[system/pressure] Add plugin to monitor pressure stall information (psi)

Adds a plugin to monitor the pressure stall information (psi) as reported by the Linux kernel.

- groups averages per resource
- rate/derive totals for ease of reading
- resources, intervals and scopes configurable

See: https://www.kernel.org/doc/html/latest/accounting/psi.html
This commit is contained in:
HaseHarald 2022-03-21 18:12:43 +01:00 committed by Lars Kruse
parent c293e35216
commit 5389b09abe
1 changed files with 331 additions and 0 deletions

331
plugins/system/pressure Executable file
View File

@ -0,0 +1,331 @@
#!/bin/bash
: << =cut
=head1 NAME
pressure - Plugin to monitor the pressure stall information for CPU, Memory and
IO as reported by the Linux kernel.
This plugin monitors the pressure stall information (psi) as reported by the
Linux Kernel. By default it reports all average intervals (10 seconds,
60 seconds and 300 seconds) as well as the total values as a rate of change
(DERIVE) for all resources (cpu, memory, io). The average intervals can be
configured if you only deem some of them useful. See CONFIGURATION for
explanations on that.
This is a multigraph plugin that, by default, will create six detail graphs and
one summary graph (so seven in total). The summary graph will contain the 300
seconds average percentages of all resources. The detail graphs are split in two
graphs per resource. One combining all average intervals and one for the
"totals" (rate of change) for the given resource.
There are no defaults for warnings and criticals, because this highly depends on
the system, so you need to configure them yourself (if you want any). It is
recommended that you first lookup the meaning of the different values.
For more information on psi see:
https://www.kernel.org/doc/html/latest/accounting/psi.html
=head1 CONFIGURATION
Simply create a symlink in your plugins directory like with any other plugin.
No additional configuration needed, no specific user required (typically).
If you want to configure alerts, just add "warn_" or "crit_" in front of the
internal name.
Optional configuration examples:
[pressure]
env.resources cpu io memory - Specify the resources to monitor. Leave one
out if you don't want this one to be
monitored.
env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor.
Leave one out if you don't want this one to
be monitored
env.scopes some full - Specify the scopes to monitor. Leave one out
If you don't want it to be monitored.
env.summary_interval avg300 - Specify the interval to be used for the
summary-graph.
env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for
"psi_cpu_avg300_some"
env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for
"psi_io_total_full"
=head1 AUTHOR
2022, HaseHarald
=head1 LICENSE
LGPLv3
=head1 BUGS
=head1 TODO
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf
=cut
# This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory
# and IO, as reported by the Linux kernel.
#
# This is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this plugin. If not, see <http://www.gnu.org/licenses/>.
resource_defaults=('cpu' 'io' 'memory')
interval_defaults=('avg10' 'avg60' 'avg300')
scope_defaults=('some' 'full')
pressure_dir=${pressure_dir:-'/proc/pressure/'}
pressure_resources=${resources[@]:-${resource_defaults[@]}}
pressure_intervals=${intervals[@]:-${interval_defaults[@]}}
pressure_scopes=${scopes[@]:-${scope_defaults[@]}}
summary_interval=${summary_interval:-avg300}
check_autoconf() {
if [ -d "${pressure_dir}" ]; then
printf "yes\n"
else
printf "no (%s not found)\n" ${pressure_dir}
fi
}
get_pressure_value() {
resource=$1
interval=$2
scope=${3:-some}
grep "$scope" ${pressure_dir}//${resource} | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2
}
get_printable_name() {
kind=$1
value=$2
case $kind in
interval)
case $interval in
avg10)
printable_name="10sec"
;;
avg60)
printable_name="60sec"
;;
avg300)
printable_name="5min"
;;
total)
printable_name="Total"
;;
*)
printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" $value >&2
exit 2
;;
esac
;;
scope)
case $value in
some)
printable_name="Some"
;;
full)
printable_name="Full"
;;
*)
printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" $value >&2
exit 2
;;
esac
;;
resource)
case $value in
cpu)
printable_name="CPU"
;;
io)
printable_name="IO"
;;
memory)
printable_name="Memory"
;;
*)
printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" $value >&2
exit 2
;;
esac
;;
*)
printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" $kind >&2
exit 2
;;
esac
printf "%s\n" $printable_name
}
iterate_config() {
for resource in ${pressure_resources[@]}; do
printable_resource=$( get_printable_name resource $resource )
printf "multigraph pressure.%s_avg\n" $resource
printf "graph_title %s Pressure Stall Information - Average\n" $printable_resource
printf "graph_category system\n"
printf "graph_info Average PSI based latency caused by lack of %s resources.\n" $printable_resource
printf "graph_vlabel %%\n"
printf "graph_scale no\n"
for interval in ${pressure_intervals[@]}; do
printable_interval=$( get_printable_name interval $interval )
output_config $resource $interval
done
echo ""
done
for resource in ${pressure_resources[@]}; do
printable_resource=$( get_printable_name resource $resource )
printf "multigraph pressure.%s_total\n" $resource
printf "graph_title %s Pressure Stall Information - Rate\n" $printable_resource
printf "graph_category system\n"
printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" $printable_resource
printf "graph_vlabel rate\n"
interval="total"
output_config $resource $interval
echo ""
done
printf "multigraph pressure\n"
printf "graph_title Pressure Stall Information - Average\n"
printf "graph_vlabel %%\n"
printf "graph_scale no\n"
printf "graph_category system\n"
printf "graph_info Average PSI based latency caused by lack of resources.\n"
for resource in ${pressure_resources[@]}; do
output_config $resource $summary_interval
done
echo ""
}
iterate_values() {
for resource in ${pressure_resources[@]}; do
printf "multigraph pressure.%s_avg\n" $resource
for interval in ${pressure_intervals[@]}; do
output_values $resource $interval
done
echo ""
done
for resource in ${pressure_resources[@]}; do
printf "multigraph pressure.%s_total\n" $resource
interval="total"
output_values $resource $interval
echo ""
done
printf "multigraph pressure\n"
for resource in ${pressure_resources[@]}; do
output_values $resource $summary_interval
done
echo ""
}
output_config() {
resource=$1
interval=$2
printable_resource=$( get_printable_name resource $resource )
printable_interval=$( get_printable_name interval $interval )
for scope in ${pressure_scopes[@]}; do
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
continue
else
printable_scope=$( get_printable_name scope $scope )
printf "psi_%s_%s_%s.min 0\n" $resource $interval $scope
printf "psi_%s_%s_%s.label %s %s %s\n" $resource $interval $scope $printable_resource $printable_interval $printable_scope
this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
if [ -n "${!this_warn_var}" ]; then
printf "psi_%s_%s_%s.warning %s\n" $resource $interval $scope ${!this_warn_var}
fi
this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
if [ -n "${!this_crit_var}" ]; then
printf "psi_%s_%s_%s.critical %s\n" $resource $interval $scope ${!this_crit_var}
fi
if [ $interval == "total" ]; then
printf "psi_%s_%s_%s.type DERIVE\n" $resource $interval $scope
fi
fi
done
}
output_values() {
resource=$1
interval=$2
for scope in ${pressure_scopes[@]}; do
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
continue
else
printf "psi_%s_%s_%s.value %s\n" $resource $interval $scope $(get_pressure_value $resource $interval $scope)
fi
done
}
output_usage() {
printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" ${0##*/}
printf >&2 "Usage: %s [config]\n" ${0##*/}
printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n"
printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n"
printf >&2 "To do so use a syntax like this:\n"
printf >&2 "[pressure]\n"
printf >&2 "env.resources cpu io memory\n"
printf >&2 "env.intervals avg10 avg60 avg300\n"
printf >&2 "env.scopes some full\n"
printf >&2 "env.summary_interval avg300\n"
printf >&2 "env.warn_psi_cpu_avg300_some 5\n"
printf >&2 "env.crit_psi_io_total_full 2000\n"
}
case $# in
0)
iterate_values
;;
1)
case $1 in
auto|autoconf)
check_autoconf
;;
config)
iterate_config
;;
*)
output_usage
exit 1
;;
esac
;;
*)
output_usage
exit 1
;;
esac