[system/pressure] Add plugin to monitor pressure stall information (psi)
Adds a plugin to monitor the pressure stall information (psi) as reported by the Linux kernel. - groups averages per resource - rate/derive totals for ease of reading - resources, intervals and scopes configurable See: https://www.kernel.org/doc/html/latest/accounting/psi.html
This commit is contained in:
parent
c293e35216
commit
5389b09abe
|
@ -0,0 +1,331 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
: << =cut
|
||||
|
||||
=head1 NAME
|
||||
|
||||
pressure - Plugin to monitor the pressure stall information for CPU, Memory and
|
||||
IO as reported by the Linux kernel.
|
||||
|
||||
This plugin monitors the pressure stall information (psi) as reported by the
|
||||
Linux Kernel. By default it reports all average intervals (10 seconds,
|
||||
60 seconds and 300 seconds) as well as the total values as a rate of change
|
||||
(DERIVE) for all resources (cpu, memory, io). The average intervals can be
|
||||
configured if you only deem some of them useful. See CONFIGURATION for
|
||||
explanations on that.
|
||||
|
||||
This is a multigraph plugin that, by default, will create six detail graphs and
|
||||
one summary graph (so seven in total). The summary graph will contain the 300
|
||||
seconds average percentages of all resources. The detail graphs are split in two
|
||||
graphs per resource. One combining all average intervals and one for the
|
||||
"totals" (rate of change) for the given resource.
|
||||
|
||||
There are no defaults for warnings and criticals, because this highly depends on
|
||||
the system, so you need to configure them yourself (if you want any). It is
|
||||
recommended that you first lookup the meaning of the different values.
|
||||
|
||||
For more information on psi see:
|
||||
https://www.kernel.org/doc/html/latest/accounting/psi.html
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
Simply create a symlink in your plugins directory like with any other plugin.
|
||||
No additional configuration needed, no specific user required (typically).
|
||||
|
||||
If you want to configure alerts, just add "warn_" or "crit_" in front of the
|
||||
internal name.
|
||||
|
||||
Optional configuration examples:
|
||||
|
||||
[pressure]
|
||||
env.resources cpu io memory - Specify the resources to monitor. Leave one
|
||||
out if you don't want this one to be
|
||||
monitored.
|
||||
env.intervals avg10 avg60 avg300 - Sepcify the average intervals to monitor.
|
||||
Leave one out if you don't want this one to
|
||||
be monitored
|
||||
env.scopes some full - Specify the scopes to monitor. Leave one out
|
||||
If you don't want it to be monitored.
|
||||
env.summary_interval avg300 - Specify the interval to be used for the
|
||||
summary-graph.
|
||||
env.warn_psi_cpu_avg300_some 5 - Set a warning-level of 5 for
|
||||
"psi_cpu_avg300_some"
|
||||
env.crit_psi_io_total_full 2000 - Set a critical-level of 2000 for
|
||||
"psi_io_total_full"
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
2022, HaseHarald
|
||||
|
||||
=head1 LICENSE
|
||||
|
||||
LGPLv3
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
=head1 TODO
|
||||
|
||||
=head1 MAGIC MARKERS
|
||||
|
||||
#%# family=auto
|
||||
#%# capabilities=autoconf
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
# This file contains a munin-plugin to graph the psi (pressure) for CPU, Memory
|
||||
# and IO, as reported by the Linux kernel.
|
||||
#
|
||||
# This is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Lesser General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public License
|
||||
# along with this plugin. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
resource_defaults=('cpu' 'io' 'memory')
|
||||
interval_defaults=('avg10' 'avg60' 'avg300')
|
||||
scope_defaults=('some' 'full')
|
||||
pressure_dir=${pressure_dir:-'/proc/pressure/'}
|
||||
pressure_resources=${resources[@]:-${resource_defaults[@]}}
|
||||
pressure_intervals=${intervals[@]:-${interval_defaults[@]}}
|
||||
pressure_scopes=${scopes[@]:-${scope_defaults[@]}}
|
||||
summary_interval=${summary_interval:-avg300}
|
||||
|
||||
check_autoconf() {
|
||||
if [ -d "${pressure_dir}" ]; then
|
||||
printf "yes\n"
|
||||
else
|
||||
printf "no (%s not found)\n" ${pressure_dir}
|
||||
fi
|
||||
}
|
||||
|
||||
get_pressure_value() {
|
||||
resource=$1
|
||||
interval=$2
|
||||
scope=${3:-some}
|
||||
grep "$scope" ${pressure_dir}//${resource} | grep -o -E "${interval}=[0-9]{1,}(\.[0-9]{1,}){0,1}" | cut -d '=' -f 2
|
||||
}
|
||||
|
||||
get_printable_name() {
|
||||
kind=$1
|
||||
value=$2
|
||||
|
||||
case $kind in
|
||||
|
||||
interval)
|
||||
case $interval in
|
||||
avg10)
|
||||
printable_name="10sec"
|
||||
;;
|
||||
avg60)
|
||||
printable_name="60sec"
|
||||
;;
|
||||
avg300)
|
||||
printable_name="5min"
|
||||
;;
|
||||
total)
|
||||
printable_name="Total"
|
||||
;;
|
||||
*)
|
||||
printf "ERROR: Could not determine interval %s ! Must be one of 'avg10' 'avg60' 'avg300' 'total'\n" $value >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
scope)
|
||||
case $value in
|
||||
some)
|
||||
printable_name="Some"
|
||||
;;
|
||||
full)
|
||||
printable_name="Full"
|
||||
;;
|
||||
*)
|
||||
printf "ERROR: Could not determine scope %s ! Must be one of 'full' 'some'.\n" $value >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
resource)
|
||||
case $value in
|
||||
cpu)
|
||||
printable_name="CPU"
|
||||
;;
|
||||
io)
|
||||
printable_name="IO"
|
||||
;;
|
||||
memory)
|
||||
printable_name="Memory"
|
||||
;;
|
||||
*)
|
||||
printf "ERROR: Could not determine resource-type %s ! Must be one of 'cpu' 'io' 'memory'.\n" $value >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
*)
|
||||
printf "ERROR: Could not determine kind %s ! Must be one of 'interval' 'scope' 'resource'\n" $kind >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
printf "%s\n" $printable_name
|
||||
}
|
||||
|
||||
iterate_config() {
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
printable_resource=$( get_printable_name resource $resource )
|
||||
printf "multigraph pressure.%s_avg\n" $resource
|
||||
printf "graph_title %s Pressure Stall Information - Average\n" $printable_resource
|
||||
printf "graph_category system\n"
|
||||
printf "graph_info Average PSI based latency caused by lack of %s resources.\n" $printable_resource
|
||||
printf "graph_vlabel %%\n"
|
||||
printf "graph_scale no\n"
|
||||
for interval in ${pressure_intervals[@]}; do
|
||||
printable_interval=$( get_printable_name interval $interval )
|
||||
output_config $resource $interval
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
printable_resource=$( get_printable_name resource $resource )
|
||||
printf "multigraph pressure.%s_total\n" $resource
|
||||
printf "graph_title %s Pressure Stall Information - Rate\n" $printable_resource
|
||||
printf "graph_category system\n"
|
||||
printf "graph_info Total PSI based latency rate caused by lack of %s resources.\n" $printable_resource
|
||||
printf "graph_vlabel rate\n"
|
||||
interval="total"
|
||||
output_config $resource $interval
|
||||
echo ""
|
||||
done
|
||||
|
||||
printf "multigraph pressure\n"
|
||||
printf "graph_title Pressure Stall Information - Average\n"
|
||||
printf "graph_vlabel %%\n"
|
||||
printf "graph_scale no\n"
|
||||
printf "graph_category system\n"
|
||||
printf "graph_info Average PSI based latency caused by lack of resources.\n"
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
output_config $resource $summary_interval
|
||||
done
|
||||
echo ""
|
||||
}
|
||||
|
||||
iterate_values() {
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
printf "multigraph pressure.%s_avg\n" $resource
|
||||
for interval in ${pressure_intervals[@]}; do
|
||||
output_values $resource $interval
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
printf "multigraph pressure.%s_total\n" $resource
|
||||
interval="total"
|
||||
output_values $resource $interval
|
||||
echo ""
|
||||
done
|
||||
|
||||
printf "multigraph pressure\n"
|
||||
for resource in ${pressure_resources[@]}; do
|
||||
output_values $resource $summary_interval
|
||||
done
|
||||
echo ""
|
||||
}
|
||||
|
||||
output_config() {
|
||||
resource=$1
|
||||
interval=$2
|
||||
|
||||
printable_resource=$( get_printable_name resource $resource )
|
||||
printable_interval=$( get_printable_name interval $interval )
|
||||
|
||||
for scope in ${pressure_scopes[@]}; do
|
||||
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
|
||||
continue
|
||||
else
|
||||
printable_scope=$( get_printable_name scope $scope )
|
||||
printf "psi_%s_%s_%s.min 0\n" $resource $interval $scope
|
||||
printf "psi_%s_%s_%s.label %s %s %s\n" $resource $interval $scope $printable_resource $printable_interval $printable_scope
|
||||
this_warn_var=$( echo "warn_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
|
||||
if [ -n "${!this_warn_var}" ]; then
|
||||
printf "psi_%s_%s_%s.warning %s\n" $resource $interval $scope ${!this_warn_var}
|
||||
fi
|
||||
this_crit_var=$( echo "crit_psi_${resource}_${interval}_${scope}" | sed 's/[^A-Za-z0-9_]/_/g' )
|
||||
if [ -n "${!this_crit_var}" ]; then
|
||||
printf "psi_%s_%s_%s.critical %s\n" $resource $interval $scope ${!this_crit_var}
|
||||
fi
|
||||
if [ $interval == "total" ]; then
|
||||
printf "psi_%s_%s_%s.type DERIVE\n" $resource $interval $scope
|
||||
fi
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
output_values() {
|
||||
resource=$1
|
||||
interval=$2
|
||||
|
||||
for scope in ${pressure_scopes[@]}; do
|
||||
if [ ${resource} == "cpu" ] && [ ${scope} != "some" ]; then
|
||||
continue
|
||||
else
|
||||
printf "psi_%s_%s_%s.value %s\n" $resource $interval $scope $(get_pressure_value $resource $interval $scope)
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
output_usage() {
|
||||
printf >&2 "%s - munin plugin to graph pressure stall information for CPU, Memory and IO as reported by the Linux kernel.\n" ${0##*/}
|
||||
printf >&2 "Usage: %s [config]\n" ${0##*/}
|
||||
printf >&2 "You may use environment settings in a plugin-config file, used by munin (for example /etc/munin/plugin-conf.d/munin-node) to further adjust settings.\n"
|
||||
printf >&2 "You can use these settings to configure which resources, intervals or scopes are monitored or to configure warning and critical levels.\n"
|
||||
printf >&2 "To do so use a syntax like this:\n"
|
||||
printf >&2 "[pressure]\n"
|
||||
printf >&2 "env.resources cpu io memory\n"
|
||||
printf >&2 "env.intervals avg10 avg60 avg300\n"
|
||||
printf >&2 "env.scopes some full\n"
|
||||
printf >&2 "env.summary_interval avg300\n"
|
||||
printf >&2 "env.warn_psi_cpu_avg300_some 5\n"
|
||||
printf >&2 "env.crit_psi_io_total_full 2000\n"
|
||||
}
|
||||
|
||||
case $# in
|
||||
0)
|
||||
iterate_values
|
||||
;;
|
||||
|
||||
1)
|
||||
case $1 in
|
||||
auto|autoconf)
|
||||
check_autoconf
|
||||
;;
|
||||
config)
|
||||
iterate_config
|
||||
;;
|
||||
*)
|
||||
output_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
|
||||
*)
|
||||
output_usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
Loading…
Reference in New Issue