munin-contrib/plugins/gpu/nvidia_gpu_

310 lines
8.9 KiB
Bash
Executable File

#!/bin/bash
# -*- sh -*-
: << =cut
=head1 NAME
nvidia_gpu_ - Wildcard plugin to monitor NVIDIA GPUs. Uses nvidia-smi utility,
usually bundled with NVIDIA GPU driver, to obtain information.
=head1 CONFIGURATION
This is a wildcard plugin. The wildcard prefix link name should be the
value to monitor.
This plugin uses the following configuration variables:
[nvidia_gpu_*]
env.smiexec - Location of nvidia-smi executable.
env.warning - Warning temperature
env.critical - Critical temperature
=head2 DEFAULT CONFIGURATION
The default configuration is to set "env.smiexec" to /usr/bin/nvidia-smi and
assume warning and critical temperatures of 75 and 95 degrees celsius, respectively.
=head2 EXAMPLE WILDCARD USAGE
C<ln -s /usr/share/munin/plugins/nvidia_gpu_ /etc/munin/plugins/nvidia_gpu_temp>
...will monitor the temperature of available GPUs.
=head1 TODO
=over 4
=item *
Add support for specific professional GPU features such as number of compute processes, clocks and so on.
=item *
Use multigraphs for multiple GPUs (http://munin-monitoring.org/wiki/MultigraphSampleOutput).
=back
=head1 AUTHOR
Nuno Fachada
faken@fakenmc.com
=head1 LICENSE
GNU General Public License, version 2
http://www.gnu.org/licenses/gpl-2.0.html
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf suggest
=cut
# Determine name of parameter to monitor
name=$(basename "$0" | sed 's/^nvidia_gpu_//g')
# Get location of nvidia-smi executable or use default
nvSmiExec=${smiexec:-'/usr/bin/nvidia-smi'}
# Check if autoconf was requested
if [ "$1" = "autoconf" ]; then
# Autoconf only returns yes if nvidia-smi exists and is executable
if [ -x "$nvSmiExec" ]; then
echo yes
exit 0
else
echo "no (nvidia-smi executable not found)"
exit 0
fi
fi
# Check if suggest was requested
if [ "$1" = "suggest" ]; then
echo "temp"
echo "mem"
echo "fan"
echo "power"
echo "utilization"
echo "rx"
echo "tx"
exit 0
fi
# Get number of GPUs
nGpusOutput=$("$nvSmiExec" -L)
nGpus=$(echo "$nGpusOutput" | wc -l)
if [ "$nGpus" -eq 0 ]; then
# Exit if no GPUs found
echo "No NVIDIA GPUs detected. Exiting."
exit 1
fi
# Get full output from nvidia-smi
smiOutput=$("$nvSmiExec" -q)
# Check if config was requested
if [ "$1" = "config" ]; then
# Get driver version
driverVersion=$(echo "$smiOutput" | grep "Driver Version" | cut -d : -f 2 | tr -d ' ')
# Configure graph depending on what which quantity will be plotted
case $name in
temp)
echo 'graph_title GPU temperature'
echo 'graph_args -l 0 -u 120'
echo 'graph_vlabel degrees Celsius'
echo 'graph_category sensors'
echo "graph_info Temperature information for NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.warning ${warning:-75}"
echo "${name}${nGpusCounter}.critical ${critical:-95}"
echo "${name}${nGpusCounter}.info Temperature information for $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
mem)
# First determine total memory of each GPU...
gpusTotalMemOutput=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | tr -d ' ')
gpusTotalMem=''
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info Memory information for $gpuName"
gpuMem=$(echo "$gpusTotalMemOutput"| sed -n $((nGpusCounter+1))p)
gpusTotalMem="${gpusTotalMem}${gpuMem} for GPU ${nGpusCounter}"
: $((nGpusCounter=nGpusCounter+1))
if [ "$nGpusCounter" -lt "$nGpus" ]; then
gpusTotalMem="${gpusTotalMem}, "
fi
done
# ...then output config data.
echo 'graph_title GPU memory usage'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel %'
echo 'graph_category memory'
echo "graph_info FB Memory usage for NVIDIA GPUs using driver version $driverVersion (total memory is $gpusTotalMem)"
;;
fan)
echo 'graph_title GPU fan speed'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel %'
echo 'graph_category sensors'
echo "graph_info Fan speed of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info Fan information for $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
power)
echo 'graph_title GPU power consumption'
echo 'graph_vlabel Watt'
echo 'graph_category sensors'
echo "graph_info power consumption of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info power consumption of $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
utilization)
echo 'graph_title GPU utilization'
echo 'graph_args -l 0 -u 100'
echo 'graph_vlabel %'
echo 'graph_category system'
echo "graph_info GPU utilization of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info GPU utilization information for $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
rx)
echo 'graph_title GPU Rx Throughput'
echo 'graph_vlabel Bits/sec'
echo 'graph_category system'
echo "graph_info Rx Throughput of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info Rx Throughput of $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
tx)
echo 'graph_title GPU Tx Throughput'
echo 'graph_vlabel Bits/sec'
echo 'graph_category system'
echo "graph_info Tx Throughput of NVIDIA GPUs using driver version $driverVersion"
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.info Tx Throughput of $gpuName"
: $((nGpusCounter=nGpusCounter+1))
done
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
# Common stuff for all quantities
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
gpuName=$(echo "$nGpusOutput" | sed -n $((nGpusCounter+1))p | cut -d \( -f 1)
echo "${name}${nGpusCounter}.label $gpuName"
: $((nGpusCounter=nGpusCounter+1))
#print_warning $name
#print_critical $name
done
exit 0
fi
# Get requested value
case $name in
temp)
valueGpus=$(echo "$smiOutput" | grep "GPU Current Temp" | cut -d : -f 2 | cut -d ' ' -f 2)
;;
mem)
totalMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Total" | cut -d : -f 2 | cut -d ' ' -f 2)
usedMemGpus=$(echo "$smiOutput" | grep -v BAR1 | grep -A 3 "Memory Usage" | grep "Used" | cut -d : -f 2 | cut -d ' ' -f 2)
valueGpus=''
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
totalMemGpu=$(echo "$totalMemGpus" | sed -n $((nGpusCounter+1))p)
usedMemGpu=$(echo "$usedMemGpus" | sed -n $((nGpusCounter+1))p)
percentMemUsed=$((usedMemGpu*100/totalMemGpu))
valueGpus="${valueGpus}${percentMemUsed}"$'\n'
: $((nGpusCounter=nGpusCounter+1))
done
;;
fan)
valueGpus=$(echo "$smiOutput" | grep "Fan Speed" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;;
power)
valueGpus=$(echo "$smiOutput" | grep "Power Draw" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;;
utilization)
valueGpus=$(echo "$smiOutput" | grep "Gpu" | cut -d ':' -f 2 | cut -d ' ' -f 2)
;;
rx)
rxGpus=$(echo "$smiOutput" | grep "Rx Throughput" | cut -d ':' -f 2 | cut -d ' ' -f 2)
valueGpus=''
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
kiloBitsPerSecond=$(echo "$rxGpus" | sed -n $((nGpusCounter+1))p)
bitsPerSecond=$((kiloBitsPerSecond*1024))
valueGpus="${valueGpus}${bitsPerSecond}"$'\n'
: $((nGpusCounter=nGpusCounter+1))
done
;;
tx)
txGpus=$(echo "$smiOutput" | grep "Tx Throughput" | cut -d ':' -f 2 | cut -d ' ' -f 2)
valueGpus=''
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
kiloBitsPerSecond=$(echo "$txGpus" | sed -n $((nGpusCounter+1))p)
bitsPerSecond=$((kiloBitsPerSecond*1024))
valueGpus="${valueGpus}${bitsPerSecond}"$'\n'
: $((nGpusCounter=nGpusCounter+1))
done
;;
*)
echo "Can't run without a proper symlink. Exiting."
echo "Try running munin-node-configure --suggest."
exit 1
;;
esac
# Print requested value
nGpusCounter=0
while [ $nGpusCounter -lt "$nGpus" ]
do
value=$(echo "$valueGpus" | sed -n $((nGpusCounter+1))p)
echo "${name}${nGpusCounter}.value $value"
: $((nGpusCounter=nGpusCounter+1))
done