munin-contrib/plugins/disk/btrfs_device_stats

276 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
=pod
=head1 NAME
btrfs_device_stats - Script to monitor btrfs device statistics
=head1 CONFIGURATION
Simply create a symlink in your plugins directory like with any other plugin.
Must be run as root.
[btrfs_device_stats]
user root
You can optionaly configure the warning and critical limits. By default warning
is set to 1 and critical is not set at all. You can set the limits either for
the entire plugin or per individual metric and down to a specific device. The
more specific values take precedence over the general ones.
See the following example:
[btrfs_device_stats]
user root
env.warning 2
env.critical 4
env.flags_warning 23
env.read_errs_critical 42
env.generation_errs_a04f3d6b_438c_4b61_979b_e5fda7fb858c_1_warning 187
=head2 DEFAULT CONFIGURATION
=head1 BUGS
=head1 AUTHOR
2019-2021, HaseHarald
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf
=head1 LICENSE
LGPLv3
=cut
"""
# This file contains a munin-plugin to gather btrfs statistics per device.
#
# This is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this plugin. If not, see <http://www.gnu.org/licenses/>.
import btrfs
import os
import sys
def munin_config(fs):
fsid = str(fs.fsid).replace('-', '_')
print("multigraph btrfs_device_stats_" + fsid)
print("graph_args --base 1000 -l 0")
print("graph_vlabel total btrfs attribute value")
print("graph_title btrfs total device stats for " + fs.path)
print("graph_category disk")
print("graph_info This graph shows the total stats of devices used by btrfs")
print("corruption_errs_total.label Corruption Errors")
print("flush_errs_total.label Flush Errors")
print("generation_errs_total.label Generation Errors")
print("read_errs_total.label Read Errors")
print("write_errs_total.label Write Errors")
print("nr_items_total.label Nr. of Items")
print("flags_total.label Nr. of Flags")
print("")
devices = fs.devices()
for this_device in devices:
# Set defaults
warning = os.getenv('warning', default="1")
critical = os.getenv('critical', default=False)
# Get device informations
this_dev_info = fs.dev_info(this_device.devid)
this_dev_name = this_dev_info.path.replace('/dev/', '')
print("multigraph btrfs_device_stats_" + fsid + "." +
str(this_device.devid))
print("graph_args --base 1000 -l 0")
print("graph_vlabel btrfs attribute value")
print("graph_title btrfs device stats for " + this_dev_name)
print("graph_category disk")
print("graph_info This graph shows stats of devices used by btrfs")
# Labels and warning/critical values for Corruption Errors
this_corr_errs_warn = os.getenv('corruption_errs_warning',
default=warning)
this_corr_errs_warn = os.getenv('corruption_errs_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_corr_errs_warn)
this_corr_errs_crit = os.getenv('corruption_errs_critical',
default=critical)
this_corr_errs_crit = os.getenv('corruption_errs_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_corr_errs_crit)
print("corruption_errs.label Corruption Errors")
print("corruption_errs.warning " + this_corr_errs_warn)
if this_corr_errs_crit:
print("corruption_errs.critical " + this_corr_errs_crit)
# Labels and warning/critical values for Flush Errors
this_flush_errs_warn = os.getenv('flush_errs_warning', default=warning)
this_flush_errs_warn = os.getenv('flush_errs_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_flush_errs_warn)
this_flush_errs_crit = os.getenv('flush_errs_critical',
default=critical)
this_flush_errs_crit = os.getenv('flush_errs_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_flush_errs_crit)
print("flush_errs.label Flush Errors")
print("flush_errs.warning " + this_flush_errs_warn)
if this_flush_errs_crit:
print("flush_errs.critical " + this_flush_errs_crit)
# Labels and warning/critical values for Generation Errors
this_gen_errs_warn = os.getenv('generation_errs_warning',
default=warning)
this_gen_errs_warn = os.getenv('generation_errs_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_gen_errs_warn)
this_gen_errs_crit = os.getenv('generation_errs_critical',
default=critical)
this_gen_errs_crit = os.getenv('generation_errs_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_gen_errs_crit)
print("generation_errs.label Generation Errors")
print("generation_errs.warning " + this_gen_errs_warn)
if this_gen_errs_crit:
print("generation_errs.critical " + this_gen_errs_crit)
# Labels and warning/critical values for Read Errors
this_read_errs_warn = os.getenv('read_errs_warning', default=warning)
this_read_errs_warn = os.getenv('read_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_read_errs_warn)
this_read_errs_crit = os.getenv('read_errs_critical', default=critical)
this_read_errs_crit = os.getenv('read_errs_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_read_errs_crit)
print("read_errs.label Read Errors")
print("read_errs.warning " + this_read_errs_warn)
if this_read_errs_crit:
print("read_errs.critical " + this_read_errs_crit)
# Labels and warning/critical values for Write Errors
this_write_errs_warn = os.getenv('write_errs_warning', default=warning)
this_write_errs_warn = os.getenv('write_errs_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_write_errs_warn)
this_write_errs_crit = os.getenv('write_errs_critical',
default=critical)
this_write_errs_crit = os.getenv('write_errs_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_write_errs_crit)
print("write_errs.label Write Errors")
print("write_errs.warning " + this_write_errs_warn)
if this_write_errs_crit:
print("write_errs.critical " + this_write_errs_crit)
print("nr_items.label Nr. of Items")
# Labels and warning/critical values for Flags
this_flags_warn = os.getenv('flags_warning', default=warning)
this_flags_warn = os.getenv('flags_' + fsid + "_" +
str(this_device.devid) + '_warning',
default=this_flags_warn)
this_flags_crit = os.getenv('flags_critical', default=critical)
this_flags_crit = os.getenv('flags_' + fsid + "_" +
str(this_device.devid) + '_critical',
default=this_flags_crit)
print("flags.label Nr. of Flags")
print("flags.warning " + this_flags_warn)
if this_flags_crit:
print("flags.critical " + this_flags_crit)
print("")
def munin_values(fs):
corruption_errs_total = 0
flush_errs_total = 0
generation_errs_total = 0
read_errs_total = 0
write_errs_total = 0
nr_items_total = 0
flags_total = 0
fsid = str(fs.fsid).replace('-', '_')
devices = fs.devices()
for this_device in devices:
this_dev_stat = fs.dev_stats(this_device.devid, False)
corruption_errs = this_dev_stat.corruption_errs
flush_errs = this_dev_stat.flush_errs
generation_errs = this_dev_stat.generation_errs
read_errs = this_dev_stat.read_errs
write_errs = this_dev_stat.write_errs
nr_items = this_dev_stat.nr_items
flags = this_dev_stat.flags
corruption_errs_total = corruption_errs_total + corruption_errs
flush_errs_total = flush_errs_total + flush_errs
generation_errs_total = generation_errs_total + generation_errs
read_errs_total = read_errs_total + read_errs
write_errs_total = write_errs_total + write_errs
nr_items_total = nr_items_total + nr_items
flags_total = flags_total + flags
print("multigraph btrfs_device_stats_" + fsid + "." +
str(this_device.devid))
print("corruption_errs.value " + str(corruption_errs))
print("flush_errs.value " + str(flush_errs))
print("generation_errs.value " + str(generation_errs))
print("read_errs.value " + str(read_errs))
print("write_errs.value " + str(write_errs))
print("nr_items.value " + str(nr_items))
print("flags.value " + str(flags))
print("")
print("multigraph btrfs_device_stats_" + fsid)
print("corruption_errs_total.value " + str(corruption_errs_total))
print("flush_errs_total.value " + str(flush_errs_total))
print("generation_errs_total.value " + str(generation_errs_total))
print("read_errs_total.value " + str(read_errs_total))
print("write_errs_total.value " + str(write_errs_total))
print("nr_items_total.value " + str(nr_items_total))
print("flags_total.value " + str(flags_total))
print("")
def main():
for path in btrfs.utils.mounted_filesystem_paths():
with btrfs.FileSystem(path) as fs:
if len(sys.argv) > 1 and sys.argv[1] == "config":
munin_config(fs)
else:
munin_values(fs)
if __name__ == "__main__":
main()
exit(0)