munin-contrib/plugins/varnish/varnish

453 lines
15 KiB
Python
Executable File

#!/usr/bin/env python3
# pylint: disable=invalid-name
# pylint: enable=invalid-name
"""Munin plugin to monitor Varnish status.
=head1 NAME
varnish - monitor Varnish status
=head1 APPLICABLE SYSTEMS
Systems running Varnish cache.
=head1 CONFIGURATION
You need to create a file named varnish placed in the directory
/etc/munin/plugin-conf.d/ with the following config:
=over 2
[varnish]
env.backend_unhealthy_warning :1
env.threads_destroyed_warning :1
env.threads_failed_warning :1
env.threads_warning :1
=back
=head1 NOTES
This plugin is tested on Varnish 4 but it should also work with v5 and v6.
=head1 AUTHOR
Kim B. Heino <b@bbbs.net>
This is based heavily on varnish4_ plugin from Munin contrib
by Kristian Lyngstol <kristian@bohemians.org> / Redpill Linpro AS.
=head1 LICENSE
GPLv2
=head1 MAGIC MARKERS
#%# family=auto
#%# capabilities=autoconf
=cut
"""
import json
import subprocess
import sys
import os
def run_binary(arg):
"""Run binary and return output."""
try:
cmd = subprocess.Popen(
arg, shell=False, close_fds=True, bufsize=-1,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
outdata, dummy_errdata = cmd.communicate()
except OSError:
return ''
return outdata.decode('utf-8', 'ignore')
def get_values():
"""Run varnishstat and parse it's output."""
output = run_binary(['/usr/bin/varnishstat', '-j'])
try:
return json.loads(output)['counters']
except (TypeError, ValueError):
return {}
def config():
"""Print plugin config."""
# pylint: disable=too-many-statements
# backend_traffic
print('multigraph varnish_backend_traffic')
print('graph_category webserver')
print('graph_title Backend traffic')
print('backend_busy.label Backend conn. too many')
print('backend_busy.min 0')
print('backend_busy.type DERIVE')
print('backend_conn.label Backend conn. success')
print('backend_conn.min 0')
print('backend_conn.type DERIVE')
print('backend_retry.label Backend conn. retry')
print('backend_retry.min 0')
print('backend_retry.type DERIVE')
print('backend_unhealthy.label Backend conn. not attempted')
print('backend_unhealthy.min 0')
if os.getenv('backend_unhealthy_warning') is not None:
print('backend_unhealthy.warning ' + os.getenv('backend_unhealthy_warning'))
print('backend_unhealthy.type DERIVE')
print('backend_recycle.label Backend conn. recycles')
print('backend_recycle.min 0')
print('backend_recycle.type DERIVE')
print('backend_fail.label Backend conn. failures')
print('backend_fail.min 0')
print('backend_fail.type DERIVE')
print('backend_reuse.label Backend conn. reuses')
print('backend_reuse.min 0')
print('backend_reuse.type DERIVE')
print('backend_req.label Backend requests made')
print('backend_req.min 0')
print('backend_req.type DERIVE')
# bad
print('multigraph varnish_bad')
print('graph_category webserver')
print('graph_title Misbehavior')
print('sess_dropped.label Sessions dropped')
print('sess_dropped.type DERIVE')
print('threads_failed.label Thread creation failed')
print('threads_failed.type DERIVE')
print('threads_destroyed.label Threads destroyed')
print('threads_destroyed.type DERIVE')
print('thread_queue_len.label Length of session queue')
print('thread_queue_len.type GAUGE')
print('sc_pipe_overflow.label Session pipe overflow')
print('sc_pipe_overflow.type DERIVE')
print('esi_warnings.label ESI parse warnings (unlock)')
print('esi_warnings.type DERIVE')
print('sess_fail.label Session accept failures')
print('sess_fail.type DERIVE')
print('backend_busy.label Backend conn. too many')
print('backend_busy.type DERIVE')
print('esi_errors.label ESI parse errors (unlock)')
print('esi_errors.type DERIVE')
print('SMA_Transient_c_fail.label Allocator failures SMA Transient')
print('SMA_Transient_c_fail.type DERIVE')
print('losthdr.label HTTP header overflows')
print('losthdr.type DERIVE')
print('backend_unhealthy.label Backend conn. not attempted')
print('backend_unhealthy.type DERIVE')
print('threads_limited.label Threads hit max')
print('threads_limited.type DERIVE')
print('fetch_failed.label Fetch failed (all causes)')
print('fetch_failed.type DERIVE')
# expunge
print('multigraph varnish_expunge')
print('graph_category webserver')
print('graph_title Object expunging')
print('graph_order n_expired n_lru_nuked')
print('n_lru_nuked.label Number of LRU nuked objects')
print('n_lru_nuked.min 0')
print('n_lru_nuked.type DERIVE')
print('n_expired.label Number of expired objects')
print('n_expired.min 0')
print('n_expired.type DERIVE')
# hit_rate
print('multigraph varnish_hit_rate')
print('graph_category webserver')
print('graph_title Hit rates')
print('graph_order client_req cache_hit cache_miss cache_hitpass')
print('graph_scale no')
print('graph_vlabel %')
print('graph_args -l 0 -u 100 --rigid')
print('client_req.label Good client requests received')
print('client_req.graph off')
print('client_req.min 0')
print('client_req.type DERIVE')
print('cache_miss.label Cache misses')
print('cache_miss.min 0')
print('cache_miss.draw STACK')
print('cache_miss.cdef cache_miss,client_req,/,100,*')
print('cache_miss.type DERIVE')
print('cache_hit.label Cache hits')
print('cache_hit.min 0')
print('cache_hit.draw AREA')
print('cache_hit.cdef cache_hit,client_req,/,100,*')
print('cache_hit.type DERIVE')
print('cache_hitpass.label Cache hits for pass')
print('cache_hitpass.min 0')
print('cache_hitpass.draw STACK')
print('cache_hitpass.cdef cache_hitpass,client_req,/,100,*')
print('cache_hitpass.type DERIVE')
# memory_usage
print('multigraph varnish_memory_usage')
print('graph_category webserver')
print('graph_title Memory usage')
print('graph_vlabel bytes')
print('graph_args --base 1024')
print('SMA_Transient_g_bytes.label Bytes outstanding SMA Transient')
print('SMA_Transient_g_bytes.type GAUGE')
print('SMA_Transient_g_space.label Bytes available SMA Transient')
print('SMA_Transient_g_space.type GAUGE')
print('SMA_Transient_c_bytes.label Bytes allocated SMA Transient')
print('SMA_Transient_c_bytes.type DERIVE')
# objects
print('multigraph varnish_objects')
print('graph_category webserver')
print('graph_title Number of objects')
print('graph_order n_object n_objectcore n_vampireobject n_objecthead')
print('n_object.label Number of objects')
print('n_object.type GAUGE')
print('n_vampireobject.label Number of unresurrected objects')
print('n_vampireobject.type GAUGE')
print('n_objectcore.label Number of object cores')
print('n_objectcore.type GAUGE')
print('n_objecthead.label Number of object heads')
print('n_objecthead.info Each object head can have one or more object '
'attached, typically based on the Vary: header')
print('n_objecthead.type GAUGE')
# request_rate
print('multigraph varnish_request_rate')
print('graph_category webserver')
print('graph_title Request rates')
print('graph_order cache_hit cache_hitpass cache_miss backend_conn '
'backend_unhealthy client_req client_conn')
print('client_req.label Good client requests received')
print('client_req.min 0')
print('client_req.colour 111111')
print('client_req.type DERIVE')
print('s_pipe.label Total pipe sessions seen')
print('s_pipe.min 0')
print('s_pipe.colour 1d2bdf')
print('s_pipe.type DERIVE')
print('sess_conn.label Sessions accepted')
print('sess_conn.graph ON')
print('sess_conn.min 0')
print('sess_conn.colour 444444')
print('sess_conn.type DERIVE')
print('cache_miss.label Cache misses')
print('cache_miss.min 0')
print('cache_miss.draw STACK')
print('cache_miss.colour FF0000')
print('cache_miss.type DERIVE')
print('backend_conn.label Backend conn. success')
print('backend_conn.min 0')
print('backend_conn.colour 995599')
print('backend_conn.type DERIVE')
print('s_pass.label Total pass-ed requests seen')
print('s_pass.min 0')
print('s_pass.colour 785d0d')
print('s_pass.type DERIVE')
print('backend_unhealthy.label Backend conn. not attempted')
print('backend_unhealthy.min 0')
print('backend_unhealthy.colour FF55FF')
print('backend_unhealthy.type DERIVE')
print('cache_hitpass.label Cache hits for pass')
print('cache_hitpass.min 0')
print('cache_hitpass.draw STACK')
print('cache_hitpass.colour FFFF00')
print('cache_hitpass.info Hitpass are cached passes: An entry in the '
'cache instructing Varnish to pass. Typically achieved after a '
'pass in vcl_fetch.')
print('cache_hitpass.type DERIVE')
print('cache_hit.label Cache hits')
print('cache_hit.min 0')
print('cache_hit.draw AREA')
print('cache_hit.colour 00FF00')
print('cache_hit.type DERIVE')
# threads
print('multigraph varnish_threads')
print('graph_category webserver')
print('graph_title Thread status')
print('threads_created.label Threads created')
print('threads_created.min 0')
print('threads_created.type DERIVE')
print('threads_failed.label Thread creation failed')
print('threads_failed.min 0')
if os.getenv('threads_failed_warning') is not None:
print('threads_failed.warning ' + os.getenv('threads_failed_warning'))
print('threads_failed.type DERIVE')
print('threads_destroyed.label Threads destroyed')
print('threads_destroyed.min 0')
if os.getenv('threads_destroyed_warning') is not None:
print('threads_destroyed.warning ' + os.getenv('threads_destroyed_warning'))
print('threads_destroyed.type DERIVE')
print('threads_limited.label Threads hit max')
print('threads_limited.min 0')
print('threads_limited.type DERIVE')
print('threads.label Total number of threads')
print('threads.min 0')
if os.getenv('threads_warning') is not None:
print('threads.warning ' + os.getenv('threads_warning'))
print('threads.type GAUGE')
# transfer_rates
print('multigraph varnish_transfer_rates')
print('graph_category webserver')
print('graph_title Transfer rates')
print('graph_order s_resp_bodybytes s_resp_hdrbytes')
print('graph_vlabel bit/s')
print('graph_args -l 0')
print('s_resp_bodybytes.label Body traffic')
print('s_resp_bodybytes.min 0')
print('s_resp_bodybytes.draw AREA')
print('s_resp_bodybytes.cdef s_resp_bodybytes,8,*')
print('s_resp_bodybytes.type DERIVE')
print('s_resp_hdrbytes.label Header traffic')
print('s_resp_hdrbytes.min 0')
print('s_resp_hdrbytes.draw STACK')
print('s_resp_hdrbytes.cdef s_resp_hdrbytes,8,*')
print('s_resp_hdrbytes.info HTTP Header traffic. TCP/IP overhead is not '
'included.')
print('s_resp_hdrbytes.type DERIVE')
# uptime
print('multigraph varnish_uptime')
print('graph_category webserver')
print('graph_title Varnish uptime')
print('graph_scale no')
print('graph_vlabel days')
print('uptime.label Management process uptime')
print('uptime.cdef uptime,86400,/')
print('uptime.type GAUGE')
def fetch():
"""Print values."""
# pylint: disable=too-many-statements
data = get_values()
if not data:
return
# backend_traffic
print('multigraph varnish_backend_traffic')
for key in (
'backend_busy',
'backend_conn',
'backend_retry',
'backend_unhealthy',
'backend_recycle',
'backend_fail',
'backend_reuse',
'backend_req',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# bad
print('multigraph varnish_bad')
for key in (
'sess_dropped',
'threads_failed',
'threads_destroyed',
'thread_queue_len',
'sc_pipe_overflow',
'esi_warnings',
'sess_fail',
'backend_busy',
'esi_errors',
'losthdr',
'backend_unhealthy',
'threads_limited',
'fetch_failed',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
print('SMA_Transient_c_fail.value {0}'.format(
data['SMA.Transient.c_fail']['value']))
# expunge
print('multigraph varnish_expunge')
for key in (
'n_lru_nuked',
'n_expired',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# hit_rate
print('multigraph varnish_hit_rate')
print('client_req.value {0}'.format(
data['MAIN.cache_hit']['value'] +
data['MAIN.cache_miss']['value'] +
data['MAIN.cache_hitpass']['value']))
for key in (
'cache_miss',
'cache_hit',
'cache_hitpass',
):
print('{0}.value {1}'.format(
key, data['MAIN.' + key]['value']))
# memory_usage
print('multigraph varnish_memory_usage')
print('SMA_Transient_g_bytes.value {0}'.format(
data['SMA.Transient.g_bytes']['value']))
print('SMA_Transient_g_space.value {0}'.format(
data['SMA.Transient.g_space']['value']))
print('SMA_Transient_c_bytes.value {0}'.format(
data['SMA.Transient.c_bytes']['value']))
# objects
print('multigraph varnish_objects')
for key in (
'n_object',
'n_vampireobject',
'n_objectcore',
'n_objecthead',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# request_rate
print('multigraph varnish_request_rate')
for key in (
'client_req',
's_pipe',
'sess_conn',
'cache_miss',
'backend_conn',
's_pass',
'backend_unhealthy',
'cache_hitpass',
'cache_hit',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# threads
print('multigraph varnish_threads')
for key in (
'threads_created',
'threads_failed',
'threads_destroyed',
'threads_limited',
'threads',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# transfer_rates
print('multigraph varnish_transfer_rates')
for key in (
's_resp_bodybytes',
's_resp_hdrbytes',
):
print('{0}.value {1}'.format(key, data['MAIN.' + key]['value']))
# uptime
print('multigraph varnish_uptime')
print('uptime.value {0}'.format(data['MAIN.uptime']['value']))
if __name__ == '__main__':
if len(sys.argv) > 1 and sys.argv[1] == 'autoconf':
print('yes' if get_values() else 'no (varnish is not running)')
elif len(sys.argv) > 1 and sys.argv[1] == 'config':
config()
else:
fetch()