#!/usr/bin/env python3 """ =head1 NAME docker_ - Docker wildcard-plugin to monitor a L host. This wildcard plugin provides series C, C, C, C, C, C and C as separate graphs. It also supports a C suffix that provides all of those as a multigraph. =head1 INSTALLATION - Copy this plugin in your munin plugins directory - Install Python3 "docker" package =over 2 If you want all the graphs as a multigraph, create a single multi symlink. ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_multi Or choose a subset of those you want. ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_containers ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_cpu ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_images ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_memory ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_network ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_status ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_volumes =back After the installation you need to restart your munin-node: =over 2 systemctl restart munin-node =back =head1 CONFIGURATION This plugin need to run as root, you need to create a file named docker placed in the directory /etc/munin/plugin-conf.d/ with the following config (you can also use Docker environment variables here as described in https://docs.docker.com/compose/reference/envvars/): You can use the EXCLUDE_CONTAINER_NAME environment variable to specify a regular expression which if matched will exclude the matching containers from the memory and cpu graphs. For example env.EXCLUDE_CONTAINER_NAME runner Would exclude all containers with the word "runner" in the name. =over 2 [docker_*] group docker env.DOCKER_HOST unix://run/docker.sock env.EXCLUDE_CONTAINER_NAME regexp =back You may need to pick a different group depending on the name schema of your distribution. Or maybe use "user root", if nothing else works. =head1 AUTHORS This section has been reverse-engineered from git logs Codimp : original rewrite Rowan Wookey : performance improvement Olivier Mehani : Network support, ClientWrapper, general cleanup, multigraph =head1 MAGIC MARKERS #%# family=auto #%# capabilities=autoconf suggest multigraph =cut """ import os import sys import re try: from functools import cached_property except ImportError: # If cached_property is not available, # just use the property decorator, without caching # This is for backward compatibility with Python<3.8 cached_property = property from multiprocessing import Process, Queue def sorted_by_creation_date(func): def sorted_func(*args, **kwargs): return sorted( func(*args, **kwargs), key=( lambda x: x.attrs['CreatedAt'] if 'CreatedAt' in x.attrs else x.attrs['Created'] ) ) return sorted_func def clean_fieldname(text): if text == "root": # "root" is a magic (forbidden) word return "_root" else: return re.sub(r"(^[^A-Za-z_]|[^A-Za-z0-9_])", "_", text) class ClientWrapper: """ A small wrapper for the docker client, to centralise some parsing logic, and support caching. In addition, when the exclude_re parameter is not None, any container which name is matched by the RE will not be excluded from reports. """ client = None exclude = None def __init__(self, client, exclude_re=None): self.client = client if exclude_re: self.exclude = re.compile(exclude_re) @property def api(self): return self.client.api @cached_property @sorted_by_creation_date def all_containers(self): return [ c for c in self.client.containers.list(all=True) if (c.status == 'running') and (not self.exclude or not self.exclude.search(c.name)) ] @cached_property @sorted_by_creation_date def intermediate_images(self): return list( set(self.all_images) .difference( set(self.images) .difference( set(self.dangling_images) ) ) ) @cached_property @sorted_by_creation_date def all_images(self): return self.client.images.list(all=True) @cached_property @sorted_by_creation_date def images(self): images = self.client.images.list() return list( set(images) .difference( set(self.dangling_images)) ) @cached_property @sorted_by_creation_date def dangling_images(self): return self.client.images.list(filters={'dangling': True}) @cached_property @sorted_by_creation_date def volumes(self): return self.client.volumes.list() def container_summary(container, *args): summary = container.name attributes = container_attributes(container, *args) if attributes: summary += f' ({attributes})' return summary def container_attributes(container, *args): attributes = container.image.tags attributes.append(container.attrs['Created']) return ', '.join(attributes + list(args)) def print_containers_status(client): running = [] unhealthy = [] paused = [] created = [] restarting = [] removing = [] exited = [] dead = [] for container in client.all_containers: if container.status == 'running': state = client.api.inspect_container(container.name)['State'] if state.get('Health', {}).get('Status') == 'unhealthy': unhealthy.append(container) else: running.append(container) elif container.status == 'paused': paused.append(container) elif container.status == 'created': created.append(container) elif container.status == 'restarting': restarting.append(container) elif container.status == 'removing': removing.append(container) elif container.status == 'exited': exited.append(container) elif container.status == 'dead': dead.append(container) print('running.value', len(running)) print('running.extinfo', ', '.join(container_summary(c) for c in running)) print('unhealthy.value', len(unhealthy)) print('unhealthy.extinfo', ', '.join(container_summary(c) for c in unhealthy)) print('paused.value', len(paused)) print('paused.extinfo', ', '.join(container_summary(c) for c in paused)) print('created.value', len(created)) print('created.extinfo', ', '.join(container_summary(c) for c in created)) print('restarting.value', len(restarting)) print('restarting.extinfo', ', '.join(container_summary(c) for c in restarting)) print('removing.value', len(removing)) print('removing.extinfo', ', '.join(container_summary(c) for c in removing)) print('exited.value', len(exited)) print('exited.extinfo', ', '.join(container_summary(c) for c in exited)) print('dead.value', len(dead)) print('dead.extinfo', ', '.join(container_summary(c) for c in dead)) def image_summary(image): attributes = image.tags attributes.append(image.attrs['Created']) attributes.append(f"{round(image.attrs['Size']/1024**2, 2)} MiB") return f"{image.short_id} ({', '.join(attributes)})" def print_images_count(client): images = client.images intermediate = client.intermediate_images dangling = client.dangling_images print('intermediate_quantity.value', len(intermediate)) print('intermediate_quantity.extinfo', ', '.join(image_summary(i) for i in intermediate)) print('images_quantity.value', len(images)) print('images_quantity.extinfo', ', '.join(image_summary(i) for i in images)) print('dangling_quantity.value', len(dangling)) print('dangling_quantity.extinfo', ', '.join(image_summary(i) for i in dangling)) def get_container_stats(container, q): q.put(container.stats(stream=False)) def parallel_container_stats(client): proc_list = [] stats = {} for container in client.all_containers: q = Queue() p = Process(target=get_container_stats, args=(container, q)) proc_list.append({'proc': p, 'queue': q, 'container': container}) p.start() for proc in proc_list: proc['proc'].join() stats[proc['container']] = proc['queue'].get() return stats.items() def print_containers_cpu(client): for container, stats in parallel_container_stats(client): cpu_percent = 0.0 cpu_delta = (float(stats["cpu_stats"]["cpu_usage"]["total_usage"]) - float(stats["precpu_stats"]["cpu_usage"]["total_usage"])) system_delta = (float(stats["cpu_stats"]["system_cpu_usage"]) - float(stats["precpu_stats"]["system_cpu_usage"])) if system_delta > 0.0: cpu_percent = cpu_delta / system_delta * 100.0 * os.cpu_count() clean_container_name = clean_fieldname(container.name) print(clean_container_name + '.value', cpu_percent) print(clean_container_name + '.extinfo', container_attributes(container)) def print_containers_memory(client): for container, stats in parallel_container_stats(client): if 'total_rss' in stats['memory_stats']['stats']: # cgroupv1 only? memory_usage = stats['memory_stats']['stats']['total_rss'] extinfo = 'Resident Set Size' else: memory_usage = stats['memory_stats']['usage'] extinfo = 'Total memory usage' clean_container_name = clean_fieldname(container.name) print(clean_container_name + '.value', memory_usage) print(clean_container_name + '.extinfo', container_attributes(container, extinfo)) def print_containers_network(client): for container, stats in parallel_container_stats(client): tx_bytes = 0 rx_bytes = 0 if "networks" in stats: for data in stats['networks'].values(): tx_bytes += data['tx_bytes'] rx_bytes += data['rx_bytes'] clean_container_name = clean_fieldname(container.name) print(clean_container_name + '_up.value', tx_bytes) print(clean_container_name + '_down.value', rx_bytes) print(clean_container_name + '_up.extinfo', container_attributes(container)) def volume_summary(volume): summary = f"{volume.short_id}" if volume.attrs['Labels']: summary += f" ({', '.join(volume.attrs['Labels'])})" return summary def status(client, mode): if mode == "config": print("graph_title Docker status") print("graph_vlabel containers") print("graph_category virtualization") print("graph_total All containers") print("running.label RUNNING") print("running.draw AREASTACK") print("running.info Running containers can be manipulated with " "`docker container [attach|kill|logs|pause|restart|stop] ` or " "commands run in them with `docker container exec " "[--detach|--interactive,--privileged,--tty] `" ) print("unhealthy.label UNHEALTHY") print("unhealthy.draw AREASTACK") print("unhealthy.warning 1") print("unhealthy.info Unhealthy containers can be restarted with " "`docker container restart `") print("paused.label PAUSED") print("paused.draw AREASTACK") print("paused.info Paused containers can be resumed with " "`docker container unpause `") print("created.label CREATED") print("created.draw AREASTACK") print("created.info New containers can be created with " "`docker container create --name ` or " "`docker container run --name `") print("restarting.label RESTARTING") print("restarting.draw AREASTACK") print("restarting.info Containers can be restarted with " "`docker container restart `") print("removing.label REMOVING") print("removing.draw AREASTACK") print("removing.info Containers can be removed with " "`docker container rm `") print("exited.label EXITED") print("exited.draw AREASTACK") print("exited.info Exited containers can be started with " "`docker container start [--attach] `") print("dead.label DEAD") print("dead.draw AREASTACK") print("dead.warning 1") print("dead.info Dead containers can be started with " "`docker container start `") else: print_containers_status(client) def containers(client, mode): if mode == "config": print("graph_title Docker containers") print("graph_vlabel containers") print("graph_category virtualization") print("containers_quantity.label Containers") else: print('containers_quantity.value', len(client.all_containers)) def images(client, mode): if mode == "config": print("graph_title Docker images") print("graph_vlabel images") print("graph_category virtualization") print("graph_total All images") print("intermediate_quantity.label Intermediate images") print("intermediate_quantity.draw AREASTACK") print("intermediate_quantity.info All unused images can be deleted with " "`docker image prune --all`") print("images_quantity.label Images") print("images_quantity.draw AREASTACK") print("images_quantity.info Images can be used in containers with " "`docker container create --name ` or " "`docker container run --name `") print("dangling_quantity.label Dangling images") print("dangling_quantity.draw AREASTACK") print("dangling_quantity.info Dangling images can be deleted with " "`docker image prune`" "or tagged with `docker image tag `") print("dangling_quantity.warning 10") else: print_images_count(client) def volumes(client, mode): if mode == "config": print("graph_title Docker volumes") print("graph_vlabel volumes") print("graph_category virtualization") print("volumes_quantity.label Volumes") print("volumes_quantity.draw AREASTACK") print("volumes_quantity.info Unused volumes can be deleted with " "`docker volume prune`") else: print('volumes_quantity.value', len(client.volumes)) print('volumes_quantity.extinfo', ', '.join(volume_summary(v) for v in client.volumes)) def cpu(client, mode): if mode == "config": graphlimit = str(os.cpu_count() * 100) print("graph_title Docker containers CPU usage") print("graph_args --base 1000 -r --lower-limit 0 --upper-limit " + graphlimit) print("graph_scale no") print("graph_period second") print("graph_vlabel CPU usage (%)") print("graph_category virtualization") print("graph_info This graph shows docker container CPU usage.") print("graph_total Total CPU usage") for container in client.all_containers: fieldname = clean_fieldname(container.name) print("{}.label {}".format(fieldname, container.name)) print("{}.draw AREASTACK".format(fieldname)) print("{}.info {}".format(fieldname, container_attributes(container))) else: print_containers_cpu(client) def network(client, mode): if mode == "config": print("graph_title Docker containers network usage") print("graph_args --base 1024 -l 0") print("graph_vlabel bits in (-) / out (+) per ${graph_period}") print("graph_category virtualization") print("graph_info This graph shows docker container network usage.") print("graph_total Total network usage") for container in client.all_containers: fieldname = clean_fieldname(container.name) print("{}_down.label {}_received".format(fieldname, container.name)) print("{}_down.type DERIVE".format(fieldname)) print("{}_down.min 0".format(fieldname)) print("{}_down.graph no".format(fieldname)) print("{}_down.cdef {}_down,8,*".format(fieldname, fieldname)) print("{}_up.label {}".format(fieldname, container.name)) print("{}_up.draw LINESTACK1".format(fieldname)) print("{}_up.type DERIVE".format(fieldname)) print("{}_up.min 0".format(fieldname)) print("{}_up.negative {}_down".format(fieldname, fieldname)) print("{}_up.cdef {}_up,8,*".format(fieldname, fieldname)) print("{}_up.info {}".format(fieldname, container_attributes(container))) else: print_containers_network(client) def memory(client, mode): if mode == "config": print("graph_title Docker containers memory usage") print("graph_args --base 1024 -l 0") print("graph_vlabel Bytes") print("graph_category virtualization") print("graph_info This graph shows docker container memory usage.") print("graph_total Total memory usage") for container in client.all_containers: fieldname = clean_fieldname(container.name) print("{}.label {}".format(fieldname, container.name)) print("{}.draw AREASTACK".format(fieldname)) print("{}.info {}".format(fieldname, container_attributes(container))) else: print_containers_memory(client) def main(): series = [ 'containers', 'cpu', 'images', 'memory', 'network', 'status', 'volumes', ] try: mode = sys.argv[1] except IndexError: mode = "" wildcard = sys.argv[0].split("docker_")[1].split("_")[0] try: import docker client = docker.from_env() if mode == "autoconf": client.ping() print('yes') sys.exit(0) except Exception as e: print(f'no ({e})') if mode == "autoconf": sys.exit(0) sys.exit(1) if mode == "suggest": # The multigraph covers all other graphs, # so we only need to suggest one print("multi") sys.exit(0) client = ClientWrapper(client, exclude_re=os.getenv('EXCLUDE_CONTAINER_NAME')) if wildcard in series: # dereference the function name by looking in the globals() # this assumes that the function name matches the series name exactly # if this were to change, a different approach would be needed, # most likely using a Dict of series name string to callable globals()[wildcard](client, mode) elif wildcard == 'multi': for s in series: print(f'multigraph docker_{s}') # ditto globals()[s](client, mode) else: print(f'unknown series ({wildcard})', file=sys.stderr) sys.exit(1) if __name__ == '__main__': main()