553 lines
19 KiB
Text
553 lines
19 KiB
Text
|
#!/usr/bin/env python3
|
||
|
"""
|
||
|
=head1 NAME
|
||
|
|
||
|
docker_ - Docker wildcard-plugin to monitor a L<Docker|https://www.docker.com> host.
|
||
|
|
||
|
This wildcard plugin provides series C<containers>, C<images>, C<status>,
|
||
|
C<volumes>, C<cpu>, C<memory> and C<network> as separate graphs. It also
|
||
|
supports a C<multi> suffix that provides all of those as a multigraph.
|
||
|
|
||
|
=head1 INSTALLATION
|
||
|
|
||
|
- Copy this plugin in your munin plugins directory
|
||
|
- Install Python3 "docker" package
|
||
|
|
||
|
=over 2
|
||
|
|
||
|
If you want all the graphs as a multigraph, create a single multi symlink.
|
||
|
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_multi
|
||
|
|
||
|
Or choose a subset of those you want.
|
||
|
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_containers
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_cpu
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_images
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_memory
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_network
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_status
|
||
|
ln -s /usr/share/munin/plugins/docker_ /etc/munin/plugins/docker_volumes
|
||
|
|
||
|
=back
|
||
|
|
||
|
After the installation you need to restart your munin-node:
|
||
|
|
||
|
=over 2
|
||
|
|
||
|
systemctl restart munin-node
|
||
|
|
||
|
=back
|
||
|
|
||
|
=head1 CONFIGURATION
|
||
|
|
||
|
This plugin need to run as root, you need to create a file named docker placed in the
|
||
|
directory /etc/munin/plugin-conf.d/ with the following config (you can also use
|
||
|
Docker environment variables here as described in
|
||
|
https://docs.docker.com/compose/reference/envvars/):
|
||
|
|
||
|
You can use the EXCLUDE_CONTAINER_NAME environment variable to specify a regular expression
|
||
|
which if matched will exclude the matching containers from the memory and cpu graphs.
|
||
|
|
||
|
For example
|
||
|
|
||
|
env.EXCLUDE_CONTAINER_NAME runner
|
||
|
|
||
|
Would exclude all containers with the word "runner" in the name.
|
||
|
|
||
|
|
||
|
=over 2
|
||
|
|
||
|
[docker_*]
|
||
|
group docker
|
||
|
env.DOCKER_HOST unix://run/docker.sock
|
||
|
env.EXCLUDE_CONTAINER_NAME regexp
|
||
|
|
||
|
=back
|
||
|
|
||
|
You may need to pick a different group depending on the name schema of your
|
||
|
distribution. Or maybe use "user root", if nothing else works.
|
||
|
|
||
|
=head1 AUTHORS
|
||
|
|
||
|
This section has been reverse-engineered from git logs
|
||
|
|
||
|
Codimp <contact@lithio.fr>: original rewrite
|
||
|
|
||
|
Rowan Wookey <admin@rwky.net>: performance improvement
|
||
|
|
||
|
Olivier Mehani <shtrom@ssji.net>: Network support, ClientWrapper, general cleanup, multigraph
|
||
|
|
||
|
=head1 MAGIC MARKERS
|
||
|
|
||
|
#%# family=auto
|
||
|
#%# capabilities=autoconf suggest multigraph
|
||
|
|
||
|
=cut
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
import re
|
||
|
try:
|
||
|
from functools import cached_property
|
||
|
except ImportError:
|
||
|
# If cached_property is not available,
|
||
|
# just use the property decorator, without caching
|
||
|
# This is for backward compatibility with Python<3.8
|
||
|
cached_property = property
|
||
|
from multiprocessing import Process, Queue
|
||
|
|
||
|
|
||
|
def sorted_by_creation_date(func):
|
||
|
def sorted_func(*args, **kwargs):
|
||
|
return sorted(
|
||
|
func(*args, **kwargs),
|
||
|
key=(
|
||
|
lambda x: x.attrs['CreatedAt']
|
||
|
if 'CreatedAt' in x.attrs
|
||
|
else x.attrs['Created']
|
||
|
)
|
||
|
)
|
||
|
return sorted_func
|
||
|
|
||
|
|
||
|
def clean_fieldname(text):
|
||
|
if text == "root":
|
||
|
# "root" is a magic (forbidden) word
|
||
|
return "_root"
|
||
|
else:
|
||
|
return re.sub(r"(^[^A-Za-z_]|[^A-Za-z0-9_])", "_", text)
|
||
|
|
||
|
|
||
|
class ClientWrapper:
|
||
|
"""
|
||
|
A small wrapper for the docker client, to centralise some parsing logic,
|
||
|
and support caching.
|
||
|
|
||
|
In addition, when the exclude_re parameter is not None,
|
||
|
any container which name is matched by the RE will not be excluded from reports.
|
||
|
"""
|
||
|
client = None
|
||
|
exclude = None
|
||
|
|
||
|
def __init__(self, client, exclude_re=None):
|
||
|
self.client = client
|
||
|
if exclude_re:
|
||
|
self.exclude = re.compile(exclude_re)
|
||
|
|
||
|
@property
|
||
|
def api(self):
|
||
|
return self.client.api
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def all_containers(self):
|
||
|
return [
|
||
|
c for c in self.client.containers.list(all=True)
|
||
|
if (c.status == 'running') and (not self.exclude or not self.exclude.search(c.name))
|
||
|
]
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def intermediate_images(self):
|
||
|
return list(
|
||
|
set(self.all_images)
|
||
|
.difference(
|
||
|
set(self.images)
|
||
|
.difference(
|
||
|
set(self.dangling_images)
|
||
|
)
|
||
|
)
|
||
|
)
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def all_images(self):
|
||
|
return self.client.images.list(all=True)
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def images(self):
|
||
|
images = self.client.images.list()
|
||
|
return list(
|
||
|
set(images)
|
||
|
.difference(
|
||
|
set(self.dangling_images))
|
||
|
)
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def dangling_images(self):
|
||
|
return self.client.images.list(filters={'dangling': True})
|
||
|
|
||
|
@cached_property
|
||
|
@sorted_by_creation_date
|
||
|
def volumes(self):
|
||
|
return self.client.volumes.list()
|
||
|
|
||
|
|
||
|
def container_summary(container, *args):
|
||
|
summary = container.name
|
||
|
attributes = container_attributes(container, *args)
|
||
|
if attributes:
|
||
|
summary += f' ({attributes})'
|
||
|
return summary
|
||
|
|
||
|
|
||
|
def container_attributes(container, *args):
|
||
|
attributes = container.image.tags
|
||
|
attributes.append(container.attrs['Created'])
|
||
|
return ', '.join(attributes + list(args))
|
||
|
|
||
|
|
||
|
def print_containers_status(client):
|
||
|
running = []
|
||
|
unhealthy = []
|
||
|
paused = []
|
||
|
created = []
|
||
|
restarting = []
|
||
|
removing = []
|
||
|
exited = []
|
||
|
dead = []
|
||
|
for container in client.all_containers:
|
||
|
if container.status == 'running':
|
||
|
state = client.api.inspect_container(container.name)['State']
|
||
|
if state.get('Health', {}).get('Status') == 'unhealthy':
|
||
|
unhealthy.append(container)
|
||
|
else:
|
||
|
running.append(container)
|
||
|
elif container.status == 'paused':
|
||
|
paused.append(container)
|
||
|
elif container.status == 'created':
|
||
|
created.append(container)
|
||
|
elif container.status == 'restarting':
|
||
|
restarting.append(container)
|
||
|
elif container.status == 'removing':
|
||
|
removing.append(container)
|
||
|
elif container.status == 'exited':
|
||
|
exited.append(container)
|
||
|
elif container.status == 'dead':
|
||
|
dead.append(container)
|
||
|
print('running.value', len(running))
|
||
|
print('running.extinfo', ', '.join(container_summary(c) for c in running))
|
||
|
print('unhealthy.value', len(unhealthy))
|
||
|
print('unhealthy.extinfo', ', '.join(container_summary(c) for c in unhealthy))
|
||
|
print('paused.value', len(paused))
|
||
|
print('paused.extinfo', ', '.join(container_summary(c) for c in paused))
|
||
|
print('created.value', len(created))
|
||
|
print('created.extinfo', ', '.join(container_summary(c) for c in created))
|
||
|
print('restarting.value', len(restarting))
|
||
|
print('restarting.extinfo', ', '.join(container_summary(c) for c in restarting))
|
||
|
print('removing.value', len(removing))
|
||
|
print('removing.extinfo', ', '.join(container_summary(c) for c in removing))
|
||
|
print('exited.value', len(exited))
|
||
|
print('exited.extinfo', ', '.join(container_summary(c) for c in exited))
|
||
|
print('dead.value', len(dead))
|
||
|
print('dead.extinfo', ', '.join(container_summary(c) for c in dead))
|
||
|
|
||
|
|
||
|
def image_summary(image):
|
||
|
attributes = image.tags
|
||
|
attributes.append(image.attrs['Created'])
|
||
|
attributes.append(f"{round(image.attrs['Size']/1024**2, 2)} MiB")
|
||
|
return f"{image.short_id} ({', '.join(attributes)})"
|
||
|
|
||
|
|
||
|
def print_images_count(client):
|
||
|
images = client.images
|
||
|
intermediate = client.intermediate_images
|
||
|
dangling = client.dangling_images
|
||
|
|
||
|
print('intermediate_quantity.value', len(intermediate))
|
||
|
print('intermediate_quantity.extinfo', ', '.join(image_summary(i) for i in intermediate))
|
||
|
print('images_quantity.value', len(images))
|
||
|
print('images_quantity.extinfo', ', '.join(image_summary(i) for i in images))
|
||
|
print('dangling_quantity.value', len(dangling))
|
||
|
print('dangling_quantity.extinfo', ', '.join(image_summary(i) for i in dangling))
|
||
|
|
||
|
|
||
|
def get_container_stats(container, q):
|
||
|
q.put(container.stats(stream=False))
|
||
|
|
||
|
|
||
|
def parallel_container_stats(client):
|
||
|
proc_list = []
|
||
|
stats = {}
|
||
|
for container in client.all_containers:
|
||
|
q = Queue()
|
||
|
p = Process(target=get_container_stats, args=(container, q))
|
||
|
proc_list.append({'proc': p, 'queue': q, 'container': container})
|
||
|
p.start()
|
||
|
for proc in proc_list:
|
||
|
proc['proc'].join()
|
||
|
stats[proc['container']] = proc['queue'].get()
|
||
|
return stats.items()
|
||
|
|
||
|
|
||
|
def print_containers_cpu(client):
|
||
|
for container, stats in parallel_container_stats(client):
|
||
|
cpu_percent = 0.0
|
||
|
cpu_delta = (float(stats["cpu_stats"]["cpu_usage"]["total_usage"])
|
||
|
- float(stats["precpu_stats"]["cpu_usage"]["total_usage"]))
|
||
|
system_delta = (float(stats["cpu_stats"]["system_cpu_usage"])
|
||
|
- float(stats["precpu_stats"]["system_cpu_usage"]))
|
||
|
if system_delta > 0.0:
|
||
|
cpu_percent = cpu_delta / system_delta * 100.0 * os.cpu_count()
|
||
|
clean_container_name = clean_fieldname(container.name)
|
||
|
print(clean_container_name + '.value', cpu_percent)
|
||
|
print(clean_container_name + '.extinfo', container_attributes(container))
|
||
|
|
||
|
|
||
|
def print_containers_memory(client):
|
||
|
for container, stats in parallel_container_stats(client):
|
||
|
if 'total_rss' in stats['memory_stats']['stats']: # cgroupv1 only?
|
||
|
memory_usage = stats['memory_stats']['stats']['total_rss']
|
||
|
extinfo = 'Resident Set Size'
|
||
|
else:
|
||
|
memory_usage = stats['memory_stats']['usage']
|
||
|
extinfo = 'Total memory usage'
|
||
|
clean_container_name = clean_fieldname(container.name)
|
||
|
print(clean_container_name + '.value', memory_usage)
|
||
|
print(clean_container_name + '.extinfo', container_attributes(container, extinfo))
|
||
|
|
||
|
|
||
|
def print_containers_network(client):
|
||
|
for container, stats in parallel_container_stats(client):
|
||
|
tx_bytes = 0
|
||
|
rx_bytes = 0
|
||
|
if "networks" in stats:
|
||
|
for data in stats['networks'].values():
|
||
|
tx_bytes += data['tx_bytes']
|
||
|
rx_bytes += data['rx_bytes']
|
||
|
clean_container_name = clean_fieldname(container.name)
|
||
|
print(clean_container_name + '_up.value', tx_bytes)
|
||
|
print(clean_container_name + '_down.value', rx_bytes)
|
||
|
print(clean_container_name + '_up.extinfo', container_attributes(container))
|
||
|
|
||
|
|
||
|
def volume_summary(volume):
|
||
|
summary = f"{volume.short_id}"
|
||
|
if volume.attrs['Labels']:
|
||
|
summary += f" ({', '.join(volume.attrs['Labels'])})"
|
||
|
return summary
|
||
|
|
||
|
|
||
|
def status(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker status")
|
||
|
print("graph_vlabel containers")
|
||
|
print("graph_category virtualization")
|
||
|
print("graph_total All containers")
|
||
|
print("running.label RUNNING")
|
||
|
print("running.draw AREASTACK")
|
||
|
print("running.info Running containers can be manipulated with "
|
||
|
"`docker container [attach|kill|logs|pause|restart|stop] <NAME>` or "
|
||
|
"commands run in them with `docker container exec "
|
||
|
"[--detach|--interactive,--privileged,--tty] <NAME> <COMMAND>`"
|
||
|
)
|
||
|
print("unhealthy.label UNHEALTHY")
|
||
|
print("unhealthy.draw AREASTACK")
|
||
|
print("unhealthy.warning 1")
|
||
|
print("unhealthy.info Unhealthy containers can be restarted with "
|
||
|
"`docker container restart <NAME>`")
|
||
|
print("paused.label PAUSED")
|
||
|
print("paused.draw AREASTACK")
|
||
|
print("paused.info Paused containers can be resumed with "
|
||
|
"`docker container unpause <NAME>`")
|
||
|
print("created.label CREATED")
|
||
|
print("created.draw AREASTACK")
|
||
|
print("created.info New containers can be created with "
|
||
|
"`docker container create --name <NAME> <IMAGE_ID >` or "
|
||
|
"`docker container run --name <NAME> <IMAGE_ID> <COMMAND>`")
|
||
|
print("restarting.label RESTARTING")
|
||
|
print("restarting.draw AREASTACK")
|
||
|
print("restarting.info Containers can be restarted with "
|
||
|
"`docker container restart <NAME>`")
|
||
|
print("removing.label REMOVING")
|
||
|
print("removing.draw AREASTACK")
|
||
|
print("removing.info Containers can be removed with "
|
||
|
"`docker container rm <NAME>`")
|
||
|
print("exited.label EXITED")
|
||
|
print("exited.draw AREASTACK")
|
||
|
print("exited.info Exited containers can be started with "
|
||
|
"`docker container start [--attach] <NAME>`")
|
||
|
print("dead.label DEAD")
|
||
|
print("dead.draw AREASTACK")
|
||
|
print("dead.warning 1")
|
||
|
print("dead.info Dead containers can be started with "
|
||
|
"`docker container start <NAME>`")
|
||
|
else:
|
||
|
print_containers_status(client)
|
||
|
|
||
|
|
||
|
def containers(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker containers")
|
||
|
print("graph_vlabel containers")
|
||
|
print("graph_category virtualization")
|
||
|
print("containers_quantity.label Containers")
|
||
|
else:
|
||
|
print('containers_quantity.value', len(client.all_containers))
|
||
|
|
||
|
|
||
|
def images(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker images")
|
||
|
print("graph_vlabel images")
|
||
|
print("graph_category virtualization")
|
||
|
print("graph_total All images")
|
||
|
print("intermediate_quantity.label Intermediate images")
|
||
|
print("intermediate_quantity.draw AREASTACK")
|
||
|
print("intermediate_quantity.info All unused images can be deleted with "
|
||
|
"`docker image prune --all`")
|
||
|
print("images_quantity.label Images")
|
||
|
print("images_quantity.draw AREASTACK")
|
||
|
print("images_quantity.info Images can be used in containers with "
|
||
|
"`docker container create --name <NAME> <IMAGE_ID >` or "
|
||
|
"`docker container run --name <NAME> <IMAGE_ID> <COMMAND>`")
|
||
|
print("dangling_quantity.label Dangling images")
|
||
|
print("dangling_quantity.draw AREASTACK")
|
||
|
print("dangling_quantity.info Dangling images can be deleted with "
|
||
|
"`docker image prune`"
|
||
|
"or tagged with `docker image tag <IMAGE_ID> <NAME>`")
|
||
|
print("dangling_quantity.warning 10")
|
||
|
else:
|
||
|
print_images_count(client)
|
||
|
|
||
|
|
||
|
def volumes(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker volumes")
|
||
|
print("graph_vlabel volumes")
|
||
|
print("graph_category virtualization")
|
||
|
print("volumes_quantity.label Volumes")
|
||
|
print("volumes_quantity.draw AREASTACK")
|
||
|
print("volumes_quantity.info Unused volumes can be deleted with "
|
||
|
"`docker volume prune`")
|
||
|
else:
|
||
|
print('volumes_quantity.value', len(client.volumes))
|
||
|
print('volumes_quantity.extinfo', ', '.join(volume_summary(v) for v in client.volumes))
|
||
|
|
||
|
|
||
|
def cpu(client, mode):
|
||
|
if mode == "config":
|
||
|
graphlimit = str(os.cpu_count() * 100)
|
||
|
print("graph_title Docker containers CPU usage")
|
||
|
print("graph_args --base 1000 -r --lower-limit 0 --upper-limit " + graphlimit)
|
||
|
print("graph_scale no")
|
||
|
print("graph_period second")
|
||
|
print("graph_vlabel CPU usage (%)")
|
||
|
print("graph_category virtualization")
|
||
|
print("graph_info This graph shows docker container CPU usage.")
|
||
|
print("graph_total Total CPU usage")
|
||
|
for container in client.all_containers:
|
||
|
fieldname = clean_fieldname(container.name)
|
||
|
print("{}.label {}".format(fieldname, container.name))
|
||
|
print("{}.draw AREASTACK".format(fieldname))
|
||
|
print("{}.info {}".format(fieldname, container_attributes(container)))
|
||
|
else:
|
||
|
print_containers_cpu(client)
|
||
|
|
||
|
|
||
|
def network(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker containers network usage")
|
||
|
print("graph_args --base 1024 -l 0")
|
||
|
print("graph_vlabel bits in (-) / out (+) per ${graph_period}")
|
||
|
print("graph_category virtualization")
|
||
|
print("graph_info This graph shows docker container network usage.")
|
||
|
print("graph_total Total network usage")
|
||
|
for container in client.all_containers:
|
||
|
fieldname = clean_fieldname(container.name)
|
||
|
print("{}_down.label {}_received".format(fieldname, container.name))
|
||
|
print("{}_down.type DERIVE".format(fieldname))
|
||
|
print("{}_down.min 0".format(fieldname))
|
||
|
print("{}_down.graph no".format(fieldname))
|
||
|
print("{}_down.cdef {}_down,8,*".format(fieldname, fieldname))
|
||
|
print("{}_up.label {}".format(fieldname, container.name))
|
||
|
print("{}_up.draw LINESTACK1".format(fieldname))
|
||
|
print("{}_up.type DERIVE".format(fieldname))
|
||
|
print("{}_up.min 0".format(fieldname))
|
||
|
print("{}_up.negative {}_down".format(fieldname, fieldname))
|
||
|
print("{}_up.cdef {}_up,8,*".format(fieldname, fieldname))
|
||
|
print("{}_up.info {}".format(fieldname, container_attributes(container)))
|
||
|
else:
|
||
|
print_containers_network(client)
|
||
|
|
||
|
|
||
|
def memory(client, mode):
|
||
|
if mode == "config":
|
||
|
print("graph_title Docker containers memory usage")
|
||
|
print("graph_args --base 1024 -l 0")
|
||
|
print("graph_vlabel Bytes")
|
||
|
print("graph_category virtualization")
|
||
|
print("graph_info This graph shows docker container memory usage.")
|
||
|
print("graph_total Total memory usage")
|
||
|
for container in client.all_containers:
|
||
|
fieldname = clean_fieldname(container.name)
|
||
|
print("{}.label {}".format(fieldname, container.name))
|
||
|
print("{}.draw AREASTACK".format(fieldname))
|
||
|
print("{}.info {}".format(fieldname, container_attributes(container)))
|
||
|
else:
|
||
|
print_containers_memory(client)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
series = [
|
||
|
'containers',
|
||
|
'cpu',
|
||
|
'images',
|
||
|
'memory',
|
||
|
'network',
|
||
|
'status',
|
||
|
'volumes',
|
||
|
]
|
||
|
|
||
|
try:
|
||
|
mode = sys.argv[1]
|
||
|
except IndexError:
|
||
|
mode = ""
|
||
|
wildcard = sys.argv[0].split("docker_")[1].split("_")[0]
|
||
|
|
||
|
try:
|
||
|
import docker
|
||
|
client = docker.from_env()
|
||
|
if mode == "autoconf":
|
||
|
client.ping()
|
||
|
print('yes')
|
||
|
sys.exit(0)
|
||
|
except Exception as e:
|
||
|
print(f'no ({e})')
|
||
|
if mode == "autoconf":
|
||
|
sys.exit(0)
|
||
|
sys.exit(1)
|
||
|
|
||
|
if mode == "suggest":
|
||
|
# The multigraph covers all other graphs,
|
||
|
# so we only need to suggest one
|
||
|
print("multi")
|
||
|
sys.exit(0)
|
||
|
|
||
|
client = ClientWrapper(client,
|
||
|
exclude_re=os.getenv('EXCLUDE_CONTAINER_NAME'))
|
||
|
|
||
|
if wildcard in series:
|
||
|
# dereference the function name by looking in the globals()
|
||
|
# this assumes that the function name matches the series name exactly
|
||
|
# if this were to change, a different approach would be needed,
|
||
|
# most likely using a Dict of series name string to callable
|
||
|
globals()[wildcard](client, mode)
|
||
|
elif wildcard == 'multi':
|
||
|
for s in series:
|
||
|
print(f'multigraph docker_{s}')
|
||
|
# ditto
|
||
|
globals()[s](client, mode)
|
||
|
else:
|
||
|
print(f'unknown series ({wildcard})', file=sys.stderr)
|
||
|
sys.exit(1)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|