From 92eb30ccb3d1179b3de30532c46a95036594f6fd Mon Sep 17 00:00:00 2001
From: Michael Grote <michael.grote@posteo.de>
Date: Fri, 24 Nov 2023 12:54:24 +0100
Subject: [PATCH] remove zfs-health.sh, configure zfs-zed (#609)

Reviewed-on: https://git.mgrote.net/mg/homeserver/pulls/609
Co-authored-by: Michael Grote <michael.grote@posteo.de>
Co-committed-by: Michael Grote <michael.grote@posteo.de>
---
 playbooks/3_service/pbs.yml                   |  1 -
 playbooks/3_service/pve.yml                   |  1 -
 roles/mgrote_zfs_health/README.md             | 12 -----
 roles/mgrote_zfs_health/defaults/main.yml     |  7 ---
 roles/mgrote_zfs_health/tasks/main.yml        | 22 ---------
 roles/mgrote_zfs_health/tasks/user.yml        | 19 --------
 .../mgrote_zfs_health/templates/zfs-health.sh | 48 -------------------
 roles/mgrote_zfs_zed/defaults/main.yml        |  6 +++
 roles/mgrote_zfs_zed/handlers/main.yml        | 11 +++--
 roles/mgrote_zfs_zed/tasks/main.yml           |  7 +--
 .../templates/{zed.rc => zed.rc.j2}           | 38 ++++-----------
 11 files changed, 27 insertions(+), 145 deletions(-)
 delete mode 100644 roles/mgrote_zfs_health/README.md
 delete mode 100644 roles/mgrote_zfs_health/defaults/main.yml
 delete mode 100644 roles/mgrote_zfs_health/tasks/main.yml
 delete mode 100644 roles/mgrote_zfs_health/tasks/user.yml
 delete mode 100644 roles/mgrote_zfs_health/templates/zfs-health.sh
 create mode 100644 roles/mgrote_zfs_zed/defaults/main.yml
 rename roles/mgrote_zfs_zed/templates/{zed.rc => zed.rc.j2} (72%)

diff --git a/playbooks/3_service/pbs.yml b/playbooks/3_service/pbs.yml
index 7abd58fe..d4f25e4d 100644
--- a/playbooks/3_service/pbs.yml
+++ b/playbooks/3_service/pbs.yml
@@ -6,7 +6,6 @@
     - { role: mgrote_zfs_manage_datasets, tags: "datasets" }
     - { role: mgrote_zfs_scrub, tags: "zfs_scrub" }
     - { role: mgrote_zfs_zed, tags: "zfs_zed" }
-    - { role: mgrote_zfs_health, tags: "zfs_health" }
     - { role: mgrote_zfs_sanoid, tags: "sanoid" }
     - { role: mgrote_smart, tags: "smart" }
     - { role: mgrote_pbs_users, tags: "pbs_users" }
diff --git a/playbooks/3_service/pve.yml b/playbooks/3_service/pve.yml
index 96b0b085..2942574e 100644
--- a/playbooks/3_service/pve.yml
+++ b/playbooks/3_service/pve.yml
@@ -6,7 +6,6 @@
     - { role: mgrote_zfs_manage_datasets, tags: "datasets" }
     - { role: mgrote_zfs_scrub, tags: "zfs_scrub" }
     - { role: mgrote_zfs_zed, tags: "zfs_zed" }
-    - { role: mgrote_zfs_health, tags: "zfs_health" }
     - { role: mgrote_zfs_sanoid, tags: "sanoid" }
     - { role: mgrote_smart, tags: "smart" }
     - { role: mgrote_cv4pve_autosnap, tags: "cv4pve" }
diff --git a/roles/mgrote_zfs_health/README.md b/roles/mgrote_zfs_health/README.md
deleted file mode 100644
index 3de1f6d8..00000000
--- a/roles/mgrote_zfs_health/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-## mgrote.zfs_health
-
-### Beschreibung
-Richtet "zfs_health.sh", ein ZFS-Checkscript das auch Mails versendet bei Fehlern.
-
-
-### getestet auf
-- [x] ProxMox 6.1
-
-### Variablen + Defaults
-- see [defaults](./defaults/main.yml)
-- Variablen für ``mgrote.zfs_health/trim/scrub/zed/arc_mem/`` sind zusammengefasst unter zfs_extra_*
diff --git a/roles/mgrote_zfs_health/defaults/main.yml b/roles/mgrote_zfs_health/defaults/main.yml
deleted file mode 100644
index 5b4266fa..00000000
--- a/roles/mgrote_zfs_health/defaults/main.yml
+++ /dev/null
@@ -1,7 +0,0 @@
----
-### when should the script be run
-zfs_extra_cron_minutes_zfs_health: "0"
-zfs_extra_cron_hours_zfs_health: "*"
-### under which user the script is run
-zfs_health_user_group: "root"
-zfs_health_user: "zfs-health"
diff --git a/roles/mgrote_zfs_health/tasks/main.yml b/roles/mgrote_zfs_health/tasks/main.yml
deleted file mode 100644
index 55af11a5..00000000
--- a/roles/mgrote_zfs_health/tasks/main.yml
+++ /dev/null
@@ -1,22 +0,0 @@
----
-- name: include user tasks
-  ansible.builtin.include_tasks: user.yml
-
-- name: template script
-  become: true
-  ansible.builtin.template:
-    src: zfs-health.sh
-    dest: /usr/local/bin/zfs-health.sh
-    mode: "0744"
-    owner: "{{ zfs_health_user }}"
-    group: "{{ zfs_health_user_group }}"
-
-- name: ensure cronjob exists
-  become: true
-  ansible.builtin.cron:
-    name: zfs_health
-    state: present
-    job: "/usr/local/bin/zfs-health.sh"
-    minute: "{{ zfs_extra_cron_minutes_zfs_health }}"
-    hour: "{{ zfs_extra_cron_hours_zfs_health }}"
-    user: "{{ zfs_health_user }}"
diff --git a/roles/mgrote_zfs_health/tasks/user.yml b/roles/mgrote_zfs_health/tasks/user.yml
deleted file mode 100644
index 5d1bf383..00000000
--- a/roles/mgrote_zfs_health/tasks/user.yml
+++ /dev/null
@@ -1,19 +0,0 @@
----
-- name: ensure group exists
-  become: true
-  ansible.builtin.group:
-    name: "{{ zfs_health_user_group }}"
-    state: present
-  when:
-    - zfs_health_user_group is defined
-
-- name: ensure user exists
-  become: true
-  ansible.builtin.user:
-    name: "{{ zfs_health_user }}"
-    group: "{{ zfs_health_user_group }}"
-    shell: /usr/sbin/nologin
-    create_home: false
-  when:
-    - zfs_health_user_group is defined
-    - zfs_health_user is defined
diff --git a/roles/mgrote_zfs_health/templates/zfs-health.sh b/roles/mgrote_zfs_health/templates/zfs-health.sh
deleted file mode 100644
index 340c2f04..00000000
--- a/roles/mgrote_zfs_health/templates/zfs-health.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#! /bin/bash
-{{ file_header | default () }}
-
-problems=0
-emailSubject="`hostname` - ZFS pool - HEALTH check"
-emailMessage=""
-
-# Health - Check if all zfs volumes are in good condition. We are looking for
-# any keyword signifying a degraded or broken array.
-
-condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)')
-if [ "${condition}" ]; then
-  emailSubject="$emailSubject - fault"
-  problems=1
-fi
-
-
-
-# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors
-# on all volumes and all drives using "zpool status". If any non-zero errors
-# are reported an email will be sent out. You should then look to replace the
-# faulty drive and run "zpool scrub" on the affected volume after resilvering.
-
-if [ ${problems} -eq 0 ]; then
-  errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000)
-  if [ "${errors}" ]; then
-    emailSubject="$emailSubject - Drive Errors"
-    problems=1
-  fi
-fi
-
-
-
-# Notifications - On any problems send email with drive status information and
-# capacities including a helpful subject line to root. Also use logger to write
-# the email subject to the local logs. This is the place you may want to put
-# any other notifications like:
-#
-# + Update an anonymous twitter account with your ZFS status (https://twitter.com/zfsmonitor)
-# + Playing a sound file or beep the internal speaker
-# + Update Nagios, Cacti, Zabbix, Munin or even BigBrother
-
-if [ "$problems" -ne 0 ]; then
-  logger $emailSubject
-        echo -e "$emailSubject \n\n\n `/sbin/zpool list` \n\n\n `/sbin/zpool status`" | mail -s "$emailSubject" {{ my_mail }}
-fi
-
-### EOF ###
diff --git a/roles/mgrote_zfs_zed/defaults/main.yml b/roles/mgrote_zfs_zed/defaults/main.yml
new file mode 100644
index 00000000..3ce80847
--- /dev/null
+++ b/roles/mgrote_zfs_zed/defaults/main.yml
@@ -0,0 +1,6 @@
+---
+zed_time_bettween_warning_s: 3600 # in seconds
+zed_mail_to: "{{ my_mail }}"
+zed_notify_verbosity: "1" # If set to 0, suppress notification if the pool is healthy. If set to 1, send notification regardless of pool health.
+zed_notify_data: 1 # Send notifications for 'ereport.fs.zfs.data' events. Disabled by default, any non-empty value will enable the feature.
+zed_scrub_after_resilver: 1 # Run a scrub after every resilver. Disabled by default, 1 to enable and 0 to disable.
diff --git a/roles/mgrote_zfs_zed/handlers/main.yml b/roles/mgrote_zfs_zed/handlers/main.yml
index a7a3113c..6c872a05 100644
--- a/roles/mgrote_zfs_zed/handlers/main.yml
+++ b/roles/mgrote_zfs_zed/handlers/main.yml
@@ -1,5 +1,8 @@
 ---
-- name: testmail # noqa no-changed-when
-  ansible.builtin.shell:
-    cmd: "set -o pipefail && echo 'zed ist eingerichtet' | mail -s '{{ ansible_hostname }} - zed' '{{ my_mail }}'"
-    executable: /bin/bash
+- name: Restart zfs-zed.service
+  become: true
+  ansible.builtin.systemd:
+    name: "zfs-zed.service"
+    enabled: true
+    masked: false
+    state: restarted
diff --git a/roles/mgrote_zfs_zed/tasks/main.yml b/roles/mgrote_zfs_zed/tasks/main.yml
index 69e817df..d286af4f 100644
--- a/roles/mgrote_zfs_zed/tasks/main.yml
+++ b/roles/mgrote_zfs_zed/tasks/main.yml
@@ -1,9 +1,10 @@
 ---
-- name: kopiere zed.rc
+- name: Template "zed.rc"
   become: true
   ansible.builtin.template:
     owner: root
+    group: root
     mode: "0600"
-    src: zed.rc
+    src: zed.rc.j2
     dest: /etc/zfs/zed.d/zed.rc
-  notify: testmail
+  notify: Restart zfs-zed.service
diff --git a/roles/mgrote_zfs_zed/templates/zed.rc b/roles/mgrote_zfs_zed/templates/zed.rc.j2
similarity index 72%
rename from roles/mgrote_zfs_zed/templates/zed.rc
rename to roles/mgrote_zfs_zed/templates/zed.rc.j2
index 34687cd7..9f1d6d5e 100644
--- a/roles/mgrote_zfs_zed/templates/zed.rc
+++ b/roles/mgrote_zfs_zed/templates/zed.rc.j2
@@ -16,14 +16,14 @@
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 # Disabled by default; uncomment to enable.
 #
-ZED_EMAIL_ADDR="{{ my_mail }}"
+ZED_EMAIL_ADDR="{{ zed_mail_to }}"
 
 ##
 # Name or path of executable responsible for sending notifications via email;
 #   the mail program must be capable of reading a message body from stdin.
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 #
-#ZED_EMAIL_PROG="mail"
+ZED_EMAIL_PROG="mail"
 
 ##
 # Command-line options for ZED_EMAIL_PROG.
@@ -32,57 +32,39 @@ ZED_EMAIL_ADDR="{{ my_mail }}"
 #   this should be protected with quotes to prevent word-splitting.
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 #
-#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
+ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
 
 ##
 # Default directory for zed lock files.
 #
-#ZED_LOCKDIR="/var/lock"
+ZED_LOCKDIR="/var/lock"
 
 ##
 # Minimum number of seconds between notifications for a similar event.
 #
-ZED_NOTIFY_INTERVAL_SECS=3600
+ZED_NOTIFY_INTERVAL_SECS={{ zed_time_bettween_warning_s }}
 
 ##
 # Notification verbosity.
 #   If set to 0, suppress notification if the pool is healthy.
 #   If set to 1, send notification regardless of pool health.
 #
-ZED_NOTIFY_VERBOSE=1
+ZED_NOTIFY_VERBOSE={{ zed_notify_verbosity }}
 
 ##
 # Send notifications for 'ereport.fs.zfs.data' events.
 # Disabled by default, any non-empty value will enable the feature.
 #
-ZED_NOTIFY_DATA=1
-
-##
-# Pushbullet access token.
-# This grants full access to your account -- protect it accordingly!
-#   <https://www.pushbullet.com/get-started>
-#   <https://www.pushbullet.com/account>
-# Disabled by default; uncomment to enable.
-#
-#ZED_PUSHBULLET_ACCESS_TOKEN=""
-
-##
-# Pushbullet channel tag for push notification feeds that can be subscribed to.
-#   <https://www.pushbullet.com/my-channel>
-# If not defined, push notifications will instead be sent to all devices
-#   associated with the account specified by the access token.
-# Disabled by default; uncomment to enable.
-#
-#ZED_PUSHBULLET_CHANNEL_TAG=""
+ZED_NOTIFY_DATA={{ zed_notify_data }}
 
 ##
 # Default directory for zed state files.
 #
-#ZED_RUNDIR="/var/run"
+ZED_RUNDIR="/var/run"
 
 ##
 # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED.  This works for
-# device mapper and multipath devices as well.  Your enclosure must be
+# device mapper and multipath devices as well. Your enclosure must be
 # supported by the Linux SES driver for this to work.
 #
 ZED_USE_ENCLOSURE_LEDS=1
@@ -90,7 +72,7 @@ ZED_USE_ENCLOSURE_LEDS=1
 ##
 # Run a scrub after every resilver
 # Disabled by default, 1 to enable and 0 to disable.
-ZED_SCRUB_AFTER_RESILVER=1
+ZED_SCRUB_AFTER_RESILVER={{ zed_scrub_after_resilver }}
 
 ##
 # The syslog priority (e.g., specified as a "facility.level" pair).