remove zfs-health.sh, configure zfs-zed (#609)

Reviewed-on: #609 Co-authored-by: Michael Grote <michael.grote@posteo.de> Co-committed-by: Michael Grote <michael.grote@posteo.de>
2023-11-24 12:54:24 +01:00 · 2023-11-24 12:54:24 +01:00 · 92eb30ccb3
commit 92eb30ccb3
parent b61a027163
11 changed files with 27 additions and 145 deletions
--- a/playbooks/3_service/pbs.yml
+++ b/playbooks/3_service/pbs.yml
@ -6,7 +6,6 @@
    - { role: mgrote_zfs_manage_datasets, tags: "datasets" }
    - { role: mgrote_zfs_scrub, tags: "zfs_scrub" }
    - { role: mgrote_zfs_zed, tags: "zfs_zed" }
-    - { role: mgrote_zfs_health, tags: "zfs_health" }
    - { role: mgrote_zfs_sanoid, tags: "sanoid" }
    - { role: mgrote_smart, tags: "smart" }
    - { role: mgrote_pbs_users, tags: "pbs_users" }
--- a/playbooks/3_service/pve.yml
+++ b/playbooks/3_service/pve.yml
@ -6,7 +6,6 @@
    - { role: mgrote_zfs_manage_datasets, tags: "datasets" }
    - { role: mgrote_zfs_scrub, tags: "zfs_scrub" }
    - { role: mgrote_zfs_zed, tags: "zfs_zed" }
-    - { role: mgrote_zfs_health, tags: "zfs_health" }
    - { role: mgrote_zfs_sanoid, tags: "sanoid" }
    - { role: mgrote_smart, tags: "smart" }
    - { role: mgrote_cv4pve_autosnap, tags: "cv4pve" }
--- a/roles/mgrote_zfs_health/README.md
+++ b/roles/mgrote_zfs_health/README.md
@ -1,12 +0,0 @@
-## mgrote.zfs_health
-
-### Beschreibung
-Richtet "zfs_health.sh", ein ZFS-Checkscript das auch Mails versendet bei Fehlern.
-
-
-### getestet auf
- [x] ProxMox 6.1
-
-### Variablen + Defaults
- see [defaults](./defaults/main.yml)
- Variablen für ``mgrote.zfs_health/trim/scrub/zed/arc_mem/`` sind zusammengefasst unter zfs_extra_*
--- a/roles/mgrote_zfs_health/defaults/main.yml
+++ b/roles/mgrote_zfs_health/defaults/main.yml
@ -1,7 +0,0 @@
---
-### when should the script be run
-zfs_extra_cron_minutes_zfs_health: "0"
-zfs_extra_cron_hours_zfs_health: "*"
-### under which user the script is run
-zfs_health_user_group: "root"
-zfs_health_user: "zfs-health"
--- a/roles/mgrote_zfs_health/tasks/main.yml
+++ b/roles/mgrote_zfs_health/tasks/main.yml
@ -1,22 +0,0 @@
---
- name: include user tasks
-  ansible.builtin.include_tasks: user.yml
-
- name: template script
-  become: true
-  ansible.builtin.template:
-    src: zfs-health.sh
-    dest: /usr/local/bin/zfs-health.sh
-    mode: "0744"
-    owner: "{{ zfs_health_user }}"
-    group: "{{ zfs_health_user_group }}"
-
- name: ensure cronjob exists
-  become: true
-  ansible.builtin.cron:
-    name: zfs_health
-    state: present
-    job: "/usr/local/bin/zfs-health.sh"
-    minute: "{{ zfs_extra_cron_minutes_zfs_health }}"
-    hour: "{{ zfs_extra_cron_hours_zfs_health }}"
-    user: "{{ zfs_health_user }}"
--- a/roles/mgrote_zfs_health/tasks/user.yml
+++ b/roles/mgrote_zfs_health/tasks/user.yml
@ -1,19 +0,0 @@
---
- name: ensure group exists
-  become: true
-  ansible.builtin.group:
-    name: "{{ zfs_health_user_group }}"
-    state: present
-  when:
-    - zfs_health_user_group is defined
-
- name: ensure user exists
-  become: true
-  ansible.builtin.user:
-    name: "{{ zfs_health_user }}"
-    group: "{{ zfs_health_user_group }}"
-    shell: /usr/sbin/nologin
-    create_home: false
-  when:
-    - zfs_health_user_group is defined
-    - zfs_health_user is defined
--- a/roles/mgrote_zfs_health/templates/zfs-health.sh
+++ b/roles/mgrote_zfs_health/templates/zfs-health.sh
@ -1,48 +0,0 @@
-#! /bin/bash
-{{ file_header | default () }}
-
-problems=0
-emailSubject="`hostname` - ZFS pool - HEALTH check"
-emailMessage=""
-
-# Health - Check if all zfs volumes are in good condition. We are looking for
-# any keyword signifying a degraded or broken array.
-
-condition=$(/sbin/zpool status | egrep -i '(DEGRADED|FAULTED|OFFLINE|UNAVAIL|REMOVED|FAIL|DESTROYED|corrupt|cannot|unrecover)')
-if [ "${condition}" ]; then
-  emailSubject="$emailSubject - fault"
-  problems=1
-fi
-
-
-
-# Errors - Check the columns for READ, WRITE and CKSUM (checksum) drive errors
-# on all volumes and all drives using "zpool status". If any non-zero errors
-# are reported an email will be sent out. You should then look to replace the
-# faulty drive and run "zpool scrub" on the affected volume after resilvering.
-
-if [ ${problems} -eq 0 ]; then
-  errors=$(/sbin/zpool status | grep ONLINE | grep -v state | awk '{print $3 $4 $5}' | grep -v 000)
-  if [ "${errors}" ]; then
-    emailSubject="$emailSubject - Drive Errors"
-    problems=1
-  fi
-fi
-
-
-
-# Notifications - On any problems send email with drive status information and
-# capacities including a helpful subject line to root. Also use logger to write
-# the email subject to the local logs. This is the place you may want to put
-# any other notifications like:
-#
-# + Update an anonymous twitter account with your ZFS status (https://twitter.com/zfsmonitor)
-# + Playing a sound file or beep the internal speaker
-# + Update Nagios, Cacti, Zabbix, Munin or even BigBrother
-
-if [ "$problems" -ne 0 ]; then
-  logger $emailSubject
-        echo -e "$emailSubject \n\n\n `/sbin/zpool list` \n\n\n `/sbin/zpool status`" | mail -s "$emailSubject" {{ my_mail }}
-fi
-
-### EOF ###
--- a/roles/mgrote_zfs_zed/defaults/main.yml
+++ b/roles/mgrote_zfs_zed/defaults/main.yml
@ -0,0 +1,6 @@
+---
+zed_time_bettween_warning_s: 3600 # in seconds
+zed_mail_to: "{{ my_mail }}"
+zed_notify_verbosity: "1" # If set to 0, suppress notification if the pool is healthy. If set to 1, send notification regardless of pool health.
+zed_notify_data: 1 # Send notifications for 'ereport.fs.zfs.data' events. Disabled by default, any non-empty value will enable the feature.
+zed_scrub_after_resilver: 1 # Run a scrub after every resilver. Disabled by default, 1 to enable and 0 to disable.
--- a/roles/mgrote_zfs_zed/handlers/main.yml
+++ b/roles/mgrote_zfs_zed/handlers/main.yml
@ -1,5 +1,8 @@
 ---
- name: testmail # noqa no-changed-when
-  ansible.builtin.shell:
-    cmd: "set -o pipefail && echo 'zed ist eingerichtet' | mail -s '{{ ansible_hostname }} - zed' '{{ my_mail }}'"
-    executable: /bin/bash
+- name: Restart zfs-zed.service
+  become: true
+  ansible.builtin.systemd:
+    name: "zfs-zed.service"
+    enabled: true
+    masked: false
+    state: restarted
--- a/roles/mgrote_zfs_zed/tasks/main.yml
+++ b/roles/mgrote_zfs_zed/tasks/main.yml
@ -1,9 +1,10 @@
 ---
- name: kopiere zed.rc
+- name: Template "zed.rc"
  become: true
  ansible.builtin.template:
    owner: root
+    group: root
    mode: "0600"
-    src: zed.rc
+    src: zed.rc.j2
    dest: /etc/zfs/zed.d/zed.rc
-  notify: testmail
+  notify: Restart zfs-zed.service
--- a/roles/mgrote_zfs_zed/templates/zed.rc.j2
+++ b/roles/mgrote_zfs_zed/templates/zed.rc.j2
@ -16,14 +16,14 @@
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 # Disabled by default; uncomment to enable.
 #
-ZED_EMAIL_ADDR="{{ my_mail }}"
+ZED_EMAIL_ADDR="{{ zed_mail_to }}"

 ##
 # Name or path of executable responsible for sending notifications via email;
 #   the mail program must be capable of reading a message body from stdin.
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 #
-#ZED_EMAIL_PROG="mail"
+ZED_EMAIL_PROG="mail"

 ##
 # Command-line options for ZED_EMAIL_PROG.
@ -32,57 +32,39 @@ ZED_EMAIL_ADDR="{{ my_mail }}"
 #   this should be protected with quotes to prevent word-splitting.
 # Email will only be sent if ZED_EMAIL_ADDR is defined.
 #
-#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
+ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"

 ##
 # Default directory for zed lock files.
 #
-#ZED_LOCKDIR="/var/lock"
+ZED_LOCKDIR="/var/lock"

 ##
 # Minimum number of seconds between notifications for a similar event.
 #
-ZED_NOTIFY_INTERVAL_SECS=3600
+ZED_NOTIFY_INTERVAL_SECS={{ zed_time_bettween_warning_s }}

 ##
 # Notification verbosity.
 #   If set to 0, suppress notification if the pool is healthy.
 #   If set to 1, send notification regardless of pool health.
 #
-ZED_NOTIFY_VERBOSE=1
+ZED_NOTIFY_VERBOSE={{ zed_notify_verbosity }}

 ##
 # Send notifications for 'ereport.fs.zfs.data' events.
 # Disabled by default, any non-empty value will enable the feature.
 #
-ZED_NOTIFY_DATA=1
-
-##
-# Pushbullet access token.
-# This grants full access to your account -- protect it accordingly!
-#   <https://www.pushbullet.com/get-started>
-#   <https://www.pushbullet.com/account>
-# Disabled by default; uncomment to enable.
-#
-#ZED_PUSHBULLET_ACCESS_TOKEN=""
-
-##
-# Pushbullet channel tag for push notification feeds that can be subscribed to.
-#   <https://www.pushbullet.com/my-channel>
-# If not defined, push notifications will instead be sent to all devices
-#   associated with the account specified by the access token.
-# Disabled by default; uncomment to enable.
-#
-#ZED_PUSHBULLET_CHANNEL_TAG=""
+ZED_NOTIFY_DATA={{ zed_notify_data }}

 ##
 # Default directory for zed state files.
 #
-#ZED_RUNDIR="/var/run"
+ZED_RUNDIR="/var/run"

 ##
 # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED.  This works for
-# device mapper and multipath devices as well.  Your enclosure must be
+# device mapper and multipath devices as well. Your enclosure must be
 # supported by the Linux SES driver for this to work.
 #
 ZED_USE_ENCLOSURE_LEDS=1
@ -90,7 +72,7 @@ ZED_USE_ENCLOSURE_LEDS=1
 ##
 # Run a scrub after every resilver
 # Disabled by default, 1 to enable and 0 to disable.
-ZED_SCRUB_AFTER_RESILVER=1
+ZED_SCRUB_AFTER_RESILVER={{ zed_scrub_after_resilver }}

 ##
 # The syslog priority (e.g., specified as a "facility.level" pair).