From 97abacccc3105677a587c782bf57e128dbd706b7 Mon Sep 17 00:00:00 2001 From: Jan Schulz-Hofen Date: Mon, 25 Dec 2017 19:34:33 +0100 Subject: [PATCH] Check hard drive health using S.M.A.R.T. --- management/status_checks.py | 22 ++++++++++++++++++++++ setup/system.sh | 17 +++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/management/status_checks.py b/management/status_checks.py index a9d0595c..57599ed2 100755 --- a/management/status_checks.py +++ b/management/status_checks.py @@ -165,6 +165,7 @@ def run_system_checks(rounded_values, env, output): check_miab_version(env, output) check_system_aliases(env, output) check_free_disk_space(rounded_values, env, output) + check_smart_status(env, output) check_free_memory(rounded_values, env, output) def check_ufw(env, output): @@ -250,6 +251,27 @@ def check_free_disk_space(rounded_values, env, output): if rounded_values: disk_msg = "The disk has less than 15% free space." output.print_error(disk_msg) +def check_smart_status(env, output): + devices = shell('check_output', ['smartctl', '--scan-open']) + for device in devices.splitlines(): + device = device.split()[0] + info = shell('check_output', ['smartctl', '-i', device]) + if 'SMART support is: Available' in info: + if 'SMART support is: Enabled' in info: + code, health = shell('check_output', ['smartctl', '-H', device, '--quietmode=errorsonly'], trap=True) + if code == 0: + output.print_ok('Disk %s passed all S.M.A.R.T. checks and seems healthy.' % device) + else: + output.print_error('Disk %s failed the S.M.A.R.T health check. Consider replacing the hard drive. Detailed information:' % device) + output.print_line("") + for line in health.split('=== START OF READ SMART DATA SECTION ===')[1].splitlines(): + output.print_line(line) + else: + output.print_warning('Disk %s supports S.M.A.R.T, but it is disabled. You should activate it using \'sudo smartctl -s on %s\'.' % (device, device)) + + else: + output.print_ok('Disk %s does not support S.M.A.R.T. Health checks are skipped.' % device) + def check_free_memory(rounded_values, env, output): # Check free memory. percent_free = 100 - psutil.virtual_memory().percent diff --git a/setup/system.sh b/setup/system.sh index 28043b16..6c522750 100755 --- a/setup/system.sh +++ b/setup/system.sh @@ -357,3 +357,20 @@ cp -f conf/fail2ban/filter.d/* /etc/fail2ban/filter.d/ # scripts will ensure the files exist and then fail2ban is given another # restart at the very end of setup. restart_service fail2ban + +# ### S.M.A.R.T. Monitoring Tools + +# Install and configure smartmontools so we can check for failing hard drives. +# We'll perform a first single smartd startup to check if any devices are found +# which can be monitored. If none are found (virtual machines!), we will configure +# smartd to not start automatically. Otherwise, smartd will run. +apt_install smartmontools +smartd --quit=onecheck > /dev/null 2>&1 || { smart=$? ; } +if [ "${smart:-0}" -ne 17 ]; then # smartd manpage: 17 means smartd didn't find any devices to monitor. + echo "S.M.A.R.T. capable hard drives found, setting up smartd..." + tools/editconf.py /etc/default/smartmontools start_smartd=yes + restart_service smartmontools +else + echo "No S.M.A.R.T. capable hard drives found, disabling smartd..." + tools/editconf.py /etc/default/smartmontools start_smartd=no +fi