diff --git a/management/backup.py b/management/backup.py index 6e7b1162..e0349dd6 100755 --- a/management/backup.py +++ b/management/backup.py @@ -11,7 +11,7 @@ import os, os.path, shutil, glob, re, datetime import dateutil.parser, dateutil.relativedelta, dateutil.tz -from utils import exclusive_process, load_environment, shell +from utils import exclusive_process, load_environment, shell, wait_for_service # Destroy backups when the most recent increment in the chain # that depends on it is this many days old. @@ -242,6 +242,13 @@ def perform_backup(full_backup): ['su', env['STORAGE_USER'], '-c', post_script], env=env) + # Our nightly cron job executes system status checks immediately after this + # backup. Since it checks that dovecot and postfix are running, block for a + # bit (maximum of 10 seconds each) to give each a chance to finish restarting + # before the status checks might catch them down. See #381. + wait_for_service(25, True, env, 10) + wait_for_service(993, True, env, 10) + def run_duplicity_verification(): env = load_environment() backup_root = os.path.join(env["STORAGE_ROOT"], 'backup') diff --git a/management/utils.py b/management/utils.py index 24a2a0a7..efe2c186 100644 --- a/management/utils.py +++ b/management/utils.py @@ -184,3 +184,19 @@ def du(path): seen.add(stat.st_ino) total_size += stat.st_size return total_size + +def wait_for_service(port, public, env, timeout): + # Block until a service on a given port (bound privately or publicly) + # is taking connections, with a maximum timeout. + import socket, time + start = time.perf_counter() + while True: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.settimeout(timeout/3) + try: + s.connect(("127.0.0.1" if not public else env['PUBLIC_IP'], port)) + return True + except OSError: + if time.perf_counter() > start+timeout: + return False + time.sleep(min(timeout/4, 1))