race condition between backups and status checks - connection refused

At the end of the backup, wait a bit for dovecot and postfix to finish restarting.

Hopefully fixes #381.
This commit is contained in:
Joshua Tauberer 2015-04-29 21:06:38 +00:00
parent c03e00035f
commit febfa72d60
2 changed files with 24 additions and 1 deletions

View File

@ -11,7 +11,7 @@
import os, os.path, shutil, glob, re, datetime
import dateutil.parser, dateutil.relativedelta, dateutil.tz
from utils import exclusive_process, load_environment, shell
from utils import exclusive_process, load_environment, shell, wait_for_service
# Destroy backups when the most recent increment in the chain
# that depends on it is this many days old.
@ -242,6 +242,13 @@ def perform_backup(full_backup):
['su', env['STORAGE_USER'], '-c', post_script],
env=env)
# Our nightly cron job executes system status checks immediately after this
# backup. Since it checks that dovecot and postfix are running, block for a
# bit (maximum of 10 seconds each) to give each a chance to finish restarting
# before the status checks might catch them down. See #381.
wait_for_service(25, True, env, 10)
wait_for_service(993, True, env, 10)
def run_duplicity_verification():
env = load_environment()
backup_root = os.path.join(env["STORAGE_ROOT"], 'backup')

View File

@ -184,3 +184,19 @@ def du(path):
seen.add(stat.st_ino)
total_size += stat.st_size
return total_size
def wait_for_service(port, public, env, timeout):
# Block until a service on a given port (bound privately or publicly)
# is taking connections, with a maximum timeout.
import socket, time
start = time.perf_counter()
while True:
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(timeout/3)
try:
s.connect(("127.0.0.1" if not public else env['PUBLIC_IP'], port))
return True
except OSError:
if time.perf_counter() > start+timeout:
return False
time.sleep(min(timeout/4, 1))