mirror of
https://github.com/mail-in-a-box/mailinabox.git
synced 2024-11-23 02:27:05 +00:00
1167 lines
40 KiB
Python
1167 lines
40 KiB
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
import argparse
|
||
|
import calendar
|
||
|
import gzip
|
||
|
import logging
|
||
|
import os
|
||
|
import pickle
|
||
|
import re
|
||
|
import shutil
|
||
|
import tempfile
|
||
|
import time
|
||
|
from collections import OrderedDict, defaultdict, Iterable
|
||
|
from datetime import datetime, timedelta
|
||
|
from functools import partial, lru_cache
|
||
|
from statistics import mean, stdev
|
||
|
|
||
|
from dateutil import parser
|
||
|
from dateutil.relativedelta import relativedelta
|
||
|
|
||
|
import utils
|
||
|
|
||
|
MONTHS = dict((v, k) for k, v in enumerate(calendar.month_abbr))
|
||
|
|
||
|
KNOWN_SERVICES = (
|
||
|
"anvil",
|
||
|
"auth",
|
||
|
"auth-worker",
|
||
|
"config", # Postfix config warning (anvil client limit warning encountered)
|
||
|
"imap",
|
||
|
"imap-login",
|
||
|
"indexer", # Dovecot restart
|
||
|
"indexer-worker", # Dovecot indexer-worker process
|
||
|
"lmtp",
|
||
|
"log", # Dovecot restart
|
||
|
"managesieve-login",
|
||
|
"master", # Dovecot restart
|
||
|
"opendkim",
|
||
|
"opendmarc",
|
||
|
"pop3",
|
||
|
"pop3-login",
|
||
|
"postfix/anvil",
|
||
|
"postfix/bounce",
|
||
|
"postfix/cleanup",
|
||
|
"postfix/lmtp",
|
||
|
"postfix/master",
|
||
|
"postfix/pickup",
|
||
|
"postfix/qmgr",
|
||
|
"postfix/scache",
|
||
|
"postfix/smtp",
|
||
|
"postfix/smtpd",
|
||
|
"postfix/submission/smtpd",
|
||
|
"postfix/tlsmgr",
|
||
|
"postgrey",
|
||
|
"spampd",
|
||
|
"ssl-params", # Dovecot restart
|
||
|
)
|
||
|
|
||
|
LOG_DIR = '/var/log/'
|
||
|
|
||
|
LOG_FILES = [
|
||
|
'mail.log',
|
||
|
'mail.log.1',
|
||
|
'mail.log.2.gz',
|
||
|
'mail.log.3.gz',
|
||
|
'mail.log.4.gz',
|
||
|
'mail.log.5.gz',
|
||
|
'mail.log.6.gz',
|
||
|
]
|
||
|
|
||
|
HISTORY_FILE = os.path.expanduser('~/.cache/logscan.cache')
|
||
|
HISTORY_SIZE = 30 # The number of days of history to remember
|
||
|
|
||
|
# Regular expressions used for log line parsing
|
||
|
|
||
|
MAIN_REGEX = re.compile(r"(\w+[\s]+\d+ \d+:\d+:\d+) ([\w]+ )?([\w\-/]+)[^:]*: (.*)")
|
||
|
SENT_REGEX = re.compile("([A-Z0-9]+): client=(\S+), sasl_method=(PLAIN|LOGIN), sasl_username=(\S+)")
|
||
|
RECV_REGEX = re.compile("([A-Z0-9]+): to=<(\S+)>, .* Saved")
|
||
|
CHCK_REGEX = re.compile("Info: Login: user=<(.*?)>, method=PLAIN, rip=(.*?),")
|
||
|
GREY_REGEX = re.compile("action=(greylist|pass), reason=(.*?), (?:delay=\d+, )?client_name=(.*), "
|
||
|
"client_address=(.*), sender=(.*), recipient=(.*)")
|
||
|
RJCT_REGEX = re.compile("NOQUEUE: reject: RCPT from .*?: (.*?); from=<(.*?)> to=<(.*?)>")
|
||
|
|
||
|
|
||
|
# Small helper functions, needed for pickling
|
||
|
|
||
|
def dd_list():
|
||
|
return defaultdict(list)
|
||
|
|
||
|
|
||
|
def dd():
|
||
|
return defaultdict(dd_list)
|
||
|
|
||
|
|
||
|
# Functions for extracting data from log lines produced by certain services
|
||
|
|
||
|
def scan_postfix_submission(collector, user_match, date, log):
|
||
|
""" Parse a postfix submission log line
|
||
|
|
||
|
Lines containing a sasl_method with the values 'PLAIN' or 'LOGIN' are assumed to indicate a sent email.
|
||
|
|
||
|
"""
|
||
|
|
||
|
# Match both the 'plain' and 'login' sasl methods, since both authentication methods are allowed by Dovecot
|
||
|
match = SENT_REGEX.match(log)
|
||
|
|
||
|
if match:
|
||
|
_, client, method, user = match.groups()
|
||
|
user = user.lower()
|
||
|
|
||
|
if user_match(user):
|
||
|
# Get the user data, or create it if the user is new
|
||
|
data = collector.setdefault(
|
||
|
user,
|
||
|
OrderedDict([
|
||
|
('sent', 0),
|
||
|
('hosts', 0),
|
||
|
('first', None),
|
||
|
('last', None),
|
||
|
('by hour', defaultdict(int)),
|
||
|
('host addresses', set()),
|
||
|
])
|
||
|
)
|
||
|
|
||
|
data['sent'] += 1
|
||
|
data['host addresses'].add(client)
|
||
|
data['hosts'] = len(data['host addresses'])
|
||
|
data['by hour'][date.hour] += 1
|
||
|
|
||
|
if data['last'] is None:
|
||
|
data['last'] = date
|
||
|
data['first'] = date
|
||
|
|
||
|
|
||
|
def scan_postfix_lmtp(collector, user_match, date, log):
|
||
|
""" Parse a postfix lmtp log line
|
||
|
|
||
|
It is assumed that every log of postfix/lmtp indicates an email that was successfully received by Postfix.
|
||
|
|
||
|
"""
|
||
|
|
||
|
match = RECV_REGEX.match(log)
|
||
|
|
||
|
if match:
|
||
|
_, user = match.groups()
|
||
|
user = user.lower()
|
||
|
|
||
|
if user_match(user):
|
||
|
# Get the user data, or create it if the user is new
|
||
|
data = collector.setdefault(
|
||
|
user,
|
||
|
OrderedDict([
|
||
|
('received', 0),
|
||
|
('by hour', defaultdict(int)),
|
||
|
('first', None),
|
||
|
('last', None),
|
||
|
])
|
||
|
)
|
||
|
|
||
|
data['received'] += 1
|
||
|
data['by hour'][date.hour] += 1
|
||
|
|
||
|
if data['last'] is None:
|
||
|
data['last'] = date
|
||
|
data['first'] = date
|
||
|
|
||
|
|
||
|
def scan_login(collector, user_match, date, log):
|
||
|
""" Scan a dovecot log line and extract interesting data """
|
||
|
|
||
|
match = CHCK_REGEX.match(log)
|
||
|
|
||
|
if match:
|
||
|
user, rip = match.groups()
|
||
|
user = user.lower()
|
||
|
|
||
|
if user_match(user):
|
||
|
# Get the user data, or create it if the user is new
|
||
|
data = collector.setdefault(
|
||
|
user,
|
||
|
OrderedDict([
|
||
|
('logins', 0),
|
||
|
('by hour', defaultdict(int)),
|
||
|
('first', None),
|
||
|
('last', None),
|
||
|
|
||
|
('by ip', defaultdict(int)),
|
||
|
])
|
||
|
)
|
||
|
|
||
|
data['logins'] += 1
|
||
|
data['by hour'][date.hour] += 1
|
||
|
|
||
|
if data['last'] is None:
|
||
|
data['last'] = date
|
||
|
data['first'] = date
|
||
|
|
||
|
if rip not in ('127.0.0.1', '::1'):
|
||
|
data['by ip'][rip] += 1
|
||
|
else:
|
||
|
data['by ip']['webmail'] += 1
|
||
|
|
||
|
|
||
|
def scan_greylist(collector, user_match, date, log):
|
||
|
""" Scan a postgrey log line and extract interesting data """
|
||
|
|
||
|
match = GREY_REGEX.match(log)
|
||
|
|
||
|
if match:
|
||
|
action, reason, sender_domain, sender_ip, sender_address, user = match.groups()
|
||
|
user = user.lower()
|
||
|
|
||
|
if user_match(user):
|
||
|
# Get the user data, or create it if the user is new
|
||
|
data = collector.setdefault(
|
||
|
user,
|
||
|
OrderedDict([
|
||
|
('lost', 0),
|
||
|
('pass', 0),
|
||
|
('first', None),
|
||
|
('last', None),
|
||
|
('grey-list', {}),
|
||
|
])
|
||
|
)
|
||
|
|
||
|
# Might be useful to group services that use a lot of mail different servers on sub
|
||
|
# domains like <sub>1.domein.com
|
||
|
|
||
|
# if '.' in client_name:
|
||
|
# addr = client_name.split('.')
|
||
|
# if len(addr) > 2:
|
||
|
# client_name = '.'.join(addr[1:])
|
||
|
|
||
|
if data['last'] is None:
|
||
|
data['last'] = date
|
||
|
data['first'] = date
|
||
|
|
||
|
if len(sender_address) > 36:
|
||
|
name, domain = sender_address.split('@')
|
||
|
if len(name) > 12:
|
||
|
sender_address = name[:12] + '…@' + domain
|
||
|
|
||
|
source = "✉ {} ← {}".format(sender_address, sender_ip if sender_domain == 'unknown' else sender_domain)
|
||
|
|
||
|
if action == 'greylist' and reason == 'new':
|
||
|
if source not in data['grey-list']:
|
||
|
data['lost'] += 1
|
||
|
data['grey-list'][source] = "✖ on {:%Y-%m-%d %H:%M:%S}".format(date)
|
||
|
elif action == 'pass':
|
||
|
data['pass'] += 1
|
||
|
data['grey-list'][source] = "✔ on {:%Y-%m-%d %H:%M:%S}".format(date)
|
||
|
|
||
|
|
||
|
def scan_rejects(collector, known_addresses, user_match, date, log):
|
||
|
""" Parse a postfix smtpd log line and extract interesting data
|
||
|
|
||
|
Currently we search for received mails that were rejected.
|
||
|
|
||
|
"""
|
||
|
|
||
|
# Check if the incoming mail was rejected
|
||
|
|
||
|
match = RJCT_REGEX.match(log)
|
||
|
|
||
|
if match:
|
||
|
message, sender, user = match.groups()
|
||
|
sender = sender or 'no address'
|
||
|
user = user.lower()
|
||
|
|
||
|
# skip this, if reported in the grey-listing report
|
||
|
if 'Recipient address rejected: Greylisted' in message:
|
||
|
return
|
||
|
|
||
|
# only log mail to known recipients
|
||
|
if user_match(user):
|
||
|
if not known_addresses or user in known_addresses:
|
||
|
data = collector.setdefault(
|
||
|
user,
|
||
|
OrderedDict([
|
||
|
('blocked', 0),
|
||
|
('from', OrderedDict()),
|
||
|
('first', None),
|
||
|
('last', None),
|
||
|
])
|
||
|
)
|
||
|
# simplify this one
|
||
|
match = re.search(r"Client host \[(.*?)\] blocked using zen.spamhaus.org; (.*)", message)
|
||
|
if match:
|
||
|
message = "ip blocked: " + match.group(2)
|
||
|
else:
|
||
|
# simplify this one too
|
||
|
match = re.search(r"Sender address \[.*@(.*)\] blocked using dbl.spamhaus.org; (.*)", message)
|
||
|
if match:
|
||
|
message = "domain blocked: " + match.group(2)
|
||
|
|
||
|
if data['last'] is None:
|
||
|
data['last'] = date
|
||
|
data['first'] = date
|
||
|
data['blocked'] += 1
|
||
|
data['from'][sender] = "✖ on {:%Y-%m-%d %H:%M:%S}: {}".format(date, message)
|
||
|
|
||
|
|
||
|
class Collector(dict):
|
||
|
""" Custom dictionary class for collecting scan data """
|
||
|
|
||
|
def __init__(self, start_date=None, end_date=None, filters=None, no_filter=False,
|
||
|
sent=True, received=True, imap=False, pop3=False, grey=False, rejected=False):
|
||
|
|
||
|
super().__init__()
|
||
|
|
||
|
# Try and get all the email addresses known to this box
|
||
|
|
||
|
known_addresses = []
|
||
|
|
||
|
if not no_filter:
|
||
|
try:
|
||
|
env_vars = utils.load_environment()
|
||
|
import mailconfig
|
||
|
known_addresses = sorted(
|
||
|
set(mailconfig.get_mail_users(env_vars)) |
|
||
|
set(alias[0] for alias in mailconfig.get_mail_aliases(env_vars)),
|
||
|
key=email_sort
|
||
|
)
|
||
|
except (FileNotFoundError, ImportError):
|
||
|
pass
|
||
|
|
||
|
start_date = start_date or datetime.now()
|
||
|
end_date = end_date or start_date - timedelta(weeks=52)
|
||
|
|
||
|
self.update({
|
||
|
'end_of_file': False, # Indicates whether the end of the log files was reached
|
||
|
'start_date': start_date,
|
||
|
'end_date': end_date,
|
||
|
'line_count': 0, # Number of lines scanned
|
||
|
'parse_count': 0, # Number of lines parsed (i.e. that had their contents examined)
|
||
|
'scan_time': time.time(), # The time in seconds the scan took
|
||
|
'unknown services': set(), # Services encountered that were not recognized
|
||
|
'known_addresses': known_addresses, # Addresses handled by MiaB
|
||
|
'services': {}, # What services to scan for
|
||
|
'data': OrderedDict(), # Scan data, per service
|
||
|
})
|
||
|
|
||
|
# Caching is only useful with longer filter lists, but doesn't seem to hurt performance in shorter ones
|
||
|
user_match = lru_cache(maxsize=None)(partial(filter_match, [f.lower() for f in filters] if filters else None))
|
||
|
|
||
|
if sent:
|
||
|
data = {}
|
||
|
self['data']['sent mail'] = {
|
||
|
'scan': partial(scan_postfix_submission, data, user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['postfix/submission/smtpd'] = self['data']['sent mail']
|
||
|
|
||
|
if received:
|
||
|
data = {}
|
||
|
self['data']['received mail'] = {
|
||
|
'scan': partial(scan_postfix_lmtp, data, user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['postfix/lmtp'] = self['data']['received mail']
|
||
|
|
||
|
if imap:
|
||
|
data = {}
|
||
|
self['data']['imap login'] = {
|
||
|
'scan': partial(scan_login, data, user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['imap-login'] = self['data']['imap login']
|
||
|
|
||
|
if pop3:
|
||
|
data = {}
|
||
|
self['data']['pop3 login'] = {
|
||
|
'scan': partial(scan_login, data, user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['pop3-login'] = self['data']['pop3 login']
|
||
|
|
||
|
if grey:
|
||
|
data = {}
|
||
|
self['data']['grey-listed mail'] = {
|
||
|
'scan': partial(scan_greylist, data, user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['postgrey'] = self['data']['grey-listed mail']
|
||
|
|
||
|
if rejected:
|
||
|
data = {}
|
||
|
self['data']['blocked mail'] = {
|
||
|
'scan': partial(scan_rejects, data, self['known_addresses'], user_match),
|
||
|
'data': data,
|
||
|
}
|
||
|
self['services']['postfix/smtpd'] = self['data']['blocked mail']
|
||
|
|
||
|
def get_addresses(self, complete=False):
|
||
|
addresses = set()
|
||
|
for category in self['data']:
|
||
|
try:
|
||
|
for address in self['data'][category]['data']:
|
||
|
addresses.add(address)
|
||
|
except KeyError:
|
||
|
logging.debug("Category %s not found" % category)
|
||
|
|
||
|
if complete:
|
||
|
addresses.update(self['known_addresses'])
|
||
|
return sorted(addresses, key=email_sort)
|
||
|
|
||
|
def group_by_address(self, complete=False):
|
||
|
|
||
|
addresses = self.get_addresses(complete)
|
||
|
|
||
|
data = {}
|
||
|
|
||
|
for address in addresses:
|
||
|
data[address] = {}
|
||
|
for category in self['data']:
|
||
|
data[address][category] = self['data'][category]['data'].get(address, None)
|
||
|
|
||
|
self['data'] = data
|
||
|
|
||
|
|
||
|
def scan_files(files, collector):
|
||
|
""" Scan files until they run out or the earliest date is reached """
|
||
|
|
||
|
logging.info("Scanning from {:%Y-%m-%d %H:%M:%S} back to {:%Y-%m-%d %H:%M:%S}".format(
|
||
|
collector['start_date'], collector['end_date']
|
||
|
))
|
||
|
|
||
|
for file_name in files:
|
||
|
scan_file(file_name, collector)
|
||
|
|
||
|
collector['scan_time'] = time.time() - collector["scan_time"]
|
||
|
|
||
|
logging.info(
|
||
|
"{line_count} Log lines scanned, {parse_count} lines parsed in {scan_time:.2f} seconds\n".format(**collector)
|
||
|
)
|
||
|
|
||
|
return collector
|
||
|
|
||
|
|
||
|
def scan_file(file_name, collector):
|
||
|
|
||
|
if not os.path.exists(file_name):
|
||
|
return
|
||
|
|
||
|
logging.debug("Processing file %s...", file_name)
|
||
|
|
||
|
collector['end_of_file'] = False
|
||
|
|
||
|
with tempfile.NamedTemporaryFile() as tmp_file:
|
||
|
|
||
|
# Copy the log file to a tmp file for scanning
|
||
|
|
||
|
if file_name[-3:] == '.gz':
|
||
|
shutil.copyfileobj(gzip.open(file_name), tmp_file)
|
||
|
else:
|
||
|
shutil.copyfileobj(open(file_name, 'rb'), tmp_file)
|
||
|
|
||
|
file_name = tmp_file.name
|
||
|
|
||
|
# A weird anomaly was encountered where a single log line with a much earlier date than the surrounding log
|
||
|
# lines was found. To avoid this anomaly from halting the scan, the following variable was introduced.
|
||
|
stop_scan = False
|
||
|
|
||
|
for log_line in _reverse_readline(file_name):
|
||
|
collector['line_count'] += 1
|
||
|
|
||
|
# If the found date is earlier than the end date, return
|
||
|
if _scan_mail_log_line(log_line.strip(), collector) is False:
|
||
|
if stop_scan:
|
||
|
return
|
||
|
stop_scan = True
|
||
|
else:
|
||
|
stop_scan = False
|
||
|
|
||
|
# If we reached this part, the file was scanned completely
|
||
|
collector['end_of_file'] = True
|
||
|
|
||
|
|
||
|
def parse_log_date(val, year):
|
||
|
""" Custom log file date parsing, which is much faster than any generic function from the Python lib """
|
||
|
|
||
|
try:
|
||
|
return datetime(
|
||
|
year,
|
||
|
MONTHS[val[0:3]],
|
||
|
int(val[4:6]),
|
||
|
int(val[7:9]),
|
||
|
int(val[10:12]),
|
||
|
int(val[13:15])
|
||
|
)
|
||
|
except KeyError:
|
||
|
logging.debug("Unknown month: %s", val)
|
||
|
return None
|
||
|
except ValueError:
|
||
|
logging.debug("Irregular date found: %s", val)
|
||
|
return None
|
||
|
|
||
|
|
||
|
def _scan_mail_log_line(line, collector):
|
||
|
""" Scan a log line and extract interesting data
|
||
|
|
||
|
Return False if the found date is earlier than the end date, True otherwise
|
||
|
|
||
|
"""
|
||
|
|
||
|
m = MAIN_REGEX.match(line)
|
||
|
|
||
|
if not m:
|
||
|
return True
|
||
|
|
||
|
date, hostname, service, log = m.groups()
|
||
|
|
||
|
# logging.debug("date: %s, host: %s, service: %s, log: %s", date, hostname, service, log)
|
||
|
|
||
|
date = parse_log_date(date, collector['start_date'].year)
|
||
|
|
||
|
# Check if the found date is within the time span we are scanning
|
||
|
if date is None or date > collector['start_date']:
|
||
|
# Don't process, but continue
|
||
|
return True
|
||
|
elif date < collector['end_date']:
|
||
|
# Don't process, and halt
|
||
|
return False
|
||
|
|
||
|
if service in collector['services']:
|
||
|
collector['services'][service]['scan'](date, log)
|
||
|
collector["parse_count"] += 1
|
||
|
elif service not in KNOWN_SERVICES:
|
||
|
if service not in collector["unknown services"]:
|
||
|
collector["unknown services"].add(service)
|
||
|
logging.debug(" Unknown service '%s':\n %s", service, line)
|
||
|
|
||
|
return True
|
||
|
|
||
|
|
||
|
def filter_match(filters, user):
|
||
|
""" Check if the given user matches any of the filters """
|
||
|
return filters is None or any(u in user for u in filters)
|
||
|
|
||
|
|
||
|
def email_sort(email):
|
||
|
""" Split the given email address into a reverse order tuple, for sorting i.e (domain, name) """
|
||
|
return tuple(reversed(email.split('@')))
|
||
|
|
||
|
|
||
|
def _reverse_readline(filename, buf_size=8192):
|
||
|
""" A generator that returns the lines of a file in reverse order
|
||
|
|
||
|
http://stackoverflow.com/a/23646049/801870
|
||
|
|
||
|
"""
|
||
|
|
||
|
with open(filename) as fh:
|
||
|
segment = None
|
||
|
offset = 0
|
||
|
fh.seek(0, os.SEEK_END)
|
||
|
file_size = remaining_size = fh.tell()
|
||
|
while remaining_size > 0:
|
||
|
offset = min(file_size, offset + buf_size)
|
||
|
fh.seek(file_size - offset)
|
||
|
buff = fh.read(min(remaining_size, buf_size))
|
||
|
remaining_size -= buf_size
|
||
|
lines = buff.split('\n')
|
||
|
# the first line of the buffer is probably not a complete line so
|
||
|
# we'll save it and append it to the last line of the next buffer
|
||
|
# we read
|
||
|
if segment is not None:
|
||
|
# if the previous chunk starts right from the beginning of line
|
||
|
# do not concat the segment to the last line of new chunk
|
||
|
# instead, yield the segment first
|
||
|
if buff[-1] is not '\n':
|
||
|
lines[-1] += segment
|
||
|
else:
|
||
|
yield segment
|
||
|
segment = lines[0]
|
||
|
for index in range(len(lines) - 1, 0, -1):
|
||
|
if len(lines[index]):
|
||
|
yield lines[index]
|
||
|
# Don't yield None if the file was empty
|
||
|
if segment is not None:
|
||
|
yield segment
|
||
|
|
||
|
|
||
|
def load_history(log_files, services, verbose=False):
|
||
|
""" Load the pickled history dictionary from the cache file, or create it if it doesn't exist yet
|
||
|
|
||
|
History dictionary structure:
|
||
|
|
||
|
{
|
||
|
last_date: date,
|
||
|
last_mail: date,
|
||
|
data:
|
||
|
<address>: {
|
||
|
<category>: {
|
||
|
<hour>: [count list],
|
||
|
<hour>: [count list],
|
||
|
<hour>: [count list],
|
||
|
.
|
||
|
.
|
||
|
.
|
||
|
}
|
||
|
<category>: {
|
||
|
<hour>: [count list],
|
||
|
<hour>: [count list],
|
||
|
<hour>: [count list],
|
||
|
.
|
||
|
.
|
||
|
.
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
"""
|
||
|
|
||
|
if os.path.exists(HISTORY_FILE):
|
||
|
try:
|
||
|
with open(HISTORY_FILE, 'rb') as f:
|
||
|
history = pickle.load(f)
|
||
|
last_date = history['last_date']
|
||
|
except (TypeError, EOFError):
|
||
|
os.remove(HISTORY_FILE)
|
||
|
if verbose:
|
||
|
mail_admin("History Error!", "History has been deleted")
|
||
|
return None
|
||
|
|
||
|
start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)
|
||
|
end_date = start_date - timedelta(days=1)
|
||
|
|
||
|
if last_date < start_date:
|
||
|
|
||
|
history['last_date'] = start_date
|
||
|
|
||
|
collectors = []
|
||
|
|
||
|
while last_date < start_date:
|
||
|
logging.info("Adding history for day %s", start_date)
|
||
|
|
||
|
collector = scan_files(
|
||
|
log_files,
|
||
|
Collector(
|
||
|
start_date,
|
||
|
end_date,
|
||
|
**services
|
||
|
)
|
||
|
)
|
||
|
|
||
|
collectors.append(collector)
|
||
|
|
||
|
if collector['end_of_file']:
|
||
|
break
|
||
|
else:
|
||
|
start_date = end_date
|
||
|
end_date = start_date - timedelta(days=1)
|
||
|
|
||
|
# Add them to the history, oldest first
|
||
|
for collector in reversed(collectors):
|
||
|
add_collector_to_history(collector, history)
|
||
|
|
||
|
logging.debug('History updated')
|
||
|
with open(HISTORY_FILE, 'wb') as f:
|
||
|
pickle.dump(history, f)
|
||
|
|
||
|
if verbose:
|
||
|
mail_admin("History updated", history_to_str(history))
|
||
|
else:
|
||
|
history = {
|
||
|
'last_date': None,
|
||
|
'last_mail': None,
|
||
|
'data': defaultdict(dd)
|
||
|
}
|
||
|
|
||
|
start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)
|
||
|
end_date = start_date - timedelta(days=1)
|
||
|
|
||
|
history['last_date'] = start_date
|
||
|
|
||
|
collectors = []
|
||
|
|
||
|
# Scan all log files
|
||
|
while True:
|
||
|
collector = scan_files(
|
||
|
log_files,
|
||
|
Collector(
|
||
|
start_date,
|
||
|
end_date,
|
||
|
**services
|
||
|
)
|
||
|
)
|
||
|
|
||
|
collectors.append(collector)
|
||
|
|
||
|
if collector['end_of_file']:
|
||
|
break
|
||
|
else:
|
||
|
start_date = end_date
|
||
|
end_date = start_date - timedelta(days=1)
|
||
|
|
||
|
# Add them to the history, oldest first
|
||
|
for collector in reversed(collectors):
|
||
|
add_collector_to_history(collector, history)
|
||
|
|
||
|
with open(HISTORY_FILE, 'wb') as f:
|
||
|
pickle.dump(history, f)
|
||
|
|
||
|
if verbose:
|
||
|
mail_admin("History created", history_to_str(history))
|
||
|
|
||
|
return history
|
||
|
|
||
|
|
||
|
def history_to_str(history):
|
||
|
content = []
|
||
|
for address, data in history['data'].items():
|
||
|
content.append(address)
|
||
|
for category, counts in data.items():
|
||
|
content.append(' %s' % category)
|
||
|
for hour, count in counts.items():
|
||
|
content.append(' %s: %s' % (hour, count))
|
||
|
return '\n'.join(content)
|
||
|
|
||
|
|
||
|
def add_collector_to_history(collector, history):
|
||
|
|
||
|
collector.group_by_address(True)
|
||
|
|
||
|
for collector_address, collector_data in collector['data'].items():
|
||
|
# Get the dictionary of user data
|
||
|
history_user_data = history['data'][collector_address]
|
||
|
for collector_category in collector_data:
|
||
|
history_user_category_data = history_user_data[collector_category]
|
||
|
if collector_data[collector_category] and 'by hour' in collector_data[collector_category]:
|
||
|
for hour in range(24):
|
||
|
history_user_category_data[hour].append(collector_data[collector_category]['by hour'][hour])
|
||
|
# Trim to last `HISTORY_SIZE` entries
|
||
|
history_user_category_data[hour] = history_user_category_data[hour][-HISTORY_SIZE:]
|
||
|
else:
|
||
|
for hour in range(24):
|
||
|
history_user_category_data[hour].append(0)
|
||
|
# Trim to last `HISTORY_SIZE` entries
|
||
|
history_user_category_data[hour] = history_user_category_data[hour][-HISTORY_SIZE:]
|
||
|
|
||
|
|
||
|
def count_is_suspect(count, history, threshold=0):
|
||
|
""" Use three-sigma rule to detect anomalous count values
|
||
|
|
||
|
:param count: The number of emails counted in a certain hour
|
||
|
:type count: int
|
||
|
:param history: List of counted emails in a certain hour over a number of days
|
||
|
:type history: list
|
||
|
:param threshold: The count value can only be suspect if it is higher than the threshold
|
||
|
:type threshold: int
|
||
|
:return: True if suspect, False otherwise
|
||
|
:rtype: bool
|
||
|
|
||
|
"""
|
||
|
|
||
|
if len(history) > 1 and count > threshold:
|
||
|
mu = mean(history)
|
||
|
std = stdev(history)
|
||
|
# logging.debug(" mean: %s, std dev: %s", mu, std)
|
||
|
return abs(count - mu) > 3 * std
|
||
|
return False
|
||
|
|
||
|
|
||
|
def mail_admin(subject, content):
|
||
|
import smtplib
|
||
|
from email.message import Message
|
||
|
from utils import load_environment
|
||
|
|
||
|
env = load_environment()
|
||
|
admin_addr = "administrator@" + env['PRIMARY_HOSTNAME']
|
||
|
|
||
|
# create MIME message
|
||
|
msg = Message()
|
||
|
msg['From'] = "\"%s\" <%s>" % (env['PRIMARY_HOSTNAME'], admin_addr)
|
||
|
msg['To'] = admin_addr
|
||
|
msg['Subject'] = "[%s] %s" % (env['PRIMARY_HOSTNAME'], subject)
|
||
|
msg.set_payload(content, "UTF-8")
|
||
|
|
||
|
smtpclient = smtplib.SMTP('127.0.0.1', 25)
|
||
|
smtpclient.ehlo()
|
||
|
smtpclient.sendmail(
|
||
|
admin_addr, # MAIL FROM
|
||
|
admin_addr, # RCPT TO
|
||
|
msg.as_string())
|
||
|
smtpclient.quit()
|
||
|
|
||
|
|
||
|
def print_time_table(label, data):
|
||
|
lbl_temp = " │ {:<%d}" % max(len(label), 4)
|
||
|
hour_line = [lbl_temp.format('hour')]
|
||
|
data_line = [lbl_temp.format(label)]
|
||
|
|
||
|
lines = [" ┬"]
|
||
|
|
||
|
for h in range(24):
|
||
|
max_len = max(len(str(data[h])), 2)
|
||
|
data_temp = "{:>%s}" % max_len
|
||
|
|
||
|
hour_line.append(data_temp.format(h))
|
||
|
data_line.append(data_temp.format(data[h] or '…'))
|
||
|
|
||
|
lines.append(' '.join(hour_line))
|
||
|
lines.append(' '.join(data_line))
|
||
|
lines.append(" └" + (len(lines[-1]) - 3) * "─")
|
||
|
|
||
|
return lines
|
||
|
|
||
|
|
||
|
def print_service_tables(collector, verbose=False):
|
||
|
address_width = 24
|
||
|
col_width = 8
|
||
|
col_tmp = "{:>%d}" % col_width
|
||
|
|
||
|
for service, service_data in collector['data'].items():
|
||
|
|
||
|
# Gather data in a flat table and convert to strings
|
||
|
|
||
|
if not service_data['data']:
|
||
|
logging.info("\n✖ No %s data found", service)
|
||
|
continue
|
||
|
else:
|
||
|
table = []
|
||
|
|
||
|
data = service_data['data'].values()
|
||
|
min_first = min([u["first"] for u in data])
|
||
|
max_last = max([u["last"] for u in data])
|
||
|
|
||
|
title = "{} ({:%Y-%m-%d %H:%M:%S} - {:%Y-%m-%d %H:%M:%S})".format(
|
||
|
service.capitalize(),
|
||
|
min_first,
|
||
|
max_last
|
||
|
)
|
||
|
|
||
|
sorted_data = OrderedDict(sorted(service_data['data'].items(), key=lambda t: email_sort(t[0])))
|
||
|
|
||
|
current_domain = ''
|
||
|
|
||
|
accum = None
|
||
|
|
||
|
for address, data in sorted_data.items():
|
||
|
|
||
|
user, domain = address.split('@')
|
||
|
|
||
|
if domain != current_domain:
|
||
|
header = '@%s %s' % (domain, '┄' * (64 - len(domain) - 3))
|
||
|
offset = 1 + address_width
|
||
|
num_atomic = len([v for v in data.values() if not isinstance(v, Iterable)])
|
||
|
offset += (num_atomic - 2) * col_width
|
||
|
if accum is None:
|
||
|
accum = [0] * (num_atomic - 1)
|
||
|
header = header[:offset] + '┼' + header[offset:]
|
||
|
table.append([header])
|
||
|
current_domain = domain
|
||
|
|
||
|
tmp = " {:<%d}" % (address_width - 2)
|
||
|
row = [tmp.format(user[:address_width - 3] + "…" if len(user) > address_width else user)]
|
||
|
|
||
|
# Condense first and last date points into a time span
|
||
|
first = data.pop("first")
|
||
|
last = data.pop("last")
|
||
|
|
||
|
timespan = relativedelta(last, first)
|
||
|
|
||
|
if timespan.months:
|
||
|
timespan_str = " │ {:0.1f} months".format(timespan.months + timespan.days / 30.0)
|
||
|
elif timespan.days:
|
||
|
timespan_str = " │ {:0.1f} days".format(timespan.days + timespan.hours / 24.0)
|
||
|
elif (first.hour, first.minute) == (last.hour, last.minute):
|
||
|
timespan_str = " │ {:%H:%M}".format(first)
|
||
|
else:
|
||
|
timespan_str = " │ {:%H:%M} - {:%H:%M}".format(first, last)
|
||
|
|
||
|
accum[0] += 1
|
||
|
|
||
|
# Only consider flat data in a flat table
|
||
|
for name, value in data.items():
|
||
|
if isinstance(value, (int, float)):
|
||
|
accum[len(row)] += value
|
||
|
row.append(col_tmp.format(value))
|
||
|
|
||
|
row.append(timespan_str)
|
||
|
data[' │ timespan'] = timespan
|
||
|
table.append(row)
|
||
|
|
||
|
if verbose:
|
||
|
for name, value in data.items():
|
||
|
if isinstance(value, Iterable):
|
||
|
if name == 'by hour':
|
||
|
table.extend(print_time_table(service, data['by hour']))
|
||
|
else:
|
||
|
if name == 'by ip':
|
||
|
value = ["{:<16}{:>4}".format(*v) for v in value.items()]
|
||
|
|
||
|
table.append(" ┬")
|
||
|
table.append(" │ %s" % name)
|
||
|
table.append(" ├─%s" % (len(name) * "─"))
|
||
|
max_len = 0
|
||
|
|
||
|
if isinstance(value, dict):
|
||
|
for key, val in value.items():
|
||
|
key_output = str(key)
|
||
|
val_output = str(val)
|
||
|
table.append(" │ %s" % key_output)
|
||
|
table.append(" │ %s" % val_output)
|
||
|
max_len = max(max_len, len(key_output), len(val_output))
|
||
|
else:
|
||
|
for item in value:
|
||
|
table.append(" │ %s" % str(item))
|
||
|
max_len = max(max_len, len(str(item)))
|
||
|
table.append(" └" + (max_len + 1) * "─")
|
||
|
|
||
|
header = [" " * address_width]
|
||
|
header.extend([col_tmp.format(k) for k, v in data.items() if not isinstance(v, Iterable)])
|
||
|
|
||
|
table.insert(0, header)
|
||
|
|
||
|
# Print table
|
||
|
|
||
|
print_table = [
|
||
|
'',
|
||
|
title,
|
||
|
"═" * offset + '╤' + "═" * (64 - offset - 1),
|
||
|
]
|
||
|
|
||
|
for row in table:
|
||
|
print_table.append(''.join(row))
|
||
|
|
||
|
print_table.append("─" * offset + '┴' + "─" * (64 - offset - 1),)
|
||
|
|
||
|
accum[0] = tmp.format("Totals: {}".format(accum[0]))
|
||
|
accum = [col_tmp.format(v) for v in accum]
|
||
|
print_table.append(''.join(accum))
|
||
|
|
||
|
logging.info('\n'.join(print_table))
|
||
|
return
|
||
|
|
||
|
|
||
|
def command_run():
|
||
|
|
||
|
logger = logging.getLogger()
|
||
|
ch = logging.StreamHandler()
|
||
|
formatter = logging.Formatter('%(message)s')
|
||
|
ch.setFormatter(formatter)
|
||
|
logger.addHandler(ch)
|
||
|
|
||
|
def valid_date(string):
|
||
|
""" Validate the given date string fetched from the --startdate argument """
|
||
|
try:
|
||
|
date = parser.parse(string)
|
||
|
except ValueError:
|
||
|
raise argparse.ArgumentTypeError("Unrecognized date and/or time '%s'" % string)
|
||
|
return date
|
||
|
|
||
|
start_date = datetime.now()
|
||
|
|
||
|
time_deltas = OrderedDict([
|
||
|
('all', timedelta(weeks=52)),
|
||
|
('month', timedelta(weeks=4)),
|
||
|
('2weeks', timedelta(days=14)),
|
||
|
('week', timedelta(days=7)),
|
||
|
('2days', timedelta(days=2)),
|
||
|
('day', timedelta(days=1)),
|
||
|
('12hours', timedelta(hours=12)),
|
||
|
('6hours', timedelta(hours=6)),
|
||
|
('hour', timedelta(hours=1)),
|
||
|
('30min', timedelta(minutes=30)),
|
||
|
('10min', timedelta(minutes=10)),
|
||
|
('5min', timedelta(minutes=5)),
|
||
|
('min', timedelta(minutes=1)),
|
||
|
('today', start_date - start_date.replace(hour=0, minute=0, second=0))
|
||
|
])
|
||
|
|
||
|
ap = argparse.ArgumentParser(
|
||
|
description="Scan the mail log files for interesting data. By default, this script "
|
||
|
"shows today's incoming and outgoing mail statistics. This script was ("
|
||
|
"re)written for the Mail-in-a-box email server."
|
||
|
"https://github.com/mail-in-a-box/mailinabox",
|
||
|
add_help=False
|
||
|
)
|
||
|
|
||
|
# Switches to determine what to parse and what to ignore
|
||
|
|
||
|
ap.add_argument("-a", "--all", help="Scan for all services.", action="store_true")
|
||
|
ap.add_argument("-r", "--received", help="Scan for received emails.", action="store_true")
|
||
|
ap.add_argument("-s", "--sent", help="Scan for sent emails.", action="store_true")
|
||
|
ap.add_argument("-l", "--logins", help="Scan for IMAP and POP3 logins.", action="store_true")
|
||
|
ap.add_argument("-i", "--imap", help="Scan for IMAP logins.", action="store_true")
|
||
|
ap.add_argument("-p", "--pop3", help="Scan for POP3 logins.", action="store_true")
|
||
|
ap.add_argument("-g", "--grey", help="Scan for greylisted emails.", action="store_true")
|
||
|
ap.add_argument("-b", "--blocked", help="Scan for blocked emails.", action="store_true")
|
||
|
|
||
|
ap.add_argument("-f", "--file", help="Path to a log file.", dest="log_files", metavar='<path>', action="append")
|
||
|
|
||
|
ap.add_argument("-m", "--monitor", nargs='?', const=50, type=int, metavar='<threshold>',
|
||
|
help="Mail an alert to the administrator when unusual behaviour is suspected. The optional "
|
||
|
"threshold value sets a limit above which the number of emails sent or received per hour by a "
|
||
|
"user will be evaluated. The default threshold is 50. It's recommended to use this option in "
|
||
|
"a cron job, e.g. '*/5 * * * * <path to>/logscan.py -m', which will run every 5 minutes.")
|
||
|
ap.add_argument("-t", "--timespan", choices=time_deltas.keys(), default='today', metavar='<time span>',
|
||
|
help="Time span to scan, going back from the start date. Possible values: "
|
||
|
"{}. Defaults to 'today'.".format(", ".join(list(time_deltas.keys()))))
|
||
|
ap.add_argument("-d", "--startdate", action="store", dest="startdate", type=valid_date, metavar='<start date>',
|
||
|
help="Date and time to start scanning the log file from. If no date is "
|
||
|
"provided, scanning will start from the current date and time.")
|
||
|
ap.add_argument("-u", "--users", action="store", dest="users", metavar='<email1,email2,email...>',
|
||
|
help="Comma separated list of (partial) email addresses to filter the output by.")
|
||
|
|
||
|
ap.add_argument('-n', "--nofilter", help="Don't filter by known email addresses.", action="store_true")
|
||
|
ap.add_argument('-h', '--help', action='help', help="Print this message and exit.")
|
||
|
ap.add_argument("-v", "--verbose", help="Output extra data where available.", action="store_true")
|
||
|
|
||
|
args = ap.parse_args()
|
||
|
|
||
|
logger.setLevel(logging.DEBUG if args.verbose else logging.INFO)
|
||
|
|
||
|
# Set a custom start date, but ignore it in monitor mode
|
||
|
if args.startdate is not None and args.monitor is None:
|
||
|
start_date = args.startdate
|
||
|
# Change the 'today' time span to 'day' when a custom start date is set
|
||
|
if args.timespan == 'today':
|
||
|
args.timespan = 'day'
|
||
|
logging.info("Setting start date to {}".format(start_date))
|
||
|
|
||
|
end_date = start_date - time_deltas[args.timespan]
|
||
|
|
||
|
filters = None
|
||
|
|
||
|
if args.users is not None:
|
||
|
filters = args.users.strip().split(',')
|
||
|
logging.info("Filtering with '%s'", ", ".join(filters))
|
||
|
|
||
|
services = {}
|
||
|
|
||
|
if args.monitor is not None:
|
||
|
# Set the services that will be checked in monitor mode
|
||
|
services = {
|
||
|
'sent': True,
|
||
|
'received': True,
|
||
|
'grey': False,
|
||
|
'rejected': False,
|
||
|
'imap': False,
|
||
|
'pop3': False
|
||
|
}
|
||
|
elif True in (args.all, args.received, args.sent, args.logins, args.pop3, args.imap, args.grey, args.blocked):
|
||
|
|
||
|
services = {
|
||
|
'sent': args.sent or args.all,
|
||
|
'received': args.received or args.all,
|
||
|
'grey': args.grey or args.all,
|
||
|
'rejected': args.blocked or args.all,
|
||
|
'imap': args.imap or args.logins or args.all,
|
||
|
'pop3': args.pop3 or args.logins or args.all
|
||
|
}
|
||
|
|
||
|
# Print what data is going to be processed
|
||
|
|
||
|
service_names = []
|
||
|
logins = []
|
||
|
|
||
|
if services['sent']:
|
||
|
service_names.append("sent")
|
||
|
|
||
|
if services['received']:
|
||
|
service_names.append("received")
|
||
|
|
||
|
if services['grey']:
|
||
|
service_names.append("grey-listed")
|
||
|
|
||
|
if services['rejected']:
|
||
|
service_names.append("rejected")
|
||
|
|
||
|
if services['imap']:
|
||
|
logins.append("IMAP")
|
||
|
|
||
|
if services['pop3']:
|
||
|
logins.append("POP3")
|
||
|
|
||
|
message = "Scanning for"
|
||
|
|
||
|
if service_names:
|
||
|
message = "{} {} emails".format(message, ', '.join(service_names))
|
||
|
if logins:
|
||
|
message = "{} and {} logins".format(message, ', '.join(logins))
|
||
|
elif logins:
|
||
|
message = "{} {} logins".format(message, ', '.join(logins))
|
||
|
|
||
|
logging.info(message)
|
||
|
|
||
|
if args.monitor is not None:
|
||
|
# Load activity history
|
||
|
|
||
|
history = load_history(
|
||
|
args.log_files or [os.path.join(LOG_DIR, f) for f in LOG_FILES],
|
||
|
services,
|
||
|
args.verbose
|
||
|
)
|
||
|
|
||
|
# for a, d in history['data'].items():
|
||
|
# print(a)
|
||
|
# print([len(v) for k, v in d['sent mail'].items()])
|
||
|
|
||
|
# Fetch today's activity
|
||
|
|
||
|
col = scan_files(
|
||
|
args.log_files or [os.path.join(LOG_DIR, f) for f in LOG_FILES],
|
||
|
Collector(
|
||
|
start_date,
|
||
|
end_date,
|
||
|
filters,
|
||
|
args.nofilter,
|
||
|
**services
|
||
|
)
|
||
|
)
|
||
|
col.group_by_address(True)
|
||
|
|
||
|
# Compare today with history
|
||
|
|
||
|
report = []
|
||
|
|
||
|
now = datetime.now()
|
||
|
if history['last_mail'] is None or now - history['last_mail'] > timedelta(hours=0.5):
|
||
|
for address, data in col['data'].items():
|
||
|
sub_report = [address]
|
||
|
for category, cat_data in data.items():
|
||
|
# If we have 'by hour' data for the current category *and* the current address has a history
|
||
|
if cat_data and 'by hour' in cat_data:
|
||
|
# Fetch the count of the latest hour
|
||
|
hour, count = max(cat_data['by hour'].items())
|
||
|
if count_is_suspect(count, history['data'][address][category][hour], args.monitor):
|
||
|
msg = " Found %d %ss at %d:00 where %0.2f is the average"
|
||
|
msg %= count, category, hour, mean(history['data'][address][category][hour])
|
||
|
sub_report.append(msg)
|
||
|
if len(sub_report) > 1:
|
||
|
report.extend(sub_report)
|
||
|
|
||
|
if report:
|
||
|
report.extend([
|
||
|
"\nReset the history by deleting the '%s' file" % HISTORY_FILE,
|
||
|
"The current limit for warnings is %d emails per hour." % args.monitor,
|
||
|
])
|
||
|
content = '\n'.join(report)
|
||
|
logging.info("Suspicious activity activity!")
|
||
|
logging.debug(content)
|
||
|
mail_admin("Suspicious email activity!", content)
|
||
|
|
||
|
history['last_mail'] = now
|
||
|
with open(HISTORY_FILE, 'wb') as f:
|
||
|
pickle.dump(history, f)
|
||
|
else:
|
||
|
col = scan_files(
|
||
|
args.log_files or [os.path.join(LOG_DIR, f) for f in LOG_FILES],
|
||
|
Collector(
|
||
|
start_date,
|
||
|
end_date,
|
||
|
filters,
|
||
|
args.nofilter,
|
||
|
**services
|
||
|
)
|
||
|
)
|
||
|
|
||
|
print_service_tables(col, args.verbose)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
command_run()
|