#!/usr/bin/env python3 import sys from pprint import pformat import requests from pyquery import PyQuery as pq import ssl import OpenSSL from urllib.parse import urlparse, urljoin from datetime import datetime, timedelta from pytz import UTC import logging #logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.WARNING) log = logging.getLogger() # FIXME: relative url stuff will not work if the url passed in redirects # somewhere else class CertificateProblem(Exception): pass class ReachabilityProblem(Exception): pass class SSLCert(object): def __init__(self,c): self.c = c def decode_ossl_time(self,t): f = '%Y%m%d%H%M%SZ' return datetime.strptime(t.decode('utf-8'), f) def notBefore(self): return self.decode_ossl_time(self.c.get_notBefore()) def notAfter(self): return self.decode_ossl_time(self.c.get_notAfter()) def commonName(self): t = self.c.get_subject().get_components() for x in t: if x[0] == "CN": return x[1] def expired(self): return datetime.utcnow() > self.notAfter() def tooEarly(self): return datetime.utcnow() < self.notBefore() def validTime(self): if not self.expired() and not self.tooEarly(): return True return False def expiresSoon(self): week = timedelta(days=7) then = datetime.utcnow() + week return then > self.notAfter() class Website(object): def __init__(self,url): self.url = urlparse(url) if not self.url.scheme: self.url = urlparse('http://' + url) self.cert = None self.res = {} self.r = requests.get(self.urlstring(),verify=True) def contentType(self): if ';' in self.r.headers['content-type']: return self.r.headers['content-type'].split(';')[0] else: return self.r.headers['content-type'] def resources(self): if self.contentType() != 'text/html': return [] d = pq(self.r.text) #import pdb; pdb.set_trace() res = [] for e in d('link'): if 'openid' in e.attrib.get('rel'): continue res.append(e.attrib.get('href')) for e in d('script'): res.append(e.attrib.get('src')) res = [ urljoin(self.urlstring(),x) if not urlparse(x).netloc else x for x in res ] res = [ self.url.scheme + ':' + x if not urlparse(x).scheme else x for x in res ] res = {x: 1 for x in res} self.res = res.keys() return self.res def resources_by_host(self): out = {} for r in self.res: if not out.get(urlparse(r).netloc): out[urlparse(r).netloc] = [] out[urlparse(r).netloc].append(r) return out def is_tls(self): return self.url.scheme == 'https' def urlstring(self): return self.url.geturl() def check(self): if self.r.status_code is not 200: raise ReachabilityProblem("can't access: '%s'" % self.urlstring()) if self.is_tls(): self._get_cert() if not self.cert.validTime(): raise CertificateProblem( "cert for %s is invalid: %s to %s" % ( self.urlstring(), self.cert.notBefore(), self.cert.notAfter() ) ) if self.cert.expiresSoon(): raise CertificateProblem( "cert for %s expires soon: %s" % ( self.urlstring(), self.cert.notAfter() ) ) def _get_cert(self): if not self.url.port: p = 443 else: p = self.url.port c = ssl.get_server_certificate( (self.url.hostname, p), ssl_version=ssl.PROTOCOL_TLSv1 ) self.cert = SSLCert( OpenSSL.crypto.load_certificate( OpenSSL.crypto.FILETYPE_PEM, c ) ) def main(): if len(sys.argv) < 2: print("usage: %s [url2] [url3] [...]" % sys.argv[0]) sys.exit(1) for site in sys.argv[1:]: s = Website(site) s.check() for u in s.resources(): Website(u).check() if __name__ == '__main__': main()