import requests import sys import signal from bs4 import BeautifulSoup from urllib.parse import urljoin import feedparser from sanelogging import log SRC = "https://hnblogs.substack.com/feed" HTTP_TIMEOUT = 10 def signal_handler(signal, frame): log.error('SIGINT') sys.exit(0) signal.signal(signal.SIGINT, signal_handler) def main(): urls = lookupFeeds() displayData(urls) def lookupFeeds(): rssUrls = [] urls = fetchUrls() for url in urls: rssUrl = None try: rssUrl = findRssUrl(url) except(SystemExit, KeyboardInterrupt): sys.exit(1) except: continue if rssUrl is not None: rssUrls.append(rssUrl) log.info("found RSS: " + rssUrl) return rssUrls def displayData(urls): for url in urls: print(url) def findRssUrl(url): output = [] log.info("checking " + url) r = requests.get(url, timeout=HTTP_TIMEOUT) if (r.status_code != 200): return soup = BeautifulSoup(r.content, 'html.parser') feeds = soup.findAll(type='application/rss+xml') + \ soup.findAll(type='application/atom+xml') for tag in feeds: u = tag.get('href') u = urljoin(url, u) return u def fetchUrls(): output = [] r = requests.get(SRC, timeout=HTTP_TIMEOUT) log.info("status code: " + str(r.status_code)) feed = feedparser.parse(SRC) for entry in feed['entries']: x = entry['content'][0]['value'] soup = BeautifulSoup(x, 'html.parser') links = soup.find_all('a') for link in links: u = link.get('href') log.info("found href") log.info(u) if is_valid_url(u): output.append(u) log.info(u) return output # from django def is_valid_url(url): log.info("validating url") if url[0:28] == "https://news.ycombinator.com": log.info("not using HN url") return False log.info("is not an HN url") import re regex = re.compile( r'^https?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return url is not None and regex.search(url) if __name__ == "__main__": main()