feeds/tools/hnblogs.py

99 lines
2.4 KiB
Python

import requests
import sys
import signal
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import feedparser
from sanelogging import log
SRC = "https://hnblogs.substack.com/feed"
HTTP_TIMEOUT = 10
def signal_handler(signal, frame):
log.error('SIGINT')
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
def main():
urls = lookupFeeds()
displayData(urls)
def lookupFeeds():
rssUrls = []
urls = fetchUrls()
for url in urls:
rssUrl = None
try:
rssUrl = findRssUrl(url)
except(SystemExit, KeyboardInterrupt):
sys.exit(1)
except:
continue
if rssUrl is not None:
rssUrls.append(rssUrl)
log.info("found RSS: " + rssUrl)
return rssUrls
def displayData(urls):
for url in urls:
print(url)
def findRssUrl(url):
output = []
log.info("checking " + url)
r = requests.get(url, timeout=HTTP_TIMEOUT)
if (r.status_code != 200):
return
soup = BeautifulSoup(r.content, 'html.parser')
feeds = soup.findAll(type='application/rss+xml') + \
soup.findAll(type='application/atom+xml')
for tag in feeds:
u = tag.get('href')
u = urljoin(url, u)
return u
def fetchUrls():
output = []
r = requests.get(SRC, timeout=HTTP_TIMEOUT)
log.info("status code: " + str(r.status_code))
feed = feedparser.parse(SRC)
for entry in feed['entries']:
x = entry['content'][0]['value']
soup = BeautifulSoup(x, 'html.parser')
links = soup.find_all('a')
for link in links:
u = link.get('href')
log.info("found href")
log.info(u)
if is_valid_url(u):
output.append(u)
log.info(u)
return output
# from django
def is_valid_url(url):
log.info("validating url")
if url[0:28] == "https://news.ycombinator.com":
log.info("not using HN url")
return False
log.info("is not an HN url")
import re
regex = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$',
re.IGNORECASE)
return url is not None and regex.search(url)
if __name__ == "__main__":
main()