99 lines
2.4 KiB
Python
99 lines
2.4 KiB
Python
import requests
|
|
import sys
|
|
import signal
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
import feedparser
|
|
from sanelogging import log
|
|
|
|
SRC = "https://hnblogs.substack.com/feed"
|
|
HTTP_TIMEOUT = 10
|
|
|
|
def signal_handler(signal, frame):
|
|
log.error('SIGINT')
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
def main():
|
|
urls = lookupFeeds()
|
|
displayData(urls)
|
|
|
|
def lookupFeeds():
|
|
rssUrls = []
|
|
urls = fetchUrls()
|
|
for url in urls:
|
|
rssUrl = None
|
|
try:
|
|
rssUrl = findRssUrl(url)
|
|
except(SystemExit, KeyboardInterrupt):
|
|
sys.exit(1)
|
|
except:
|
|
continue
|
|
if rssUrl is not None:
|
|
rssUrls.append(rssUrl)
|
|
log.info("found RSS: " + rssUrl)
|
|
return rssUrls
|
|
|
|
def displayData(urls):
|
|
for url in urls:
|
|
print(url)
|
|
|
|
def findRssUrl(url):
|
|
output = []
|
|
log.info("checking " + url)
|
|
r = requests.get(url, timeout=HTTP_TIMEOUT)
|
|
if (r.status_code != 200):
|
|
return
|
|
soup = BeautifulSoup(r.content, 'html.parser')
|
|
feeds = soup.findAll(type='application/rss+xml') + \
|
|
soup.findAll(type='application/atom+xml')
|
|
for tag in feeds:
|
|
u = tag.get('href')
|
|
u = urljoin(url, u)
|
|
return u
|
|
|
|
def fetchUrls():
|
|
output = []
|
|
r = requests.get(SRC, timeout=HTTP_TIMEOUT)
|
|
log.info("status code: " + str(r.status_code))
|
|
feed = feedparser.parse(SRC)
|
|
for entry in feed['entries']:
|
|
x = entry['content'][0]['value']
|
|
soup = BeautifulSoup(x, 'html.parser')
|
|
links = soup.find_all('a')
|
|
for link in links:
|
|
u = link.get('href')
|
|
log.info("found href")
|
|
log.info(u)
|
|
if is_valid_url(u):
|
|
output.append(u)
|
|
log.info(u)
|
|
return output
|
|
|
|
|
|
# from django
|
|
def is_valid_url(url):
|
|
log.info("validating url")
|
|
|
|
if url[0:28] == "https://news.ycombinator.com":
|
|
log.info("not using HN url")
|
|
return False
|
|
|
|
log.info("is not an HN url")
|
|
import re
|
|
regex = re.compile(
|
|
r'^https?://' # http:// or https://
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
|
r'localhost|' # localhost...
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
r'(?::\d+)?' # optional port
|
|
r'(?:/?|[/?]\S+)$',
|
|
re.IGNORECASE)
|
|
return url is not None and regex.search(url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|