feeds/tools/main.py

76 lines
1.8 KiB
Python

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
SRC = "https://jessimekirk.com/blog/hn_users_links/"
HTTP_TIMEOUT = 10
def main():
urls = lookupFeeds()
displayData(urls)
def lookupFeeds():
rssUrls = []
urls = fetchUrls()
for url in urls:
rssUrl = None
try:
rssUrl = findRssUrl(url)
except:
continue
if rssUrl is not None:
rssUrls.append(rssUrl)
print("found RSS: " + rssUrl)
return rssUrls
def displayData(urls):
for url in urls:
print(url)
def findRssUrl(url):
output = []
print("checking " + url)
r = requests.get(url, timeout=HTTP_TIMEOUT)
if (r.status_code != 200):
return
soup = BeautifulSoup(r.content, 'html.parser')
feeds = soup.findAll(type='application/rss+xml') + \
soup.findAll(type='application/atom+xml')
for tag in feeds:
u = tag.get('href')
u = urljoin(url, u)
return u
def fetchUrls():
output = []
r = requests.get(SRC, timeout=HTTP_TIMEOUT)
print(r.status_code)
soup = BeautifulSoup(r.content, 'html.parser')
links = soup.find_all('a')
for link in links:
u = link.get_text()
if is_valid_url(u):
output.append(u)
return output
# from django
def is_valid_url(url):
import re
regex = re.compile(
r'^https?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$',
re.IGNORECASE)
return url is not None and regex.search(url)
if __name__ == "__main__":
main()