76 lines
1.8 KiB
Python
76 lines
1.8 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin
|
|
|
|
SRC = "https://jessimekirk.com/blog/hn_users_links/"
|
|
HTTP_TIMEOUT = 10
|
|
|
|
def main():
|
|
urls = lookupFeeds()
|
|
displayData(urls)
|
|
|
|
|
|
def lookupFeeds():
|
|
rssUrls = []
|
|
urls = fetchUrls()
|
|
for url in urls:
|
|
rssUrl = None
|
|
try:
|
|
rssUrl = findRssUrl(url)
|
|
except:
|
|
continue
|
|
if rssUrl is not None:
|
|
rssUrls.append(rssUrl)
|
|
print("found RSS: " + rssUrl)
|
|
return rssUrls
|
|
|
|
def displayData(urls):
|
|
for url in urls:
|
|
print(url)
|
|
|
|
def findRssUrl(url):
|
|
output = []
|
|
print("checking " + url)
|
|
r = requests.get(url, timeout=HTTP_TIMEOUT)
|
|
if (r.status_code != 200):
|
|
return
|
|
soup = BeautifulSoup(r.content, 'html.parser')
|
|
feeds = soup.findAll(type='application/rss+xml') + \
|
|
soup.findAll(type='application/atom+xml')
|
|
for tag in feeds:
|
|
u = tag.get('href')
|
|
u = urljoin(url, u)
|
|
return u
|
|
|
|
|
|
def fetchUrls():
|
|
output = []
|
|
r = requests.get(SRC, timeout=HTTP_TIMEOUT)
|
|
print(r.status_code)
|
|
soup = BeautifulSoup(r.content, 'html.parser')
|
|
links = soup.find_all('a')
|
|
for link in links:
|
|
u = link.get_text()
|
|
if is_valid_url(u):
|
|
output.append(u)
|
|
return output
|
|
|
|
|
|
# from django
|
|
def is_valid_url(url):
|
|
import re
|
|
regex = re.compile(
|
|
r'^https?://' # http:// or https://
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
|
r'localhost|' # localhost...
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
r'(?::\d+)?' # optional port
|
|
r'(?:/?|[/?]\S+)$',
|
|
re.IGNORECASE)
|
|
return url is not None and regex.search(url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|