From 2c64646749e8eb5c0a92ee1d2ae5ec15fc98a2be Mon Sep 17 00:00:00 2001 From: sneak Date: Thu, 4 Aug 2022 08:02:45 +0200 Subject: [PATCH] initial --- README.md | 22 +- feeds.opml | 528 +++++++++++++++++++++++++++++++++++++++++++++ tools/Makefile | 7 + tools/Pipfile | 16 ++ tools/Pipfile.lock | 269 +++++++++++++++++++++++ tools/hnblogs.py | 98 +++++++++ tools/main.py | 75 +++++++ 7 files changed, 1014 insertions(+), 1 deletion(-) create mode 100644 feeds.opml create mode 100644 tools/Makefile create mode 100644 tools/Pipfile create mode 100644 tools/Pipfile.lock create mode 100644 tools/hnblogs.py create mode 100644 tools/main.py diff --git a/README.md b/README.md index 547466a..abee2f5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,23 @@ # feeds -opml list of feeds i read \ No newline at end of file +opml list of feeds i read + +# about + +I scraped https://jessimekirk.com/blog/hn_users_links/ (who in turn scraped +HN profile pages) and fetched `` tags pointing to RSS/Atom feeds. + +I also scraped the https://hnblogs.substack.com/feed RSS feed for +HN-related blogs and then fetched and looked in those for link tags to +feeds, too. + +No endorsement or assertion of quality, express or implied, is the result of +inclusion in this repo. This is just a scrape. + +# license + +WTFPL + +# author + +sneak <[sneak@sneak.berlin](mailto:sneak@sneak.berlin)> diff --git a/feeds.opml b/feeds.opml new file mode 100644 index 0000000..893038c --- /dev/null +++ b/feeds.opml @@ -0,0 +1,528 @@ + + + + sneak's reader feeds + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/Makefile b/tools/Makefile new file mode 100644 index 0000000..7fedccc --- /dev/null +++ b/tools/Makefile @@ -0,0 +1,7 @@ +default: run + +run: + pipenv run python3 main.py | tee output.txt + +other: + pipenv run python3 hnblogs.py | tee output.txt diff --git a/tools/Pipfile b/tools/Pipfile new file mode 100644 index 0000000..ace4847 --- /dev/null +++ b/tools/Pipfile @@ -0,0 +1,16 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +requests = "*" +bs4 = "*" +lxml = "*" +feedparser = "*" +sanelogging = "*" + +[dev-packages] + +[requires] +python_version = "3.9" diff --git a/tools/Pipfile.lock b/tools/Pipfile.lock new file mode 100644 index 0000000..b90a2fd --- /dev/null +++ b/tools/Pipfile.lock @@ -0,0 +1,269 @@ +{ + "_meta": { + "hash": { + "sha256": "e25af9d23d965cd5371700c95baa1de3028f8f273467e27ed19e3556188a8fae" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "beautifulsoup4": { + "hashes": [ + "sha256:58d5c3d29f5a36ffeb94f02f0d786cd53014cf9b3b3951d42e0080d8a9498d30", + "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==4.11.1" + }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" + }, + "certifi": { + "hashes": [ + "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d", + "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2022.6.15" + }, + "charset-normalizer": { + "hashes": [ + "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5", + "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.1.0" + }, + "colorlog": { + "hashes": [ + "sha256:344f73204009e4c83c5b6beb00b3c45dc70fcdae3c80db919e0a4171d006fde8", + "sha256:351c51e866c86c3217f08e4b067a7974a678be78f07f85fc2d55b8babde6d94e" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==6.6.0" + }, + "datetime": { + "hashes": [ + "sha256:65416160ff35a564c6cca67ce0c5cd1816eee4e8a2fb324556939689d0a9ee41", + "sha256:ec9894c438cdd54dc31578b1b43c79b11f8111dbeb066e372548a0a78a3bec46" + ], + "version": "==4.5" + }, + "feedparser": { + "hashes": [ + "sha256:27da485f4637ce7163cdeab13a80312b93b7d0c1b775bef4a47629a3110bca51", + "sha256:79c257d526d13b944e965f6095700587f27388e50ea16fd245babe4dfae7024f" + ], + "index": "pypi", + "version": "==6.0.10" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3.5'", + "version": "==3.3" + }, + "lxml": { + "hashes": [ + "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318", + "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c", + "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b", + "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000", + "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73", + "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d", + "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb", + "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8", + "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2", + "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345", + "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94", + "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e", + "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b", + "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc", + "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a", + "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9", + "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc", + "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387", + "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb", + "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7", + "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4", + "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97", + "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67", + "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627", + "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7", + "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd", + "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3", + "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7", + "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130", + "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b", + "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036", + "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785", + "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca", + "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91", + "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc", + "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536", + "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391", + "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3", + "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d", + "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21", + "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3", + "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d", + "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29", + "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715", + "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed", + "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25", + "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c", + "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785", + "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837", + "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4", + "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b", + "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2", + "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067", + "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448", + "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d", + "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2", + "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc", + "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c", + "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5", + "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84", + "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8", + "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf", + "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7", + "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e", + "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb", + "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b", + "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3", + "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad", + "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8", + "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f" + ], + "index": "pypi", + "version": "==4.9.1" + }, + "pytz": { + "hashes": [ + "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7", + "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c" + ], + "version": "==2022.1" + }, + "requests": { + "hashes": [ + "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", + "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" + ], + "index": "pypi", + "version": "==2.28.1" + }, + "sanelogging": { + "hashes": [ + "sha256:2ac0c05986c6faec18eb918129189acebcca626a8a96804fc81e304d9a3a2463", + "sha256:fe56cd15b54a7fd7458b21bdd0120c20fe79312e4aefc4fb7c54a339ab7bdc77" + ], + "index": "pypi", + "version": "==1.1.0" + }, + "setuptools": { + "hashes": [ + "sha256:7c7854ee1429a240090297628dc9f75b35318d193537968e2dc14010ee2f5bca", + "sha256:dc2662692f47d99cb8ae15a784529adeed535bcd7c277fee0beccf961522baf6" + ], + "markers": "python_version >= '3.7'", + "version": "==63.4.1" + }, + "sgmllib3k": { + "hashes": [ + "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9" + ], + "version": "==1.0.0" + }, + "soupsieve": { + "hashes": [ + "sha256:3b2503d3c7084a42b1ebd08116e5f81aadfaea95863628c80a3b774a11b7c759", + "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d" + ], + "markers": "python_full_version >= '3.6.0'", + "version": "==2.3.2.post1" + }, + "urllib3": { + "hashes": [ + "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc", + "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", + "version": "==1.26.11" + }, + "zope.interface": { + "hashes": [ + "sha256:08f9636e99a9d5410181ba0729e0408d3d8748026ea938f3b970a0249daa8192", + "sha256:0b465ae0962d49c68aa9733ba92a001b2a0933c317780435f00be7ecb959c702", + "sha256:0cba8477e300d64a11a9789ed40ee8932b59f9ee05f85276dbb4b59acee5dd09", + "sha256:0cee5187b60ed26d56eb2960136288ce91bcf61e2a9405660d271d1f122a69a4", + "sha256:0ea1d73b7c9dcbc5080bb8aaffb776f1c68e807767069b9ccdd06f27a161914a", + "sha256:0f91b5b948686659a8e28b728ff5e74b1be6bf40cb04704453617e5f1e945ef3", + "sha256:15e7d1f7a6ee16572e21e3576d2012b2778cbacf75eb4b7400be37455f5ca8bf", + "sha256:17776ecd3a1fdd2b2cd5373e5ef8b307162f581c693575ec62e7c5399d80794c", + "sha256:194d0bcb1374ac3e1e023961610dc8f2c78a0f5f634d0c737691e215569e640d", + "sha256:1c0e316c9add0db48a5b703833881351444398b04111188069a26a61cfb4df78", + "sha256:205e40ccde0f37496904572035deea747390a8b7dc65146d30b96e2dd1359a83", + "sha256:273f158fabc5ea33cbc936da0ab3d4ba80ede5351babc4f577d768e057651531", + "sha256:2876246527c91e101184f63ccd1d716ec9c46519cc5f3d5375a3351c46467c46", + "sha256:2c98384b254b37ce50eddd55db8d381a5c53b4c10ee66e1e7fe749824f894021", + "sha256:2e5a26f16503be6c826abca904e45f1a44ff275fdb7e9d1b75c10671c26f8b94", + "sha256:334701327f37c47fa628fc8b8d28c7d7730ce7daaf4bda1efb741679c2b087fc", + "sha256:3748fac0d0f6a304e674955ab1365d515993b3a0a865e16a11ec9d86fb307f63", + "sha256:3c02411a3b62668200910090a0dff17c0b25aaa36145082a5a6adf08fa281e54", + "sha256:3dd4952748521205697bc2802e4afac5ed4b02909bb799ba1fe239f77fd4e117", + "sha256:3f24df7124c323fceb53ff6168da70dbfbae1442b4f3da439cd441681f54fe25", + "sha256:469e2407e0fe9880ac690a3666f03eb4c3c444411a5a5fddfdabc5d184a79f05", + "sha256:4de4bc9b6d35c5af65b454d3e9bc98c50eb3960d5a3762c9438df57427134b8e", + "sha256:5208ebd5152e040640518a77827bdfcc73773a15a33d6644015b763b9c9febc1", + "sha256:52de7fc6c21b419078008f697fd4103dbc763288b1406b4562554bd47514c004", + "sha256:5bb3489b4558e49ad2c5118137cfeaf59434f9737fa9c5deefc72d22c23822e2", + "sha256:5dba5f530fec3f0988d83b78cc591b58c0b6eb8431a85edd1569a0539a8a5a0e", + "sha256:5dd9ca406499444f4c8299f803d4a14edf7890ecc595c8b1c7115c2342cadc5f", + "sha256:5f931a1c21dfa7a9c573ec1f50a31135ccce84e32507c54e1ea404894c5eb96f", + "sha256:63b82bb63de7c821428d513607e84c6d97d58afd1fe2eb645030bdc185440120", + "sha256:66c0061c91b3b9cf542131148ef7ecbecb2690d48d1612ec386de9d36766058f", + "sha256:6f0c02cbb9691b7c91d5009108f975f8ffeab5dff8f26d62e21c493060eff2a1", + "sha256:71aace0c42d53abe6fc7f726c5d3b60d90f3c5c055a447950ad6ea9cec2e37d9", + "sha256:7d97a4306898b05404a0dcdc32d9709b7d8832c0c542b861d9a826301719794e", + "sha256:7df1e1c05304f26faa49fa752a8c690126cf98b40b91d54e6e9cc3b7d6ffe8b7", + "sha256:8270252effc60b9642b423189a2fe90eb6b59e87cbee54549db3f5562ff8d1b8", + "sha256:867a5ad16892bf20e6c4ea2aab1971f45645ff3102ad29bd84c86027fa99997b", + "sha256:877473e675fdcc113c138813a5dd440da0769a2d81f4d86614e5d62b69497155", + "sha256:8892f89999ffd992208754851e5a052f6b5db70a1e3f7d54b17c5211e37a98c7", + "sha256:9a9845c4c6bb56e508651f005c4aeb0404e518c6f000d5a1123ab077ab769f5c", + "sha256:a1e6e96217a0f72e2b8629e271e1b280c6fa3fe6e59fa8f6701bec14e3354325", + "sha256:a8156e6a7f5e2a0ff0c5b21d6bcb45145efece1909efcbbbf48c56f8da68221d", + "sha256:a9506a7e80bcf6eacfff7f804c0ad5350c8c95b9010e4356a4b36f5322f09abb", + "sha256:af310ec8335016b5e52cae60cda4a4f2a60a788cbb949a4fbea13d441aa5a09e", + "sha256:b0297b1e05fd128d26cc2460c810d42e205d16d76799526dfa8c8ccd50e74959", + "sha256:bf68f4b2b6683e52bec69273562df15af352e5ed25d1b6641e7efddc5951d1a7", + "sha256:d0c1bc2fa9a7285719e5678584f6b92572a5b639d0e471bb8d4b650a1a910920", + "sha256:d4d9d6c1a455d4babd320203b918ccc7fcbefe308615c521062bc2ba1aa4d26e", + "sha256:db1fa631737dab9fa0b37f3979d8d2631e348c3b4e8325d6873c2541d0ae5a48", + "sha256:dd93ea5c0c7f3e25335ab7d22a507b1dc43976e1345508f845efc573d3d779d8", + "sha256:f44e517131a98f7a76696a7b21b164bcb85291cee106a23beccce454e1f433a4", + "sha256:f7ee479e96f7ee350db1cf24afa5685a5899e2b34992fb99e1f7c1b0b758d263" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==5.4.0" + } + }, + "develop": {} +} diff --git a/tools/hnblogs.py b/tools/hnblogs.py new file mode 100644 index 0000000..8c0b7a8 --- /dev/null +++ b/tools/hnblogs.py @@ -0,0 +1,98 @@ +import requests +import sys +import signal +from bs4 import BeautifulSoup +from urllib.parse import urljoin +import feedparser +from sanelogging import log + +SRC = "https://hnblogs.substack.com/feed" +HTTP_TIMEOUT = 10 + +def signal_handler(signal, frame): + log.error('SIGINT') + sys.exit(0) + +signal.signal(signal.SIGINT, signal_handler) + +def main(): + urls = lookupFeeds() + displayData(urls) + +def lookupFeeds(): + rssUrls = [] + urls = fetchUrls() + for url in urls: + rssUrl = None + try: + rssUrl = findRssUrl(url) + except(SystemExit, KeyboardInterrupt): + sys.exit(1) + except: + continue + if rssUrl is not None: + rssUrls.append(rssUrl) + log.info("found RSS: " + rssUrl) + return rssUrls + +def displayData(urls): + for url in urls: + print(url) + +def findRssUrl(url): + output = [] + log.info("checking " + url) + r = requests.get(url, timeout=HTTP_TIMEOUT) + if (r.status_code != 200): + return + soup = BeautifulSoup(r.content, 'html.parser') + feeds = soup.findAll(type='application/rss+xml') + \ + soup.findAll(type='application/atom+xml') + for tag in feeds: + u = tag.get('href') + u = urljoin(url, u) + return u + +def fetchUrls(): + output = [] + r = requests.get(SRC, timeout=HTTP_TIMEOUT) + log.info("status code: " + str(r.status_code)) + feed = feedparser.parse(SRC) + for entry in feed['entries']: + x = entry['content'][0]['value'] + soup = BeautifulSoup(x, 'html.parser') + links = soup.find_all('a') + for link in links: + u = link.get('href') + log.info("found href") + log.info(u) + if is_valid_url(u): + output.append(u) + log.info(u) + return output + + +# from django +def is_valid_url(url): + log.info("validating url") + + if url[0:28] == "https://news.ycombinator.com": + log.info("not using HN url") + return False + + log.info("is not an HN url") + import re + regex = re.compile( + r'^https?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... + r'localhost|' # localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', + re.IGNORECASE) + return url is not None and regex.search(url) + + +if __name__ == "__main__": + main() + diff --git a/tools/main.py b/tools/main.py new file mode 100644 index 0000000..ef05d0c --- /dev/null +++ b/tools/main.py @@ -0,0 +1,75 @@ +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +SRC = "https://jessimekirk.com/blog/hn_users_links/" +HTTP_TIMEOUT = 10 + +def main(): + urls = lookupFeeds() + displayData(urls) + + +def lookupFeeds(): + rssUrls = [] + urls = fetchUrls() + for url in urls: + rssUrl = None + try: + rssUrl = findRssUrl(url) + except: + continue + if rssUrl is not None: + rssUrls.append(rssUrl) + print("found RSS: " + rssUrl) + return rssUrls + +def displayData(urls): + for url in urls: + print(url) + +def findRssUrl(url): + output = [] + print("checking " + url) + r = requests.get(url, timeout=HTTP_TIMEOUT) + if (r.status_code != 200): + return + soup = BeautifulSoup(r.content, 'html.parser') + feeds = soup.findAll(type='application/rss+xml') + \ + soup.findAll(type='application/atom+xml') + for tag in feeds: + u = tag.get('href') + u = urljoin(url, u) + return u + + +def fetchUrls(): + output = [] + r = requests.get(SRC, timeout=HTTP_TIMEOUT) + print(r.status_code) + soup = BeautifulSoup(r.content, 'html.parser') + links = soup.find_all('a') + for link in links: + u = link.get_text() + if is_valid_url(u): + output.append(u) + return output + + +# from django +def is_valid_url(url): + import re + regex = re.compile( + r'^https?://' # http:// or https:// + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain... + r'localhost|' # localhost... + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip + r'(?::\d+)?' # optional port + r'(?:/?|[/?]\S+)$', + re.IGNORECASE) + return url is not None and regex.search(url) + + +if __name__ == "__main__": + main() +