This commit is contained in:
2024-06-03 12:11:29 -07:00
commit 108587b28c
6 changed files with 828 additions and 0 deletions

309
cmd/test/alexa500.txt Normal file
View File

@@ -0,0 +1,309 @@
163.com
1688.com
2ch.net
360.cn
4shared.com
9gag.com
OPENLOAD.co
T.co
about.com
acfun.tv
addthis.com
adf.ly
adnetworkperformance.com
adobe.com
adplxmd.com
agar.io
airbnb.com
alibaba.com
alicdn.com
aliexpress.com
allegro.pl
ameba.jp
americanexpress.com
ancestry.com
apple.com
archive.org
ask.fm
att.com
avito.ru
badoo.com
bankofamerica.com
battle.net
bbc.co.uk
bbc.com
bestbuy.com
bet365.com
bild.de
bilibili.com
billdesk.com
bing.com
bitauto.com
blackboard.com
blastingnews.com
blkget.com
blog.jp
blogfa.com
blogger.com
blogspot.com
blogspot.com.br
blogspot.in
bloomberg.com
bongacams.com
booking.com
box.com
bukalapak.com
businessinsider.com
buzzlie.com
capitalone.com
chase.com
chaturbate.com
citi.com
ck101.com
cnblogs.com
cnnic.cn
coccoc.com
craigslist.org
dailymotion.com
dell.com
detail.tmall.com.danuoyi.tbcache.com
detik.com
deviantart.com
digikala.com
diply.com
disqus.com
doubleclick.net
doublepimp.com
dropbox.com
dssedc4qxg7o6.cloudfront.net
ebay-kleinanzeigen.de
ebay.co.uk
ebay.com
ebay.de
ebay.in
ebay.it
eksisozluk.com
espn.gns.go.com
etsy.com
ettoday.net
evernote.com
exoclick.com
extratorrent.cc
fbcdn.net
fedex.com
feedly.com
files.wordpress.com
flickr.com
flipkart.com
forbes.com
foxnews.com
freepik.com
gfycat.com
giphy.com
github.com
github.io
gizmodo.com
globo.com
gmanetwork.com
gmx.net
go.com
godaddy.com
goodreads.com
google.com
groupon.com
haosou.com
hatena.ne.jp
hclips.com
hdfcbank.com
hdzog.com
hp.com
hpcc-page.cnc.ccgslb.com.cn
hulu.com
hurriyet.com.tr
icloud.com
imgur.com
impress.co.jp
imzog.com
indeed.com
instagram.com
instructure.com
intuit.com
iqiyi.com
irctc.co.in
jabong.com
jd.com
kaskus.co.id
kat.cr
kickstarter.com
kinogo.co
kinopoisk.ru
kissanime.to
kohls.com
leboncoin.fr
life.tw
lifebuzz.com
likes.com
liputan6.com
liveadexchanger.com
livejournal.com
loading-delivery2.com
lowes.com
macys.com
mail.ru
mailchimp.com
mama.cn
mashable.com
mediab.uy
mediafire.com
medium.com
mega.nz
mercadolibre.com.ar
messenger.com
microsoft.com
mozilla.org
msn.com
nametests.com
naver.jp
nba.com
netflix.com
nih.gov
nyaa.se
nytimes.com
office.com
ok.ru
olx.pl
onclickads.net
onedio.com
onet.pl
onlinesbi.com
opthw.xdwscache.speedcdns.com
oracle.com
orange.fr
ouo.io
outbrain.com
ozock.com
pandora.com
paypal.com
paytm.com
pinimg.com
pinterest.com
pixiv.net
popads.net
popcash.net
pornhub.com
ppomppu.co.kr
putlocker.is
quora.com
rambler.ru
rdsa2012.com
realtor.com
rediff.com
reimageplus.com
roblox.com
rt.com
ruten.com.tw
rutracker.org
sabah.com.tr
sahibinden.com
salesforce.com
savefrom.net
sberbank.ru
scribd.com
secureserver.net
seznam.cz
sh.st
shutterstock.com
siteadvisor.com
skype.com
slack.com
slickdeals.net
slideshare.net
slither.io
so.com
sogou.com
sohu.com
soundcloud.com
sourceforge.net
spiegel.de
spotify.com
stackexchange.com
stackoverflow.com
steamcommunity.com
steampowered.com
subscene.com
surveymonkey.com
t-online.de
tabelog.com
taboola.com
taleo.net
taobao.com
target.com
taringa.net
telegram.org
telegraph.co.uk
terraclicks.com
thefreedictionary.com
theladbible.com
themeforest.net
thepiratebay.se
thesportbible.com
thewatchseries.to
tistory.com
tmall.com
tokopedia.com
torrentz.eu
tradeadexchange.com
trello.com
tripadvisor.com
tuberel.com
tumblr.com
twitch.tv
twitter.com
txxx.com
udn.com
upornia.com
ups.com
uptodown.com
upwork.com
usps.com
verizonwireless.com
vice.com
vimeo.com
vk.com
vk.me
walmart.com
wangzhanbao.cc
washingtonpost.com
weather.com
web.de
webtretho.com
weebly.com
weibo.com
wellsfargo.com
wetransfer.com
whatsapp.com
wikia.com
wikihow.com
wikimedia.org
wikipedia.org
wittyfeed.com
wix.com
wordpress.com
wordpress.org
wp.com
wsj.com
xfinity.com
xhamster.com
xuite.net
yahoo.com
yandex.ru
yelp.com
youm7.com
youporn.com
youtube-mp3.org
youtube.com
zendesk.com
zhihu.com
zillow.com
zippyshare.com
zoho.com

175
cmd/test/main.go Normal file
View File

@@ -0,0 +1,175 @@
package main
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"net"
"net/url"
"os"
"strings"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/k0kubun/pp"
"github.com/microcosm-cc/bluemonday"
"sneak.berlin/go/aipagesummary"
"sneak.berlin/go/puppeteerapiclient"
)
func main() {
bigList := []string{
"https://www.bbc.com",
"https://www.cnn.com",
"https://news.ycombinator.com",
"https://www.reddit.com",
"https://www.wikipedia.org",
"https://www.ford.com",
"https://www.tesla.com",
"https://www.apple.com",
"https://www.microsoft.com",
"https://www.google.com",
"https://medium.com",
"https://www.nytimes.com",
"https://sneak.berlin",
}
// now read urls from alexa500.txt
list, err := os.ReadFile("alexa500.txt")
if err != nil {
fmt.Println(err)
return
}
urls := strings.Fields(string(list))
for _, url := range urls {
url = "https://" + url
bigList = append(bigList, url)
}
shuffle(bigList)
for _, url := range bigList {
fmt.Printf("Showing summary for %s\n", url)
summarizeURL(url)
}
}
func shuffle(slice []string) {
rand.Shuffle(len(slice), func(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
})
}
func parseHostnameFromURL(rawURL string) string {
// Parse the URL
parsedURL, err := url.Parse(rawURL)
if err != nil {
return ""
}
// Extract the host part
host := parsedURL.Host
// If the host contains a port, strip it
hostname, _, err := net.SplitHostPort(host)
if err != nil {
// If there's no port, it might return an error, in which case, host is the hostname
if err.(*net.AddrError).Err == "missing port in address" {
hostname = host
} else {
return ""
}
}
// Convert hostname to lowercase
hostname = strings.ToLower(hostname)
return hostname
}
func summarizeURL(url string) {
// Initialize the summarizer with the OpenAI API key
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
if err != nil {
fmt.Println(err)
return
}
// Retrieve Puppeteer API URL and salt from environment variables
apiURL := os.Getenv("PUPPETEER_API_URL")
apiSalt := os.Getenv("PUPPETEER_API_SALT")
if apiURL == "" || apiSalt == "" {
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
}
// Initialize the Puppeteer client
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
// Create a context with a timeout for the scrape request
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
// Scrape the content from the given URL
scraped, err := client.Scrape(ctx, url, "body")
if err != nil {
fmt.Println(err)
return
}
// Use bluemonday to strip HTML tags from the scraped content
p := bluemonday.StripTagsPolicy()
strippedContent := p.Sanitize(scraped.Content)
if len(strippedContent) > 15000 {
// truncate if too big
strippedContent = strippedContent[:10000]
}
// Get the summary of the stripped content
res, err := s.GetSummary(url, strippedContent)
if err != nil {
fmt.Println(err)
return
}
// pretty print the result:
pp.Print(res)
fmt.Println()
hn := parseHostnameFromURL(url)
fmt.Printf("Hostname: %s\n", hn)
fn := fmt.Sprintf("%s.json", hn)
writePrettyJSONToFile(res, fn)
}
func writePrettyJSONToFile(data interface{}, filename string) error {
// Serialize the data to pretty-printed JSON
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
return fmt.Errorf("error serializing to JSON: %v", err)
}
// Create or truncate the file
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("error creating file: %v", err)
}
defer file.Close()
// Write the JSON data to the file
_, err = file.Write(jsonData)
if err != nil {
return fmt.Errorf("error writing to file: %v", err)
}
return nil
}