initial
This commit is contained in:
309
cmd/test/alexa500.txt
Normal file
309
cmd/test/alexa500.txt
Normal file
@@ -0,0 +1,309 @@
|
||||
163.com
|
||||
1688.com
|
||||
2ch.net
|
||||
360.cn
|
||||
4shared.com
|
||||
9gag.com
|
||||
OPENLOAD.co
|
||||
T.co
|
||||
about.com
|
||||
acfun.tv
|
||||
addthis.com
|
||||
adf.ly
|
||||
adnetworkperformance.com
|
||||
adobe.com
|
||||
adplxmd.com
|
||||
agar.io
|
||||
airbnb.com
|
||||
alibaba.com
|
||||
alicdn.com
|
||||
aliexpress.com
|
||||
allegro.pl
|
||||
ameba.jp
|
||||
americanexpress.com
|
||||
ancestry.com
|
||||
apple.com
|
||||
archive.org
|
||||
ask.fm
|
||||
att.com
|
||||
avito.ru
|
||||
badoo.com
|
||||
bankofamerica.com
|
||||
battle.net
|
||||
bbc.co.uk
|
||||
bbc.com
|
||||
bestbuy.com
|
||||
bet365.com
|
||||
bild.de
|
||||
bilibili.com
|
||||
billdesk.com
|
||||
bing.com
|
||||
bitauto.com
|
||||
blackboard.com
|
||||
blastingnews.com
|
||||
blkget.com
|
||||
blog.jp
|
||||
blogfa.com
|
||||
blogger.com
|
||||
blogspot.com
|
||||
blogspot.com.br
|
||||
blogspot.in
|
||||
bloomberg.com
|
||||
bongacams.com
|
||||
booking.com
|
||||
box.com
|
||||
bukalapak.com
|
||||
businessinsider.com
|
||||
buzzlie.com
|
||||
capitalone.com
|
||||
chase.com
|
||||
chaturbate.com
|
||||
citi.com
|
||||
ck101.com
|
||||
cnblogs.com
|
||||
cnnic.cn
|
||||
coccoc.com
|
||||
craigslist.org
|
||||
dailymotion.com
|
||||
dell.com
|
||||
detail.tmall.com.danuoyi.tbcache.com
|
||||
detik.com
|
||||
deviantart.com
|
||||
digikala.com
|
||||
diply.com
|
||||
disqus.com
|
||||
doubleclick.net
|
||||
doublepimp.com
|
||||
dropbox.com
|
||||
dssedc4qxg7o6.cloudfront.net
|
||||
ebay-kleinanzeigen.de
|
||||
ebay.co.uk
|
||||
ebay.com
|
||||
ebay.de
|
||||
ebay.in
|
||||
ebay.it
|
||||
eksisozluk.com
|
||||
espn.gns.go.com
|
||||
etsy.com
|
||||
ettoday.net
|
||||
evernote.com
|
||||
exoclick.com
|
||||
extratorrent.cc
|
||||
fbcdn.net
|
||||
fedex.com
|
||||
feedly.com
|
||||
files.wordpress.com
|
||||
flickr.com
|
||||
flipkart.com
|
||||
forbes.com
|
||||
foxnews.com
|
||||
freepik.com
|
||||
gfycat.com
|
||||
giphy.com
|
||||
github.com
|
||||
github.io
|
||||
gizmodo.com
|
||||
globo.com
|
||||
gmanetwork.com
|
||||
gmx.net
|
||||
go.com
|
||||
godaddy.com
|
||||
goodreads.com
|
||||
google.com
|
||||
groupon.com
|
||||
haosou.com
|
||||
hatena.ne.jp
|
||||
hclips.com
|
||||
hdfcbank.com
|
||||
hdzog.com
|
||||
hp.com
|
||||
hpcc-page.cnc.ccgslb.com.cn
|
||||
hulu.com
|
||||
hurriyet.com.tr
|
||||
icloud.com
|
||||
imgur.com
|
||||
impress.co.jp
|
||||
imzog.com
|
||||
indeed.com
|
||||
instagram.com
|
||||
instructure.com
|
||||
intuit.com
|
||||
iqiyi.com
|
||||
irctc.co.in
|
||||
jabong.com
|
||||
jd.com
|
||||
kaskus.co.id
|
||||
kat.cr
|
||||
kickstarter.com
|
||||
kinogo.co
|
||||
kinopoisk.ru
|
||||
kissanime.to
|
||||
kohls.com
|
||||
leboncoin.fr
|
||||
life.tw
|
||||
lifebuzz.com
|
||||
likes.com
|
||||
liputan6.com
|
||||
liveadexchanger.com
|
||||
livejournal.com
|
||||
loading-delivery2.com
|
||||
lowes.com
|
||||
macys.com
|
||||
mail.ru
|
||||
mailchimp.com
|
||||
mama.cn
|
||||
mashable.com
|
||||
mediab.uy
|
||||
mediafire.com
|
||||
medium.com
|
||||
mega.nz
|
||||
mercadolibre.com.ar
|
||||
messenger.com
|
||||
microsoft.com
|
||||
mozilla.org
|
||||
msn.com
|
||||
nametests.com
|
||||
naver.jp
|
||||
nba.com
|
||||
netflix.com
|
||||
nih.gov
|
||||
nyaa.se
|
||||
nytimes.com
|
||||
office.com
|
||||
ok.ru
|
||||
olx.pl
|
||||
onclickads.net
|
||||
onedio.com
|
||||
onet.pl
|
||||
onlinesbi.com
|
||||
opthw.xdwscache.speedcdns.com
|
||||
oracle.com
|
||||
orange.fr
|
||||
ouo.io
|
||||
outbrain.com
|
||||
ozock.com
|
||||
pandora.com
|
||||
paypal.com
|
||||
paytm.com
|
||||
pinimg.com
|
||||
pinterest.com
|
||||
pixiv.net
|
||||
popads.net
|
||||
popcash.net
|
||||
pornhub.com
|
||||
ppomppu.co.kr
|
||||
putlocker.is
|
||||
quora.com
|
||||
rambler.ru
|
||||
rdsa2012.com
|
||||
realtor.com
|
||||
rediff.com
|
||||
reimageplus.com
|
||||
roblox.com
|
||||
rt.com
|
||||
ruten.com.tw
|
||||
rutracker.org
|
||||
sabah.com.tr
|
||||
sahibinden.com
|
||||
salesforce.com
|
||||
savefrom.net
|
||||
sberbank.ru
|
||||
scribd.com
|
||||
secureserver.net
|
||||
seznam.cz
|
||||
sh.st
|
||||
shutterstock.com
|
||||
siteadvisor.com
|
||||
skype.com
|
||||
slack.com
|
||||
slickdeals.net
|
||||
slideshare.net
|
||||
slither.io
|
||||
so.com
|
||||
sogou.com
|
||||
sohu.com
|
||||
soundcloud.com
|
||||
sourceforge.net
|
||||
spiegel.de
|
||||
spotify.com
|
||||
stackexchange.com
|
||||
stackoverflow.com
|
||||
steamcommunity.com
|
||||
steampowered.com
|
||||
subscene.com
|
||||
surveymonkey.com
|
||||
t-online.de
|
||||
tabelog.com
|
||||
taboola.com
|
||||
taleo.net
|
||||
taobao.com
|
||||
target.com
|
||||
taringa.net
|
||||
telegram.org
|
||||
telegraph.co.uk
|
||||
terraclicks.com
|
||||
thefreedictionary.com
|
||||
theladbible.com
|
||||
themeforest.net
|
||||
thepiratebay.se
|
||||
thesportbible.com
|
||||
thewatchseries.to
|
||||
tistory.com
|
||||
tmall.com
|
||||
tokopedia.com
|
||||
torrentz.eu
|
||||
tradeadexchange.com
|
||||
trello.com
|
||||
tripadvisor.com
|
||||
tuberel.com
|
||||
tumblr.com
|
||||
twitch.tv
|
||||
twitter.com
|
||||
txxx.com
|
||||
udn.com
|
||||
upornia.com
|
||||
ups.com
|
||||
uptodown.com
|
||||
upwork.com
|
||||
usps.com
|
||||
verizonwireless.com
|
||||
vice.com
|
||||
vimeo.com
|
||||
vk.com
|
||||
vk.me
|
||||
walmart.com
|
||||
wangzhanbao.cc
|
||||
washingtonpost.com
|
||||
weather.com
|
||||
web.de
|
||||
webtretho.com
|
||||
weebly.com
|
||||
weibo.com
|
||||
wellsfargo.com
|
||||
wetransfer.com
|
||||
whatsapp.com
|
||||
wikia.com
|
||||
wikihow.com
|
||||
wikimedia.org
|
||||
wikipedia.org
|
||||
wittyfeed.com
|
||||
wix.com
|
||||
wordpress.com
|
||||
wordpress.org
|
||||
wp.com
|
||||
wsj.com
|
||||
xfinity.com
|
||||
xhamster.com
|
||||
xuite.net
|
||||
yahoo.com
|
||||
yandex.ru
|
||||
yelp.com
|
||||
youm7.com
|
||||
youporn.com
|
||||
youtube-mp3.org
|
||||
youtube.com
|
||||
zendesk.com
|
||||
zhihu.com
|
||||
zillow.com
|
||||
zippyshare.com
|
||||
zoho.com
|
||||
175
cmd/test/main.go
Normal file
175
cmd/test/main.go
Normal file
@@ -0,0 +1,175 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/joho/godotenv/autoload"
|
||||
"github.com/k0kubun/pp"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"sneak.berlin/go/aipagesummary"
|
||||
"sneak.berlin/go/puppeteerapiclient"
|
||||
)
|
||||
|
||||
func main() {
|
||||
bigList := []string{
|
||||
"https://www.bbc.com",
|
||||
"https://www.cnn.com",
|
||||
"https://news.ycombinator.com",
|
||||
"https://www.reddit.com",
|
||||
"https://www.wikipedia.org",
|
||||
"https://www.ford.com",
|
||||
"https://www.tesla.com",
|
||||
"https://www.apple.com",
|
||||
"https://www.microsoft.com",
|
||||
"https://www.google.com",
|
||||
"https://medium.com",
|
||||
"https://www.nytimes.com",
|
||||
"https://sneak.berlin",
|
||||
}
|
||||
|
||||
// now read urls from alexa500.txt
|
||||
list, err := os.ReadFile("alexa500.txt")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
urls := strings.Fields(string(list))
|
||||
for _, url := range urls {
|
||||
url = "https://" + url
|
||||
bigList = append(bigList, url)
|
||||
|
||||
}
|
||||
|
||||
shuffle(bigList)
|
||||
|
||||
for _, url := range bigList {
|
||||
fmt.Printf("Showing summary for %s\n", url)
|
||||
summarizeURL(url)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func shuffle(slice []string) {
|
||||
rand.Shuffle(len(slice), func(i, j int) {
|
||||
slice[i], slice[j] = slice[j], slice[i]
|
||||
})
|
||||
}
|
||||
|
||||
func parseHostnameFromURL(rawURL string) string {
|
||||
// Parse the URL
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Extract the host part
|
||||
host := parsedURL.Host
|
||||
|
||||
// If the host contains a port, strip it
|
||||
hostname, _, err := net.SplitHostPort(host)
|
||||
if err != nil {
|
||||
// If there's no port, it might return an error, in which case, host is the hostname
|
||||
if err.(*net.AddrError).Err == "missing port in address" {
|
||||
hostname = host
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// Convert hostname to lowercase
|
||||
hostname = strings.ToLower(hostname)
|
||||
|
||||
return hostname
|
||||
}
|
||||
|
||||
func summarizeURL(url string) {
|
||||
|
||||
// Initialize the summarizer with the OpenAI API key
|
||||
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Retrieve Puppeteer API URL and salt from environment variables
|
||||
apiURL := os.Getenv("PUPPETEER_API_URL")
|
||||
apiSalt := os.Getenv("PUPPETEER_API_SALT")
|
||||
|
||||
if apiURL == "" || apiSalt == "" {
|
||||
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
|
||||
}
|
||||
|
||||
// Initialize the Puppeteer client
|
||||
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
|
||||
|
||||
// Create a context with a timeout for the scrape request
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Scrape the content from the given URL
|
||||
|
||||
scraped, err := client.Scrape(ctx, url, "body")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Use bluemonday to strip HTML tags from the scraped content
|
||||
p := bluemonday.StripTagsPolicy()
|
||||
strippedContent := p.Sanitize(scraped.Content)
|
||||
|
||||
if len(strippedContent) > 15000 {
|
||||
// truncate if too big
|
||||
strippedContent = strippedContent[:10000]
|
||||
}
|
||||
|
||||
// Get the summary of the stripped content
|
||||
res, err := s.GetSummary(url, strippedContent)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
// pretty print the result:
|
||||
pp.Print(res)
|
||||
fmt.Println()
|
||||
|
||||
hn := parseHostnameFromURL(url)
|
||||
fmt.Printf("Hostname: %s\n", hn)
|
||||
|
||||
fn := fmt.Sprintf("%s.json", hn)
|
||||
|
||||
writePrettyJSONToFile(res, fn)
|
||||
|
||||
}
|
||||
|
||||
func writePrettyJSONToFile(data interface{}, filename string) error {
|
||||
// Serialize the data to pretty-printed JSON
|
||||
jsonData, err := json.MarshalIndent(data, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing to JSON: %v", err)
|
||||
}
|
||||
|
||||
// Create or truncate the file
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Write the JSON data to the file
|
||||
_, err = file.Write(jsonData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing to file: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user