initial
This commit is contained in:
commit
108587b28c
|
@ -0,0 +1,5 @@
|
||||||
|
cmd/test/*.json
|
||||||
|
cmd/test/.env
|
||||||
|
cmd/test/test
|
||||||
|
.env
|
||||||
|
logfile.txt
|
|
@ -0,0 +1,309 @@
|
||||||
|
163.com
|
||||||
|
1688.com
|
||||||
|
2ch.net
|
||||||
|
360.cn
|
||||||
|
4shared.com
|
||||||
|
9gag.com
|
||||||
|
OPENLOAD.co
|
||||||
|
T.co
|
||||||
|
about.com
|
||||||
|
acfun.tv
|
||||||
|
addthis.com
|
||||||
|
adf.ly
|
||||||
|
adnetworkperformance.com
|
||||||
|
adobe.com
|
||||||
|
adplxmd.com
|
||||||
|
agar.io
|
||||||
|
airbnb.com
|
||||||
|
alibaba.com
|
||||||
|
alicdn.com
|
||||||
|
aliexpress.com
|
||||||
|
allegro.pl
|
||||||
|
ameba.jp
|
||||||
|
americanexpress.com
|
||||||
|
ancestry.com
|
||||||
|
apple.com
|
||||||
|
archive.org
|
||||||
|
ask.fm
|
||||||
|
att.com
|
||||||
|
avito.ru
|
||||||
|
badoo.com
|
||||||
|
bankofamerica.com
|
||||||
|
battle.net
|
||||||
|
bbc.co.uk
|
||||||
|
bbc.com
|
||||||
|
bestbuy.com
|
||||||
|
bet365.com
|
||||||
|
bild.de
|
||||||
|
bilibili.com
|
||||||
|
billdesk.com
|
||||||
|
bing.com
|
||||||
|
bitauto.com
|
||||||
|
blackboard.com
|
||||||
|
blastingnews.com
|
||||||
|
blkget.com
|
||||||
|
blog.jp
|
||||||
|
blogfa.com
|
||||||
|
blogger.com
|
||||||
|
blogspot.com
|
||||||
|
blogspot.com.br
|
||||||
|
blogspot.in
|
||||||
|
bloomberg.com
|
||||||
|
bongacams.com
|
||||||
|
booking.com
|
||||||
|
box.com
|
||||||
|
bukalapak.com
|
||||||
|
businessinsider.com
|
||||||
|
buzzlie.com
|
||||||
|
capitalone.com
|
||||||
|
chase.com
|
||||||
|
chaturbate.com
|
||||||
|
citi.com
|
||||||
|
ck101.com
|
||||||
|
cnblogs.com
|
||||||
|
cnnic.cn
|
||||||
|
coccoc.com
|
||||||
|
craigslist.org
|
||||||
|
dailymotion.com
|
||||||
|
dell.com
|
||||||
|
detail.tmall.com.danuoyi.tbcache.com
|
||||||
|
detik.com
|
||||||
|
deviantart.com
|
||||||
|
digikala.com
|
||||||
|
diply.com
|
||||||
|
disqus.com
|
||||||
|
doubleclick.net
|
||||||
|
doublepimp.com
|
||||||
|
dropbox.com
|
||||||
|
dssedc4qxg7o6.cloudfront.net
|
||||||
|
ebay-kleinanzeigen.de
|
||||||
|
ebay.co.uk
|
||||||
|
ebay.com
|
||||||
|
ebay.de
|
||||||
|
ebay.in
|
||||||
|
ebay.it
|
||||||
|
eksisozluk.com
|
||||||
|
espn.gns.go.com
|
||||||
|
etsy.com
|
||||||
|
ettoday.net
|
||||||
|
evernote.com
|
||||||
|
exoclick.com
|
||||||
|
extratorrent.cc
|
||||||
|
fbcdn.net
|
||||||
|
fedex.com
|
||||||
|
feedly.com
|
||||||
|
files.wordpress.com
|
||||||
|
flickr.com
|
||||||
|
flipkart.com
|
||||||
|
forbes.com
|
||||||
|
foxnews.com
|
||||||
|
freepik.com
|
||||||
|
gfycat.com
|
||||||
|
giphy.com
|
||||||
|
github.com
|
||||||
|
github.io
|
||||||
|
gizmodo.com
|
||||||
|
globo.com
|
||||||
|
gmanetwork.com
|
||||||
|
gmx.net
|
||||||
|
go.com
|
||||||
|
godaddy.com
|
||||||
|
goodreads.com
|
||||||
|
google.com
|
||||||
|
groupon.com
|
||||||
|
haosou.com
|
||||||
|
hatena.ne.jp
|
||||||
|
hclips.com
|
||||||
|
hdfcbank.com
|
||||||
|
hdzog.com
|
||||||
|
hp.com
|
||||||
|
hpcc-page.cnc.ccgslb.com.cn
|
||||||
|
hulu.com
|
||||||
|
hurriyet.com.tr
|
||||||
|
icloud.com
|
||||||
|
imgur.com
|
||||||
|
impress.co.jp
|
||||||
|
imzog.com
|
||||||
|
indeed.com
|
||||||
|
instagram.com
|
||||||
|
instructure.com
|
||||||
|
intuit.com
|
||||||
|
iqiyi.com
|
||||||
|
irctc.co.in
|
||||||
|
jabong.com
|
||||||
|
jd.com
|
||||||
|
kaskus.co.id
|
||||||
|
kat.cr
|
||||||
|
kickstarter.com
|
||||||
|
kinogo.co
|
||||||
|
kinopoisk.ru
|
||||||
|
kissanime.to
|
||||||
|
kohls.com
|
||||||
|
leboncoin.fr
|
||||||
|
life.tw
|
||||||
|
lifebuzz.com
|
||||||
|
likes.com
|
||||||
|
liputan6.com
|
||||||
|
liveadexchanger.com
|
||||||
|
livejournal.com
|
||||||
|
loading-delivery2.com
|
||||||
|
lowes.com
|
||||||
|
macys.com
|
||||||
|
mail.ru
|
||||||
|
mailchimp.com
|
||||||
|
mama.cn
|
||||||
|
mashable.com
|
||||||
|
mediab.uy
|
||||||
|
mediafire.com
|
||||||
|
medium.com
|
||||||
|
mega.nz
|
||||||
|
mercadolibre.com.ar
|
||||||
|
messenger.com
|
||||||
|
microsoft.com
|
||||||
|
mozilla.org
|
||||||
|
msn.com
|
||||||
|
nametests.com
|
||||||
|
naver.jp
|
||||||
|
nba.com
|
||||||
|
netflix.com
|
||||||
|
nih.gov
|
||||||
|
nyaa.se
|
||||||
|
nytimes.com
|
||||||
|
office.com
|
||||||
|
ok.ru
|
||||||
|
olx.pl
|
||||||
|
onclickads.net
|
||||||
|
onedio.com
|
||||||
|
onet.pl
|
||||||
|
onlinesbi.com
|
||||||
|
opthw.xdwscache.speedcdns.com
|
||||||
|
oracle.com
|
||||||
|
orange.fr
|
||||||
|
ouo.io
|
||||||
|
outbrain.com
|
||||||
|
ozock.com
|
||||||
|
pandora.com
|
||||||
|
paypal.com
|
||||||
|
paytm.com
|
||||||
|
pinimg.com
|
||||||
|
pinterest.com
|
||||||
|
pixiv.net
|
||||||
|
popads.net
|
||||||
|
popcash.net
|
||||||
|
pornhub.com
|
||||||
|
ppomppu.co.kr
|
||||||
|
putlocker.is
|
||||||
|
quora.com
|
||||||
|
rambler.ru
|
||||||
|
rdsa2012.com
|
||||||
|
realtor.com
|
||||||
|
rediff.com
|
||||||
|
reimageplus.com
|
||||||
|
roblox.com
|
||||||
|
rt.com
|
||||||
|
ruten.com.tw
|
||||||
|
rutracker.org
|
||||||
|
sabah.com.tr
|
||||||
|
sahibinden.com
|
||||||
|
salesforce.com
|
||||||
|
savefrom.net
|
||||||
|
sberbank.ru
|
||||||
|
scribd.com
|
||||||
|
secureserver.net
|
||||||
|
seznam.cz
|
||||||
|
sh.st
|
||||||
|
shutterstock.com
|
||||||
|
siteadvisor.com
|
||||||
|
skype.com
|
||||||
|
slack.com
|
||||||
|
slickdeals.net
|
||||||
|
slideshare.net
|
||||||
|
slither.io
|
||||||
|
so.com
|
||||||
|
sogou.com
|
||||||
|
sohu.com
|
||||||
|
soundcloud.com
|
||||||
|
sourceforge.net
|
||||||
|
spiegel.de
|
||||||
|
spotify.com
|
||||||
|
stackexchange.com
|
||||||
|
stackoverflow.com
|
||||||
|
steamcommunity.com
|
||||||
|
steampowered.com
|
||||||
|
subscene.com
|
||||||
|
surveymonkey.com
|
||||||
|
t-online.de
|
||||||
|
tabelog.com
|
||||||
|
taboola.com
|
||||||
|
taleo.net
|
||||||
|
taobao.com
|
||||||
|
target.com
|
||||||
|
taringa.net
|
||||||
|
telegram.org
|
||||||
|
telegraph.co.uk
|
||||||
|
terraclicks.com
|
||||||
|
thefreedictionary.com
|
||||||
|
theladbible.com
|
||||||
|
themeforest.net
|
||||||
|
thepiratebay.se
|
||||||
|
thesportbible.com
|
||||||
|
thewatchseries.to
|
||||||
|
tistory.com
|
||||||
|
tmall.com
|
||||||
|
tokopedia.com
|
||||||
|
torrentz.eu
|
||||||
|
tradeadexchange.com
|
||||||
|
trello.com
|
||||||
|
tripadvisor.com
|
||||||
|
tuberel.com
|
||||||
|
tumblr.com
|
||||||
|
twitch.tv
|
||||||
|
twitter.com
|
||||||
|
txxx.com
|
||||||
|
udn.com
|
||||||
|
upornia.com
|
||||||
|
ups.com
|
||||||
|
uptodown.com
|
||||||
|
upwork.com
|
||||||
|
usps.com
|
||||||
|
verizonwireless.com
|
||||||
|
vice.com
|
||||||
|
vimeo.com
|
||||||
|
vk.com
|
||||||
|
vk.me
|
||||||
|
walmart.com
|
||||||
|
wangzhanbao.cc
|
||||||
|
washingtonpost.com
|
||||||
|
weather.com
|
||||||
|
web.de
|
||||||
|
webtretho.com
|
||||||
|
weebly.com
|
||||||
|
weibo.com
|
||||||
|
wellsfargo.com
|
||||||
|
wetransfer.com
|
||||||
|
whatsapp.com
|
||||||
|
wikia.com
|
||||||
|
wikihow.com
|
||||||
|
wikimedia.org
|
||||||
|
wikipedia.org
|
||||||
|
wittyfeed.com
|
||||||
|
wix.com
|
||||||
|
wordpress.com
|
||||||
|
wordpress.org
|
||||||
|
wp.com
|
||||||
|
wsj.com
|
||||||
|
xfinity.com
|
||||||
|
xhamster.com
|
||||||
|
xuite.net
|
||||||
|
yahoo.com
|
||||||
|
yandex.ru
|
||||||
|
yelp.com
|
||||||
|
youm7.com
|
||||||
|
youporn.com
|
||||||
|
youtube-mp3.org
|
||||||
|
youtube.com
|
||||||
|
zendesk.com
|
||||||
|
zhihu.com
|
||||||
|
zillow.com
|
||||||
|
zippyshare.com
|
||||||
|
zoho.com
|
|
@ -0,0 +1,175 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
|
"net"
|
||||||
|
"net/url"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/joho/godotenv/autoload"
|
||||||
|
"github.com/k0kubun/pp"
|
||||||
|
"github.com/microcosm-cc/bluemonday"
|
||||||
|
"sneak.berlin/go/aipagesummary"
|
||||||
|
"sneak.berlin/go/puppeteerapiclient"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
bigList := []string{
|
||||||
|
"https://www.bbc.com",
|
||||||
|
"https://www.cnn.com",
|
||||||
|
"https://news.ycombinator.com",
|
||||||
|
"https://www.reddit.com",
|
||||||
|
"https://www.wikipedia.org",
|
||||||
|
"https://www.ford.com",
|
||||||
|
"https://www.tesla.com",
|
||||||
|
"https://www.apple.com",
|
||||||
|
"https://www.microsoft.com",
|
||||||
|
"https://www.google.com",
|
||||||
|
"https://medium.com",
|
||||||
|
"https://www.nytimes.com",
|
||||||
|
"https://sneak.berlin",
|
||||||
|
}
|
||||||
|
|
||||||
|
// now read urls from alexa500.txt
|
||||||
|
list, err := os.ReadFile("alexa500.txt")
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
urls := strings.Fields(string(list))
|
||||||
|
for _, url := range urls {
|
||||||
|
url = "https://" + url
|
||||||
|
bigList = append(bigList, url)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
shuffle(bigList)
|
||||||
|
|
||||||
|
for _, url := range bigList {
|
||||||
|
fmt.Printf("Showing summary for %s\n", url)
|
||||||
|
summarizeURL(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func shuffle(slice []string) {
|
||||||
|
rand.Shuffle(len(slice), func(i, j int) {
|
||||||
|
slice[i], slice[j] = slice[j], slice[i]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseHostnameFromURL(rawURL string) string {
|
||||||
|
// Parse the URL
|
||||||
|
parsedURL, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the host part
|
||||||
|
host := parsedURL.Host
|
||||||
|
|
||||||
|
// If the host contains a port, strip it
|
||||||
|
hostname, _, err := net.SplitHostPort(host)
|
||||||
|
if err != nil {
|
||||||
|
// If there's no port, it might return an error, in which case, host is the hostname
|
||||||
|
if err.(*net.AddrError).Err == "missing port in address" {
|
||||||
|
hostname = host
|
||||||
|
} else {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert hostname to lowercase
|
||||||
|
hostname = strings.ToLower(hostname)
|
||||||
|
|
||||||
|
return hostname
|
||||||
|
}
|
||||||
|
|
||||||
|
func summarizeURL(url string) {
|
||||||
|
|
||||||
|
// Initialize the summarizer with the OpenAI API key
|
||||||
|
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieve Puppeteer API URL and salt from environment variables
|
||||||
|
apiURL := os.Getenv("PUPPETEER_API_URL")
|
||||||
|
apiSalt := os.Getenv("PUPPETEER_API_SALT")
|
||||||
|
|
||||||
|
if apiURL == "" || apiSalt == "" {
|
||||||
|
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize the Puppeteer client
|
||||||
|
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
|
||||||
|
|
||||||
|
// Create a context with a timeout for the scrape request
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Scrape the content from the given URL
|
||||||
|
|
||||||
|
scraped, err := client.Scrape(ctx, url, "body")
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use bluemonday to strip HTML tags from the scraped content
|
||||||
|
p := bluemonday.StripTagsPolicy()
|
||||||
|
strippedContent := p.Sanitize(scraped.Content)
|
||||||
|
|
||||||
|
if len(strippedContent) > 15000 {
|
||||||
|
// truncate if too big
|
||||||
|
strippedContent = strippedContent[:10000]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the summary of the stripped content
|
||||||
|
res, err := s.GetSummary(url, strippedContent)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// pretty print the result:
|
||||||
|
pp.Print(res)
|
||||||
|
fmt.Println()
|
||||||
|
|
||||||
|
hn := parseHostnameFromURL(url)
|
||||||
|
fmt.Printf("Hostname: %s\n", hn)
|
||||||
|
|
||||||
|
fn := fmt.Sprintf("%s.json", hn)
|
||||||
|
|
||||||
|
writePrettyJSONToFile(res, fn)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePrettyJSONToFile(data interface{}, filename string) error {
|
||||||
|
// Serialize the data to pretty-printed JSON
|
||||||
|
jsonData, err := json.MarshalIndent(data, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error serializing to JSON: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create or truncate the file
|
||||||
|
file, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error creating file: %v", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
// Write the JSON data to the file
|
||||||
|
_, err = file.Write(jsonData)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error writing to file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
module sneak.berlin/go/aipagesummary
|
||||||
|
|
||||||
|
go 1.22.2
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/joho/godotenv v1.5.1
|
||||||
|
github.com/k0kubun/pp v3.0.1+incompatible
|
||||||
|
github.com/microcosm-cc/bluemonday v1.0.26
|
||||||
|
github.com/sashabaranov/go-openai v1.24.1
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/aymerick/douceur v0.2.0 // indirect
|
||||||
|
github.com/emvi/iso-639-1 v1.1.0 // indirect
|
||||||
|
github.com/gorilla/css v1.0.1 // indirect
|
||||||
|
github.com/k0kubun/pp/v3 v3.2.0 // indirect
|
||||||
|
github.com/kr/pretty v0.3.1 // indirect
|
||||||
|
github.com/kr/text v0.2.0 // indirect
|
||||||
|
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/rogpeppe/go-internal v1.9.0 // indirect
|
||||||
|
golang.org/x/net v0.25.0 // indirect
|
||||||
|
golang.org/x/sys v0.20.0 // indirect
|
||||||
|
golang.org/x/text v0.15.0 // indirect
|
||||||
|
)
|
|
@ -0,0 +1,48 @@
|
||||||
|
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
|
||||||
|
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
|
||||||
|
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||||
|
github.com/emvi/iso-639-1 v1.1.0 h1:EhZiYVA+ysa/b7+0T2DD9hcX7E/5sh4o1KyDAIPu7VE=
|
||||||
|
github.com/emvi/iso-639-1 v1.1.0/go.mod h1:CSA53/Tx0xF9bk2DEA0Mr0wTdIxq7pqoVZgBOfoL5GI=
|
||||||
|
github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
|
||||||
|
github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
|
||||||
|
github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
|
||||||
|
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
|
||||||
|
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||||
|
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||||
|
github.com/k0kubun/pp v3.0.1+incompatible h1:3tqvf7QgUnZ5tXO6pNAZlrvHgl6DvifjDrd9g2S9Z40=
|
||||||
|
github.com/k0kubun/pp v3.0.1+incompatible/go.mod h1:GWse8YhT0p8pT4ir3ZgBbfZild3tgzSScAn6HmfYukg=
|
||||||
|
github.com/k0kubun/pp/v3 v3.2.0 h1:h33hNTZ9nVFNP3u2Fsgz8JXiF5JINoZfFq4SvKJwNcs=
|
||||||
|
github.com/k0kubun/pp/v3 v3.2.0/go.mod h1:ODtJQbQcIRfAD3N+theGCV1m/CBxweERz2dapdz1EwA=
|
||||||
|
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||||
|
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||||
|
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||||
|
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||||
|
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||||
|
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||||
|
github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ=
|
||||||
|
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
|
||||||
|
github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
|
||||||
|
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||||
|
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||||
|
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||||
|
github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z04DQDJ1MI=
|
||||||
|
github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
|
||||||
|
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
|
||||||
|
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
|
||||||
|
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
||||||
|
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||||
|
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
||||||
|
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
|
||||||
|
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290 h1:HpQBFKNCdyJjjJLEhUpjEzEh945JUtx2ifdbINU5jgY=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56 h1:KzPfR0+SyVBr1yHnbdXCCOPPTq95a4cdUp45gqB3VbM=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 h1:nJwEfj/BU1O4caRFt0UWCE09JLpr7/bTuI5pIo1h5lM=
|
||||||
|
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
|
@ -0,0 +1,265 @@
|
||||||
|
package aipagesummary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
iso6391 "github.com/emvi/iso-639-1"
|
||||||
|
openai "github.com/sashabaranov/go-openai"
|
||||||
|
)
|
||||||
|
|
||||||
|
const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json"
|
||||||
|
|
||||||
|
type AIResponse struct {
|
||||||
|
Success bool `json:"success"`
|
||||||
|
Error string `json:"error, omitempty"`
|
||||||
|
Result PageSummary `json:"result"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type PageSummary struct {
|
||||||
|
ID string `json:"$id"`
|
||||||
|
FetchedAt time.Time
|
||||||
|
URL string
|
||||||
|
Tags []string
|
||||||
|
Categories []string
|
||||||
|
ContentExcerpt string
|
||||||
|
Language string
|
||||||
|
Summary100 string
|
||||||
|
Summary500 string
|
||||||
|
}
|
||||||
|
|
||||||
|
const model = openai.GPT3Dot5Turbo
|
||||||
|
|
||||||
|
const summarizerSystemPrompt = `
|
||||||
|
|
||||||
|
Your job is to summarize a webpage.
|
||||||
|
|
||||||
|
You will receive a json object containing the url and body tag content with
|
||||||
|
HTML tags stripped out. The key "url" will be a string representing a URL.
|
||||||
|
The key "content" will be the body tag and its rendered contents.
|
||||||
|
|
||||||
|
You will parse and interpret the "content" value and respond with a json
|
||||||
|
object such as:
|
||||||
|
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"error": null,
|
||||||
|
"result": { ... }
|
||||||
|
}
|
||||||
|
|
||||||
|
Set success to false and provide a descriptive string in "error" describing
|
||||||
|
the error if you are unable to complete the task for any reason, such as a
|
||||||
|
failure to parse the input content. Your response should be json and only
|
||||||
|
json, and your response should always, always parse as valid json. Do not
|
||||||
|
provide any other output except json, no description.
|
||||||
|
|
||||||
|
|
||||||
|
Result should be the following:
|
||||||
|
|
||||||
|
type PageSummary struct {
|
||||||
|
URL string
|
||||||
|
Tags []string
|
||||||
|
Categories []string
|
||||||
|
ContentExcerpt string
|
||||||
|
Language string
|
||||||
|
Summary100 string
|
||||||
|
Summary500 string
|
||||||
|
}
|
||||||
|
|
||||||
|
If the page content primarily describes an error condition or failure,
|
||||||
|
please do not return a summary. Instead, set the error field to a
|
||||||
|
descriptive string and set success to false. Omit the result field.
|
||||||
|
|
||||||
|
Try to identify if the provided URL and content are a main page on a site,
|
||||||
|
or a specific article, post, product, or other type of content on a site.
|
||||||
|
If it is a main page, you should provide a summary of the site itself, and
|
||||||
|
if it is a specific article, post, product, or other type of content, you
|
||||||
|
should provide a summary of that specific article, post, item, or product.
|
||||||
|
The tags and category should reflect this distinction.
|
||||||
|
|
||||||
|
The output URL should be a canonicalized version of the input URL.
|
||||||
|
|
||||||
|
The output tags is a list of strings, each a short one or two or three word
|
||||||
|
tag, using underscores instead of spaces, to indicate page attributes, such
|
||||||
|
as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You
|
||||||
|
can use tags not listed in these instructions, come up with your own. You
|
||||||
|
should endeavor to come up with at least 5 tags for each URL, up to a
|
||||||
|
maximum of 20. For example, you might tag a personal blog site that
|
||||||
|
contains a lot of posts about the javascript programming language with
|
||||||
|
"javascript" and "programming". Tags should be lowercase. Important tags
|
||||||
|
that might apply to sites are things like "defunct" for sites or services
|
||||||
|
that have been shut down, or "cdn" for domains or sites that are primarily
|
||||||
|
content delivery networks and not directly accessed by users, such as
|
||||||
|
fbcdn.net. Sites that are open to the public for signup and posting of
|
||||||
|
user-generated content should receive tags of "public" or "ugc".
|
||||||
|
Image-focused sites such as image hosting or photo sharing or art should
|
||||||
|
receive the "images" tag. Photography focused sites should receive the tag
|
||||||
|
"photography". Blogs should receive "blog", and specific blog posts should
|
||||||
|
receive both "blog" and "blogpost". Product pages should receive the tag
|
||||||
|
"product". Try to keep tags as single words. For example, a payment
|
||||||
|
processing company site should receive tags "corporate", "payments",
|
||||||
|
"service_provider", among others.
|
||||||
|
|
||||||
|
The output field categories should be a list of broad categories that a
|
||||||
|
website can fit into. "personal_blog" is a good example.
|
||||||
|
"corporate_website" is another. "tech_startup" is another.
|
||||||
|
"automobile_retailer" is another. These are just examples, not an inclusive
|
||||||
|
list. "news_aggregator" and "discussion_forum" are other examples. You may
|
||||||
|
include up to 10 categories, with a minimum of one. Categories should be
|
||||||
|
lowercase. Please try to provide at least three categories in your response.
|
||||||
|
|
||||||
|
The output field "ContentExcerpt" should be a substring of the input
|
||||||
|
content, with whitespace collapsed. Try to find the main or most relevant
|
||||||
|
section of the page, such as the first paragraph of an article. Do not
|
||||||
|
include site header or footer boilerplate text. Truncate this field at 2
|
||||||
|
kilobytes of text. It should be taken directly from the input. Do not
|
||||||
|
summarize in this field, but include a snippet of the provided content. It
|
||||||
|
should be a substring of the input "content" field.
|
||||||
|
|
||||||
|
Language is the two digit ISO language code representing the main language
|
||||||
|
of the content input.
|
||||||
|
|
||||||
|
Summary100, and Summary500 output fields are english language summaries of
|
||||||
|
the type of page, with a maximum of 100 words and 500 words respectively.
|
||||||
|
These fields should not contain summaries of the page content! They should
|
||||||
|
contain a short description of the page itself, such as "A personal blog
|
||||||
|
about programming and technology" or "A news site covering the latest in
|
||||||
|
technology" or "An article about the history of the internet". For the tone
|
||||||
|
of these Summary fields, you should be neutral and informative. Please
|
||||||
|
don't repeat imperative statements or calls to action from the source site,
|
||||||
|
such as instructions to sign up, purchase, or download. You should not
|
||||||
|
include any opinions or subjective statements in the summaries. Avoid using
|
||||||
|
overly effusive descriptions or adjectives. Unless the site is focused on a
|
||||||
|
specific technical topic, avoid discussing technical things like logging in,
|
||||||
|
cookies, or preference settings that are common to many websites.
|
||||||
|
|
||||||
|
Given that the Summary fields are in English and for an English-speaking
|
||||||
|
audience, if the page's language is not English, the page's language and
|
||||||
|
country should mentioned in the summaries. For example, "A Japanese news
|
||||||
|
site covering current events in Japan" for a site in Japanese. If the
|
||||||
|
language is not English, always add the name of the language to the tags
|
||||||
|
list. If the site is focused at a non-english-speaking country, such as a
|
||||||
|
site in French for a French audience, you should include the country name in
|
||||||
|
the tags list, such as "france" or "japan".
|
||||||
|
|
||||||
|
Editorially, you should not be summarizing the specific page content, but
|
||||||
|
the type of page. For example, if you are summarizing a news site, you
|
||||||
|
should not summarize the news items displayed currently, but the news site
|
||||||
|
itself, indicating the type of news being covered and style of coverage. If
|
||||||
|
you are summarizing a blog post, you should summarize the blog site itself.
|
||||||
|
If you are summarizing a corporate site, you should provide general
|
||||||
|
information about the company, and a high level overview of what type of
|
||||||
|
information the company provides on the site. If you are summarizing a
|
||||||
|
software-as-a-service site, you should provide a high level overview of the
|
||||||
|
service's features and target audience.
|
||||||
|
|
||||||
|
`
|
||||||
|
|
||||||
|
type Summarizer struct{ APIKey string }
|
||||||
|
|
||||||
|
func NewSummarizer(apiKey string) (*Summarizer, error) {
|
||||||
|
if apiKey == "" {
|
||||||
|
return nil, fmt.Errorf("API key is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
return &Summarizer{
|
||||||
|
APIKey: apiKey,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func UniqueSortedList(input []string) []string {
|
||||||
|
// Create a map to track unique elements
|
||||||
|
uniqueMap := make(map[string]struct{})
|
||||||
|
for _, item := range input {
|
||||||
|
uniqueMap[item] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a slice from the keys of the map
|
||||||
|
uniqueList := make([]string, 0, len(uniqueMap))
|
||||||
|
for key := range uniqueMap {
|
||||||
|
uniqueList = append(uniqueList, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the unique list
|
||||||
|
sort.Strings(uniqueList)
|
||||||
|
|
||||||
|
return uniqueList
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) {
|
||||||
|
apiKey := os.Getenv("OPENAI_API_KEY")
|
||||||
|
if apiKey == "" {
|
||||||
|
log.Fatal("OPENAI_API_KEY environment variable is not set")
|
||||||
|
}
|
||||||
|
|
||||||
|
type Request struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Content string `json:"content"`
|
||||||
|
}
|
||||||
|
|
||||||
|
req := Request{
|
||||||
|
URL: url,
|
||||||
|
Content: content,
|
||||||
|
}
|
||||||
|
|
||||||
|
reqJSON, err := json.Marshal(req)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
client := openai.NewClient(apiKey)
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
sumReq := openai.ChatCompletionRequest{
|
||||||
|
Model: model,
|
||||||
|
Messages: []openai.ChatCompletionMessage{
|
||||||
|
{
|
||||||
|
Role: openai.ChatMessageRoleSystem,
|
||||||
|
Content: summarizerSystemPrompt,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Role: openai.ChatMessageRoleUser,
|
||||||
|
Content: string(reqJSON),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := client.CreateChatCompletion(ctx, sumReq)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
jsonResp := resp.Choices[0].Message.Content
|
||||||
|
var aiResp AIResponse
|
||||||
|
err = json.Unmarshal([]byte(jsonResp), &aiResp)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Error unmarshalling response: %v\n", err)
|
||||||
|
fmt.Printf("Response: '%s'\n", jsonResp)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if aiResp.Error != "" {
|
||||||
|
return nil, fmt.Errorf(aiResp.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
if aiResp.Result.ContentExcerpt == "" {
|
||||||
|
return nil, fmt.Errorf("No content excerpt found")
|
||||||
|
}
|
||||||
|
|
||||||
|
langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name
|
||||||
|
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language))
|
||||||
|
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish))
|
||||||
|
|
||||||
|
aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags)
|
||||||
|
aiResp.Result.ID = schemaURL
|
||||||
|
aiResp.Result.FetchedAt = time.Now().UTC()
|
||||||
|
|
||||||
|
return &aiResp.Result, nil
|
||||||
|
}
|
Loading…
Reference in New Issue