initial
This commit is contained in:
commit
108587b28c
|
@ -0,0 +1,5 @@
|
|||
cmd/test/*.json
|
||||
cmd/test/.env
|
||||
cmd/test/test
|
||||
.env
|
||||
logfile.txt
|
|
@ -0,0 +1,309 @@
|
|||
163.com
|
||||
1688.com
|
||||
2ch.net
|
||||
360.cn
|
||||
4shared.com
|
||||
9gag.com
|
||||
OPENLOAD.co
|
||||
T.co
|
||||
about.com
|
||||
acfun.tv
|
||||
addthis.com
|
||||
adf.ly
|
||||
adnetworkperformance.com
|
||||
adobe.com
|
||||
adplxmd.com
|
||||
agar.io
|
||||
airbnb.com
|
||||
alibaba.com
|
||||
alicdn.com
|
||||
aliexpress.com
|
||||
allegro.pl
|
||||
ameba.jp
|
||||
americanexpress.com
|
||||
ancestry.com
|
||||
apple.com
|
||||
archive.org
|
||||
ask.fm
|
||||
att.com
|
||||
avito.ru
|
||||
badoo.com
|
||||
bankofamerica.com
|
||||
battle.net
|
||||
bbc.co.uk
|
||||
bbc.com
|
||||
bestbuy.com
|
||||
bet365.com
|
||||
bild.de
|
||||
bilibili.com
|
||||
billdesk.com
|
||||
bing.com
|
||||
bitauto.com
|
||||
blackboard.com
|
||||
blastingnews.com
|
||||
blkget.com
|
||||
blog.jp
|
||||
blogfa.com
|
||||
blogger.com
|
||||
blogspot.com
|
||||
blogspot.com.br
|
||||
blogspot.in
|
||||
bloomberg.com
|
||||
bongacams.com
|
||||
booking.com
|
||||
box.com
|
||||
bukalapak.com
|
||||
businessinsider.com
|
||||
buzzlie.com
|
||||
capitalone.com
|
||||
chase.com
|
||||
chaturbate.com
|
||||
citi.com
|
||||
ck101.com
|
||||
cnblogs.com
|
||||
cnnic.cn
|
||||
coccoc.com
|
||||
craigslist.org
|
||||
dailymotion.com
|
||||
dell.com
|
||||
detail.tmall.com.danuoyi.tbcache.com
|
||||
detik.com
|
||||
deviantart.com
|
||||
digikala.com
|
||||
diply.com
|
||||
disqus.com
|
||||
doubleclick.net
|
||||
doublepimp.com
|
||||
dropbox.com
|
||||
dssedc4qxg7o6.cloudfront.net
|
||||
ebay-kleinanzeigen.de
|
||||
ebay.co.uk
|
||||
ebay.com
|
||||
ebay.de
|
||||
ebay.in
|
||||
ebay.it
|
||||
eksisozluk.com
|
||||
espn.gns.go.com
|
||||
etsy.com
|
||||
ettoday.net
|
||||
evernote.com
|
||||
exoclick.com
|
||||
extratorrent.cc
|
||||
fbcdn.net
|
||||
fedex.com
|
||||
feedly.com
|
||||
files.wordpress.com
|
||||
flickr.com
|
||||
flipkart.com
|
||||
forbes.com
|
||||
foxnews.com
|
||||
freepik.com
|
||||
gfycat.com
|
||||
giphy.com
|
||||
github.com
|
||||
github.io
|
||||
gizmodo.com
|
||||
globo.com
|
||||
gmanetwork.com
|
||||
gmx.net
|
||||
go.com
|
||||
godaddy.com
|
||||
goodreads.com
|
||||
google.com
|
||||
groupon.com
|
||||
haosou.com
|
||||
hatena.ne.jp
|
||||
hclips.com
|
||||
hdfcbank.com
|
||||
hdzog.com
|
||||
hp.com
|
||||
hpcc-page.cnc.ccgslb.com.cn
|
||||
hulu.com
|
||||
hurriyet.com.tr
|
||||
icloud.com
|
||||
imgur.com
|
||||
impress.co.jp
|
||||
imzog.com
|
||||
indeed.com
|
||||
instagram.com
|
||||
instructure.com
|
||||
intuit.com
|
||||
iqiyi.com
|
||||
irctc.co.in
|
||||
jabong.com
|
||||
jd.com
|
||||
kaskus.co.id
|
||||
kat.cr
|
||||
kickstarter.com
|
||||
kinogo.co
|
||||
kinopoisk.ru
|
||||
kissanime.to
|
||||
kohls.com
|
||||
leboncoin.fr
|
||||
life.tw
|
||||
lifebuzz.com
|
||||
likes.com
|
||||
liputan6.com
|
||||
liveadexchanger.com
|
||||
livejournal.com
|
||||
loading-delivery2.com
|
||||
lowes.com
|
||||
macys.com
|
||||
mail.ru
|
||||
mailchimp.com
|
||||
mama.cn
|
||||
mashable.com
|
||||
mediab.uy
|
||||
mediafire.com
|
||||
medium.com
|
||||
mega.nz
|
||||
mercadolibre.com.ar
|
||||
messenger.com
|
||||
microsoft.com
|
||||
mozilla.org
|
||||
msn.com
|
||||
nametests.com
|
||||
naver.jp
|
||||
nba.com
|
||||
netflix.com
|
||||
nih.gov
|
||||
nyaa.se
|
||||
nytimes.com
|
||||
office.com
|
||||
ok.ru
|
||||
olx.pl
|
||||
onclickads.net
|
||||
onedio.com
|
||||
onet.pl
|
||||
onlinesbi.com
|
||||
opthw.xdwscache.speedcdns.com
|
||||
oracle.com
|
||||
orange.fr
|
||||
ouo.io
|
||||
outbrain.com
|
||||
ozock.com
|
||||
pandora.com
|
||||
paypal.com
|
||||
paytm.com
|
||||
pinimg.com
|
||||
pinterest.com
|
||||
pixiv.net
|
||||
popads.net
|
||||
popcash.net
|
||||
pornhub.com
|
||||
ppomppu.co.kr
|
||||
putlocker.is
|
||||
quora.com
|
||||
rambler.ru
|
||||
rdsa2012.com
|
||||
realtor.com
|
||||
rediff.com
|
||||
reimageplus.com
|
||||
roblox.com
|
||||
rt.com
|
||||
ruten.com.tw
|
||||
rutracker.org
|
||||
sabah.com.tr
|
||||
sahibinden.com
|
||||
salesforce.com
|
||||
savefrom.net
|
||||
sberbank.ru
|
||||
scribd.com
|
||||
secureserver.net
|
||||
seznam.cz
|
||||
sh.st
|
||||
shutterstock.com
|
||||
siteadvisor.com
|
||||
skype.com
|
||||
slack.com
|
||||
slickdeals.net
|
||||
slideshare.net
|
||||
slither.io
|
||||
so.com
|
||||
sogou.com
|
||||
sohu.com
|
||||
soundcloud.com
|
||||
sourceforge.net
|
||||
spiegel.de
|
||||
spotify.com
|
||||
stackexchange.com
|
||||
stackoverflow.com
|
||||
steamcommunity.com
|
||||
steampowered.com
|
||||
subscene.com
|
||||
surveymonkey.com
|
||||
t-online.de
|
||||
tabelog.com
|
||||
taboola.com
|
||||
taleo.net
|
||||
taobao.com
|
||||
target.com
|
||||
taringa.net
|
||||
telegram.org
|
||||
telegraph.co.uk
|
||||
terraclicks.com
|
||||
thefreedictionary.com
|
||||
theladbible.com
|
||||
themeforest.net
|
||||
thepiratebay.se
|
||||
thesportbible.com
|
||||
thewatchseries.to
|
||||
tistory.com
|
||||
tmall.com
|
||||
tokopedia.com
|
||||
torrentz.eu
|
||||
tradeadexchange.com
|
||||
trello.com
|
||||
tripadvisor.com
|
||||
tuberel.com
|
||||
tumblr.com
|
||||
twitch.tv
|
||||
twitter.com
|
||||
txxx.com
|
||||
udn.com
|
||||
upornia.com
|
||||
ups.com
|
||||
uptodown.com
|
||||
upwork.com
|
||||
usps.com
|
||||
verizonwireless.com
|
||||
vice.com
|
||||
vimeo.com
|
||||
vk.com
|
||||
vk.me
|
||||
walmart.com
|
||||
wangzhanbao.cc
|
||||
washingtonpost.com
|
||||
weather.com
|
||||
web.de
|
||||
webtretho.com
|
||||
weebly.com
|
||||
weibo.com
|
||||
wellsfargo.com
|
||||
wetransfer.com
|
||||
whatsapp.com
|
||||
wikia.com
|
||||
wikihow.com
|
||||
wikimedia.org
|
||||
wikipedia.org
|
||||
wittyfeed.com
|
||||
wix.com
|
||||
wordpress.com
|
||||
wordpress.org
|
||||
wp.com
|
||||
wsj.com
|
||||
xfinity.com
|
||||
xhamster.com
|
||||
xuite.net
|
||||
yahoo.com
|
||||
yandex.ru
|
||||
yelp.com
|
||||
youm7.com
|
||||
youporn.com
|
||||
youtube-mp3.org
|
||||
youtube.com
|
||||
zendesk.com
|
||||
zhihu.com
|
||||
zillow.com
|
||||
zippyshare.com
|
||||
zoho.com
|
|
@ -0,0 +1,175 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
_ "github.com/joho/godotenv/autoload"
|
||||
"github.com/k0kubun/pp"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"sneak.berlin/go/aipagesummary"
|
||||
"sneak.berlin/go/puppeteerapiclient"
|
||||
)
|
||||
|
||||
func main() {
|
||||
bigList := []string{
|
||||
"https://www.bbc.com",
|
||||
"https://www.cnn.com",
|
||||
"https://news.ycombinator.com",
|
||||
"https://www.reddit.com",
|
||||
"https://www.wikipedia.org",
|
||||
"https://www.ford.com",
|
||||
"https://www.tesla.com",
|
||||
"https://www.apple.com",
|
||||
"https://www.microsoft.com",
|
||||
"https://www.google.com",
|
||||
"https://medium.com",
|
||||
"https://www.nytimes.com",
|
||||
"https://sneak.berlin",
|
||||
}
|
||||
|
||||
// now read urls from alexa500.txt
|
||||
list, err := os.ReadFile("alexa500.txt")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
urls := strings.Fields(string(list))
|
||||
for _, url := range urls {
|
||||
url = "https://" + url
|
||||
bigList = append(bigList, url)
|
||||
|
||||
}
|
||||
|
||||
shuffle(bigList)
|
||||
|
||||
for _, url := range bigList {
|
||||
fmt.Printf("Showing summary for %s\n", url)
|
||||
summarizeURL(url)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func shuffle(slice []string) {
|
||||
rand.Shuffle(len(slice), func(i, j int) {
|
||||
slice[i], slice[j] = slice[j], slice[i]
|
||||
})
|
||||
}
|
||||
|
||||
func parseHostnameFromURL(rawURL string) string {
|
||||
// Parse the URL
|
||||
parsedURL, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Extract the host part
|
||||
host := parsedURL.Host
|
||||
|
||||
// If the host contains a port, strip it
|
||||
hostname, _, err := net.SplitHostPort(host)
|
||||
if err != nil {
|
||||
// If there's no port, it might return an error, in which case, host is the hostname
|
||||
if err.(*net.AddrError).Err == "missing port in address" {
|
||||
hostname = host
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// Convert hostname to lowercase
|
||||
hostname = strings.ToLower(hostname)
|
||||
|
||||
return hostname
|
||||
}
|
||||
|
||||
func summarizeURL(url string) {
|
||||
|
||||
// Initialize the summarizer with the OpenAI API key
|
||||
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Retrieve Puppeteer API URL and salt from environment variables
|
||||
apiURL := os.Getenv("PUPPETEER_API_URL")
|
||||
apiSalt := os.Getenv("PUPPETEER_API_SALT")
|
||||
|
||||
if apiURL == "" || apiSalt == "" {
|
||||
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
|
||||
}
|
||||
|
||||
// Initialize the Puppeteer client
|
||||
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
|
||||
|
||||
// Create a context with a timeout for the scrape request
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Scrape the content from the given URL
|
||||
|
||||
scraped, err := client.Scrape(ctx, url, "body")
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Use bluemonday to strip HTML tags from the scraped content
|
||||
p := bluemonday.StripTagsPolicy()
|
||||
strippedContent := p.Sanitize(scraped.Content)
|
||||
|
||||
if len(strippedContent) > 15000 {
|
||||
// truncate if too big
|
||||
strippedContent = strippedContent[:10000]
|
||||
}
|
||||
|
||||
// Get the summary of the stripped content
|
||||
res, err := s.GetSummary(url, strippedContent)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
return
|
||||
}
|
||||
// pretty print the result:
|
||||
pp.Print(res)
|
||||
fmt.Println()
|
||||
|
||||
hn := parseHostnameFromURL(url)
|
||||
fmt.Printf("Hostname: %s\n", hn)
|
||||
|
||||
fn := fmt.Sprintf("%s.json", hn)
|
||||
|
||||
writePrettyJSONToFile(res, fn)
|
||||
|
||||
}
|
||||
|
||||
func writePrettyJSONToFile(data interface{}, filename string) error {
|
||||
// Serialize the data to pretty-printed JSON
|
||||
jsonData, err := json.MarshalIndent(data, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error serializing to JSON: %v", err)
|
||||
}
|
||||
|
||||
// Create or truncate the file
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
// Write the JSON data to the file
|
||||
_, err = file.Write(jsonData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing to file: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
module sneak.berlin/go/aipagesummary
|
||||
|
||||
go 1.22.2
|
||||
|
||||
require (
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/k0kubun/pp v3.0.1+incompatible
|
||||
github.com/microcosm-cc/bluemonday v1.0.26
|
||||
github.com/sashabaranov/go-openai v1.24.1
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/aymerick/douceur v0.2.0 // indirect
|
||||
github.com/emvi/iso-639-1 v1.1.0 // indirect
|
||||
github.com/gorilla/css v1.0.1 // indirect
|
||||
github.com/k0kubun/pp/v3 v3.2.0 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/rogpeppe/go-internal v1.9.0 // indirect
|
||||
golang.org/x/net v0.25.0 // indirect
|
||||
golang.org/x/sys v0.20.0 // indirect
|
||||
golang.org/x/text v0.15.0 // indirect
|
||||
)
|
|
@ -0,0 +1,48 @@
|
|||
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
|
||||
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/emvi/iso-639-1 v1.1.0 h1:EhZiYVA+ysa/b7+0T2DD9hcX7E/5sh4o1KyDAIPu7VE=
|
||||
github.com/emvi/iso-639-1 v1.1.0/go.mod h1:CSA53/Tx0xF9bk2DEA0Mr0wTdIxq7pqoVZgBOfoL5GI=
|
||||
github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
|
||||
github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
|
||||
github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
|
||||
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/k0kubun/pp v3.0.1+incompatible h1:3tqvf7QgUnZ5tXO6pNAZlrvHgl6DvifjDrd9g2S9Z40=
|
||||
github.com/k0kubun/pp v3.0.1+incompatible/go.mod h1:GWse8YhT0p8pT4ir3ZgBbfZild3tgzSScAn6HmfYukg=
|
||||
github.com/k0kubun/pp/v3 v3.2.0 h1:h33hNTZ9nVFNP3u2Fsgz8JXiF5JINoZfFq4SvKJwNcs=
|
||||
github.com/k0kubun/pp/v3 v3.2.0/go.mod h1:ODtJQbQcIRfAD3N+theGCV1m/CBxweERz2dapdz1EwA=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ=
|
||||
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
|
||||
github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
|
||||
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z04DQDJ1MI=
|
||||
github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
|
||||
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
|
||||
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
|
||||
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
|
||||
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290 h1:HpQBFKNCdyJjjJLEhUpjEzEh945JUtx2ifdbINU5jgY=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56 h1:KzPfR0+SyVBr1yHnbdXCCOPPTq95a4cdUp45gqB3VbM=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 h1:nJwEfj/BU1O4caRFt0UWCE09JLpr7/bTuI5pIo1h5lM=
|
||||
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
|
|
@ -0,0 +1,265 @@
|
|||
package aipagesummary
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
iso6391 "github.com/emvi/iso-639-1"
|
||||
openai "github.com/sashabaranov/go-openai"
|
||||
)
|
||||
|
||||
const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json"
|
||||
|
||||
type AIResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error, omitempty"`
|
||||
Result PageSummary `json:"result"`
|
||||
}
|
||||
|
||||
type PageSummary struct {
|
||||
ID string `json:"$id"`
|
||||
FetchedAt time.Time
|
||||
URL string
|
||||
Tags []string
|
||||
Categories []string
|
||||
ContentExcerpt string
|
||||
Language string
|
||||
Summary100 string
|
||||
Summary500 string
|
||||
}
|
||||
|
||||
const model = openai.GPT3Dot5Turbo
|
||||
|
||||
const summarizerSystemPrompt = `
|
||||
|
||||
Your job is to summarize a webpage.
|
||||
|
||||
You will receive a json object containing the url and body tag content with
|
||||
HTML tags stripped out. The key "url" will be a string representing a URL.
|
||||
The key "content" will be the body tag and its rendered contents.
|
||||
|
||||
You will parse and interpret the "content" value and respond with a json
|
||||
object such as:
|
||||
|
||||
{
|
||||
"success": true,
|
||||
"error": null,
|
||||
"result": { ... }
|
||||
}
|
||||
|
||||
Set success to false and provide a descriptive string in "error" describing
|
||||
the error if you are unable to complete the task for any reason, such as a
|
||||
failure to parse the input content. Your response should be json and only
|
||||
json, and your response should always, always parse as valid json. Do not
|
||||
provide any other output except json, no description.
|
||||
|
||||
|
||||
Result should be the following:
|
||||
|
||||
type PageSummary struct {
|
||||
URL string
|
||||
Tags []string
|
||||
Categories []string
|
||||
ContentExcerpt string
|
||||
Language string
|
||||
Summary100 string
|
||||
Summary500 string
|
||||
}
|
||||
|
||||
If the page content primarily describes an error condition or failure,
|
||||
please do not return a summary. Instead, set the error field to a
|
||||
descriptive string and set success to false. Omit the result field.
|
||||
|
||||
Try to identify if the provided URL and content are a main page on a site,
|
||||
or a specific article, post, product, or other type of content on a site.
|
||||
If it is a main page, you should provide a summary of the site itself, and
|
||||
if it is a specific article, post, product, or other type of content, you
|
||||
should provide a summary of that specific article, post, item, or product.
|
||||
The tags and category should reflect this distinction.
|
||||
|
||||
The output URL should be a canonicalized version of the input URL.
|
||||
|
||||
The output tags is a list of strings, each a short one or two or three word
|
||||
tag, using underscores instead of spaces, to indicate page attributes, such
|
||||
as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You
|
||||
can use tags not listed in these instructions, come up with your own. You
|
||||
should endeavor to come up with at least 5 tags for each URL, up to a
|
||||
maximum of 20. For example, you might tag a personal blog site that
|
||||
contains a lot of posts about the javascript programming language with
|
||||
"javascript" and "programming". Tags should be lowercase. Important tags
|
||||
that might apply to sites are things like "defunct" for sites or services
|
||||
that have been shut down, or "cdn" for domains or sites that are primarily
|
||||
content delivery networks and not directly accessed by users, such as
|
||||
fbcdn.net. Sites that are open to the public for signup and posting of
|
||||
user-generated content should receive tags of "public" or "ugc".
|
||||
Image-focused sites such as image hosting or photo sharing or art should
|
||||
receive the "images" tag. Photography focused sites should receive the tag
|
||||
"photography". Blogs should receive "blog", and specific blog posts should
|
||||
receive both "blog" and "blogpost". Product pages should receive the tag
|
||||
"product". Try to keep tags as single words. For example, a payment
|
||||
processing company site should receive tags "corporate", "payments",
|
||||
"service_provider", among others.
|
||||
|
||||
The output field categories should be a list of broad categories that a
|
||||
website can fit into. "personal_blog" is a good example.
|
||||
"corporate_website" is another. "tech_startup" is another.
|
||||
"automobile_retailer" is another. These are just examples, not an inclusive
|
||||
list. "news_aggregator" and "discussion_forum" are other examples. You may
|
||||
include up to 10 categories, with a minimum of one. Categories should be
|
||||
lowercase. Please try to provide at least three categories in your response.
|
||||
|
||||
The output field "ContentExcerpt" should be a substring of the input
|
||||
content, with whitespace collapsed. Try to find the main or most relevant
|
||||
section of the page, such as the first paragraph of an article. Do not
|
||||
include site header or footer boilerplate text. Truncate this field at 2
|
||||
kilobytes of text. It should be taken directly from the input. Do not
|
||||
summarize in this field, but include a snippet of the provided content. It
|
||||
should be a substring of the input "content" field.
|
||||
|
||||
Language is the two digit ISO language code representing the main language
|
||||
of the content input.
|
||||
|
||||
Summary100, and Summary500 output fields are english language summaries of
|
||||
the type of page, with a maximum of 100 words and 500 words respectively.
|
||||
These fields should not contain summaries of the page content! They should
|
||||
contain a short description of the page itself, such as "A personal blog
|
||||
about programming and technology" or "A news site covering the latest in
|
||||
technology" or "An article about the history of the internet". For the tone
|
||||
of these Summary fields, you should be neutral and informative. Please
|
||||
don't repeat imperative statements or calls to action from the source site,
|
||||
such as instructions to sign up, purchase, or download. You should not
|
||||
include any opinions or subjective statements in the summaries. Avoid using
|
||||
overly effusive descriptions or adjectives. Unless the site is focused on a
|
||||
specific technical topic, avoid discussing technical things like logging in,
|
||||
cookies, or preference settings that are common to many websites.
|
||||
|
||||
Given that the Summary fields are in English and for an English-speaking
|
||||
audience, if the page's language is not English, the page's language and
|
||||
country should mentioned in the summaries. For example, "A Japanese news
|
||||
site covering current events in Japan" for a site in Japanese. If the
|
||||
language is not English, always add the name of the language to the tags
|
||||
list. If the site is focused at a non-english-speaking country, such as a
|
||||
site in French for a French audience, you should include the country name in
|
||||
the tags list, such as "france" or "japan".
|
||||
|
||||
Editorially, you should not be summarizing the specific page content, but
|
||||
the type of page. For example, if you are summarizing a news site, you
|
||||
should not summarize the news items displayed currently, but the news site
|
||||
itself, indicating the type of news being covered and style of coverage. If
|
||||
you are summarizing a blog post, you should summarize the blog site itself.
|
||||
If you are summarizing a corporate site, you should provide general
|
||||
information about the company, and a high level overview of what type of
|
||||
information the company provides on the site. If you are summarizing a
|
||||
software-as-a-service site, you should provide a high level overview of the
|
||||
service's features and target audience.
|
||||
|
||||
`
|
||||
|
||||
type Summarizer struct{ APIKey string }
|
||||
|
||||
func NewSummarizer(apiKey string) (*Summarizer, error) {
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("API key is required")
|
||||
}
|
||||
|
||||
return &Summarizer{
|
||||
APIKey: apiKey,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func UniqueSortedList(input []string) []string {
|
||||
// Create a map to track unique elements
|
||||
uniqueMap := make(map[string]struct{})
|
||||
for _, item := range input {
|
||||
uniqueMap[item] = struct{}{}
|
||||
}
|
||||
|
||||
// Create a slice from the keys of the map
|
||||
uniqueList := make([]string, 0, len(uniqueMap))
|
||||
for key := range uniqueMap {
|
||||
uniqueList = append(uniqueList, key)
|
||||
}
|
||||
|
||||
// Sort the unique list
|
||||
sort.Strings(uniqueList)
|
||||
|
||||
return uniqueList
|
||||
}
|
||||
|
||||
func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) {
|
||||
apiKey := os.Getenv("OPENAI_API_KEY")
|
||||
if apiKey == "" {
|
||||
log.Fatal("OPENAI_API_KEY environment variable is not set")
|
||||
}
|
||||
|
||||
type Request struct {
|
||||
URL string `json:"url"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
req := Request{
|
||||
URL: url,
|
||||
Content: content,
|
||||
}
|
||||
|
||||
reqJSON, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
client := openai.NewClient(apiKey)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
sumReq := openai.ChatCompletionRequest{
|
||||
Model: model,
|
||||
Messages: []openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: openai.ChatMessageRoleSystem,
|
||||
Content: summarizerSystemPrompt,
|
||||
},
|
||||
{
|
||||
Role: openai.ChatMessageRoleUser,
|
||||
Content: string(reqJSON),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
resp, err := client.CreateChatCompletion(ctx, sumReq)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
jsonResp := resp.Choices[0].Message.Content
|
||||
var aiResp AIResponse
|
||||
err = json.Unmarshal([]byte(jsonResp), &aiResp)
|
||||
if err != nil {
|
||||
fmt.Printf("Error unmarshalling response: %v\n", err)
|
||||
fmt.Printf("Response: '%s'\n", jsonResp)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if aiResp.Error != "" {
|
||||
return nil, fmt.Errorf(aiResp.Error)
|
||||
}
|
||||
|
||||
if aiResp.Result.ContentExcerpt == "" {
|
||||
return nil, fmt.Errorf("No content excerpt found")
|
||||
}
|
||||
|
||||
langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name
|
||||
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language))
|
||||
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish))
|
||||
|
||||
aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags)
|
||||
aiResp.Result.ID = schemaURL
|
||||
aiResp.Result.FetchedAt = time.Now().UTC()
|
||||
|
||||
return &aiResp.Result, nil
|
||||
}
|
Loading…
Reference in New Issue