This commit is contained in:
Jeffrey Paul 2024-06-03 12:11:29 -07:00
commit 108587b28c
6 changed files with 828 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
cmd/test/*.json
cmd/test/.env
cmd/test/test
.env
logfile.txt

309
cmd/test/alexa500.txt Normal file
View File

@ -0,0 +1,309 @@
163.com
1688.com
2ch.net
360.cn
4shared.com
9gag.com
OPENLOAD.co
T.co
about.com
acfun.tv
addthis.com
adf.ly
adnetworkperformance.com
adobe.com
adplxmd.com
agar.io
airbnb.com
alibaba.com
alicdn.com
aliexpress.com
allegro.pl
ameba.jp
americanexpress.com
ancestry.com
apple.com
archive.org
ask.fm
att.com
avito.ru
badoo.com
bankofamerica.com
battle.net
bbc.co.uk
bbc.com
bestbuy.com
bet365.com
bild.de
bilibili.com
billdesk.com
bing.com
bitauto.com
blackboard.com
blastingnews.com
blkget.com
blog.jp
blogfa.com
blogger.com
blogspot.com
blogspot.com.br
blogspot.in
bloomberg.com
bongacams.com
booking.com
box.com
bukalapak.com
businessinsider.com
buzzlie.com
capitalone.com
chase.com
chaturbate.com
citi.com
ck101.com
cnblogs.com
cnnic.cn
coccoc.com
craigslist.org
dailymotion.com
dell.com
detail.tmall.com.danuoyi.tbcache.com
detik.com
deviantart.com
digikala.com
diply.com
disqus.com
doubleclick.net
doublepimp.com
dropbox.com
dssedc4qxg7o6.cloudfront.net
ebay-kleinanzeigen.de
ebay.co.uk
ebay.com
ebay.de
ebay.in
ebay.it
eksisozluk.com
espn.gns.go.com
etsy.com
ettoday.net
evernote.com
exoclick.com
extratorrent.cc
fbcdn.net
fedex.com
feedly.com
files.wordpress.com
flickr.com
flipkart.com
forbes.com
foxnews.com
freepik.com
gfycat.com
giphy.com
github.com
github.io
gizmodo.com
globo.com
gmanetwork.com
gmx.net
go.com
godaddy.com
goodreads.com
google.com
groupon.com
haosou.com
hatena.ne.jp
hclips.com
hdfcbank.com
hdzog.com
hp.com
hpcc-page.cnc.ccgslb.com.cn
hulu.com
hurriyet.com.tr
icloud.com
imgur.com
impress.co.jp
imzog.com
indeed.com
instagram.com
instructure.com
intuit.com
iqiyi.com
irctc.co.in
jabong.com
jd.com
kaskus.co.id
kat.cr
kickstarter.com
kinogo.co
kinopoisk.ru
kissanime.to
kohls.com
leboncoin.fr
life.tw
lifebuzz.com
likes.com
liputan6.com
liveadexchanger.com
livejournal.com
loading-delivery2.com
lowes.com
macys.com
mail.ru
mailchimp.com
mama.cn
mashable.com
mediab.uy
mediafire.com
medium.com
mega.nz
mercadolibre.com.ar
messenger.com
microsoft.com
mozilla.org
msn.com
nametests.com
naver.jp
nba.com
netflix.com
nih.gov
nyaa.se
nytimes.com
office.com
ok.ru
olx.pl
onclickads.net
onedio.com
onet.pl
onlinesbi.com
opthw.xdwscache.speedcdns.com
oracle.com
orange.fr
ouo.io
outbrain.com
ozock.com
pandora.com
paypal.com
paytm.com
pinimg.com
pinterest.com
pixiv.net
popads.net
popcash.net
pornhub.com
ppomppu.co.kr
putlocker.is
quora.com
rambler.ru
rdsa2012.com
realtor.com
rediff.com
reimageplus.com
roblox.com
rt.com
ruten.com.tw
rutracker.org
sabah.com.tr
sahibinden.com
salesforce.com
savefrom.net
sberbank.ru
scribd.com
secureserver.net
seznam.cz
sh.st
shutterstock.com
siteadvisor.com
skype.com
slack.com
slickdeals.net
slideshare.net
slither.io
so.com
sogou.com
sohu.com
soundcloud.com
sourceforge.net
spiegel.de
spotify.com
stackexchange.com
stackoverflow.com
steamcommunity.com
steampowered.com
subscene.com
surveymonkey.com
t-online.de
tabelog.com
taboola.com
taleo.net
taobao.com
target.com
taringa.net
telegram.org
telegraph.co.uk
terraclicks.com
thefreedictionary.com
theladbible.com
themeforest.net
thepiratebay.se
thesportbible.com
thewatchseries.to
tistory.com
tmall.com
tokopedia.com
torrentz.eu
tradeadexchange.com
trello.com
tripadvisor.com
tuberel.com
tumblr.com
twitch.tv
twitter.com
txxx.com
udn.com
upornia.com
ups.com
uptodown.com
upwork.com
usps.com
verizonwireless.com
vice.com
vimeo.com
vk.com
vk.me
walmart.com
wangzhanbao.cc
washingtonpost.com
weather.com
web.de
webtretho.com
weebly.com
weibo.com
wellsfargo.com
wetransfer.com
whatsapp.com
wikia.com
wikihow.com
wikimedia.org
wikipedia.org
wittyfeed.com
wix.com
wordpress.com
wordpress.org
wp.com
wsj.com
xfinity.com
xhamster.com
xuite.net
yahoo.com
yandex.ru
yelp.com
youm7.com
youporn.com
youtube-mp3.org
youtube.com
zendesk.com
zhihu.com
zillow.com
zippyshare.com
zoho.com

175
cmd/test/main.go Normal file
View File

@ -0,0 +1,175 @@
package main
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"net"
"net/url"
"os"
"strings"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/k0kubun/pp"
"github.com/microcosm-cc/bluemonday"
"sneak.berlin/go/aipagesummary"
"sneak.berlin/go/puppeteerapiclient"
)
func main() {
bigList := []string{
"https://www.bbc.com",
"https://www.cnn.com",
"https://news.ycombinator.com",
"https://www.reddit.com",
"https://www.wikipedia.org",
"https://www.ford.com",
"https://www.tesla.com",
"https://www.apple.com",
"https://www.microsoft.com",
"https://www.google.com",
"https://medium.com",
"https://www.nytimes.com",
"https://sneak.berlin",
}
// now read urls from alexa500.txt
list, err := os.ReadFile("alexa500.txt")
if err != nil {
fmt.Println(err)
return
}
urls := strings.Fields(string(list))
for _, url := range urls {
url = "https://" + url
bigList = append(bigList, url)
}
shuffle(bigList)
for _, url := range bigList {
fmt.Printf("Showing summary for %s\n", url)
summarizeURL(url)
}
}
func shuffle(slice []string) {
rand.Shuffle(len(slice), func(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
})
}
func parseHostnameFromURL(rawURL string) string {
// Parse the URL
parsedURL, err := url.Parse(rawURL)
if err != nil {
return ""
}
// Extract the host part
host := parsedURL.Host
// If the host contains a port, strip it
hostname, _, err := net.SplitHostPort(host)
if err != nil {
// If there's no port, it might return an error, in which case, host is the hostname
if err.(*net.AddrError).Err == "missing port in address" {
hostname = host
} else {
return ""
}
}
// Convert hostname to lowercase
hostname = strings.ToLower(hostname)
return hostname
}
func summarizeURL(url string) {
// Initialize the summarizer with the OpenAI API key
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
if err != nil {
fmt.Println(err)
return
}
// Retrieve Puppeteer API URL and salt from environment variables
apiURL := os.Getenv("PUPPETEER_API_URL")
apiSalt := os.Getenv("PUPPETEER_API_SALT")
if apiURL == "" || apiSalt == "" {
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
}
// Initialize the Puppeteer client
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
// Create a context with a timeout for the scrape request
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
// Scrape the content from the given URL
scraped, err := client.Scrape(ctx, url, "body")
if err != nil {
fmt.Println(err)
return
}
// Use bluemonday to strip HTML tags from the scraped content
p := bluemonday.StripTagsPolicy()
strippedContent := p.Sanitize(scraped.Content)
if len(strippedContent) > 15000 {
// truncate if too big
strippedContent = strippedContent[:10000]
}
// Get the summary of the stripped content
res, err := s.GetSummary(url, strippedContent)
if err != nil {
fmt.Println(err)
return
}
// pretty print the result:
pp.Print(res)
fmt.Println()
hn := parseHostnameFromURL(url)
fmt.Printf("Hostname: %s\n", hn)
fn := fmt.Sprintf("%s.json", hn)
writePrettyJSONToFile(res, fn)
}
func writePrettyJSONToFile(data interface{}, filename string) error {
// Serialize the data to pretty-printed JSON
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
return fmt.Errorf("error serializing to JSON: %v", err)
}
// Create or truncate the file
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("error creating file: %v", err)
}
defer file.Close()
// Write the JSON data to the file
_, err = file.Write(jsonData)
if err != nil {
return fmt.Errorf("error writing to file: %v", err)
}
return nil
}

26
go.mod Normal file
View File

@ -0,0 +1,26 @@
module sneak.berlin/go/aipagesummary
go 1.22.2
require (
github.com/joho/godotenv v1.5.1
github.com/k0kubun/pp v3.0.1+incompatible
github.com/microcosm-cc/bluemonday v1.0.26
github.com/sashabaranov/go-openai v1.24.1
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607
)
require (
github.com/aymerick/douceur v0.2.0 // indirect
github.com/emvi/iso-639-1 v1.1.0 // indirect
github.com/gorilla/css v1.0.1 // indirect
github.com/k0kubun/pp/v3 v3.2.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/rogpeppe/go-internal v1.9.0 // indirect
golang.org/x/net v0.25.0 // indirect
golang.org/x/sys v0.20.0 // indirect
golang.org/x/text v0.15.0 // indirect
)

48
go.sum Normal file
View File

@ -0,0 +1,48 @@
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/emvi/iso-639-1 v1.1.0 h1:EhZiYVA+ysa/b7+0T2DD9hcX7E/5sh4o1KyDAIPu7VE=
github.com/emvi/iso-639-1 v1.1.0/go.mod h1:CSA53/Tx0xF9bk2DEA0Mr0wTdIxq7pqoVZgBOfoL5GI=
github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/k0kubun/pp v3.0.1+incompatible h1:3tqvf7QgUnZ5tXO6pNAZlrvHgl6DvifjDrd9g2S9Z40=
github.com/k0kubun/pp v3.0.1+incompatible/go.mod h1:GWse8YhT0p8pT4ir3ZgBbfZild3tgzSScAn6HmfYukg=
github.com/k0kubun/pp/v3 v3.2.0 h1:h33hNTZ9nVFNP3u2Fsgz8JXiF5JINoZfFq4SvKJwNcs=
github.com/k0kubun/pp/v3 v3.2.0/go.mod h1:ODtJQbQcIRfAD3N+theGCV1m/CBxweERz2dapdz1EwA=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z04DQDJ1MI=
github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290 h1:HpQBFKNCdyJjjJLEhUpjEzEh945JUtx2ifdbINU5jgY=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56 h1:KzPfR0+SyVBr1yHnbdXCCOPPTq95a4cdUp45gqB3VbM=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 h1:nJwEfj/BU1O4caRFt0UWCE09JLpr7/bTuI5pIo1h5lM=
sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=

265
summary.go Normal file
View File

@ -0,0 +1,265 @@
package aipagesummary
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"sort"
"strings"
"time"
iso6391 "github.com/emvi/iso-639-1"
openai "github.com/sashabaranov/go-openai"
)
const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json"
type AIResponse struct {
Success bool `json:"success"`
Error string `json:"error, omitempty"`
Result PageSummary `json:"result"`
}
type PageSummary struct {
ID string `json:"$id"`
FetchedAt time.Time
URL string
Tags []string
Categories []string
ContentExcerpt string
Language string
Summary100 string
Summary500 string
}
const model = openai.GPT3Dot5Turbo
const summarizerSystemPrompt = `
Your job is to summarize a webpage.
You will receive a json object containing the url and body tag content with
HTML tags stripped out. The key "url" will be a string representing a URL.
The key "content" will be the body tag and its rendered contents.
You will parse and interpret the "content" value and respond with a json
object such as:
{
"success": true,
"error": null,
"result": { ... }
}
Set success to false and provide a descriptive string in "error" describing
the error if you are unable to complete the task for any reason, such as a
failure to parse the input content. Your response should be json and only
json, and your response should always, always parse as valid json. Do not
provide any other output except json, no description.
Result should be the following:
type PageSummary struct {
URL string
Tags []string
Categories []string
ContentExcerpt string
Language string
Summary100 string
Summary500 string
}
If the page content primarily describes an error condition or failure,
please do not return a summary. Instead, set the error field to a
descriptive string and set success to false. Omit the result field.
Try to identify if the provided URL and content are a main page on a site,
or a specific article, post, product, or other type of content on a site.
If it is a main page, you should provide a summary of the site itself, and
if it is a specific article, post, product, or other type of content, you
should provide a summary of that specific article, post, item, or product.
The tags and category should reflect this distinction.
The output URL should be a canonicalized version of the input URL.
The output tags is a list of strings, each a short one or two or three word
tag, using underscores instead of spaces, to indicate page attributes, such
as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You
can use tags not listed in these instructions, come up with your own. You
should endeavor to come up with at least 5 tags for each URL, up to a
maximum of 20. For example, you might tag a personal blog site that
contains a lot of posts about the javascript programming language with
"javascript" and "programming". Tags should be lowercase. Important tags
that might apply to sites are things like "defunct" for sites or services
that have been shut down, or "cdn" for domains or sites that are primarily
content delivery networks and not directly accessed by users, such as
fbcdn.net. Sites that are open to the public for signup and posting of
user-generated content should receive tags of "public" or "ugc".
Image-focused sites such as image hosting or photo sharing or art should
receive the "images" tag. Photography focused sites should receive the tag
"photography". Blogs should receive "blog", and specific blog posts should
receive both "blog" and "blogpost". Product pages should receive the tag
"product". Try to keep tags as single words. For example, a payment
processing company site should receive tags "corporate", "payments",
"service_provider", among others.
The output field categories should be a list of broad categories that a
website can fit into. "personal_blog" is a good example.
"corporate_website" is another. "tech_startup" is another.
"automobile_retailer" is another. These are just examples, not an inclusive
list. "news_aggregator" and "discussion_forum" are other examples. You may
include up to 10 categories, with a minimum of one. Categories should be
lowercase. Please try to provide at least three categories in your response.
The output field "ContentExcerpt" should be a substring of the input
content, with whitespace collapsed. Try to find the main or most relevant
section of the page, such as the first paragraph of an article. Do not
include site header or footer boilerplate text. Truncate this field at 2
kilobytes of text. It should be taken directly from the input. Do not
summarize in this field, but include a snippet of the provided content. It
should be a substring of the input "content" field.
Language is the two digit ISO language code representing the main language
of the content input.
Summary100, and Summary500 output fields are english language summaries of
the type of page, with a maximum of 100 words and 500 words respectively.
These fields should not contain summaries of the page content! They should
contain a short description of the page itself, such as "A personal blog
about programming and technology" or "A news site covering the latest in
technology" or "An article about the history of the internet". For the tone
of these Summary fields, you should be neutral and informative. Please
don't repeat imperative statements or calls to action from the source site,
such as instructions to sign up, purchase, or download. You should not
include any opinions or subjective statements in the summaries. Avoid using
overly effusive descriptions or adjectives. Unless the site is focused on a
specific technical topic, avoid discussing technical things like logging in,
cookies, or preference settings that are common to many websites.
Given that the Summary fields are in English and for an English-speaking
audience, if the page's language is not English, the page's language and
country should mentioned in the summaries. For example, "A Japanese news
site covering current events in Japan" for a site in Japanese. If the
language is not English, always add the name of the language to the tags
list. If the site is focused at a non-english-speaking country, such as a
site in French for a French audience, you should include the country name in
the tags list, such as "france" or "japan".
Editorially, you should not be summarizing the specific page content, but
the type of page. For example, if you are summarizing a news site, you
should not summarize the news items displayed currently, but the news site
itself, indicating the type of news being covered and style of coverage. If
you are summarizing a blog post, you should summarize the blog site itself.
If you are summarizing a corporate site, you should provide general
information about the company, and a high level overview of what type of
information the company provides on the site. If you are summarizing a
software-as-a-service site, you should provide a high level overview of the
service's features and target audience.
`
type Summarizer struct{ APIKey string }
func NewSummarizer(apiKey string) (*Summarizer, error) {
if apiKey == "" {
return nil, fmt.Errorf("API key is required")
}
return &Summarizer{
APIKey: apiKey,
}, nil
}
func UniqueSortedList(input []string) []string {
// Create a map to track unique elements
uniqueMap := make(map[string]struct{})
for _, item := range input {
uniqueMap[item] = struct{}{}
}
// Create a slice from the keys of the map
uniqueList := make([]string, 0, len(uniqueMap))
for key := range uniqueMap {
uniqueList = append(uniqueList, key)
}
// Sort the unique list
sort.Strings(uniqueList)
return uniqueList
}
func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) {
apiKey := os.Getenv("OPENAI_API_KEY")
if apiKey == "" {
log.Fatal("OPENAI_API_KEY environment variable is not set")
}
type Request struct {
URL string `json:"url"`
Content string `json:"content"`
}
req := Request{
URL: url,
Content: content,
}
reqJSON, err := json.Marshal(req)
if err != nil {
return nil, err
}
client := openai.NewClient(apiKey)
ctx := context.Background()
sumReq := openai.ChatCompletionRequest{
Model: model,
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: summarizerSystemPrompt,
},
{
Role: openai.ChatMessageRoleUser,
Content: string(reqJSON),
},
},
}
resp, err := client.CreateChatCompletion(ctx, sumReq)
if err != nil {
return nil, err
}
jsonResp := resp.Choices[0].Message.Content
var aiResp AIResponse
err = json.Unmarshal([]byte(jsonResp), &aiResp)
if err != nil {
fmt.Printf("Error unmarshalling response: %v\n", err)
fmt.Printf("Response: '%s'\n", jsonResp)
return nil, err
}
if aiResp.Error != "" {
return nil, fmt.Errorf(aiResp.Error)
}
if aiResp.Result.ContentExcerpt == "" {
return nil, fmt.Errorf("No content excerpt found")
}
langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language))
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish))
aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags)
aiResp.Result.ID = schemaURL
aiResp.Result.FetchedAt = time.Now().UTC()
return &aiResp.Result, nil
}