commit 108587b28c7f9f47ae760461ec21ec86467b4178 Author: sneak Date: Mon Jun 3 12:11:29 2024 -0700 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..25a5ff4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +cmd/test/*.json +cmd/test/.env +cmd/test/test +.env +logfile.txt diff --git a/cmd/test/alexa500.txt b/cmd/test/alexa500.txt new file mode 100644 index 0000000..5f2890d --- /dev/null +++ b/cmd/test/alexa500.txt @@ -0,0 +1,309 @@ +163.com +1688.com +2ch.net +360.cn +4shared.com +9gag.com +OPENLOAD.co +T.co +about.com +acfun.tv +addthis.com +adf.ly +adnetworkperformance.com +adobe.com +adplxmd.com +agar.io +airbnb.com +alibaba.com +alicdn.com +aliexpress.com +allegro.pl +ameba.jp +americanexpress.com +ancestry.com +apple.com +archive.org +ask.fm +att.com +avito.ru +badoo.com +bankofamerica.com +battle.net +bbc.co.uk +bbc.com +bestbuy.com +bet365.com +bild.de +bilibili.com +billdesk.com +bing.com +bitauto.com +blackboard.com +blastingnews.com +blkget.com +blog.jp +blogfa.com +blogger.com +blogspot.com +blogspot.com.br +blogspot.in +bloomberg.com +bongacams.com +booking.com +box.com +bukalapak.com +businessinsider.com +buzzlie.com +capitalone.com +chase.com +chaturbate.com +citi.com +ck101.com +cnblogs.com +cnnic.cn +coccoc.com +craigslist.org +dailymotion.com +dell.com +detail.tmall.com.danuoyi.tbcache.com +detik.com +deviantart.com +digikala.com +diply.com +disqus.com +doubleclick.net +doublepimp.com +dropbox.com +dssedc4qxg7o6.cloudfront.net +ebay-kleinanzeigen.de +ebay.co.uk +ebay.com +ebay.de +ebay.in +ebay.it +eksisozluk.com +espn.gns.go.com +etsy.com +ettoday.net +evernote.com +exoclick.com +extratorrent.cc +fbcdn.net +fedex.com +feedly.com +files.wordpress.com +flickr.com +flipkart.com +forbes.com +foxnews.com +freepik.com +gfycat.com +giphy.com +github.com +github.io +gizmodo.com +globo.com +gmanetwork.com +gmx.net +go.com +godaddy.com +goodreads.com +google.com +groupon.com +haosou.com +hatena.ne.jp +hclips.com +hdfcbank.com +hdzog.com +hp.com +hpcc-page.cnc.ccgslb.com.cn +hulu.com +hurriyet.com.tr +icloud.com +imgur.com +impress.co.jp +imzog.com +indeed.com +instagram.com +instructure.com +intuit.com +iqiyi.com +irctc.co.in +jabong.com +jd.com +kaskus.co.id +kat.cr +kickstarter.com +kinogo.co +kinopoisk.ru +kissanime.to +kohls.com +leboncoin.fr +life.tw +lifebuzz.com +likes.com +liputan6.com +liveadexchanger.com +livejournal.com +loading-delivery2.com +lowes.com +macys.com +mail.ru +mailchimp.com +mama.cn +mashable.com +mediab.uy +mediafire.com +medium.com +mega.nz +mercadolibre.com.ar +messenger.com +microsoft.com +mozilla.org +msn.com +nametests.com +naver.jp +nba.com +netflix.com +nih.gov +nyaa.se +nytimes.com +office.com +ok.ru +olx.pl +onclickads.net +onedio.com +onet.pl +onlinesbi.com +opthw.xdwscache.speedcdns.com +oracle.com +orange.fr +ouo.io +outbrain.com +ozock.com +pandora.com +paypal.com +paytm.com +pinimg.com +pinterest.com +pixiv.net +popads.net +popcash.net +pornhub.com +ppomppu.co.kr +putlocker.is +quora.com +rambler.ru +rdsa2012.com +realtor.com +rediff.com +reimageplus.com +roblox.com +rt.com +ruten.com.tw +rutracker.org +sabah.com.tr +sahibinden.com +salesforce.com +savefrom.net +sberbank.ru +scribd.com +secureserver.net +seznam.cz +sh.st +shutterstock.com +siteadvisor.com +skype.com +slack.com +slickdeals.net +slideshare.net +slither.io +so.com +sogou.com +sohu.com +soundcloud.com +sourceforge.net +spiegel.de +spotify.com +stackexchange.com +stackoverflow.com +steamcommunity.com +steampowered.com +subscene.com +surveymonkey.com +t-online.de +tabelog.com +taboola.com +taleo.net +taobao.com +target.com +taringa.net +telegram.org +telegraph.co.uk +terraclicks.com +thefreedictionary.com +theladbible.com +themeforest.net +thepiratebay.se +thesportbible.com +thewatchseries.to +tistory.com +tmall.com +tokopedia.com +torrentz.eu +tradeadexchange.com +trello.com +tripadvisor.com +tuberel.com +tumblr.com +twitch.tv +twitter.com +txxx.com +udn.com +upornia.com +ups.com +uptodown.com +upwork.com +usps.com +verizonwireless.com +vice.com +vimeo.com +vk.com +vk.me +walmart.com +wangzhanbao.cc +washingtonpost.com +weather.com +web.de +webtretho.com +weebly.com +weibo.com +wellsfargo.com +wetransfer.com +whatsapp.com +wikia.com +wikihow.com +wikimedia.org +wikipedia.org +wittyfeed.com +wix.com +wordpress.com +wordpress.org +wp.com +wsj.com +xfinity.com +xhamster.com +xuite.net +yahoo.com +yandex.ru +yelp.com +youm7.com +youporn.com +youtube-mp3.org +youtube.com +zendesk.com +zhihu.com +zillow.com +zippyshare.com +zoho.com diff --git a/cmd/test/main.go b/cmd/test/main.go new file mode 100644 index 0000000..6e8668a --- /dev/null +++ b/cmd/test/main.go @@ -0,0 +1,175 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "net" + "net/url" + "os" + "strings" + "time" + + _ "github.com/joho/godotenv/autoload" + "github.com/k0kubun/pp" + "github.com/microcosm-cc/bluemonday" + "sneak.berlin/go/aipagesummary" + "sneak.berlin/go/puppeteerapiclient" +) + +func main() { + bigList := []string{ + "https://www.bbc.com", + "https://www.cnn.com", + "https://news.ycombinator.com", + "https://www.reddit.com", + "https://www.wikipedia.org", + "https://www.ford.com", + "https://www.tesla.com", + "https://www.apple.com", + "https://www.microsoft.com", + "https://www.google.com", + "https://medium.com", + "https://www.nytimes.com", + "https://sneak.berlin", + } + + // now read urls from alexa500.txt + list, err := os.ReadFile("alexa500.txt") + if err != nil { + fmt.Println(err) + return + } + + urls := strings.Fields(string(list)) + for _, url := range urls { + url = "https://" + url + bigList = append(bigList, url) + + } + + shuffle(bigList) + + for _, url := range bigList { + fmt.Printf("Showing summary for %s\n", url) + summarizeURL(url) + } + +} + +func shuffle(slice []string) { + rand.Shuffle(len(slice), func(i, j int) { + slice[i], slice[j] = slice[j], slice[i] + }) +} + +func parseHostnameFromURL(rawURL string) string { + // Parse the URL + parsedURL, err := url.Parse(rawURL) + if err != nil { + return "" + } + + // Extract the host part + host := parsedURL.Host + + // If the host contains a port, strip it + hostname, _, err := net.SplitHostPort(host) + if err != nil { + // If there's no port, it might return an error, in which case, host is the hostname + if err.(*net.AddrError).Err == "missing port in address" { + hostname = host + } else { + return "" + } + } + + // Convert hostname to lowercase + hostname = strings.ToLower(hostname) + + return hostname +} + +func summarizeURL(url string) { + + // Initialize the summarizer with the OpenAI API key + s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY")) + if err != nil { + fmt.Println(err) + return + } + + // Retrieve Puppeteer API URL and salt from environment variables + apiURL := os.Getenv("PUPPETEER_API_URL") + apiSalt := os.Getenv("PUPPETEER_API_SALT") + + if apiURL == "" || apiSalt == "" { + panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set") + } + + // Initialize the Puppeteer client + client := puppeteerapiclient.NewClient(apiURL, apiSalt) + + // Create a context with a timeout for the scrape request + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + // Scrape the content from the given URL + + scraped, err := client.Scrape(ctx, url, "body") + if err != nil { + fmt.Println(err) + return + } + + // Use bluemonday to strip HTML tags from the scraped content + p := bluemonday.StripTagsPolicy() + strippedContent := p.Sanitize(scraped.Content) + + if len(strippedContent) > 15000 { + // truncate if too big + strippedContent = strippedContent[:10000] + } + + // Get the summary of the stripped content + res, err := s.GetSummary(url, strippedContent) + if err != nil { + fmt.Println(err) + return + } + // pretty print the result: + pp.Print(res) + fmt.Println() + + hn := parseHostnameFromURL(url) + fmt.Printf("Hostname: %s\n", hn) + + fn := fmt.Sprintf("%s.json", hn) + + writePrettyJSONToFile(res, fn) + +} + +func writePrettyJSONToFile(data interface{}, filename string) error { + // Serialize the data to pretty-printed JSON + jsonData, err := json.MarshalIndent(data, "", " ") + if err != nil { + return fmt.Errorf("error serializing to JSON: %v", err) + } + + // Create or truncate the file + file, err := os.Create(filename) + if err != nil { + return fmt.Errorf("error creating file: %v", err) + } + defer file.Close() + + // Write the JSON data to the file + _, err = file.Write(jsonData) + if err != nil { + return fmt.Errorf("error writing to file: %v", err) + } + + return nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..bee93d3 --- /dev/null +++ b/go.mod @@ -0,0 +1,26 @@ +module sneak.berlin/go/aipagesummary + +go 1.22.2 + +require ( + github.com/joho/godotenv v1.5.1 + github.com/k0kubun/pp v3.0.1+incompatible + github.com/microcosm-cc/bluemonday v1.0.26 + github.com/sashabaranov/go-openai v1.24.1 + sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 +) + +require ( + github.com/aymerick/douceur v0.2.0 // indirect + github.com/emvi/iso-639-1 v1.1.0 // indirect + github.com/gorilla/css v1.0.1 // indirect + github.com/k0kubun/pp/v3 v3.2.0 // indirect + github.com/kr/pretty v0.3.1 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/rogpeppe/go-internal v1.9.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/sys v0.20.0 // indirect + golang.org/x/text v0.15.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..29a17b2 --- /dev/null +++ b/go.sum @@ -0,0 +1,48 @@ +github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= +github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/emvi/iso-639-1 v1.1.0 h1:EhZiYVA+ysa/b7+0T2DD9hcX7E/5sh4o1KyDAIPu7VE= +github.com/emvi/iso-639-1 v1.1.0/go.mod h1:CSA53/Tx0xF9bk2DEA0Mr0wTdIxq7pqoVZgBOfoL5GI= +github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY= +github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c= +github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= +github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/k0kubun/pp v3.0.1+incompatible h1:3tqvf7QgUnZ5tXO6pNAZlrvHgl6DvifjDrd9g2S9Z40= +github.com/k0kubun/pp v3.0.1+incompatible/go.mod h1:GWse8YhT0p8pT4ir3ZgBbfZild3tgzSScAn6HmfYukg= +github.com/k0kubun/pp/v3 v3.2.0 h1:h33hNTZ9nVFNP3u2Fsgz8JXiF5JINoZfFq4SvKJwNcs= +github.com/k0kubun/pp/v3 v3.2.0/go.mod h1:ODtJQbQcIRfAD3N+theGCV1m/CBxweERz2dapdz1EwA= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58= +github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z04DQDJ1MI= +github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290 h1:HpQBFKNCdyJjjJLEhUpjEzEh945JUtx2ifdbINU5jgY= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56 h1:KzPfR0+SyVBr1yHnbdXCCOPPTq95a4cdUp45gqB3VbM= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 h1:nJwEfj/BU1O4caRFt0UWCE09JLpr7/bTuI5pIo1h5lM= +sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI= diff --git a/summary.go b/summary.go new file mode 100644 index 0000000..08adf03 --- /dev/null +++ b/summary.go @@ -0,0 +1,265 @@ +package aipagesummary + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "sort" + "strings" + "time" + + iso6391 "github.com/emvi/iso-639-1" + openai "github.com/sashabaranov/go-openai" +) + +const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json" + +type AIResponse struct { + Success bool `json:"success"` + Error string `json:"error, omitempty"` + Result PageSummary `json:"result"` +} + +type PageSummary struct { + ID string `json:"$id"` + FetchedAt time.Time + URL string + Tags []string + Categories []string + ContentExcerpt string + Language string + Summary100 string + Summary500 string +} + +const model = openai.GPT3Dot5Turbo + +const summarizerSystemPrompt = ` + +Your job is to summarize a webpage. + +You will receive a json object containing the url and body tag content with +HTML tags stripped out. The key "url" will be a string representing a URL. +The key "content" will be the body tag and its rendered contents. + +You will parse and interpret the "content" value and respond with a json +object such as: + +{ +"success": true, +"error": null, +"result": { ... } +} + +Set success to false and provide a descriptive string in "error" describing +the error if you are unable to complete the task for any reason, such as a +failure to parse the input content. Your response should be json and only +json, and your response should always, always parse as valid json. Do not +provide any other output except json, no description. + + +Result should be the following: + +type PageSummary struct { + URL string + Tags []string + Categories []string + ContentExcerpt string + Language string + Summary100 string + Summary500 string +} + +If the page content primarily describes an error condition or failure, +please do not return a summary. Instead, set the error field to a +descriptive string and set success to false. Omit the result field. + +Try to identify if the provided URL and content are a main page on a site, +or a specific article, post, product, or other type of content on a site. +If it is a main page, you should provide a summary of the site itself, and +if it is a specific article, post, product, or other type of content, you +should provide a summary of that specific article, post, item, or product. +The tags and category should reflect this distinction. + +The output URL should be a canonicalized version of the input URL. + +The output tags is a list of strings, each a short one or two or three word +tag, using underscores instead of spaces, to indicate page attributes, such +as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You +can use tags not listed in these instructions, come up with your own. You +should endeavor to come up with at least 5 tags for each URL, up to a +maximum of 20. For example, you might tag a personal blog site that +contains a lot of posts about the javascript programming language with +"javascript" and "programming". Tags should be lowercase. Important tags +that might apply to sites are things like "defunct" for sites or services +that have been shut down, or "cdn" for domains or sites that are primarily +content delivery networks and not directly accessed by users, such as +fbcdn.net. Sites that are open to the public for signup and posting of +user-generated content should receive tags of "public" or "ugc". +Image-focused sites such as image hosting or photo sharing or art should +receive the "images" tag. Photography focused sites should receive the tag +"photography". Blogs should receive "blog", and specific blog posts should +receive both "blog" and "blogpost". Product pages should receive the tag +"product". Try to keep tags as single words. For example, a payment +processing company site should receive tags "corporate", "payments", +"service_provider", among others. + +The output field categories should be a list of broad categories that a +website can fit into. "personal_blog" is a good example. +"corporate_website" is another. "tech_startup" is another. +"automobile_retailer" is another. These are just examples, not an inclusive +list. "news_aggregator" and "discussion_forum" are other examples. You may +include up to 10 categories, with a minimum of one. Categories should be +lowercase. Please try to provide at least three categories in your response. + +The output field "ContentExcerpt" should be a substring of the input +content, with whitespace collapsed. Try to find the main or most relevant +section of the page, such as the first paragraph of an article. Do not +include site header or footer boilerplate text. Truncate this field at 2 +kilobytes of text. It should be taken directly from the input. Do not +summarize in this field, but include a snippet of the provided content. It +should be a substring of the input "content" field. + +Language is the two digit ISO language code representing the main language +of the content input. + +Summary100, and Summary500 output fields are english language summaries of +the type of page, with a maximum of 100 words and 500 words respectively. +These fields should not contain summaries of the page content! They should +contain a short description of the page itself, such as "A personal blog +about programming and technology" or "A news site covering the latest in +technology" or "An article about the history of the internet". For the tone +of these Summary fields, you should be neutral and informative. Please +don't repeat imperative statements or calls to action from the source site, +such as instructions to sign up, purchase, or download. You should not +include any opinions or subjective statements in the summaries. Avoid using +overly effusive descriptions or adjectives. Unless the site is focused on a +specific technical topic, avoid discussing technical things like logging in, +cookies, or preference settings that are common to many websites. + +Given that the Summary fields are in English and for an English-speaking +audience, if the page's language is not English, the page's language and +country should mentioned in the summaries. For example, "A Japanese news +site covering current events in Japan" for a site in Japanese. If the +language is not English, always add the name of the language to the tags +list. If the site is focused at a non-english-speaking country, such as a +site in French for a French audience, you should include the country name in +the tags list, such as "france" or "japan". + +Editorially, you should not be summarizing the specific page content, but +the type of page. For example, if you are summarizing a news site, you +should not summarize the news items displayed currently, but the news site +itself, indicating the type of news being covered and style of coverage. If +you are summarizing a blog post, you should summarize the blog site itself. +If you are summarizing a corporate site, you should provide general +information about the company, and a high level overview of what type of +information the company provides on the site. If you are summarizing a +software-as-a-service site, you should provide a high level overview of the +service's features and target audience. + +` + +type Summarizer struct{ APIKey string } + +func NewSummarizer(apiKey string) (*Summarizer, error) { + if apiKey == "" { + return nil, fmt.Errorf("API key is required") + } + + return &Summarizer{ + APIKey: apiKey, + }, nil +} + +func UniqueSortedList(input []string) []string { + // Create a map to track unique elements + uniqueMap := make(map[string]struct{}) + for _, item := range input { + uniqueMap[item] = struct{}{} + } + + // Create a slice from the keys of the map + uniqueList := make([]string, 0, len(uniqueMap)) + for key := range uniqueMap { + uniqueList = append(uniqueList, key) + } + + // Sort the unique list + sort.Strings(uniqueList) + + return uniqueList +} + +func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) { + apiKey := os.Getenv("OPENAI_API_KEY") + if apiKey == "" { + log.Fatal("OPENAI_API_KEY environment variable is not set") + } + + type Request struct { + URL string `json:"url"` + Content string `json:"content"` + } + + req := Request{ + URL: url, + Content: content, + } + + reqJSON, err := json.Marshal(req) + if err != nil { + return nil, err + } + + client := openai.NewClient(apiKey) + + ctx := context.Background() + + sumReq := openai.ChatCompletionRequest{ + Model: model, + Messages: []openai.ChatCompletionMessage{ + { + Role: openai.ChatMessageRoleSystem, + Content: summarizerSystemPrompt, + }, + { + Role: openai.ChatMessageRoleUser, + Content: string(reqJSON), + }, + }, + } + + resp, err := client.CreateChatCompletion(ctx, sumReq) + if err != nil { + return nil, err + } + + jsonResp := resp.Choices[0].Message.Content + var aiResp AIResponse + err = json.Unmarshal([]byte(jsonResp), &aiResp) + if err != nil { + fmt.Printf("Error unmarshalling response: %v\n", err) + fmt.Printf("Response: '%s'\n", jsonResp) + return nil, err + } + + if aiResp.Error != "" { + return nil, fmt.Errorf(aiResp.Error) + } + + if aiResp.Result.ContentExcerpt == "" { + return nil, fmt.Errorf("No content excerpt found") + } + + langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name + aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language)) + aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish)) + + aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags) + aiResp.Result.ID = schemaURL + aiResp.Result.FetchedAt = time.Now().UTC() + + return &aiResp.Result, nil +}