initial

2024-06-03 12:11:29 -07:00 · 2024-06-03 12:11:29 -07:00 · 108587b28c
commit 108587b28c
6 changed files with 828 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
 cmd/test/*.json
 cmd/test/.env
 cmd/test/test
 .env
 logfile.txt
--- a/cmd/test/alexa500.txt
+++ b/cmd/test/alexa500.txt
@ -0,0 +1,309 @@
 163.com
 1688.com
 2ch.net
 360.cn
 4shared.com
 9gag.com
 OPENLOAD.co
 T.co
 about.com
 acfun.tv
 addthis.com
 adf.ly
 adnetworkperformance.com
 adobe.com
 adplxmd.com
 agar.io
 airbnb.com
 alibaba.com
 alicdn.com
 aliexpress.com
 allegro.pl
 ameba.jp
 americanexpress.com
 ancestry.com
 apple.com
 archive.org
 ask.fm
 att.com
 avito.ru
 badoo.com
 bankofamerica.com
 battle.net
 bbc.co.uk
 bbc.com
 bestbuy.com
 bet365.com
 bild.de
 bilibili.com
 billdesk.com
 bing.com
 bitauto.com
 blackboard.com
 blastingnews.com
 blkget.com
 blog.jp
 blogfa.com
 blogger.com
 blogspot.com
 blogspot.com.br
 blogspot.in
 bloomberg.com
 bongacams.com
 booking.com
 box.com
 bukalapak.com
 businessinsider.com
 buzzlie.com
 capitalone.com
 chase.com
 chaturbate.com
 citi.com
 ck101.com
 cnblogs.com
 cnnic.cn
 coccoc.com
 craigslist.org
 dailymotion.com
 dell.com
 detail.tmall.com.danuoyi.tbcache.com
 detik.com
 deviantart.com
 digikala.com
 diply.com
 disqus.com
 doubleclick.net
 doublepimp.com
 dropbox.com
 dssedc4qxg7o6.cloudfront.net
 ebay-kleinanzeigen.de
 ebay.co.uk
 ebay.com
 ebay.de
 ebay.in
 ebay.it
 eksisozluk.com
 espn.gns.go.com
 etsy.com
 ettoday.net
 evernote.com
 exoclick.com
 extratorrent.cc
 fbcdn.net
 fedex.com
 feedly.com
 files.wordpress.com
 flickr.com
 flipkart.com
 forbes.com
 foxnews.com
 freepik.com
 gfycat.com
 giphy.com
 github.com
 github.io
 gizmodo.com
 globo.com
 gmanetwork.com
 gmx.net
 go.com
 godaddy.com
 goodreads.com
 google.com
 groupon.com
 haosou.com
 hatena.ne.jp
 hclips.com
 hdfcbank.com
 hdzog.com
 hp.com
 hpcc-page.cnc.ccgslb.com.cn
 hulu.com
 hurriyet.com.tr
 icloud.com
 imgur.com
 impress.co.jp
 imzog.com
 indeed.com
 instagram.com
 instructure.com
 intuit.com
 iqiyi.com
 irctc.co.in
 jabong.com
 jd.com
 kaskus.co.id
 kat.cr
 kickstarter.com
 kinogo.co
 kinopoisk.ru
 kissanime.to
 kohls.com
 leboncoin.fr
 life.tw
 lifebuzz.com
 likes.com
 liputan6.com
 liveadexchanger.com
 livejournal.com
 loading-delivery2.com
 lowes.com
 macys.com
 mail.ru
 mailchimp.com
 mama.cn
 mashable.com
 mediab.uy
 mediafire.com
 medium.com
 mega.nz
 mercadolibre.com.ar
 messenger.com
 microsoft.com
 mozilla.org
 msn.com
 nametests.com
 naver.jp
 nba.com
 netflix.com
 nih.gov
 nyaa.se
 nytimes.com
 office.com
 ok.ru
 olx.pl
 onclickads.net
 onedio.com
 onet.pl
 onlinesbi.com
 opthw.xdwscache.speedcdns.com
 oracle.com
 orange.fr
 ouo.io
 outbrain.com
 ozock.com
 pandora.com
 paypal.com
 paytm.com
 pinimg.com
 pinterest.com
 pixiv.net
 popads.net
 popcash.net
 pornhub.com
 ppomppu.co.kr
 putlocker.is
 quora.com
 rambler.ru
 rdsa2012.com
 realtor.com
 rediff.com
 reimageplus.com
 roblox.com
 rt.com
 ruten.com.tw
 rutracker.org
 sabah.com.tr
 sahibinden.com
 salesforce.com
 savefrom.net
 sberbank.ru
 scribd.com
 secureserver.net
 seznam.cz
 sh.st
 shutterstock.com
 siteadvisor.com
 skype.com
 slack.com
 slickdeals.net
 slideshare.net
 slither.io
 so.com
 sogou.com
 sohu.com
 soundcloud.com
 sourceforge.net
 spiegel.de
 spotify.com
 stackexchange.com
 stackoverflow.com
 steamcommunity.com
 steampowered.com
 subscene.com
 surveymonkey.com
 t-online.de
 tabelog.com
 taboola.com
 taleo.net
 taobao.com
 target.com
 taringa.net
 telegram.org
 telegraph.co.uk
 terraclicks.com
 thefreedictionary.com
 theladbible.com
 themeforest.net
 thepiratebay.se
 thesportbible.com
 thewatchseries.to
 tistory.com
 tmall.com
 tokopedia.com
 torrentz.eu
 tradeadexchange.com
 trello.com
 tripadvisor.com
 tuberel.com
 tumblr.com
 twitch.tv
 twitter.com
 txxx.com
 udn.com
 upornia.com
 ups.com
 uptodown.com
 upwork.com
 usps.com
 verizonwireless.com
 vice.com
 vimeo.com
 vk.com
 vk.me
 walmart.com
 wangzhanbao.cc
 washingtonpost.com
 weather.com
 web.de
 webtretho.com
 weebly.com
 weibo.com
 wellsfargo.com
 wetransfer.com
 whatsapp.com
 wikia.com
 wikihow.com
 wikimedia.org
 wikipedia.org
 wittyfeed.com
 wix.com
 wordpress.com
 wordpress.org
 wp.com
 wsj.com
 xfinity.com
 xhamster.com
 xuite.net
 yahoo.com
 yandex.ru
 yelp.com
 youm7.com
 youporn.com
 youtube-mp3.org
 youtube.com
 zendesk.com
 zhihu.com
 zillow.com
 zippyshare.com
 zoho.com
--- a/cmd/test/main.go
+++ b/cmd/test/main.go
@ -0,0 +1,175 @@
 package main
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"math/rand"
 	"net"
 	"net/url"
 	"os"
 	"strings"
 	"time"
 	_ "github.com/joho/godotenv/autoload"
 	"github.com/k0kubun/pp"
 	"github.com/microcosm-cc/bluemonday"
 	"sneak.berlin/go/aipagesummary"
 	"sneak.berlin/go/puppeteerapiclient"
 )
 func main() {
 	bigList := []string{
 		"https://www.bbc.com",
 		"https://www.cnn.com",
 		"https://news.ycombinator.com",
 		"https://www.reddit.com",
 		"https://www.wikipedia.org",
 		"https://www.ford.com",
 		"https://www.tesla.com",
 		"https://www.apple.com",
 		"https://www.microsoft.com",
 		"https://www.google.com",
 		"https://medium.com",
 		"https://www.nytimes.com",
 		"https://sneak.berlin",
 	}
 	// now read urls from alexa500.txt
 	list, err := os.ReadFile("alexa500.txt")
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	urls := strings.Fields(string(list))
 	for _, url := range urls {
 		url = "https://" + url
 		bigList = append(bigList, url)
 	}
 	shuffle(bigList)
 	for _, url := range bigList {
 		fmt.Printf("Showing summary for %s\n", url)
 		summarizeURL(url)
 	}
 }
 func shuffle(slice []string) {
 	rand.Shuffle(len(slice), func(i, j int) {
 		slice[i], slice[j] = slice[j], slice[i]
 	})
 }
 func parseHostnameFromURL(rawURL string) string {
 	// Parse the URL
 	parsedURL, err := url.Parse(rawURL)
 	if err != nil {
 		return ""
 	}
 	// Extract the host part
 	host := parsedURL.Host
 	// If the host contains a port, strip it
 	hostname, _, err := net.SplitHostPort(host)
 	if err != nil {
 		// If there's no port, it might return an error, in which case, host is the hostname
 		if err.(*net.AddrError).Err == "missing port in address" {
 			hostname = host
 		} else {
 			return ""
 		}
 	}
 	// Convert hostname to lowercase
 	hostname = strings.ToLower(hostname)
 	return hostname
 }
 func summarizeURL(url string) {
 	// Initialize the summarizer with the OpenAI API key
 	s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	// Retrieve Puppeteer API URL and salt from environment variables
 	apiURL := os.Getenv("PUPPETEER_API_URL")
 	apiSalt := os.Getenv("PUPPETEER_API_SALT")
 	if apiURL == "" || apiSalt == "" {
 		panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
 	}
 	// Initialize the Puppeteer client
 	client := puppeteerapiclient.NewClient(apiURL, apiSalt)
 	// Create a context with a timeout for the scrape request
 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
 	defer cancel()
 	// Scrape the content from the given URL
 	scraped, err := client.Scrape(ctx, url, "body")
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	// Use bluemonday to strip HTML tags from the scraped content
 	p := bluemonday.StripTagsPolicy()
 	strippedContent := p.Sanitize(scraped.Content)
 	if len(strippedContent) > 15000 {
 		// truncate if too big
 		strippedContent = strippedContent[:10000]
 	}
 	// Get the summary of the stripped content
 	res, err := s.GetSummary(url, strippedContent)
 	if err != nil {
 		fmt.Println(err)
 		return
 	}
 	// pretty print the result:
 	pp.Print(res)
 	fmt.Println()
 	hn := parseHostnameFromURL(url)
 	fmt.Printf("Hostname: %s\n", hn)
 	fn := fmt.Sprintf("%s.json", hn)
 	writePrettyJSONToFile(res, fn)
 }
 func writePrettyJSONToFile(data interface{}, filename string) error {
 	// Serialize the data to pretty-printed JSON
 	jsonData, err := json.MarshalIndent(data, "", "  ")
 	if err != nil {
 		return fmt.Errorf("error serializing to JSON: %v", err)
 	}
 	// Create or truncate the file
 	file, err := os.Create(filename)
 	if err != nil {
 		return fmt.Errorf("error creating file: %v", err)
 	}
 	defer file.Close()
 	// Write the JSON data to the file
 	_, err = file.Write(jsonData)
 	if err != nil {
 		return fmt.Errorf("error writing to file: %v", err)
 	}
 	return nil
 }
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,26 @@
 module sneak.berlin/go/aipagesummary
 go 1.22.2
 require (
 	github.com/joho/godotenv v1.5.1
 	github.com/k0kubun/pp v3.0.1+incompatible
 	github.com/microcosm-cc/bluemonday v1.0.26
 	github.com/sashabaranov/go-openai v1.24.1
 	sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607
 )
 require (
 	github.com/aymerick/douceur v0.2.0 // indirect
 	github.com/emvi/iso-639-1 v1.1.0 // indirect
 	github.com/gorilla/css v1.0.1 // indirect
 	github.com/k0kubun/pp/v3 v3.2.0 // indirect
 	github.com/kr/pretty v0.3.1 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/rogpeppe/go-internal v1.9.0 // indirect
 	golang.org/x/net v0.25.0 // indirect
 	golang.org/x/sys v0.20.0 // indirect
 	golang.org/x/text v0.15.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,48 @@
 github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
 github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/emvi/iso-639-1 v1.1.0 h1:EhZiYVA+ysa/b7+0T2DD9hcX7E/5sh4o1KyDAIPu7VE=
 github.com/emvi/iso-639-1 v1.1.0/go.mod h1:CSA53/Tx0xF9bk2DEA0Mr0wTdIxq7pqoVZgBOfoL5GI=
 github.com/gorilla/css v1.0.0 h1:BQqNyPTi50JCFMTw/b67hByjMVXZRwGha6wxVGkeihY=
 github.com/gorilla/css v1.0.0/go.mod h1:Dn721qIggHpt4+EFCcTLTU/vk5ySda2ReITrtgBl60c=
 github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
 github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
 github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
 github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
 github.com/k0kubun/pp v3.0.1+incompatible h1:3tqvf7QgUnZ5tXO6pNAZlrvHgl6DvifjDrd9g2S9Z40=
 github.com/k0kubun/pp v3.0.1+incompatible/go.mod h1:GWse8YhT0p8pT4ir3ZgBbfZild3tgzSScAn6HmfYukg=
 github.com/k0kubun/pp/v3 v3.2.0 h1:h33hNTZ9nVFNP3u2Fsgz8JXiF5JINoZfFq4SvKJwNcs=
 github.com/k0kubun/pp/v3 v3.2.0/go.mod h1:ODtJQbQcIRfAD3N+theGCV1m/CBxweERz2dapdz1EwA=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
 github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
 github.com/microcosm-cc/bluemonday v1.0.26 h1:xbqSvqzQMeEHCqMi64VAs4d8uy6Mequs3rQ0k/Khz58=
 github.com/microcosm-cc/bluemonday v1.0.26/go.mod h1:JyzOCs9gkyQyjs+6h10UEVSe02CGwkhd72Xdqh78TWs=
 github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
 github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
 github.com/sashabaranov/go-openai v1.24.1 h1:DWK95XViNb+agQtuzsn+FyHhn3HQJ7Va8z04DQDJ1MI=
 github.com/sashabaranov/go-openai v1.24.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
 golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
 golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
 golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac=
 golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
 golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk=
 golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290 h1:HpQBFKNCdyJjjJLEhUpjEzEh945JUtx2ifdbINU5jgY=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602183725-421b3d8f1290/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56 h1:KzPfR0+SyVBr1yHnbdXCCOPPTq95a4cdUp45gqB3VbM=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602190643-e496a4b65b56/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607 h1:nJwEfj/BU1O4caRFt0UWCE09JLpr7/bTuI5pIo1h5lM=
 sneak.berlin/go/puppeteerapiclient v0.0.0-20240602195637-92dcfdcd7607/go.mod h1:5pUwe/o7D++G7W8EACuO56NhtMgmSzkad7XkyBGeZcI=
--- a/summary.go
+++ b/summary.go
@ -0,0 +1,265 @@
 package aipagesummary
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"log"
 	"os"
 	"sort"
 	"strings"
 	"time"
 	iso6391 "github.com/emvi/iso-639-1"
 	openai "github.com/sashabaranov/go-openai"
 )
 const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json"
 type AIResponse struct {
 	Success bool        `json:"success"`
 	Error   string      `json:"error, omitempty"`
 	Result  PageSummary `json:"result"`
 }
 type PageSummary struct {
 	ID             string `json:"$id"`
 	FetchedAt      time.Time
 	URL            string
 	Tags           []string
 	Categories     []string
 	ContentExcerpt string
 	Language       string
 	Summary100     string
 	Summary500     string
 }
 const model = openai.GPT3Dot5Turbo
 const summarizerSystemPrompt = `
 Your job is to summarize a webpage.
 You will receive a json object containing the url and body tag content with
 HTML tags stripped out.  The key "url" will be a string representing a URL.
 The key "content" will be the body tag and its rendered contents.
 You will parse and interpret the "content" value and respond with a json
 object such as:
 {
 "success": true,
 "error": null,
 "result": { ... }
 }
 Set success to false and provide a descriptive string in "error" describing
 the error if you are unable to complete the task for any reason, such as a
 failure to parse the input content.  Your response should be json and only
 json, and your response should always, always parse as valid json.  Do not
 provide any other output except json, no description.
 Result should be the following:
 type PageSummary struct {
    URL            string
    Tags           []string
    Categories     []string
    ContentExcerpt string
    Language       string
    Summary100     string
    Summary500     string
 }
 If the page content primarily describes an error condition or failure,
 please do not return a summary.  Instead, set the error field to a
 descriptive string and set success to false.  Omit the result field.
 Try to identify if the provided URL and content are a main page on a site,
 or a specific article, post, product, or other type of content on a site.
 If it is a main page, you should provide a summary of the site itself, and
 if it is a specific article, post, product, or other type of content, you
 should provide a summary of that specific article, post, item, or product.
 The tags and category should reflect this distinction.
 The output URL should be a canonicalized version of the input URL.
 The output tags is a list of strings, each a short one or two or three word
 tag, using underscores instead of spaces, to indicate page attributes, such
 as 'blogpost', 'documentation', 'article', 'listicle', et cetera.  You
 can use tags not listed in these instructions, come up with your own.  You
 should endeavor to come up with at least 5 tags for each URL, up to a
 maximum of 20.   For example, you might tag a personal blog site that
 contains a lot of posts about the javascript programming language with
 "javascript" and "programming".  Tags should be lowercase.  Important tags
 that might apply to sites are things like "defunct" for sites or services
 that have been shut down, or "cdn" for domains or sites that are primarily
 content delivery networks and not directly accessed by users, such as
 fbcdn.net.  Sites that are open to the public for signup and posting of
 user-generated content should receive tags of "public" or "ugc".
 Image-focused sites such as image hosting or photo sharing or art should
 receive the "images" tag.  Photography focused sites should receive the tag
 "photography".  Blogs should receive "blog", and specific blog posts should
 receive both "blog" and "blogpost".  Product pages should receive the tag
 "product".  Try to keep tags as single words.  For example, a payment
 processing company site should receive tags "corporate", "payments",
 "service_provider", among others.
 The output field categories should be a list of broad categories that a
 website can fit into.  "personal_blog" is a good example.
 "corporate_website" is another.  "tech_startup" is another.
 "automobile_retailer" is another.  These are just examples, not an inclusive
 list.  "news_aggregator" and "discussion_forum" are other examples.  You may
 include up to 10 categories, with a minimum of one.  Categories should be
 lowercase.  Please try to provide at least three categories in your response.
 The output field "ContentExcerpt" should be a substring of the input
 content, with whitespace collapsed.  Try to find the main or most relevant
 section of the page, such as the first paragraph of an article.  Do not
 include site header or footer boilerplate text.  Truncate this field at 2
 kilobytes of text.  It should be taken directly from the input.  Do not
 summarize in this field, but include a snippet of the provided content.  It
 should be a substring of the input "content" field.
 Language is the two digit ISO language code representing the main language
 of the content input.
 Summary100, and Summary500 output fields are english language summaries of
 the type of page, with a maximum of 100 words and 500 words respectively.
 These fields should not contain summaries of the page content!  They should
 contain a short description of the page itself, such as "A personal blog
 about programming and technology" or "A news site covering the latest in
 technology" or "An article about the history of the internet". For the tone
 of these Summary fields, you should be neutral and informative.  Please
 don't repeat imperative statements or calls to action from the source site,
 such as instructions to sign up, purchase, or download.  You should not
 include any opinions or subjective statements in the summaries.  Avoid using
 overly effusive descriptions or adjectives.  Unless the site is focused on a
 specific technical topic, avoid discussing technical things like logging in,
 cookies, or preference settings that are common to many websites.
 Given that the Summary fields are in English and for an English-speaking
 audience, if the page's language is not English, the page's language and
 country should mentioned in the summaries.  For example, "A Japanese news
 site covering current events in Japan" for a site in Japanese.  If the
 language is not English, always add the name of the language to the tags
 list.  If the site is focused at a non-english-speaking country, such as a
 site in French for a French audience, you should include the country name in
 the tags list, such as "france" or "japan".
 Editorially, you should not be summarizing the specific page content, but
 the type of page.  For example, if you are summarizing a news site, you
 should not summarize the news items displayed currently, but the news site
 itself, indicating the type of news being covered and style of coverage.  If
 you are summarizing a blog post, you should summarize the blog site itself.
 If you are summarizing a corporate site, you should provide general
 information about the company, and a high level overview of what type of
 information the company provides on the site.  If you are summarizing a
 software-as-a-service site, you should provide a high level overview of the
 service's features and target audience.
 `
 type Summarizer struct{ APIKey string }
 func NewSummarizer(apiKey string) (*Summarizer, error) {
 	if apiKey == "" {
 		return nil, fmt.Errorf("API key is required")
 	}
 	return &Summarizer{
 		APIKey: apiKey,
 	}, nil
 }
 func UniqueSortedList(input []string) []string {
 	// Create a map to track unique elements
 	uniqueMap := make(map[string]struct{})
 	for _, item := range input {
 		uniqueMap[item] = struct{}{}
 	}
 	// Create a slice from the keys of the map
 	uniqueList := make([]string, 0, len(uniqueMap))
 	for key := range uniqueMap {
 		uniqueList = append(uniqueList, key)
 	}
 	// Sort the unique list
 	sort.Strings(uniqueList)
 	return uniqueList
 }
 func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) {
 	apiKey := os.Getenv("OPENAI_API_KEY")
 	if apiKey == "" {
 		log.Fatal("OPENAI_API_KEY environment variable is not set")
 	}
 	type Request struct {
 		URL     string `json:"url"`
 		Content string `json:"content"`
 	}
 	req := Request{
 		URL:     url,
 		Content: content,
 	}
 	reqJSON, err := json.Marshal(req)
 	if err != nil {
 		return nil, err
 	}
 	client := openai.NewClient(apiKey)
 	ctx := context.Background()
 	sumReq := openai.ChatCompletionRequest{
 		Model: model,
 		Messages: []openai.ChatCompletionMessage{
 			{
 				Role:    openai.ChatMessageRoleSystem,
 				Content: summarizerSystemPrompt,
 			},
 			{
 				Role:    openai.ChatMessageRoleUser,
 				Content: string(reqJSON),
 			},
 		},
 	}
 	resp, err := client.CreateChatCompletion(ctx, sumReq)
 	if err != nil {
 		return nil, err
 	}
 	jsonResp := resp.Choices[0].Message.Content
 	var aiResp AIResponse
 	err = json.Unmarshal([]byte(jsonResp), &aiResp)
 	if err != nil {
 		fmt.Printf("Error unmarshalling response: %v\n", err)
 		fmt.Printf("Response: '%s'\n", jsonResp)
 		return nil, err
 	}
 	if aiResp.Error != "" {
 		return nil, fmt.Errorf(aiResp.Error)
 	}
 	if aiResp.Result.ContentExcerpt == "" {
 		return nil, fmt.Errorf("No content excerpt found")
 	}
 	langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name
 	aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language))
 	aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish))
 	aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags)
 	aiResp.Result.ID = schemaURL
 	aiResp.Result.FetchedAt = time.Now().UTC()
 	return &aiResp.Result, nil
 }