aipagesummary/cmd/test/main.go

176 lines
3.7 KiB
Go
Raw Permalink Normal View History

2024-06-03 19:11:29 +00:00
package main
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"net"
"net/url"
"os"
"strings"
"time"
_ "github.com/joho/godotenv/autoload"
"github.com/k0kubun/pp"
"github.com/microcosm-cc/bluemonday"
"sneak.berlin/go/aipagesummary"
"sneak.berlin/go/puppeteerapiclient"
)
func main() {
bigList := []string{
"https://www.bbc.com",
"https://www.cnn.com",
"https://news.ycombinator.com",
"https://www.reddit.com",
"https://www.wikipedia.org",
"https://www.ford.com",
"https://www.tesla.com",
"https://www.apple.com",
"https://www.microsoft.com",
"https://www.google.com",
"https://medium.com",
"https://www.nytimes.com",
"https://sneak.berlin",
}
// now read urls from alexa500.txt
list, err := os.ReadFile("alexa500.txt")
if err != nil {
fmt.Println(err)
return
}
urls := strings.Fields(string(list))
for _, url := range urls {
url = "https://" + url
bigList = append(bigList, url)
}
shuffle(bigList)
for _, url := range bigList {
fmt.Printf("Showing summary for %s\n", url)
summarizeURL(url)
}
}
func shuffle(slice []string) {
rand.Shuffle(len(slice), func(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
})
}
func parseHostnameFromURL(rawURL string) string {
// Parse the URL
parsedURL, err := url.Parse(rawURL)
if err != nil {
return ""
}
// Extract the host part
host := parsedURL.Host
// If the host contains a port, strip it
hostname, _, err := net.SplitHostPort(host)
if err != nil {
// If there's no port, it might return an error, in which case, host is the hostname
if err.(*net.AddrError).Err == "missing port in address" {
hostname = host
} else {
return ""
}
}
// Convert hostname to lowercase
hostname = strings.ToLower(hostname)
return hostname
}
func summarizeURL(url string) {
// Initialize the summarizer with the OpenAI API key
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
if err != nil {
fmt.Println(err)
return
}
// Retrieve Puppeteer API URL and salt from environment variables
apiURL := os.Getenv("PUPPETEER_API_URL")
apiSalt := os.Getenv("PUPPETEER_API_SALT")
if apiURL == "" || apiSalt == "" {
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
}
// Initialize the Puppeteer client
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
// Create a context with a timeout for the scrape request
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
// Scrape the content from the given URL
scraped, err := client.Scrape(ctx, url, "body")
if err != nil {
fmt.Println(err)
return
}
// Use bluemonday to strip HTML tags from the scraped content
p := bluemonday.StripTagsPolicy()
strippedContent := p.Sanitize(scraped.Content)
if len(strippedContent) > 15000 {
// truncate if too big
strippedContent = strippedContent[:10000]
}
// Get the summary of the stripped content
res, err := s.GetSummary(url, strippedContent)
if err != nil {
fmt.Println(err)
return
}
// pretty print the result:
pp.Print(res)
fmt.Println()
hn := parseHostnameFromURL(url)
fmt.Printf("Hostname: %s\n", hn)
fn := fmt.Sprintf("%s.json", hn)
writePrettyJSONToFile(res, fn)
}
func writePrettyJSONToFile(data interface{}, filename string) error {
// Serialize the data to pretty-printed JSON
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
return fmt.Errorf("error serializing to JSON: %v", err)
}
// Create or truncate the file
file, err := os.Create(filename)
if err != nil {
return fmt.Errorf("error creating file: %v", err)
}
defer file.Close()
// Write the JSON data to the file
_, err = file.Write(jsonData)
if err != nil {
return fmt.Errorf("error writing to file: %v", err)
}
return nil
}