package main import ( "context" "encoding/json" "fmt" "math/rand" "net" "net/url" "os" "strings" "time" _ "github.com/joho/godotenv/autoload" "github.com/k0kubun/pp" "github.com/microcosm-cc/bluemonday" "sneak.berlin/go/aipagesummary" "sneak.berlin/go/puppeteerapiclient" ) func main() { bigList := []string{ "https://www.bbc.com", "https://www.cnn.com", "https://news.ycombinator.com", "https://www.reddit.com", "https://www.wikipedia.org", "https://www.ford.com", "https://www.tesla.com", "https://www.apple.com", "https://www.microsoft.com", "https://www.google.com", "https://medium.com", "https://www.nytimes.com", "https://sneak.berlin", } // now read urls from alexa500.txt list, err := os.ReadFile("alexa500.txt") if err != nil { fmt.Println(err) return } urls := strings.Fields(string(list)) for _, url := range urls { url = "https://" + url bigList = append(bigList, url) } shuffle(bigList) for _, url := range bigList { fmt.Printf("Showing summary for %s\n", url) summarizeURL(url) } } func shuffle(slice []string) { rand.Shuffle(len(slice), func(i, j int) { slice[i], slice[j] = slice[j], slice[i] }) } func parseHostnameFromURL(rawURL string) string { // Parse the URL parsedURL, err := url.Parse(rawURL) if err != nil { return "" } // Extract the host part host := parsedURL.Host // If the host contains a port, strip it hostname, _, err := net.SplitHostPort(host) if err != nil { // If there's no port, it might return an error, in which case, host is the hostname if err.(*net.AddrError).Err == "missing port in address" { hostname = host } else { return "" } } // Convert hostname to lowercase hostname = strings.ToLower(hostname) return hostname } func summarizeURL(url string) { // Initialize the summarizer with the OpenAI API key s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY")) if err != nil { fmt.Println(err) return } // Retrieve Puppeteer API URL and salt from environment variables apiURL := os.Getenv("PUPPETEER_API_URL") apiSalt := os.Getenv("PUPPETEER_API_SALT") if apiURL == "" || apiSalt == "" { panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set") } // Initialize the Puppeteer client client := puppeteerapiclient.NewClient(apiURL, apiSalt) // Create a context with a timeout for the scrape request ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() // Scrape the content from the given URL scraped, err := client.Scrape(ctx, url, "body") if err != nil { fmt.Println(err) return } // Use bluemonday to strip HTML tags from the scraped content p := bluemonday.StripTagsPolicy() strippedContent := p.Sanitize(scraped.Content) if len(strippedContent) > 15000 { // truncate if too big strippedContent = strippedContent[:10000] } // Get the summary of the stripped content res, err := s.GetSummary(url, strippedContent) if err != nil { fmt.Println(err) return } // pretty print the result: pp.Print(res) fmt.Println() hn := parseHostnameFromURL(url) fmt.Printf("Hostname: %s\n", hn) fn := fmt.Sprintf("%s.json", hn) writePrettyJSONToFile(res, fn) } func writePrettyJSONToFile(data interface{}, filename string) error { // Serialize the data to pretty-printed JSON jsonData, err := json.MarshalIndent(data, "", " ") if err != nil { return fmt.Errorf("error serializing to JSON: %v", err) } // Create or truncate the file file, err := os.Create(filename) if err != nil { return fmt.Errorf("error creating file: %v", err) } defer file.Close() // Write the JSON data to the file _, err = file.Write(jsonData) if err != nil { return fmt.Errorf("error writing to file: %v", err) } return nil }