176 lines
3.7 KiB
Go
176 lines
3.7 KiB
Go
|
package main
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"math/rand"
|
||
|
"net"
|
||
|
"net/url"
|
||
|
"os"
|
||
|
"strings"
|
||
|
"time"
|
||
|
|
||
|
_ "github.com/joho/godotenv/autoload"
|
||
|
"github.com/k0kubun/pp"
|
||
|
"github.com/microcosm-cc/bluemonday"
|
||
|
"sneak.berlin/go/aipagesummary"
|
||
|
"sneak.berlin/go/puppeteerapiclient"
|
||
|
)
|
||
|
|
||
|
func main() {
|
||
|
bigList := []string{
|
||
|
"https://www.bbc.com",
|
||
|
"https://www.cnn.com",
|
||
|
"https://news.ycombinator.com",
|
||
|
"https://www.reddit.com",
|
||
|
"https://www.wikipedia.org",
|
||
|
"https://www.ford.com",
|
||
|
"https://www.tesla.com",
|
||
|
"https://www.apple.com",
|
||
|
"https://www.microsoft.com",
|
||
|
"https://www.google.com",
|
||
|
"https://medium.com",
|
||
|
"https://www.nytimes.com",
|
||
|
"https://sneak.berlin",
|
||
|
}
|
||
|
|
||
|
// now read urls from alexa500.txt
|
||
|
list, err := os.ReadFile("alexa500.txt")
|
||
|
if err != nil {
|
||
|
fmt.Println(err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
urls := strings.Fields(string(list))
|
||
|
for _, url := range urls {
|
||
|
url = "https://" + url
|
||
|
bigList = append(bigList, url)
|
||
|
|
||
|
}
|
||
|
|
||
|
shuffle(bigList)
|
||
|
|
||
|
for _, url := range bigList {
|
||
|
fmt.Printf("Showing summary for %s\n", url)
|
||
|
summarizeURL(url)
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
func shuffle(slice []string) {
|
||
|
rand.Shuffle(len(slice), func(i, j int) {
|
||
|
slice[i], slice[j] = slice[j], slice[i]
|
||
|
})
|
||
|
}
|
||
|
|
||
|
func parseHostnameFromURL(rawURL string) string {
|
||
|
// Parse the URL
|
||
|
parsedURL, err := url.Parse(rawURL)
|
||
|
if err != nil {
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
// Extract the host part
|
||
|
host := parsedURL.Host
|
||
|
|
||
|
// If the host contains a port, strip it
|
||
|
hostname, _, err := net.SplitHostPort(host)
|
||
|
if err != nil {
|
||
|
// If there's no port, it might return an error, in which case, host is the hostname
|
||
|
if err.(*net.AddrError).Err == "missing port in address" {
|
||
|
hostname = host
|
||
|
} else {
|
||
|
return ""
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Convert hostname to lowercase
|
||
|
hostname = strings.ToLower(hostname)
|
||
|
|
||
|
return hostname
|
||
|
}
|
||
|
|
||
|
func summarizeURL(url string) {
|
||
|
|
||
|
// Initialize the summarizer with the OpenAI API key
|
||
|
s, err := aipagesummary.NewSummarizer(os.Getenv("OPENAI_API_KEY"))
|
||
|
if err != nil {
|
||
|
fmt.Println(err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// Retrieve Puppeteer API URL and salt from environment variables
|
||
|
apiURL := os.Getenv("PUPPETEER_API_URL")
|
||
|
apiSalt := os.Getenv("PUPPETEER_API_SALT")
|
||
|
|
||
|
if apiURL == "" || apiSalt == "" {
|
||
|
panic("PUPPETEER_API_URL and PUPPETEER_API_SALT must be set")
|
||
|
}
|
||
|
|
||
|
// Initialize the Puppeteer client
|
||
|
client := puppeteerapiclient.NewClient(apiURL, apiSalt)
|
||
|
|
||
|
// Create a context with a timeout for the scrape request
|
||
|
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||
|
defer cancel()
|
||
|
|
||
|
// Scrape the content from the given URL
|
||
|
|
||
|
scraped, err := client.Scrape(ctx, url, "body")
|
||
|
if err != nil {
|
||
|
fmt.Println(err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// Use bluemonday to strip HTML tags from the scraped content
|
||
|
p := bluemonday.StripTagsPolicy()
|
||
|
strippedContent := p.Sanitize(scraped.Content)
|
||
|
|
||
|
if len(strippedContent) > 15000 {
|
||
|
// truncate if too big
|
||
|
strippedContent = strippedContent[:10000]
|
||
|
}
|
||
|
|
||
|
// Get the summary of the stripped content
|
||
|
res, err := s.GetSummary(url, strippedContent)
|
||
|
if err != nil {
|
||
|
fmt.Println(err)
|
||
|
return
|
||
|
}
|
||
|
// pretty print the result:
|
||
|
pp.Print(res)
|
||
|
fmt.Println()
|
||
|
|
||
|
hn := parseHostnameFromURL(url)
|
||
|
fmt.Printf("Hostname: %s\n", hn)
|
||
|
|
||
|
fn := fmt.Sprintf("%s.json", hn)
|
||
|
|
||
|
writePrettyJSONToFile(res, fn)
|
||
|
|
||
|
}
|
||
|
|
||
|
func writePrettyJSONToFile(data interface{}, filename string) error {
|
||
|
// Serialize the data to pretty-printed JSON
|
||
|
jsonData, err := json.MarshalIndent(data, "", " ")
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("error serializing to JSON: %v", err)
|
||
|
}
|
||
|
|
||
|
// Create or truncate the file
|
||
|
file, err := os.Create(filename)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("error creating file: %v", err)
|
||
|
}
|
||
|
defer file.Close()
|
||
|
|
||
|
// Write the JSON data to the file
|
||
|
_, err = file.Write(jsonData)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("error writing to file: %v", err)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|