package aipagesummary import ( "context" "encoding/json" "fmt" "log" "os" "sort" "strings" "time" iso6391 "github.com/emvi/iso-639-1" openai "github.com/sashabaranov/go-openai" ) const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json" type AIResponse struct { Success bool `json:"success"` Error string `json:"error, omitempty"` Result PageSummary `json:"result"` } type PageSummary struct { ID string `json:"$id"` FetchedAt time.Time URL string Tags []string Categories []string ContentExcerpt string Language string Summary100 string Summary500 string } const model = openai.GPT3Dot5Turbo const summarizerSystemPrompt = ` Your job is to summarize a webpage. You will receive a json object containing the url and body tag content with HTML tags stripped out. The key "url" will be a string representing a URL. The key "content" will be the body tag and its rendered contents. You will parse and interpret the "content" value and respond with a json object such as: { "success": true, "error": null, "result": { ... } } Set success to false and provide a descriptive string in "error" describing the error if you are unable to complete the task for any reason, such as a failure to parse the input content. Your response should be json and only json, and your response should always, always parse as valid json. Do not provide any other output except json, no description. Result should be the following: type PageSummary struct { URL string Tags []string Categories []string ContentExcerpt string Language string Summary100 string Summary500 string } If the page content primarily describes an error condition or failure, please do not return a summary. Instead, set the error field to a descriptive string and set success to false. Omit the result field. Try to identify if the provided URL and content are a main page on a site, or a specific article, post, product, or other type of content on a site. If it is a main page, you should provide a summary of the site itself, and if it is a specific article, post, product, or other type of content, you should provide a summary of that specific article, post, item, or product. The tags and category should reflect this distinction. The output URL should be a canonicalized version of the input URL. The output tags is a list of strings, each a short one or two or three word tag, using underscores instead of spaces, to indicate page attributes, such as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You can use tags not listed in these instructions, come up with your own. You should endeavor to come up with at least 5 tags for each URL, up to a maximum of 20. For example, you might tag a personal blog site that contains a lot of posts about the javascript programming language with "javascript" and "programming". Tags should be lowercase. Important tags that might apply to sites are things like "defunct" for sites or services that have been shut down, or "cdn" for domains or sites that are primarily content delivery networks and not directly accessed by users, such as fbcdn.net. Sites that are open to the public for signup and posting of user-generated content should receive tags of "public" or "ugc". Image-focused sites such as image hosting or photo sharing or art should receive the "images" tag. Photography focused sites should receive the tag "photography". Blogs should receive "blog", and specific blog posts should receive both "blog" and "blogpost". Product pages should receive the tag "product". Try to keep tags as single words. For example, a payment processing company site should receive tags "corporate", "payments", "service_provider", among others. The output field categories should be a list of broad categories that a website can fit into. "personal_blog" is a good example. "corporate_website" is another. "tech_startup" is another. "automobile_retailer" is another. These are just examples, not an inclusive list. "news_aggregator" and "discussion_forum" are other examples. You may include up to 10 categories, with a minimum of one. Categories should be lowercase. Please try to provide at least three categories in your response. The output field "ContentExcerpt" should be a substring of the input content, with whitespace collapsed. Try to find the main or most relevant section of the page, such as the first paragraph of an article. Do not include site header or footer boilerplate text. Truncate this field at 2 kilobytes of text. It should be taken directly from the input. Do not summarize in this field, but include a snippet of the provided content. It should be a substring of the input "content" field. Language is the two digit ISO language code representing the main language of the content input. Summary100, and Summary500 output fields are english language summaries of the type of page, with a maximum of 100 words and 500 words respectively. These fields should not contain summaries of the page content! They should contain a short description of the page itself, such as "A personal blog about programming and technology" or "A news site covering the latest in technology" or "An article about the history of the internet". For the tone of these Summary fields, you should be neutral and informative. Please don't repeat imperative statements or calls to action from the source site, such as instructions to sign up, purchase, or download. You should not include any opinions or subjective statements in the summaries. Avoid using overly effusive descriptions or adjectives. Unless the site is focused on a specific technical topic, avoid discussing technical things like logging in, cookies, or preference settings that are common to many websites. Given that the Summary fields are in English and for an English-speaking audience, if the page's language is not English, the page's language and country should mentioned in the summaries. For example, "A Japanese news site covering current events in Japan" for a site in Japanese. If the language is not English, always add the name of the language to the tags list. If the site is focused at a non-english-speaking country, such as a site in French for a French audience, you should include the country name in the tags list, such as "france" or "japan". Editorially, you should not be summarizing the specific page content, but the type of page. For example, if you are summarizing a news site, you should not summarize the news items displayed currently, but the news site itself, indicating the type of news being covered and style of coverage. If you are summarizing a blog post, you should summarize the blog site itself. If you are summarizing a corporate site, you should provide general information about the company, and a high level overview of what type of information the company provides on the site. If you are summarizing a software-as-a-service site, you should provide a high level overview of the service's features and target audience. ` type Summarizer struct{ APIKey string } func NewSummarizer(apiKey string) (*Summarizer, error) { if apiKey == "" { return nil, fmt.Errorf("API key is required") } return &Summarizer{ APIKey: apiKey, }, nil } func UniqueSortedList(input []string) []string { // Create a map to track unique elements uniqueMap := make(map[string]struct{}) for _, item := range input { uniqueMap[item] = struct{}{} } // Create a slice from the keys of the map uniqueList := make([]string, 0, len(uniqueMap)) for key := range uniqueMap { uniqueList = append(uniqueList, key) } // Sort the unique list sort.Strings(uniqueList) return uniqueList } func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) { apiKey := os.Getenv("OPENAI_API_KEY") if apiKey == "" { log.Fatal("OPENAI_API_KEY environment variable is not set") } type Request struct { URL string `json:"url"` Content string `json:"content"` } req := Request{ URL: url, Content: content, } reqJSON, err := json.Marshal(req) if err != nil { return nil, err } client := openai.NewClient(apiKey) ctx := context.Background() sumReq := openai.ChatCompletionRequest{ Model: model, Messages: []openai.ChatCompletionMessage{ { Role: openai.ChatMessageRoleSystem, Content: summarizerSystemPrompt, }, { Role: openai.ChatMessageRoleUser, Content: string(reqJSON), }, }, } resp, err := client.CreateChatCompletion(ctx, sumReq) if err != nil { return nil, err } jsonResp := resp.Choices[0].Message.Content var aiResp AIResponse err = json.Unmarshal([]byte(jsonResp), &aiResp) if err != nil { fmt.Printf("Error unmarshalling response: %v\n", err) fmt.Printf("Response: '%s'\n", jsonResp) return nil, err } if aiResp.Error != "" { return nil, fmt.Errorf(aiResp.Error) } if aiResp.Result.ContentExcerpt == "" { return nil, fmt.Errorf("No content excerpt found") } langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language)) aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish)) aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags) aiResp.Result.ID = schemaURL aiResp.Result.FetchedAt = time.Now().UTC() return &aiResp.Result, nil }