266 lines
9.3 KiB
Go
266 lines
9.3 KiB
Go
package aipagesummary
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
iso6391 "github.com/emvi/iso-639-1"
|
|
openai "github.com/sashabaranov/go-openai"
|
|
)
|
|
|
|
const schemaURL = "https://static.sneak.cloud/2024/2024-06-02/2024-06-02.aipagesummary.schema.json"
|
|
|
|
type AIResponse struct {
|
|
Success bool `json:"success"`
|
|
Error string `json:"error, omitempty"`
|
|
Result PageSummary `json:"result"`
|
|
}
|
|
|
|
type PageSummary struct {
|
|
ID string `json:"$id"`
|
|
FetchedAt time.Time
|
|
URL string
|
|
Tags []string
|
|
Categories []string
|
|
ContentExcerpt string
|
|
Language string
|
|
Summary100 string
|
|
Summary500 string
|
|
}
|
|
|
|
const model = openai.GPT3Dot5Turbo
|
|
|
|
const summarizerSystemPrompt = `
|
|
|
|
Your job is to summarize a webpage.
|
|
|
|
You will receive a json object containing the url and body tag content with
|
|
HTML tags stripped out. The key "url" will be a string representing a URL.
|
|
The key "content" will be the body tag and its rendered contents.
|
|
|
|
You will parse and interpret the "content" value and respond with a json
|
|
object such as:
|
|
|
|
{
|
|
"success": true,
|
|
"error": null,
|
|
"result": { ... }
|
|
}
|
|
|
|
Set success to false and provide a descriptive string in "error" describing
|
|
the error if you are unable to complete the task for any reason, such as a
|
|
failure to parse the input content. Your response should be json and only
|
|
json, and your response should always, always parse as valid json. Do not
|
|
provide any other output except json, no description.
|
|
|
|
|
|
Result should be the following:
|
|
|
|
type PageSummary struct {
|
|
URL string
|
|
Tags []string
|
|
Categories []string
|
|
ContentExcerpt string
|
|
Language string
|
|
Summary100 string
|
|
Summary500 string
|
|
}
|
|
|
|
If the page content primarily describes an error condition or failure,
|
|
please do not return a summary. Instead, set the error field to a
|
|
descriptive string and set success to false. Omit the result field.
|
|
|
|
Try to identify if the provided URL and content are a main page on a site,
|
|
or a specific article, post, product, or other type of content on a site.
|
|
If it is a main page, you should provide a summary of the site itself, and
|
|
if it is a specific article, post, product, or other type of content, you
|
|
should provide a summary of that specific article, post, item, or product.
|
|
The tags and category should reflect this distinction.
|
|
|
|
The output URL should be a canonicalized version of the input URL.
|
|
|
|
The output tags is a list of strings, each a short one or two or three word
|
|
tag, using underscores instead of spaces, to indicate page attributes, such
|
|
as 'blogpost', 'documentation', 'article', 'listicle', et cetera. You
|
|
can use tags not listed in these instructions, come up with your own. You
|
|
should endeavor to come up with at least 5 tags for each URL, up to a
|
|
maximum of 20. For example, you might tag a personal blog site that
|
|
contains a lot of posts about the javascript programming language with
|
|
"javascript" and "programming". Tags should be lowercase. Important tags
|
|
that might apply to sites are things like "defunct" for sites or services
|
|
that have been shut down, or "cdn" for domains or sites that are primarily
|
|
content delivery networks and not directly accessed by users, such as
|
|
fbcdn.net. Sites that are open to the public for signup and posting of
|
|
user-generated content should receive tags of "public" or "ugc".
|
|
Image-focused sites such as image hosting or photo sharing or art should
|
|
receive the "images" tag. Photography focused sites should receive the tag
|
|
"photography". Blogs should receive "blog", and specific blog posts should
|
|
receive both "blog" and "blogpost". Product pages should receive the tag
|
|
"product". Try to keep tags as single words. For example, a payment
|
|
processing company site should receive tags "corporate", "payments",
|
|
"service_provider", among others.
|
|
|
|
The output field categories should be a list of broad categories that a
|
|
website can fit into. "personal_blog" is a good example.
|
|
"corporate_website" is another. "tech_startup" is another.
|
|
"automobile_retailer" is another. These are just examples, not an inclusive
|
|
list. "news_aggregator" and "discussion_forum" are other examples. You may
|
|
include up to 10 categories, with a minimum of one. Categories should be
|
|
lowercase. Please try to provide at least three categories in your response.
|
|
|
|
The output field "ContentExcerpt" should be a substring of the input
|
|
content, with whitespace collapsed. Try to find the main or most relevant
|
|
section of the page, such as the first paragraph of an article. Do not
|
|
include site header or footer boilerplate text. Truncate this field at 2
|
|
kilobytes of text. It should be taken directly from the input. Do not
|
|
summarize in this field, but include a snippet of the provided content. It
|
|
should be a substring of the input "content" field.
|
|
|
|
Language is the two digit ISO language code representing the main language
|
|
of the content input.
|
|
|
|
Summary100, and Summary500 output fields are english language summaries of
|
|
the type of page, with a maximum of 100 words and 500 words respectively.
|
|
These fields should not contain summaries of the page content! They should
|
|
contain a short description of the page itself, such as "A personal blog
|
|
about programming and technology" or "A news site covering the latest in
|
|
technology" or "An article about the history of the internet". For the tone
|
|
of these Summary fields, you should be neutral and informative. Please
|
|
don't repeat imperative statements or calls to action from the source site,
|
|
such as instructions to sign up, purchase, or download. You should not
|
|
include any opinions or subjective statements in the summaries. Avoid using
|
|
overly effusive descriptions or adjectives. Unless the site is focused on a
|
|
specific technical topic, avoid discussing technical things like logging in,
|
|
cookies, or preference settings that are common to many websites.
|
|
|
|
Given that the Summary fields are in English and for an English-speaking
|
|
audience, if the page's language is not English, the page's language and
|
|
country should mentioned in the summaries. For example, "A Japanese news
|
|
site covering current events in Japan" for a site in Japanese. If the
|
|
language is not English, always add the name of the language to the tags
|
|
list. If the site is focused at a non-english-speaking country, such as a
|
|
site in French for a French audience, you should include the country name in
|
|
the tags list, such as "france" or "japan".
|
|
|
|
Editorially, you should not be summarizing the specific page content, but
|
|
the type of page. For example, if you are summarizing a news site, you
|
|
should not summarize the news items displayed currently, but the news site
|
|
itself, indicating the type of news being covered and style of coverage. If
|
|
you are summarizing a blog post, you should summarize the blog site itself.
|
|
If you are summarizing a corporate site, you should provide general
|
|
information about the company, and a high level overview of what type of
|
|
information the company provides on the site. If you are summarizing a
|
|
software-as-a-service site, you should provide a high level overview of the
|
|
service's features and target audience.
|
|
|
|
`
|
|
|
|
type Summarizer struct{ APIKey string }
|
|
|
|
func NewSummarizer(apiKey string) (*Summarizer, error) {
|
|
if apiKey == "" {
|
|
return nil, fmt.Errorf("API key is required")
|
|
}
|
|
|
|
return &Summarizer{
|
|
APIKey: apiKey,
|
|
}, nil
|
|
}
|
|
|
|
func UniqueSortedList(input []string) []string {
|
|
// Create a map to track unique elements
|
|
uniqueMap := make(map[string]struct{})
|
|
for _, item := range input {
|
|
uniqueMap[item] = struct{}{}
|
|
}
|
|
|
|
// Create a slice from the keys of the map
|
|
uniqueList := make([]string, 0, len(uniqueMap))
|
|
for key := range uniqueMap {
|
|
uniqueList = append(uniqueList, key)
|
|
}
|
|
|
|
// Sort the unique list
|
|
sort.Strings(uniqueList)
|
|
|
|
return uniqueList
|
|
}
|
|
|
|
func (s *Summarizer) GetSummary(url, content string) (*PageSummary, error) {
|
|
apiKey := os.Getenv("OPENAI_API_KEY")
|
|
if apiKey == "" {
|
|
log.Fatal("OPENAI_API_KEY environment variable is not set")
|
|
}
|
|
|
|
type Request struct {
|
|
URL string `json:"url"`
|
|
Content string `json:"content"`
|
|
}
|
|
|
|
req := Request{
|
|
URL: url,
|
|
Content: content,
|
|
}
|
|
|
|
reqJSON, err := json.Marshal(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
client := openai.NewClient(apiKey)
|
|
|
|
ctx := context.Background()
|
|
|
|
sumReq := openai.ChatCompletionRequest{
|
|
Model: model,
|
|
Messages: []openai.ChatCompletionMessage{
|
|
{
|
|
Role: openai.ChatMessageRoleSystem,
|
|
Content: summarizerSystemPrompt,
|
|
},
|
|
{
|
|
Role: openai.ChatMessageRoleUser,
|
|
Content: string(reqJSON),
|
|
},
|
|
},
|
|
}
|
|
|
|
resp, err := client.CreateChatCompletion(ctx, sumReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
jsonResp := resp.Choices[0].Message.Content
|
|
var aiResp AIResponse
|
|
err = json.Unmarshal([]byte(jsonResp), &aiResp)
|
|
if err != nil {
|
|
fmt.Printf("Error unmarshalling response: %v\n", err)
|
|
fmt.Printf("Response: '%s'\n", jsonResp)
|
|
return nil, err
|
|
}
|
|
|
|
if aiResp.Error != "" {
|
|
return nil, fmt.Errorf(aiResp.Error)
|
|
}
|
|
|
|
if aiResp.Result.ContentExcerpt == "" {
|
|
return nil, fmt.Errorf("No content excerpt found")
|
|
}
|
|
|
|
langNameInEnglish := iso6391.FromCode(aiResp.Result.Language).Name
|
|
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(aiResp.Result.Language))
|
|
aiResp.Result.Tags = append(aiResp.Result.Tags, strings.ToLower(langNameInEnglish))
|
|
|
|
aiResp.Result.Tags = UniqueSortedList(aiResp.Result.Tags)
|
|
aiResp.Result.ID = schemaURL
|
|
aiResp.Result.FetchedAt = time.Now().UTC()
|
|
|
|
return &aiResp.Result, nil
|
|
}
|