commit 60a35c8a6caa9ef7ba1f0c31e1431870f271d39e Author: sneak Date: Sun Jun 2 11:32:29 2024 -0700 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..30bd623 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.env + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ee7d6a5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,14 @@ + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + Version 2, December 2004 + + Copyright (C) 2004 Sam Hocevar + + Everyone is permitted to copy and distribute verbatim or modified + copies of this license document, and changing it is allowed as long + as the name is changed. + + DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. You just DO WHAT THE FUCK YOU WANT TO. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ce89afa --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +.PHONY: all test build clean + +# Load environment variables from .env file +include .env +export $(shell sed 's/=.*//' .env) + +all: build + +# Build the Go project +build: + go build -o myapp ./... + +# Run tests with .env variables loaded and disable caching +test: + go test -v -count=1 ./... + +# Clean the build output +clean: + go clean + rm -f myapp diff --git a/README.md b/README.md new file mode 100644 index 0000000..41db0b1 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# puppeteer api client + +This is a golang client for the api exposed by the puppeteer docker container. + +https://github.com/l0co/docker-puppeteer-api + +# usage + +```go +package main + +import "fmt" + +func main() { + apiURL := os.Getenv("API_URL") + apiSalt := os.Getenv("API_SALT") + + client := NewClient(apiURL, apiSalt) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + + defer cancel() + + response, err := client.Scrape(ctx, "https://news.ycombinator.com", ".athing .title") + + if err != nil { + log.Fatal(err) + } + fmt.Println(response.Content) +} + +``` + +# License + +WTFPL diff --git a/client.go b/client.go new file mode 100644 index 0000000..7582415 --- /dev/null +++ b/client.go @@ -0,0 +1,87 @@ +package puppeteerapiclient + +import ( + "bytes" + "context" + "crypto/md5" + "encoding/hex" + "encoding/json" + "fmt" + "io/ioutil" + "net/http" +) + +type Client struct { + BaseURL string + Salt string +} + +type ScrapeRequest struct { + URL string `json:"url"` + Selector string `json:"selector"` + Hash string `json:"hash"` +} + +type ScrapeResponse struct { + URL string `json:"url"` + Selector string `json:"selector"` + Content string `json:"content"` +} + +func NewClient(baseURL, salt string) *Client { + return &Client{ + BaseURL: baseURL, + Salt: salt, + } +} + +func (c *Client) CalculateHash(url string) string { + // The hash should be calculated by appending the salt to the URL with a colon separator. + data := url + ":" + c.Salt + hash := md5.Sum([]byte(data)) + return hex.EncodeToString(hash[:]) +} + +func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) { + if selector == "" { + return ScrapeResponse{}, fmt.Errorf("selector is required") + } + + hash := c.CalculateHash(url) + + requestBody, err := json.Marshal(ScrapeRequest{ + URL: url, + Selector: selector, + Hash: hash, + }) + if err != nil { + return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody)) + if err != nil { + return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return ScrapeResponse{}, fmt.Errorf("failed to send request: %v", err) + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err) + } + + // Since the response is HTML, we don't need to unmarshal JSON. + content := string(body) + + return ScrapeResponse{ + URL: url, + Selector: selector, + Content: content, + }, nil +} diff --git a/client_test.go b/client_test.go new file mode 100644 index 0000000..36f574a --- /dev/null +++ b/client_test.go @@ -0,0 +1,63 @@ +package puppeteerapiclient + +import ( + "context" + "log" + "os" + "testing" + "time" + + "github.com/joho/godotenv" +) + +type TestCase struct { + URL string + Selector string +} + +func TestMain(m *testing.M) { + err := godotenv.Load() + if err != nil { + log.Fatalf("Error loading .env file") + } + + os.Exit(m.Run()) +} + +func TestScrape(t *testing.T) { + apiURL := os.Getenv("API_URL") + apiSalt := os.Getenv("API_SALT") + + if apiURL == "" || apiSalt == "" { + t.Fatal("API_URL or API_SALT environment variables not set") + } + + client := NewClient(apiURL, apiSalt) + + testCases := []TestCase{ + {"https://sneak.berlin", "body"}, + {"https://nytimes.com", "h1"}, + {"https://sneak.berlin", "#quoteCycler"}, + {"https://news.ycombinator.com", ".athing .title"}, + } + + for _, tc := range testCases { + t.Run(tc.URL+" "+tc.Selector, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + response, err := client.Scrape(ctx, tc.URL, tc.Selector) + if err != nil { + t.Fatalf("Error scraping content: %v", err) + } + + if response.Content == "" { + t.Fatalf("No content fetched") + } + + t.Logf("Scraped Content: %s", response.Content) + t.Logf("URL: %s", response.URL) + t.Logf("Selector: %s", response.Selector) + }) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..3de440d --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module sneak.berlin/go/puppeteerapiclient + +go 1.22.2 + +require github.com/joho/godotenv v1.5.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..d61b19e --- /dev/null +++ b/go.sum @@ -0,0 +1,2 @@ +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=