This commit is contained in:
Jeffrey Paul 2024-06-02 11:32:29 -07:00
commit 60a35c8a6c
8 changed files with 229 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.env

14
LICENSE Normal file
View File

@ -0,0 +1,14 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

20
Makefile Normal file
View File

@ -0,0 +1,20 @@
.PHONY: all test build clean
# Load environment variables from .env file
include .env
export $(shell sed 's/=.*//' .env)
all: build
# Build the Go project
build:
go build -o myapp ./...
# Run tests with .env variables loaded and disable caching
test:
go test -v -count=1 ./...
# Clean the build output
clean:
go clean
rm -f myapp

36
README.md Normal file
View File

@ -0,0 +1,36 @@
# puppeteer api client
This is a golang client for the api exposed by the puppeteer docker container.
https://github.com/l0co/docker-puppeteer-api
# usage
```go
package main
import "fmt"
func main() {
apiURL := os.Getenv("API_URL")
apiSalt := os.Getenv("API_SALT")
client := NewClient(apiURL, apiSalt)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
response, err := client.Scrape(ctx, "https://news.ycombinator.com", ".athing .title")
if err != nil {
log.Fatal(err)
}
fmt.Println(response.Content)
}
```
# License
WTFPL

87
client.go Normal file
View File

@ -0,0 +1,87 @@
package puppeteerapiclient
import (
"bytes"
"context"
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
type Client struct {
BaseURL string
Salt string
}
type ScrapeRequest struct {
URL string `json:"url"`
Selector string `json:"selector"`
Hash string `json:"hash"`
}
type ScrapeResponse struct {
URL string `json:"url"`
Selector string `json:"selector"`
Content string `json:"content"`
}
func NewClient(baseURL, salt string) *Client {
return &Client{
BaseURL: baseURL,
Salt: salt,
}
}
func (c *Client) CalculateHash(url string) string {
// The hash should be calculated by appending the salt to the URL with a colon separator.
data := url + ":" + c.Salt
hash := md5.Sum([]byte(data))
return hex.EncodeToString(hash[:])
}
func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) {
if selector == "" {
return ScrapeResponse{}, fmt.Errorf("selector is required")
}
hash := c.CalculateHash(url)
requestBody, err := json.Marshal(ScrapeRequest{
URL: url,
Selector: selector,
Hash: hash,
})
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody))
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to send request: %v", err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err)
}
// Since the response is HTML, we don't need to unmarshal JSON.
content := string(body)
return ScrapeResponse{
URL: url,
Selector: selector,
Content: content,
}, nil
}

63
client_test.go Normal file
View File

@ -0,0 +1,63 @@
package puppeteerapiclient
import (
"context"
"log"
"os"
"testing"
"time"
"github.com/joho/godotenv"
)
type TestCase struct {
URL string
Selector string
}
func TestMain(m *testing.M) {
err := godotenv.Load()
if err != nil {
log.Fatalf("Error loading .env file")
}
os.Exit(m.Run())
}
func TestScrape(t *testing.T) {
apiURL := os.Getenv("API_URL")
apiSalt := os.Getenv("API_SALT")
if apiURL == "" || apiSalt == "" {
t.Fatal("API_URL or API_SALT environment variables not set")
}
client := NewClient(apiURL, apiSalt)
testCases := []TestCase{
{"https://sneak.berlin", "body"},
{"https://nytimes.com", "h1"},
{"https://sneak.berlin", "#quoteCycler"},
{"https://news.ycombinator.com", ".athing .title"},
}
for _, tc := range testCases {
t.Run(tc.URL+" "+tc.Selector, func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
response, err := client.Scrape(ctx, tc.URL, tc.Selector)
if err != nil {
t.Fatalf("Error scraping content: %v", err)
}
if response.Content == "" {
t.Fatalf("No content fetched")
}
t.Logf("Scraped Content: %s", response.Content)
t.Logf("URL: %s", response.URL)
t.Logf("Selector: %s", response.Selector)
})
}
}

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module sneak.berlin/go/puppeteerapiclient
go 1.22.2
require github.com/joho/godotenv v1.5.1

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=