initial
This commit is contained in:
commit
60a35c8a6c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.env
|
||||
|
14
LICENSE
Normal file
14
LICENSE
Normal file
@ -0,0 +1,14 @@
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
Version 2, December 2004
|
||||
|
||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim or modified
|
||||
copies of this license document, and changing it is allowed as long
|
||||
as the name is changed.
|
||||
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||
|
20
Makefile
Normal file
20
Makefile
Normal file
@ -0,0 +1,20 @@
|
||||
.PHONY: all test build clean
|
||||
|
||||
# Load environment variables from .env file
|
||||
include .env
|
||||
export $(shell sed 's/=.*//' .env)
|
||||
|
||||
all: build
|
||||
|
||||
# Build the Go project
|
||||
build:
|
||||
go build -o myapp ./...
|
||||
|
||||
# Run tests with .env variables loaded and disable caching
|
||||
test:
|
||||
go test -v -count=1 ./...
|
||||
|
||||
# Clean the build output
|
||||
clean:
|
||||
go clean
|
||||
rm -f myapp
|
36
README.md
Normal file
36
README.md
Normal file
@ -0,0 +1,36 @@
|
||||
# puppeteer api client
|
||||
|
||||
This is a golang client for the api exposed by the puppeteer docker container.
|
||||
|
||||
https://github.com/l0co/docker-puppeteer-api
|
||||
|
||||
# usage
|
||||
|
||||
```go
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
func main() {
|
||||
apiURL := os.Getenv("API_URL")
|
||||
apiSalt := os.Getenv("API_SALT")
|
||||
|
||||
client := NewClient(apiURL, apiSalt)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
|
||||
defer cancel()
|
||||
|
||||
response, err := client.Scrape(ctx, "https://news.ycombinator.com", ".athing .title")
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Println(response.Content)
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
# License
|
||||
|
||||
WTFPL
|
87
client.go
Normal file
87
client.go
Normal file
@ -0,0 +1,87 @@
|
||||
package puppeteerapiclient
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
BaseURL string
|
||||
Salt string
|
||||
}
|
||||
|
||||
type ScrapeRequest struct {
|
||||
URL string `json:"url"`
|
||||
Selector string `json:"selector"`
|
||||
Hash string `json:"hash"`
|
||||
}
|
||||
|
||||
type ScrapeResponse struct {
|
||||
URL string `json:"url"`
|
||||
Selector string `json:"selector"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
func NewClient(baseURL, salt string) *Client {
|
||||
return &Client{
|
||||
BaseURL: baseURL,
|
||||
Salt: salt,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) CalculateHash(url string) string {
|
||||
// The hash should be calculated by appending the salt to the URL with a colon separator.
|
||||
data := url + ":" + c.Salt
|
||||
hash := md5.Sum([]byte(data))
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
||||
|
||||
func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) {
|
||||
if selector == "" {
|
||||
return ScrapeResponse{}, fmt.Errorf("selector is required")
|
||||
}
|
||||
|
||||
hash := c.CalculateHash(url)
|
||||
|
||||
requestBody, err := json.Marshal(ScrapeRequest{
|
||||
URL: url,
|
||||
Selector: selector,
|
||||
Hash: hash,
|
||||
})
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody))
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("failed to send request: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err)
|
||||
}
|
||||
|
||||
// Since the response is HTML, we don't need to unmarshal JSON.
|
||||
content := string(body)
|
||||
|
||||
return ScrapeResponse{
|
||||
URL: url,
|
||||
Selector: selector,
|
||||
Content: content,
|
||||
}, nil
|
||||
}
|
63
client_test.go
Normal file
63
client_test.go
Normal file
@ -0,0 +1,63 @@
|
||||
package puppeteerapiclient
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/joho/godotenv"
|
||||
)
|
||||
|
||||
type TestCase struct {
|
||||
URL string
|
||||
Selector string
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
err := godotenv.Load()
|
||||
if err != nil {
|
||||
log.Fatalf("Error loading .env file")
|
||||
}
|
||||
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestScrape(t *testing.T) {
|
||||
apiURL := os.Getenv("API_URL")
|
||||
apiSalt := os.Getenv("API_SALT")
|
||||
|
||||
if apiURL == "" || apiSalt == "" {
|
||||
t.Fatal("API_URL or API_SALT environment variables not set")
|
||||
}
|
||||
|
||||
client := NewClient(apiURL, apiSalt)
|
||||
|
||||
testCases := []TestCase{
|
||||
{"https://sneak.berlin", "body"},
|
||||
{"https://nytimes.com", "h1"},
|
||||
{"https://sneak.berlin", "#quoteCycler"},
|
||||
{"https://news.ycombinator.com", ".athing .title"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.URL+" "+tc.Selector, func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
response, err := client.Scrape(ctx, tc.URL, tc.Selector)
|
||||
if err != nil {
|
||||
t.Fatalf("Error scraping content: %v", err)
|
||||
}
|
||||
|
||||
if response.Content == "" {
|
||||
t.Fatalf("No content fetched")
|
||||
}
|
||||
|
||||
t.Logf("Scraped Content: %s", response.Content)
|
||||
t.Logf("URL: %s", response.URL)
|
||||
t.Logf("Selector: %s", response.Selector)
|
||||
})
|
||||
}
|
||||
}
|
5
go.mod
Normal file
5
go.mod
Normal file
@ -0,0 +1,5 @@
|
||||
module sneak.berlin/go/puppeteerapiclient
|
||||
|
||||
go 1.22.2
|
||||
|
||||
require github.com/joho/godotenv v1.5.1
|
Loading…
Reference in New Issue
Block a user