initial
This commit is contained in:
commit
60a35c8a6c
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.env
|
||||||
|
|
14
LICENSE
Normal file
14
LICENSE
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
Version 2, December 2004
|
||||||
|
|
||||||
|
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||||
|
|
||||||
|
Everyone is permitted to copy and distribute verbatim or modified
|
||||||
|
copies of this license document, and changing it is allowed as long
|
||||||
|
as the name is changed.
|
||||||
|
|
||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||||
|
|
||||||
|
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||||
|
|
20
Makefile
Normal file
20
Makefile
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
.PHONY: all test build clean
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
include .env
|
||||||
|
export $(shell sed 's/=.*//' .env)
|
||||||
|
|
||||||
|
all: build
|
||||||
|
|
||||||
|
# Build the Go project
|
||||||
|
build:
|
||||||
|
go build -o myapp ./...
|
||||||
|
|
||||||
|
# Run tests with .env variables loaded and disable caching
|
||||||
|
test:
|
||||||
|
go test -v -count=1 ./...
|
||||||
|
|
||||||
|
# Clean the build output
|
||||||
|
clean:
|
||||||
|
go clean
|
||||||
|
rm -f myapp
|
36
README.md
Normal file
36
README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# puppeteer api client
|
||||||
|
|
||||||
|
This is a golang client for the api exposed by the puppeteer docker container.
|
||||||
|
|
||||||
|
https://github.com/l0co/docker-puppeteer-api
|
||||||
|
|
||||||
|
# usage
|
||||||
|
|
||||||
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
apiURL := os.Getenv("API_URL")
|
||||||
|
apiSalt := os.Getenv("API_SALT")
|
||||||
|
|
||||||
|
client := NewClient(apiURL, apiSalt)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
response, err := client.Scrape(ctx, "https://news.ycombinator.com", ".athing .title")
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
fmt.Println(response.Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
# License
|
||||||
|
|
||||||
|
WTFPL
|
87
client.go
Normal file
87
client.go
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
package puppeteerapiclient
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"crypto/md5"
|
||||||
|
"encoding/hex"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"net/http"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Client struct {
|
||||||
|
BaseURL string
|
||||||
|
Salt string
|
||||||
|
}
|
||||||
|
|
||||||
|
type ScrapeRequest struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Selector string `json:"selector"`
|
||||||
|
Hash string `json:"hash"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type ScrapeResponse struct {
|
||||||
|
URL string `json:"url"`
|
||||||
|
Selector string `json:"selector"`
|
||||||
|
Content string `json:"content"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewClient(baseURL, salt string) *Client {
|
||||||
|
return &Client{
|
||||||
|
BaseURL: baseURL,
|
||||||
|
Salt: salt,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Client) CalculateHash(url string) string {
|
||||||
|
// The hash should be calculated by appending the salt to the URL with a colon separator.
|
||||||
|
data := url + ":" + c.Salt
|
||||||
|
hash := md5.Sum([]byte(data))
|
||||||
|
return hex.EncodeToString(hash[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) {
|
||||||
|
if selector == "" {
|
||||||
|
return ScrapeResponse{}, fmt.Errorf("selector is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
hash := c.CalculateHash(url)
|
||||||
|
|
||||||
|
requestBody, err := json.Marshal(ScrapeRequest{
|
||||||
|
URL: url,
|
||||||
|
Selector: selector,
|
||||||
|
Hash: hash,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody))
|
||||||
|
if err != nil {
|
||||||
|
return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err)
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
|
||||||
|
client := &http.Client{}
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
return ScrapeResponse{}, fmt.Errorf("failed to send request: %v", err)
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
body, err := ioutil.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Since the response is HTML, we don't need to unmarshal JSON.
|
||||||
|
content := string(body)
|
||||||
|
|
||||||
|
return ScrapeResponse{
|
||||||
|
URL: url,
|
||||||
|
Selector: selector,
|
||||||
|
Content: content,
|
||||||
|
}, nil
|
||||||
|
}
|
63
client_test.go
Normal file
63
client_test.go
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
package puppeteerapiclient
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/joho/godotenv"
|
||||||
|
)
|
||||||
|
|
||||||
|
type TestCase struct {
|
||||||
|
URL string
|
||||||
|
Selector string
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMain(m *testing.M) {
|
||||||
|
err := godotenv.Load()
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Error loading .env file")
|
||||||
|
}
|
||||||
|
|
||||||
|
os.Exit(m.Run())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestScrape(t *testing.T) {
|
||||||
|
apiURL := os.Getenv("API_URL")
|
||||||
|
apiSalt := os.Getenv("API_SALT")
|
||||||
|
|
||||||
|
if apiURL == "" || apiSalt == "" {
|
||||||
|
t.Fatal("API_URL or API_SALT environment variables not set")
|
||||||
|
}
|
||||||
|
|
||||||
|
client := NewClient(apiURL, apiSalt)
|
||||||
|
|
||||||
|
testCases := []TestCase{
|
||||||
|
{"https://sneak.berlin", "body"},
|
||||||
|
{"https://nytimes.com", "h1"},
|
||||||
|
{"https://sneak.berlin", "#quoteCycler"},
|
||||||
|
{"https://news.ycombinator.com", ".athing .title"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.URL+" "+tc.Selector, func(t *testing.T) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
response, err := client.Scrape(ctx, tc.URL, tc.Selector)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error scraping content: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if response.Content == "" {
|
||||||
|
t.Fatalf("No content fetched")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Scraped Content: %s", response.Content)
|
||||||
|
t.Logf("URL: %s", response.URL)
|
||||||
|
t.Logf("Selector: %s", response.Selector)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
5
go.mod
Normal file
5
go.mod
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
module sneak.berlin/go/puppeteerapiclient
|
||||||
|
|
||||||
|
go 1.22.2
|
||||||
|
|
||||||
|
require github.com/joho/godotenv v1.5.1
|
Loading…
Reference in New Issue
Block a user