puppeteerapiclient/client.go

113 lines
2.6 KiB
Go
Raw Permalink Normal View History

2024-06-02 18:32:29 +00:00
package puppeteerapiclient
import (
"bytes"
"context"
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"io/ioutil"
"math"
2024-06-02 18:32:29 +00:00
"net/http"
"time"
)
const (
MaxRetries = 5
2024-06-02 18:32:29 +00:00
)
type Client struct {
BaseURL string
Salt string
}
type ScrapeRequest struct {
URL string `json:"url"`
Selector string `json:"selector"`
Hash string `json:"hash"`
}
type ScrapeResponse struct {
URL string `json:"url"`
Selector string `json:"selector"`
Content string `json:"content"`
}
func NewClient(baseURL, salt string) *Client {
return &Client{
BaseURL: baseURL,
Salt: salt,
}
}
func (c *Client) CalculateHash(url string) string {
data := url + ":" + c.Salt
hash := md5.Sum([]byte(data))
return hex.EncodeToString(hash[:])
}
func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) {
if selector == "" {
return ScrapeResponse{}, fmt.Errorf("selector is required")
}
hash := c.CalculateHash(url)
requestBody, err := json.Marshal(ScrapeRequest{
URL: url,
Selector: selector,
Hash: hash,
})
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err)
}
client := &http.Client{}
var resp *http.Response
var body []byte
startTime := time.Now()
2024-06-02 18:32:29 +00:00
for attempt := 0; attempt < MaxRetries; attempt++ {
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody))
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err = client.Do(req)
if err == nil && resp.StatusCode == http.StatusOK {
defer resp.Body.Close()
body, err = ioutil.ReadAll(resp.Body)
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err)
}
content := string(body)
return ScrapeResponse{
URL: url,
Selector: selector,
Content: content,
}, nil
}
if resp != nil {
resp.Body.Close()
}
select {
case <-ctx.Done():
totalDuration := time.Since(startTime)
return ScrapeResponse{}, fmt.Errorf("context cancelled after %d retries and %v: %v", attempt+1, totalDuration, ctx.Err())
case <-time.After(time.Duration(math.Pow(2, float64(attempt))) * time.Second):
// continue to next retry
}
2024-06-02 19:06:43 +00:00
}
totalDuration := time.Since(startTime)
2024-06-02 18:32:29 +00:00
if err != nil {
return ScrapeResponse{}, fmt.Errorf("failed to send request after %d retries and %v: %v", MaxRetries, totalDuration, err)
2024-06-02 18:32:29 +00:00
}
return ScrapeResponse{}, fmt.Errorf("received non-OK response after %d retries and %v: %s", MaxRetries, totalDuration, resp.Status)
2024-06-02 18:32:29 +00:00
}