113 lines
2.6 KiB
Go
113 lines
2.6 KiB
Go
package puppeteerapiclient
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"crypto/md5"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"math"
|
|
"net/http"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
MaxRetries = 5
|
|
)
|
|
|
|
type Client struct {
|
|
BaseURL string
|
|
Salt string
|
|
}
|
|
|
|
type ScrapeRequest struct {
|
|
URL string `json:"url"`
|
|
Selector string `json:"selector"`
|
|
Hash string `json:"hash"`
|
|
}
|
|
|
|
type ScrapeResponse struct {
|
|
URL string `json:"url"`
|
|
Selector string `json:"selector"`
|
|
Content string `json:"content"`
|
|
}
|
|
|
|
func NewClient(baseURL, salt string) *Client {
|
|
return &Client{
|
|
BaseURL: baseURL,
|
|
Salt: salt,
|
|
}
|
|
}
|
|
|
|
func (c *Client) CalculateHash(url string) string {
|
|
data := url + ":" + c.Salt
|
|
hash := md5.Sum([]byte(data))
|
|
return hex.EncodeToString(hash[:])
|
|
}
|
|
|
|
func (c *Client) Scrape(ctx context.Context, url, selector string) (ScrapeResponse, error) {
|
|
if selector == "" {
|
|
return ScrapeResponse{}, fmt.Errorf("selector is required")
|
|
}
|
|
|
|
hash := c.CalculateHash(url)
|
|
|
|
requestBody, err := json.Marshal(ScrapeRequest{
|
|
URL: url,
|
|
Selector: selector,
|
|
Hash: hash,
|
|
})
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("failed to marshal request: %v", err)
|
|
}
|
|
|
|
client := &http.Client{}
|
|
var resp *http.Response
|
|
var body []byte
|
|
startTime := time.Now()
|
|
|
|
for attempt := 0; attempt < MaxRetries; attempt++ {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/scrape", bytes.NewBuffer(requestBody))
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("failed to create request: %v", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err = client.Do(req)
|
|
if err == nil && resp.StatusCode == http.StatusOK {
|
|
defer resp.Body.Close()
|
|
body, err = ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("failed to read response body: %v", err)
|
|
}
|
|
content := string(body)
|
|
return ScrapeResponse{
|
|
URL: url,
|
|
Selector: selector,
|
|
Content: content,
|
|
}, nil
|
|
}
|
|
|
|
if resp != nil {
|
|
resp.Body.Close()
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
totalDuration := time.Since(startTime)
|
|
return ScrapeResponse{}, fmt.Errorf("context cancelled after %d retries and %v: %v", attempt+1, totalDuration, ctx.Err())
|
|
case <-time.After(time.Duration(math.Pow(2, float64(attempt))) * time.Second):
|
|
// continue to next retry
|
|
}
|
|
}
|
|
|
|
totalDuration := time.Since(startTime)
|
|
if err != nil {
|
|
return ScrapeResponse{}, fmt.Errorf("failed to send request after %d retries and %v: %v", MaxRetries, totalDuration, err)
|
|
}
|
|
|
|
return ScrapeResponse{}, fmt.Errorf("received non-OK response after %d retries and %v: %s", MaxRetries, totalDuration, resp.Status)
|
|
}
|