120 lines
2.4 KiB
Go
120 lines
2.4 KiB
Go
|
package alexatop
|
||
|
|
||
|
import (
|
||
|
"archive/zip"
|
||
|
"bytes"
|
||
|
_ "embed"
|
||
|
"encoding/csv"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"math/rand"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
)
|
||
|
|
||
|
//go:embed backup/alexa.zip
|
||
|
var alexaZip []byte
|
||
|
|
||
|
var (
|
||
|
urls []string
|
||
|
fetchError error
|
||
|
once sync.Once
|
||
|
)
|
||
|
|
||
|
// FetchAlexaTop1M fetches the Alexa Top 1M websites from the embedded zip file and returns a list of URLs
|
||
|
func FetchAlexaTop1M() ([]string, error) {
|
||
|
once.Do(func() {
|
||
|
// Step 1: Open the zip file from the embedded data
|
||
|
r, err := zip.NewReader(bytes.NewReader(alexaZip), int64(len(alexaZip)))
|
||
|
if err != nil {
|
||
|
fetchError = fmt.Errorf("failed to open zip file: %v", err)
|
||
|
return
|
||
|
}
|
||
|
|
||
|
// Step 2: Locate and read the CSV file within the zip
|
||
|
var csvFile *zip.File
|
||
|
for _, f := range r.File {
|
||
|
if strings.HasSuffix(f.Name, ".csv") {
|
||
|
csvFile = f
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
if csvFile == nil {
|
||
|
fetchError = fmt.Errorf("csv file not found in zip archive")
|
||
|
return
|
||
|
}
|
||
|
|
||
|
rc, err := csvFile.Open()
|
||
|
if err != nil {
|
||
|
fetchError = fmt.Errorf("failed to open csv file: %v", err)
|
||
|
return
|
||
|
}
|
||
|
defer rc.Close()
|
||
|
|
||
|
// Step 3: Parse the CSV file and extract URLs
|
||
|
csvReader := csv.NewReader(rc)
|
||
|
var fetchedUrls []string
|
||
|
for {
|
||
|
record, err := csvReader.Read()
|
||
|
if err == io.EOF {
|
||
|
break
|
||
|
}
|
||
|
if err != nil {
|
||
|
fetchError = fmt.Errorf("error reading csv: %v", err)
|
||
|
return
|
||
|
}
|
||
|
if len(record) < 2 {
|
||
|
continue
|
||
|
}
|
||
|
fetchedUrls = append(fetchedUrls, fmt.Sprintf("http://%s", record[1]))
|
||
|
}
|
||
|
|
||
|
urls = fetchedUrls
|
||
|
})
|
||
|
|
||
|
return urls, fetchError
|
||
|
}
|
||
|
|
||
|
// RandomSite returns a random site from the list of Alexa Top 1M URLs
|
||
|
func RandomSite() (string, error) {
|
||
|
urls, err := FetchAlexaTop1M()
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
return urls[rand.Intn(len(urls))], nil
|
||
|
}
|
||
|
|
||
|
// NthSite returns the nth site from the list of Alexa Top 1M URLs
|
||
|
func NthSite(n int) (string, error) {
|
||
|
urls, err := FetchAlexaTop1M()
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
if n < 0 || n >= len(urls) {
|
||
|
return "", fmt.Errorf("index out of range")
|
||
|
}
|
||
|
|
||
|
return urls[n], nil
|
||
|
}
|
||
|
|
||
|
// RandomSites returns n random sites from the list of Alexa Top 1M URLs
|
||
|
func RandomSites(n int) ([]string, error) {
|
||
|
urls, err := FetchAlexaTop1M()
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
if n < 0 || n > len(urls) {
|
||
|
return nil, fmt.Errorf("invalid number of sites requested")
|
||
|
}
|
||
|
|
||
|
selected := make([]string, n)
|
||
|
for i := range selected {
|
||
|
selected[i] = urls[rand.Intn(len(urls))]
|
||
|
}
|
||
|
|
||
|
return selected, nil
|
||
|
}
|