orangesite/hn/fetcher.go

202 lines
4.6 KiB
Go
Raw Normal View History

2020-03-25 00:49:19 +00:00
package hn
import (
"net/http"
2020-03-25 03:55:03 +00:00
"os"
2020-03-25 00:49:19 +00:00
"time"
"github.com/jinzhu/gorm"
_ "github.com/jinzhu/gorm/dialects/sqlite"
"github.com/peterhellberg/hn"
2020-03-25 03:55:03 +00:00
2020-03-25 00:49:19 +00:00
"github.com/rs/zerolog"
)
func NewFetcher(db *gorm.DB) *Fetcher {
f := new(Fetcher)
f.db = db
f.fetchIntervalSecs = 60
f.hn = hn.NewClient(&http.Client{
Timeout: time.Duration(5 * time.Second),
})
return f
}
type Fetcher struct {
nextFetch time.Time
fetchIntervalSecs uint
db *gorm.DB
hn *hn.Client
log *zerolog.Logger
}
func (f *Fetcher) AddLogger(l *zerolog.Logger) {
f.log = l
}
func (f *Fetcher) run() {
2020-03-25 03:55:03 +00:00
if os.Getenv("DEBUG") != "" {
f.db.LogMode(true)
}
2020-03-25 00:49:19 +00:00
f.db.AutoMigrate(&HNStoryRank{})
f.db.AutoMigrate(&FrontPageCache{})
2020-03-25 03:55:03 +00:00
f.db.AutoMigrate(&HNFrontPage{})
2020-03-25 00:49:19 +00:00
for {
f.log.Info().
Msg("fetching top stories from HN")
f.nextFetch = time.Now().Add(time.Duration(f.fetchIntervalSecs) * time.Second)
err := f.StoreFrontPage()
if err != nil {
panic(err)
}
until := time.Until(f.nextFetch)
countdown := time.NewTimer(until)
f.log.Info().Msgf("waiting %s until next fetch", until)
<-countdown.C
}
}
func (f *Fetcher) StoreFrontPage() error {
// FIXME set fetchid
2020-03-25 01:32:40 +00:00
//r, err := f.db.Table("hn_story_rank").Select("MAX(FetchID)").Rows()
//pp.Print(r)
//Select("max(FetchID)").Find(&HNStoryRank)
2020-03-25 00:49:19 +00:00
ids, err := f.hn.TopStories()
t := time.Now()
if err != nil {
return err
}
2020-03-25 01:32:40 +00:00
// 30 items on HN frontpage.
2020-03-25 00:49:19 +00:00
for i, id := range ids[:30] {
item, err := f.hn.Item(id)
if err != nil {
return (err)
}
2020-03-25 04:21:09 +00:00
/*
s := HNStoryRank{
HNID: uint(id),
Rank: uint(i + 1),
URL: item.URL,
Title: item.Title,
Score: item.Score,
2020-03-25 04:21:09 +00:00
FetchedAt: t,
}
*/
2020-03-25 03:55:03 +00:00
//f.log.Debug().Msgf("storing story with rank %d in db", (i + 1))
// FIXME this will grow unbounded and make the file too big if
// I don't clean this up or otherwise limit the data in here
2020-03-25 04:15:40 +00:00
// disabled for now
//f.db.Create(&s)
2020-03-25 01:32:40 +00:00
2020-03-25 16:11:31 +00:00
//FIXME check to see if the same HNID was already on the frontpage
//or not so we don't spam the db
2020-03-25 01:32:40 +00:00
// check to see if the item was on the frontpage already or not
var c int
2020-03-25 16:11:31 +00:00
f.db.Model(&HNFrontPage{}).Where("hn_id = ?", id).Count(&c)
2020-03-25 01:32:40 +00:00
if c == 0 {
// first appearance on frontpage
r := HNFrontPage{
HNID: uint(id),
Appeared: t,
2020-03-25 16:11:31 +00:00
Disappeared: time.Time{},
2020-03-25 01:32:40 +00:00
HighestRank: uint(i + 1),
2020-03-25 03:55:03 +00:00
Rank: uint(i + 1),
2020-03-25 01:32:40 +00:00
Title: item.Title,
Score: uint(item.Score),
2020-03-25 01:32:40 +00:00
URL: item.URL,
}
f.db.Create(&r)
f.log.Info().
Uint("hnid", uint(id)).
Uint("rank", uint(i+1)).
Str("title", item.Title).
Int("score", item.Score).
2020-03-25 01:32:40 +00:00
Str("url", item.URL).
Msg("HN new story on frontpage")
} else {
2020-03-25 16:11:31 +00:00
// it's still here, (or back)
2020-03-25 01:32:40 +00:00
var old HNFrontPage
2020-03-25 16:11:31 +00:00
f.db.Model(&HNFrontPage{}).Where("hn_id = ?", id).First(&old)
2020-03-25 01:32:40 +00:00
if old.Rank != uint(i+1) {
f.log.Info().
Uint("hnid", uint(id)).
Uint("oldrank", old.Rank).
Uint("newrank", uint(i+1)).
Int("score", item.Score).
2020-03-25 01:32:40 +00:00
Str("title", item.Title).
Str("url", item.URL).
Msg("HN story rank changed, recording new rank")
old.Rank = uint(i + 1)
old.Score = uint(item.Score)
2020-03-25 01:32:40 +00:00
}
if old.HighestRank > uint(i+1) {
f.log.Info().
Uint("hnid", uint(id)).
2020-03-25 03:55:03 +00:00
Uint("oldrecord", old.HighestRank).
Uint("newrecord", uint(i+1)).
2020-03-25 01:32:40 +00:00
Msg("recording new record high rank for story")
2020-03-25 03:55:03 +00:00
old.HighestRank = uint(i + 1)
2020-03-25 01:32:40 +00:00
}
if old.Score != uint(item.Score) {
old.Score = uint(item.Score)
}
2020-03-25 16:11:31 +00:00
// in any case it's here now
old.Disappeared = time.Time{}
f.db.Save(&old)
2020-03-25 01:32:40 +00:00
}
}
// FIXME iterate over frontpage items still active in DB and note any
// that are no longer on the scrape
2020-03-25 16:11:31 +00:00
fpitems, err := f.db.Model(&HNFrontPage{}).Where("disappeared is ?", time.Time{}).Rows()
2020-03-25 03:55:03 +00:00
if err != nil {
f.log.Error().
Err(err)
}
var toupdate []uint
2020-03-25 01:32:40 +00:00
for fpitems.Next() {
var item HNFrontPage
f.db.ScanRows(fpitems, &item)
2020-03-25 03:55:03 +00:00
//pp.Print(item)
2020-03-25 01:32:40 +00:00
exitedFrontPage := true
for _, xd := range ids[:30] {
if item.HNID == uint(xd) {
exitedFrontPage = false
}
}
if exitedFrontPage {
2020-03-25 03:55:03 +00:00
toupdate = append(toupdate, item.HNID)
//item.Disappeared = t
dur := t.Sub(item.Appeared).String()
//f.db.Save(&item)
2020-03-25 01:32:40 +00:00
f.log.Info().
Uint("hnid", item.HNID).
Uint("HighestRank", item.HighestRank).
Str("title", item.Title).
2020-03-25 03:55:03 +00:00
Str("time_on_frontpage", dur).
2020-03-25 01:32:40 +00:00
Str("url", item.URL).
Msg("HN story exited frontpage")
}
2020-03-25 00:49:19 +00:00
}
2020-03-25 16:11:31 +00:00
fpitems.Close() // close result before we do the update
f.db.Model(&HNFrontPage{}).Where("disappeared is ? and hn_id in (?)", time.Time{}, toupdate).Update("Disappeared", t)
2020-03-25 03:55:03 +00:00
2020-03-25 00:49:19 +00:00
return nil
}