gomeshalerter/rss.go

224 lines
5.9 KiB
Go

package main
import (
"fmt"
"net/http"
"os"
"sync"
"time"
"github.com/mmcdole/gofeed"
)
// Map of source names to their abbreviations
var sourceAbbreviations = map[string]string{
"BBC": "BBC",
"CNN": "CNN",
"NYTimes": "NYT",
"Guardian": "Grd",
"Al Jazeera": "AlJ",
"NBC": "NBC",
"ABC": "ABC",
"CBS": "CBS",
"Sky News": "Sky",
"Time": "Time",
"NPR": "NPR",
"Deutsche Welle": "DW",
"France 24": "F24",
"The Independent": "Ind",
"Washington Post": "WaPo",
"WSJ": "WSJ",
}
var feeds = map[string]string{
"BBC": "https://feeds.bbci.co.uk/news/world/rss.xml",
"CNN": "http://rss.cnn.com/rss/edition.rss",
"NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
"Guardian": "https://www.theguardian.com/world/rss",
"Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
"NBC": "http://feeds.nbcnews.com/nbcnews/public/news",
"ABC": "https://abcnews.go.com/abcnews/topstories",
"CBS": "https://www.cbsnews.com/latest/rss/world",
"Sky News": "https://feeds.skynews.com/feeds/rss/world.xml",
"Time": "https://time.com/feed/",
"NPR": "https://feeds.npr.org/1001/rss.xml",
"Deutsche Welle": "https://rss.dw.com/rdf/rss-en-world",
"France 24": "https://www.france24.com/en/rss",
"The Independent": "https://www.independent.co.uk/news/world/rss",
"Washington Post": "https://feeds.washingtonpost.com/rss/world",
"WSJ": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
}
// Find the maximum abbreviation length
func getMaxAbbreviationLength() int {
maxLen := 0
for _, abbr := range sourceAbbreviations {
if len(abbr) > maxLen {
maxLen = len(abbr)
}
}
return maxLen
}
// rssFeedChecker checks RSS feeds every 15 minutes and adds new articles to the database
func rssFeedChecker(shutdown chan struct{}, ollamaURL, ollamaModel string) {
fmt.Fprintf(os.Stderr, "[rss] Starting RSS feed checker (interval: %s)\n", RSS_CHECK_INTERVAL)
// Run immediately on startup
checkRSSFeeds()
// Then run on interval
ticker := time.NewTicker(RSS_CHECK_INTERVAL)
defer ticker.Stop()
for {
select {
case <-ticker.C:
checkRSSFeeds()
case <-shutdown:
fmt.Fprintf(os.Stderr, "[rss] Shutting down RSS feed checker\n")
return
}
}
}
// checkRSSFeeds fetches all RSS feeds and adds new articles to the database
func checkRSSFeeds() {
articles := loadArticles()
oldCount := len(articles)
logInfo("rss", "Checking RSS feeds", map[string]interface{}{
"time": time.Now().Format("15:04:05"),
"articlesBeforeFetch": oldCount,
})
now := time.Now()
newArticles := fetchAllFeedsParallel(now)
newCount := 0
for _, a := range newArticles {
if _, ok := articles[a.Link]; !ok {
if a.ID == "" {
a.ID = generateID(a.Link)
}
articles[a.Link] = a
saveArticle(a)
newCount++
logInfo("new", fmt.Sprintf("Found new article: %s", a.Title), map[string]interface{}{
"id": a.ID,
"source": a.Source,
"published": a.Published.Format(time.RFC3339),
})
}
}
logInfo("rss", "Completed RSS check", map[string]interface{}{
"articlesBeforeFetch": oldCount,
"articlesAfterFetch": oldCount + newCount,
"newArticles": newCount,
})
}
func fetchAllFeedsParallel(now time.Time) []Article {
type fetchResult struct {
Source string
URL string
Feed *gofeed.Feed
Err error
HTTPStatus int
Duration time.Duration
}
var wg sync.WaitGroup
results := make(chan fetchResult, len(feeds))
for source, url := range feeds {
wg.Add(1)
go func(source, url string) {
defer wg.Done()
start := time.Now()
fp := gofeed.NewParser()
client := &http.Client{Timeout: 20 * time.Second}
var httpStatus int
resp, err := client.Get(url)
var feed *gofeed.Feed
if err == nil {
httpStatus = resp.StatusCode
defer resp.Body.Close()
feed, err = fp.Parse(resp.Body)
}
duration := time.Since(start)
details := map[string]interface{}{
"source": source,
"url": url,
"status": httpStatus,
"duration": duration.Seconds(),
}
if err != nil {
details["error"] = err.Error()
}
logEvent("rss_fetch_result", details)
if err != nil {
fmt.Fprintf(os.Stderr, "[rss] FAIL %-15s (%s) [%.2fs] ERR: %v\n", source, url, duration.Seconds(), err)
results <- fetchResult{Source: source, URL: url, Err: err, Duration: duration, HTTPStatus: httpStatus}
return
}
fmt.Fprintf(os.Stderr, "[rss] OK %-15s (%s) [%.2fs] HTTP %d, items: %d\n",
source, url, duration.Seconds(), httpStatus, len(feed.Items))
results <- fetchResult{
Source: source,
URL: url,
Feed: feed,
Duration: duration,
HTTPStatus: httpStatus,
}
}(source, url)
}
wg.Wait()
close(results)
var all []Article
for result := range results {
if result.Err != nil || result.Feed == nil {
continue
}
for _, item := range result.Feed.Items {
published := now
// Set published to the current time if not available from feed
if item.PublishedParsed != nil {
published = *item.PublishedParsed
}
// Set originalDate to the feed's publication date if available
originalDate := published
// Skip articles older than 7 days based on the feed's publication date
cutoffDate := now.AddDate(0, 0, -7) // 7 days ago
if published.Before(cutoffDate) {
// Skip this article as it's older than 7 days
continue
}
all = append(all, Article{
Title: item.Title,
Description: item.Description,
Link: item.Link,
Published: now, // When we first saw the article
OriginalDate: originalDate, // Original publication date from the feed
Source: result.Source,
FirstSeen: now,
ID: generateID(item.Link),
})
}
}
return all
}