231 lines
6.0 KiB
Go
231 lines
6.0 KiB
Go
package main
|
|
|
|
import (
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mmcdole/gofeed"
|
|
)
|
|
|
|
// Map of source names to their abbreviations
|
|
var sourceAbbreviations = map[string]string{
|
|
"BBC": "BBC",
|
|
"CNN": "CNN",
|
|
"NYTimes": "NYT",
|
|
"Guardian": "Grd",
|
|
"Al Jazeera": "AlJ",
|
|
"NBC": "NBC",
|
|
"ABC": "ABC",
|
|
"CBS": "CBS",
|
|
"Sky News": "Sky",
|
|
"Time": "Time",
|
|
"NPR": "NPR",
|
|
"Deutsche Welle": "DW",
|
|
"France 24": "F24",
|
|
"The Independent": "Ind",
|
|
"Washington Post": "WaPo",
|
|
"WSJ": "WSJ",
|
|
}
|
|
|
|
var feeds = map[string]string{
|
|
"BBC": "https://feeds.bbci.co.uk/news/world/rss.xml",
|
|
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
|
"NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
|
"Guardian": "https://www.theguardian.com/world/rss",
|
|
"Al Jazeera": "https://www.aljazeera.com/xml/rss/all.xml",
|
|
"NBC": "http://feeds.nbcnews.com/nbcnews/public/news",
|
|
"ABC": "https://abcnews.go.com/abcnews/topstories",
|
|
"CBS": "https://www.cbsnews.com/latest/rss/world",
|
|
"Sky News": "https://feeds.skynews.com/feeds/rss/world.xml",
|
|
"Time": "https://time.com/feed/",
|
|
"NPR": "https://feeds.npr.org/1001/rss.xml",
|
|
"Deutsche Welle": "https://rss.dw.com/rdf/rss-en-world",
|
|
"France 24": "https://www.france24.com/en/rss",
|
|
"The Independent": "https://www.independent.co.uk/news/world/rss",
|
|
"Washington Post": "https://feeds.washingtonpost.com/rss/world",
|
|
"WSJ": "https://feeds.a.dj.com/rss/RSSWorldNews.xml",
|
|
}
|
|
|
|
// This function was unused and removed to satisfy linter
|
|
|
|
// rssFeedChecker checks RSS feeds every 15 minutes and adds new articles to the database
|
|
func rssFeedChecker(shutdown chan struct{}, ollamaURL, ollamaModel string) {
|
|
logInfo("rss", "Starting RSS feed checker", map[string]interface{}{
|
|
"interval": RSS_CHECK_INTERVAL.String(),
|
|
})
|
|
|
|
// Run immediately on startup
|
|
checkRSSFeeds()
|
|
|
|
// Then run on interval
|
|
ticker := time.NewTicker(RSS_CHECK_INTERVAL)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
checkRSSFeeds()
|
|
case <-shutdown:
|
|
logInfo("rss", "Shutting down RSS feed checker", nil)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkRSSFeeds fetches all RSS feeds and adds new articles to the database
|
|
func checkRSSFeeds() {
|
|
articles := loadArticles()
|
|
oldCount := len(articles)
|
|
|
|
logInfo("rss", "Checking RSS feeds", map[string]interface{}{
|
|
"time": time.Now().Format("15:04:05"),
|
|
"articlesBeforeFetch": oldCount,
|
|
})
|
|
|
|
now := time.Now()
|
|
newArticles := fetchAllFeedsParallel(now)
|
|
newCount := 0
|
|
|
|
for _, a := range newArticles {
|
|
if _, ok := articles[a.Link]; !ok {
|
|
if a.ID == "" {
|
|
a.ID = generateID(a.Link)
|
|
}
|
|
articles[a.Link] = a
|
|
if err := saveArticle(a); err != nil {
|
|
logInfo("rss", "Error saving article", map[string]interface{}{
|
|
"id": a.ID,
|
|
"error": err.Error(),
|
|
})
|
|
}
|
|
newCount++
|
|
logInfo("new", "Found new article", map[string]interface{}{
|
|
"id": a.ID,
|
|
"title": a.Title,
|
|
"source": a.Source,
|
|
"published": a.Published.Format(time.RFC3339),
|
|
})
|
|
}
|
|
}
|
|
|
|
logInfo("rss", "Completed RSS check", map[string]interface{}{
|
|
"articlesBeforeFetch": oldCount,
|
|
"articlesAfterFetch": oldCount + newCount,
|
|
"newArticles": newCount,
|
|
})
|
|
}
|
|
|
|
func fetchAllFeedsParallel(now time.Time) []Article {
|
|
type fetchResult struct {
|
|
Source string
|
|
URL string
|
|
Feed *gofeed.Feed
|
|
Err error
|
|
HTTPStatus int
|
|
Duration time.Duration
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
results := make(chan fetchResult, len(feeds))
|
|
|
|
for source, url := range feeds {
|
|
wg.Add(1)
|
|
go func(source, url string) {
|
|
defer wg.Done()
|
|
|
|
start := time.Now()
|
|
fp := gofeed.NewParser()
|
|
client := &http.Client{Timeout: 20 * time.Second}
|
|
var httpStatus int
|
|
|
|
resp, err := client.Get(url)
|
|
var feed *gofeed.Feed
|
|
if err == nil {
|
|
httpStatus = resp.StatusCode
|
|
defer resp.Body.Close()
|
|
feed, err = fp.Parse(resp.Body)
|
|
}
|
|
|
|
duration := time.Since(start)
|
|
|
|
details := map[string]interface{}{
|
|
"source": source,
|
|
"url": url,
|
|
"status": httpStatus,
|
|
"duration": duration.Seconds(),
|
|
}
|
|
if err != nil {
|
|
details["error"] = err.Error()
|
|
}
|
|
logEvent("rss_fetch_result", details)
|
|
|
|
if err != nil {
|
|
logInfo("rss", "Feed fetch failed", map[string]interface{}{
|
|
"source": source,
|
|
"url": url,
|
|
"duration": duration.Seconds(),
|
|
"error": err.Error(),
|
|
})
|
|
results <- fetchResult{Source: source, URL: url, Err: err, Duration: duration, HTTPStatus: httpStatus}
|
|
return
|
|
}
|
|
|
|
logInfo("rss", "Feed fetch succeeded", map[string]interface{}{
|
|
"source": source,
|
|
"url": url,
|
|
"duration": duration.Seconds(),
|
|
"status": httpStatus,
|
|
"items": len(feed.Items),
|
|
})
|
|
|
|
results <- fetchResult{
|
|
Source: source,
|
|
URL: url,
|
|
Feed: feed,
|
|
Duration: duration,
|
|
HTTPStatus: httpStatus,
|
|
}
|
|
}(source, url)
|
|
}
|
|
|
|
wg.Wait()
|
|
close(results)
|
|
|
|
var all []Article
|
|
for result := range results {
|
|
if result.Err != nil || result.Feed == nil {
|
|
continue
|
|
}
|
|
for _, item := range result.Feed.Items {
|
|
published := now
|
|
// Set published to the current time if not available from feed
|
|
if item.PublishedParsed != nil {
|
|
published = *item.PublishedParsed
|
|
}
|
|
|
|
// Set originalDate to the feed's publication date if available
|
|
originalDate := published
|
|
|
|
// Skip articles older than 7 days based on the feed's publication date
|
|
cutoffDate := now.AddDate(0, 0, -7) // 7 days ago
|
|
if published.Before(cutoffDate) {
|
|
// Skip this article as it's older than 7 days
|
|
continue
|
|
}
|
|
|
|
all = append(all, Article{
|
|
Title: item.Title,
|
|
Description: item.Description,
|
|
Link: item.Link,
|
|
Published: now, // When we first saw the article
|
|
OriginalDate: originalDate, // Original publication date from the feed
|
|
Source: result.Source,
|
|
FirstSeen: now,
|
|
ID: generateID(item.Link),
|
|
})
|
|
}
|
|
}
|
|
return all
|
|
}
|