Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions api/apnews.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
package api

import (
"fmt"
"net/http"
"strings"
"sync"
"time"

"github.com/PuerkitoBio/goquery"
)

// AP News doesn't have public RSS feeds, so we scrape their website
// AP provides straight wire news with minimal opinion

const apBaseURL = "https://apnews.com"

var apFeedPaths = map[string]string{
"top": "",
"world": "/world-news",
"us": "/us-news",
"politics": "/politics",
"business": "/business",
"tech": "/technology",
"science": "/science",
"health": "/health",
"sports": "/sports",
"entertain": "/entertainment",
}

var apFeedNames = []string{"top", "world", "us", "politics", "business", "tech", "science", "health", "sports", "entertain"}
var apFeedLabels = []string{"Top", "World", "US", "Politics", "Business", "Tech", "Science", "Health", "Sports", "Arts"}

// APNewsClient scrapes AP News
type APNewsClient struct {
http *http.Client
storyCache map[int]*Item
cacheMu sync.RWMutex
lastRequest time.Time
requestMu sync.Mutex
}

// NewAPNewsClient creates a new AP News scraping client
func NewAPNewsClient() *APNewsClient {
return &APNewsClient{
http: &http.Client{
Timeout: 15 * time.Second,
},
storyCache: make(map[int]*Item),
}
}

// throttle ensures we don't make requests too quickly
func (c *APNewsClient) throttle() {
c.requestMu.Lock()
defer c.requestMu.Unlock()

minDelay := 500 * time.Millisecond
elapsed := time.Since(c.lastRequest)
if elapsed < minDelay {
time.Sleep(minDelay - elapsed)
}
c.lastRequest = time.Now()
}

// Name returns the display name of the source
func (c *APNewsClient) Name() string {
return "AP News"
}

// FeedNames returns the available feed names
func (c *APNewsClient) FeedNames() []string {
return apFeedNames
}

// FeedLabels returns the display labels for feeds
func (c *APNewsClient) FeedLabels() []string {
return apFeedLabels
}

// StoryURL returns the URL for viewing a story on AP News
func (c *APNewsClient) StoryURL(item *Item) string {
return item.URL
}

// FetchStoryIDs fetches story IDs for a feed
func (c *APNewsClient) FetchStoryIDs(feed string) ([]int, error) {
path, ok := apFeedPaths[feed]
if !ok {
return nil, fmt.Errorf("unknown feed: %s", feed)
}

stories, err := c.fetchStories(path)
if err != nil {
return nil, err
}

// Cache stories and return IDs
ids := make([]int, len(stories))
c.cacheMu.Lock()
c.storyCache = make(map[int]*Item)
for i, story := range stories {
id := i + 1
c.storyCache[id] = story
ids[i] = id
}
c.cacheMu.Unlock()

return ids, nil
}

// fetchStories fetches stories from AP News
func (c *APNewsClient) fetchStories(path string) ([]*Item, error) {
c.throttle()

url := apBaseURL + path

req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; feedme/1.0)")

resp, err := c.http.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch AP News: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != 200 {
return nil, fmt.Errorf("AP News returned status %d", resp.StatusCode)
}

doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML: %w", err)
}

return c.parseStories(doc)
}

// parseStories extracts stories from the HTML document
func (c *APNewsClient) parseStories(doc *goquery.Document) ([]*Item, error) {
var stories []*Item
seen := make(map[string]bool)

// AP News uses various selectors for story cards
selectors := []string{
"a[data-key='card-headline']",
"a.Link[href*='/article/']",
"div.PagePromo a[href*='/article/']",
"h2 a[href*='/article/']",
"h3 a[href*='/article/']",
}

for _, selector := range selectors {
doc.Find(selector).Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
return
}

// Make URL absolute
if strings.HasPrefix(href, "/") {
href = apBaseURL + href
}

// Skip if already seen
if seen[href] {
return
}
seen[href] = true

// Get title
title := strings.TrimSpace(s.Text())
if title == "" {
title = strings.TrimSpace(s.Find("span").Text())
}
if title == "" {
return
}

story := &Item{
Title: title,
URL: href,
By: "AP",
Time: time.Now().Unix(),
Type: href,
}

stories = append(stories, story)
})
}

// Limit to reasonable number
if len(stories) > 50 {
stories = stories[:50]
}

return stories, nil
}

// FetchItem fetches a cached item by ID
func (c *APNewsClient) FetchItem(id int) (*Item, error) {
c.cacheMu.RLock()
item, ok := c.storyCache[id]
c.cacheMu.RUnlock()

if !ok {
return nil, fmt.Errorf("item %d not found in cache", id)
}

return item, nil
}

// FetchItems fetches multiple cached items by ID
func (c *APNewsClient) FetchItems(ids []int) ([]*Item, error) {
items := make([]*Item, len(ids))
c.cacheMu.RLock()
for i, id := range ids {
if item, ok := c.storyCache[id]; ok {
items[i] = item
}
}
c.cacheMu.RUnlock()
return items, nil
}

// FetchCommentTree returns empty comments (news sites don't have integrated comments)
func (c *APNewsClient) FetchCommentTree(item *Item, maxDepth int) ([]*Comment, error) {
return []*Comment{}, nil
}
35 changes: 35 additions & 0 deletions api/bbc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package api

// BBC News RSS feeds
// BBC provides excellent RSS coverage across different topics

var bbcFeeds = map[string]string{
"top": "https://feeds.bbci.co.uk/news/rss.xml",
"world": "https://feeds.bbci.co.uk/news/world/rss.xml",
"uk": "https://feeds.bbci.co.uk/news/uk/rss.xml",
"business": "https://feeds.bbci.co.uk/news/business/rss.xml",
"tech": "https://feeds.bbci.co.uk/news/technology/rss.xml",
"science": "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml",
"health": "https://feeds.bbci.co.uk/news/health/rss.xml",
"entertain": "https://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml",
}

var bbcFeedNames = []string{"top", "world", "uk", "business", "tech", "science", "health", "entertain"}
var bbcFeedLabels = []string{"Top", "World", "UK", "Business", "Tech", "Science", "Health", "Arts"}

// BBCClient is a client for BBC News RSS feeds
type BBCClient struct {
*RSSClient
}

// NewBBCClient creates a new BBC News client
func NewBBCClient() *BBCClient {
return &BBCClient{
RSSClient: NewRSSClient("BBC", "https://www.bbc.com", bbcFeeds, bbcFeedNames, bbcFeedLabels),
}
}

// StoryURL returns the BBC article URL
func (c *BBCClient) StoryURL(item *Item) string {
return item.URL
}
36 changes: 36 additions & 0 deletions api/google_news.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package api

// Google News RSS feeds
// Google News provides topic-based RSS feeds

var googleNewsFeeds = map[string]string{
"top": "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en",
"world": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
"us": "https://news.google.com/rss/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US:en",
"business": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
"tech": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
"science": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp0Y1RjU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
"health": "https://news.google.com/rss/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNR3QwTlRFU0FtVnVLQUFQAQ?hl=en-US&gl=US&ceid=US:en",
"sports": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp1ZEdvU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
"entertain": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNREpxYW5RU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
}

var googleNewsFeedNames = []string{"top", "world", "us", "business", "tech", "science", "health", "sports", "entertain"}
var googleNewsFeedLabels = []string{"Top", "World", "US", "Business", "Tech", "Science", "Health", "Sports", "Arts"}

// GoogleNewsClient is a client for Google News RSS feeds
type GoogleNewsClient struct {
*RSSClient
}

// NewGoogleNewsClient creates a new Google News client
func NewGoogleNewsClient() *GoogleNewsClient {
return &GoogleNewsClient{
RSSClient: NewRSSClient("Google News", "https://news.google.com", googleNewsFeeds, googleNewsFeedNames, googleNewsFeedLabels),
}
}

// StoryURL returns the article URL
func (c *GoogleNewsClient) StoryURL(item *Item) string {
return item.URL
}
36 changes: 36 additions & 0 deletions api/npr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package api

// NPR RSS feeds
// NPR provides RSS feeds for news and culture

var nprFeeds = map[string]string{
"news": "https://feeds.npr.org/1001/rss.xml", // News
"world": "https://feeds.npr.org/1004/rss.xml", // World
"us": "https://feeds.npr.org/1003/rss.xml", // National/US
"politics": "https://feeds.npr.org/1014/rss.xml", // Politics
"business": "https://feeds.npr.org/1006/rss.xml", // Business
"tech": "https://feeds.npr.org/1019/rss.xml", // Technology
"science": "https://feeds.npr.org/1007/rss.xml", // Science
"health": "https://feeds.npr.org/1128/rss.xml", // Health
"culture": "https://feeds.npr.org/1008/rss.xml", // Arts & Life
}

var nprFeedNames = []string{"news", "world", "us", "politics", "business", "tech", "science", "health", "culture"}
var nprFeedLabels = []string{"News", "World", "US", "Politics", "Business", "Tech", "Science", "Health", "Culture"}

// NPRClient is a client for NPR RSS feeds
type NPRClient struct {
*RSSClient
}

// NewNPRClient creates a new NPR client
func NewNPRClient() *NPRClient {
return &NPRClient{
RSSClient: NewRSSClient("NPR", "https://www.npr.org", nprFeeds, nprFeedNames, nprFeedLabels),
}
}

// StoryURL returns the NPR article URL
func (c *NPRClient) StoryURL(item *Item) string {
return item.URL
}
32 changes: 32 additions & 0 deletions api/reuters.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package api

// Reuters RSS feeds
// Reuters wire news - straight news, less opinion

var reutersFeeds = map[string]string{
"world": "https://www.reutersagency.com/feed/?best-topics=world&post_type=best",
"business": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
"tech": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
"sports": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
"life": "https://www.reutersagency.com/feed/?best-topics=lifestyle&post_type=best",
}

var reutersFeedNames = []string{"world", "business", "tech", "sports", "life"}
var reutersFeedLabels = []string{"World", "Business", "Tech", "Sports", "Life"}

// ReutersClient is a client for Reuters RSS feeds
type ReutersClient struct {
*RSSClient
}

// NewReutersClient creates a new Reuters client
func NewReutersClient() *ReutersClient {
return &ReutersClient{
RSSClient: NewRSSClient("Reuters", "https://www.reuters.com", reutersFeeds, reutersFeedNames, reutersFeedLabels),
}
}

// StoryURL returns the Reuters article URL
func (c *ReutersClient) StoryURL(item *Item) string {
return item.URL
}
Loading