diff --git a/api/apnews.go b/api/apnews.go new file mode 100644 index 0000000..a30fec6 --- /dev/null +++ b/api/apnews.go @@ -0,0 +1,232 @@ +package api + +import ( + "fmt" + "net/http" + "strings" + "sync" + "time" + + "github.com/PuerkitoBio/goquery" +) + +// AP News doesn't have public RSS feeds, so we scrape their website +// AP provides straight wire news with minimal opinion + +const apBaseURL = "https://apnews.com" + +var apFeedPaths = map[string]string{ + "top": "", + "world": "/world-news", + "us": "/us-news", + "politics": "/politics", + "business": "/business", + "tech": "/technology", + "science": "/science", + "health": "/health", + "sports": "/sports", + "entertain": "/entertainment", +} + +var apFeedNames = []string{"top", "world", "us", "politics", "business", "tech", "science", "health", "sports", "entertain"} +var apFeedLabels = []string{"Top", "World", "US", "Politics", "Business", "Tech", "Science", "Health", "Sports", "Arts"} + +// APNewsClient scrapes AP News +type APNewsClient struct { + http *http.Client + storyCache map[int]*Item + cacheMu sync.RWMutex + lastRequest time.Time + requestMu sync.Mutex +} + +// NewAPNewsClient creates a new AP News scraping client +func NewAPNewsClient() *APNewsClient { + return &APNewsClient{ + http: &http.Client{ + Timeout: 15 * time.Second, + }, + storyCache: make(map[int]*Item), + } +} + +// throttle ensures we don't make requests too quickly +func (c *APNewsClient) throttle() { + c.requestMu.Lock() + defer c.requestMu.Unlock() + + minDelay := 500 * time.Millisecond + elapsed := time.Since(c.lastRequest) + if elapsed < minDelay { + time.Sleep(minDelay - elapsed) + } + c.lastRequest = time.Now() +} + +// Name returns the display name of the source +func (c *APNewsClient) Name() string { + return "AP News" +} + +// FeedNames returns the available feed names +func (c *APNewsClient) FeedNames() []string { + return apFeedNames +} + +// FeedLabels returns the display labels for feeds +func (c *APNewsClient) FeedLabels() []string { + return apFeedLabels +} + +// StoryURL returns the URL for viewing a story on AP News +func (c *APNewsClient) StoryURL(item *Item) string { + return item.URL +} + +// FetchStoryIDs fetches story IDs for a feed +func (c *APNewsClient) FetchStoryIDs(feed string) ([]int, error) { + path, ok := apFeedPaths[feed] + if !ok { + return nil, fmt.Errorf("unknown feed: %s", feed) + } + + stories, err := c.fetchStories(path) + if err != nil { + return nil, err + } + + // Cache stories and return IDs + ids := make([]int, len(stories)) + c.cacheMu.Lock() + c.storyCache = make(map[int]*Item) + for i, story := range stories { + id := i + 1 + c.storyCache[id] = story + ids[i] = id + } + c.cacheMu.Unlock() + + return ids, nil +} + +// fetchStories fetches stories from AP News +func (c *APNewsClient) fetchStories(path string) ([]*Item, error) { + c.throttle() + + url := apBaseURL + path + + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; feedme/1.0)") + + resp, err := c.http.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch AP News: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("AP News returned status %d", resp.StatusCode) + } + + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + return c.parseStories(doc) +} + +// parseStories extracts stories from the HTML document +func (c *APNewsClient) parseStories(doc *goquery.Document) ([]*Item, error) { + var stories []*Item + seen := make(map[string]bool) + + // AP News uses various selectors for story cards + selectors := []string{ + "a[data-key='card-headline']", + "a.Link[href*='/article/']", + "div.PagePromo a[href*='/article/']", + "h2 a[href*='/article/']", + "h3 a[href*='/article/']", + } + + for _, selector := range selectors { + doc.Find(selector).Each(func(i int, s *goquery.Selection) { + href, exists := s.Attr("href") + if !exists { + return + } + + // Make URL absolute + if strings.HasPrefix(href, "/") { + href = apBaseURL + href + } + + // Skip if already seen + if seen[href] { + return + } + seen[href] = true + + // Get title + title := strings.TrimSpace(s.Text()) + if title == "" { + title = strings.TrimSpace(s.Find("span").Text()) + } + if title == "" { + return + } + + story := &Item{ + Title: title, + URL: href, + By: "AP", + Time: time.Now().Unix(), + Type: href, + } + + stories = append(stories, story) + }) + } + + // Limit to reasonable number + if len(stories) > 50 { + stories = stories[:50] + } + + return stories, nil +} + +// FetchItem fetches a cached item by ID +func (c *APNewsClient) FetchItem(id int) (*Item, error) { + c.cacheMu.RLock() + item, ok := c.storyCache[id] + c.cacheMu.RUnlock() + + if !ok { + return nil, fmt.Errorf("item %d not found in cache", id) + } + + return item, nil +} + +// FetchItems fetches multiple cached items by ID +func (c *APNewsClient) FetchItems(ids []int) ([]*Item, error) { + items := make([]*Item, len(ids)) + c.cacheMu.RLock() + for i, id := range ids { + if item, ok := c.storyCache[id]; ok { + items[i] = item + } + } + c.cacheMu.RUnlock() + return items, nil +} + +// FetchCommentTree returns empty comments (news sites don't have integrated comments) +func (c *APNewsClient) FetchCommentTree(item *Item, maxDepth int) ([]*Comment, error) { + return []*Comment{}, nil +} diff --git a/api/bbc.go b/api/bbc.go new file mode 100644 index 0000000..1a8cc35 --- /dev/null +++ b/api/bbc.go @@ -0,0 +1,35 @@ +package api + +// BBC News RSS feeds +// BBC provides excellent RSS coverage across different topics + +var bbcFeeds = map[string]string{ + "top": "https://feeds.bbci.co.uk/news/rss.xml", + "world": "https://feeds.bbci.co.uk/news/world/rss.xml", + "uk": "https://feeds.bbci.co.uk/news/uk/rss.xml", + "business": "https://feeds.bbci.co.uk/news/business/rss.xml", + "tech": "https://feeds.bbci.co.uk/news/technology/rss.xml", + "science": "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml", + "health": "https://feeds.bbci.co.uk/news/health/rss.xml", + "entertain": "https://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml", +} + +var bbcFeedNames = []string{"top", "world", "uk", "business", "tech", "science", "health", "entertain"} +var bbcFeedLabels = []string{"Top", "World", "UK", "Business", "Tech", "Science", "Health", "Arts"} + +// BBCClient is a client for BBC News RSS feeds +type BBCClient struct { + *RSSClient +} + +// NewBBCClient creates a new BBC News client +func NewBBCClient() *BBCClient { + return &BBCClient{ + RSSClient: NewRSSClient("BBC", "https://www.bbc.com", bbcFeeds, bbcFeedNames, bbcFeedLabels), + } +} + +// StoryURL returns the BBC article URL +func (c *BBCClient) StoryURL(item *Item) string { + return item.URL +} diff --git a/api/google_news.go b/api/google_news.go new file mode 100644 index 0000000..24c200c --- /dev/null +++ b/api/google_news.go @@ -0,0 +1,36 @@ +package api + +// Google News RSS feeds +// Google News provides topic-based RSS feeds + +var googleNewsFeeds = map[string]string{ + "top": "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en", + "world": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", + "us": "https://news.google.com/rss/topics/CAAqIggKIhxDQkFTRHdvSkwyMHZNRGxqTjNjd0VnSmxiaWdBUAE?hl=en-US&gl=US&ceid=US:en", + "business": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", + "tech": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", + "science": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp0Y1RjU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", + "health": "https://news.google.com/rss/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNR3QwTlRFU0FtVnVLQUFQAQ?hl=en-US&gl=US&ceid=US:en", + "sports": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp1ZEdvU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", + "entertain": "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNREpxYW5RU0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en", +} + +var googleNewsFeedNames = []string{"top", "world", "us", "business", "tech", "science", "health", "sports", "entertain"} +var googleNewsFeedLabels = []string{"Top", "World", "US", "Business", "Tech", "Science", "Health", "Sports", "Arts"} + +// GoogleNewsClient is a client for Google News RSS feeds +type GoogleNewsClient struct { + *RSSClient +} + +// NewGoogleNewsClient creates a new Google News client +func NewGoogleNewsClient() *GoogleNewsClient { + return &GoogleNewsClient{ + RSSClient: NewRSSClient("Google News", "https://news.google.com", googleNewsFeeds, googleNewsFeedNames, googleNewsFeedLabels), + } +} + +// StoryURL returns the article URL +func (c *GoogleNewsClient) StoryURL(item *Item) string { + return item.URL +} diff --git a/api/npr.go b/api/npr.go new file mode 100644 index 0000000..3dd7d63 --- /dev/null +++ b/api/npr.go @@ -0,0 +1,36 @@ +package api + +// NPR RSS feeds +// NPR provides RSS feeds for news and culture + +var nprFeeds = map[string]string{ + "news": "https://feeds.npr.org/1001/rss.xml", // News + "world": "https://feeds.npr.org/1004/rss.xml", // World + "us": "https://feeds.npr.org/1003/rss.xml", // National/US + "politics": "https://feeds.npr.org/1014/rss.xml", // Politics + "business": "https://feeds.npr.org/1006/rss.xml", // Business + "tech": "https://feeds.npr.org/1019/rss.xml", // Technology + "science": "https://feeds.npr.org/1007/rss.xml", // Science + "health": "https://feeds.npr.org/1128/rss.xml", // Health + "culture": "https://feeds.npr.org/1008/rss.xml", // Arts & Life +} + +var nprFeedNames = []string{"news", "world", "us", "politics", "business", "tech", "science", "health", "culture"} +var nprFeedLabels = []string{"News", "World", "US", "Politics", "Business", "Tech", "Science", "Health", "Culture"} + +// NPRClient is a client for NPR RSS feeds +type NPRClient struct { + *RSSClient +} + +// NewNPRClient creates a new NPR client +func NewNPRClient() *NPRClient { + return &NPRClient{ + RSSClient: NewRSSClient("NPR", "https://www.npr.org", nprFeeds, nprFeedNames, nprFeedLabels), + } +} + +// StoryURL returns the NPR article URL +func (c *NPRClient) StoryURL(item *Item) string { + return item.URL +} diff --git a/api/reuters.go b/api/reuters.go new file mode 100644 index 0000000..a8e5e81 --- /dev/null +++ b/api/reuters.go @@ -0,0 +1,32 @@ +package api + +// Reuters RSS feeds +// Reuters wire news - straight news, less opinion + +var reutersFeeds = map[string]string{ + "world": "https://www.reutersagency.com/feed/?best-topics=world&post_type=best", + "business": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best", + "tech": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best", + "sports": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best", + "life": "https://www.reutersagency.com/feed/?best-topics=lifestyle&post_type=best", +} + +var reutersFeedNames = []string{"world", "business", "tech", "sports", "life"} +var reutersFeedLabels = []string{"World", "Business", "Tech", "Sports", "Life"} + +// ReutersClient is a client for Reuters RSS feeds +type ReutersClient struct { + *RSSClient +} + +// NewReutersClient creates a new Reuters client +func NewReutersClient() *ReutersClient { + return &ReutersClient{ + RSSClient: NewRSSClient("Reuters", "https://www.reuters.com", reutersFeeds, reutersFeedNames, reutersFeedLabels), + } +} + +// StoryURL returns the Reuters article URL +func (c *ReutersClient) StoryURL(item *Item) string { + return item.URL +} diff --git a/api/rss.go b/api/rss.go new file mode 100644 index 0000000..d2ba502 --- /dev/null +++ b/api/rss.go @@ -0,0 +1,324 @@ +package api + +import ( + "encoding/xml" + "fmt" + "html" + "net/http" + "regexp" + "strings" + "time" +) + +// RSS feed structures for parsing XML + +// RSSFeed represents an RSS 2.0 feed +type RSSFeed struct { + XMLName xml.Name `xml:"rss"` + Channel RSSChannel `xml:"channel"` +} + +// RSSChannel represents an RSS channel +type RSSChannel struct { + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` + Items []RSSItem `xml:"item"` +} + +// RSSItem represents an RSS item +type RSSItem struct { + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` + PubDate string `xml:"pubDate"` + GUID string `xml:"guid"` + Author string `xml:"author"` + Creator string `xml:"creator"` // Dublin Core dc:creator + Source string `xml:"source"` +} + +// AtomFeed represents an Atom feed +type AtomFeed struct { + XMLName xml.Name `xml:"feed"` + Title string `xml:"title"` + Entries []AtomEntry `xml:"entry"` +} + +// AtomEntry represents an Atom entry +type AtomEntry struct { + Title string `xml:"title"` + Link AtomLink `xml:"link"` + Published string `xml:"published"` + Updated string `xml:"updated"` + ID string `xml:"id"` + Author AtomAuthor `xml:"author"` + Summary string `xml:"summary"` + Content string `xml:"content"` +} + +// AtomLink represents an Atom link +type AtomLink struct { + Href string `xml:"href,attr"` + Rel string `xml:"rel,attr"` + Type string `xml:"type,attr"` +} + +// AtomAuthor represents an Atom author +type AtomAuthor struct { + Name string `xml:"name"` +} + +// RSSClient is a base client for RSS-based news sources +type RSSClient struct { + http *http.Client + name string + feedURLs map[string]string // feed name -> URL + feedNames []string + feedLabels []string + storyCache map[int]*Item + urlToID map[string]int + nextID int + baseURL string + lastRequest time.Time +} + +// NewRSSClient creates a new RSS client +func NewRSSClient(name string, baseURL string, feeds map[string]string, feedNames, feedLabels []string) *RSSClient { + return &RSSClient{ + http: &http.Client{ + Timeout: 15 * time.Second, + }, + name: name, + baseURL: baseURL, + feedURLs: feeds, + feedNames: feedNames, + feedLabels: feedLabels, + storyCache: make(map[int]*Item), + urlToID: make(map[string]int), + nextID: 1, + } +} + +// Name returns the display name of the source +func (c *RSSClient) Name() string { + return c.name +} + +// FeedNames returns the available feed names +func (c *RSSClient) FeedNames() []string { + return c.feedNames +} + +// FeedLabels returns the display labels for feeds +func (c *RSSClient) FeedLabels() []string { + return c.feedLabels +} + +// StoryURL returns the URL for viewing a story +func (c *RSSClient) StoryURL(item *Item) string { + return item.URL +} + +// FetchStoryIDs fetches story IDs for a feed +func (c *RSSClient) FetchStoryIDs(feed string) ([]int, error) { + feedURL, ok := c.feedURLs[feed] + if !ok { + return nil, fmt.Errorf("unknown feed: %s", feed) + } + + items, err := c.fetchRSSFeed(feedURL) + if err != nil { + return nil, err + } + + // Clear cache for new fetch + c.storyCache = make(map[int]*Item) + c.urlToID = make(map[string]int) + c.nextID = 1 + + ids := make([]int, len(items)) + for i, item := range items { + id := c.nextID + c.nextID++ + c.storyCache[id] = item + c.urlToID[item.URL] = id + ids[i] = id + } + + return ids, nil +} + +// fetchRSSFeed fetches and parses an RSS feed +func (c *RSSClient) fetchRSSFeed(url string) ([]*Item, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; feedme/1.0)") + + resp, err := c.http.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch RSS feed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return nil, fmt.Errorf("RSS feed returned status %d", resp.StatusCode) + } + + // Try parsing as RSS first + var rssFeed RSSFeed + decoder := xml.NewDecoder(resp.Body) + decoder.Strict = false + if err := decoder.Decode(&rssFeed); err == nil && len(rssFeed.Channel.Items) > 0 { + return c.convertRSSItems(rssFeed.Channel.Items), nil + } + + // If RSS parsing failed, try refetching and parsing as Atom + resp2, err := c.http.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to refetch feed: %w", err) + } + defer resp2.Body.Close() + + var atomFeed AtomFeed + decoder2 := xml.NewDecoder(resp2.Body) + decoder2.Strict = false + if err := decoder2.Decode(&atomFeed); err == nil && len(atomFeed.Entries) > 0 { + return c.convertAtomEntries(atomFeed.Entries), nil + } + + return nil, fmt.Errorf("failed to parse feed as RSS or Atom") +} + +// convertRSSItems converts RSS items to our Item type +func (c *RSSClient) convertRSSItems(rssItems []RSSItem) []*Item { + items := make([]*Item, 0, len(rssItems)) + for _, rss := range rssItems { + item := &Item{ + Title: html.UnescapeString(strings.TrimSpace(rss.Title)), + URL: strings.TrimSpace(rss.Link), + By: c.extractAuthor(rss), + Time: c.parseRSSDate(rss.PubDate), + Text: cleanDescription(rss.Description), + Type: rss.GUID, + } + if item.Title != "" && item.URL != "" { + items = append(items, item) + } + } + return items +} + +// convertAtomEntries converts Atom entries to our Item type +func (c *RSSClient) convertAtomEntries(entries []AtomEntry) []*Item { + items := make([]*Item, 0, len(entries)) + for _, entry := range entries { + pubTime := entry.Published + if pubTime == "" { + pubTime = entry.Updated + } + + item := &Item{ + Title: html.UnescapeString(strings.TrimSpace(entry.Title)), + URL: strings.TrimSpace(entry.Link.Href), + By: entry.Author.Name, + Time: c.parseRSSDate(pubTime), + Text: cleanDescription(entry.Summary), + Type: entry.ID, + } + if item.Title != "" && item.URL != "" { + items = append(items, item) + } + } + return items +} + +// extractAuthor extracts author from RSS item +func (c *RSSClient) extractAuthor(rss RSSItem) string { + if rss.Author != "" { + return rss.Author + } + if rss.Creator != "" { + return rss.Creator + } + if rss.Source != "" { + return rss.Source + } + return "" +} + +// parseRSSDate parses various RSS date formats +func (c *RSSClient) parseRSSDate(dateStr string) int64 { + if dateStr == "" { + return time.Now().Unix() + } + + formats := []string{ + time.RFC1123Z, + time.RFC1123, + time.RFC822Z, + time.RFC822, + time.RFC3339, + "2006-01-02T15:04:05Z", + "2006-01-02T15:04:05-07:00", + "Mon, 2 Jan 2006 15:04:05 -0700", + "Mon, 2 Jan 2006 15:04:05 MST", + "2 Jan 2006 15:04:05 -0700", + "2006-01-02 15:04:05", + } + + dateStr = strings.TrimSpace(dateStr) + for _, format := range formats { + if t, err := time.Parse(format, dateStr); err == nil { + return t.Unix() + } + } + + return time.Now().Unix() +} + +// FetchItem fetches a cached item by ID +func (c *RSSClient) FetchItem(id int) (*Item, error) { + if item, ok := c.storyCache[id]; ok { + return item, nil + } + return nil, fmt.Errorf("item %d not found in cache", id) +} + +// FetchItems fetches multiple cached items by ID +func (c *RSSClient) FetchItems(ids []int) ([]*Item, error) { + items := make([]*Item, len(ids)) + for i, id := range ids { + if item, ok := c.storyCache[id]; ok { + items[i] = item + } + } + return items, nil +} + +// FetchCommentTree returns empty comments (news sites don't have integrated comments) +func (c *RSSClient) FetchCommentTree(item *Item, maxDepth int) ([]*Comment, error) { + // Most news sites don't have comments in their RSS feeds + return []*Comment{}, nil +} + +// cleanDescription removes HTML tags and cleans up description text +func cleanDescription(desc string) string { + if desc == "" { + return "" + } + // Remove HTML tags + re := regexp.MustCompile(`<[^>]*>`) + text := re.ReplaceAllString(desc, "") + // Unescape HTML entities + text = html.UnescapeString(text) + // Clean up whitespace + text = strings.Join(strings.Fields(text), " ") + // Truncate if too long + if len(text) > 300 { + text = text[:297] + "..." + } + return text +} diff --git a/main.go b/main.go index 8383879..4adca6a 100644 --- a/main.go +++ b/main.go @@ -17,7 +17,7 @@ var version = "dev" func main() { var sourceFlag string var showVersion bool - flag.StringVar(&sourceFlag, "source", "hn", "News source: hn, lobsters, or r/subreddit (e.g., r/golang)") + flag.StringVar(&sourceFlag, "source", "hn", "News source: hn, lobsters, r/subreddit, bbc, npr, google, reuters, ap") flag.StringVar(&sourceFlag, "s", "hn", "News source (shorthand)") flag.BoolVar(&showVersion, "version", false, "Show version information") flag.BoolVar(&showVersion, "v", false, "Show version information (shorthand)") @@ -44,9 +44,19 @@ func main() { source = api.NewLobstersClient() case strings.HasPrefix(sourceLower, "r/") || strings.HasPrefix(sourceLower, "/r/"): source = api.NewRedditClient(sourceFlag) + case sourceLower == "bbc" || sourceLower == "bbcnews" || sourceLower == "bbc-news": + source = api.NewBBCClient() + case sourceLower == "npr": + source = api.NewNPRClient() + case sourceLower == "google" || sourceLower == "googlenews" || sourceLower == "google-news": + source = api.NewGoogleNewsClient() + case sourceLower == "reuters": + source = api.NewReutersClient() + case sourceLower == "ap" || sourceLower == "apnews" || sourceLower == "ap-news": + source = api.NewAPNewsClient() default: fmt.Fprintf(os.Stderr, "Unknown source: %s\n", sourceFlag) - fmt.Fprintf(os.Stderr, "Valid sources: hn, lobsters, r/subreddit\n") + fmt.Fprintf(os.Stderr, "Valid sources: hn, lobsters, r/subreddit, bbc, npr, google, reuters, ap\n") os.Exit(1) }