const Parser = require('rss-parser'); const config = require('../config'); const { fetchWithPolicy } = require('../http'); const parser = new Parser({ timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0', Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8', }, }); const blockedFeedDomains = [ 'arabnews.com', 'arabianbusiness.com', 'business-standard.com', 'cityam.com', 'eleconomista.com.mx', 'eleconomista.es', 'moneycontrol.com', 'thisismoney.co.uk', ]; const invalidFeedLabels = new Set([ 'ABC Business AU', 'Australian Fin Review', 'Business Daily Africa', 'BusinessLive SA', 'Caixin Global', 'Cinco Dias', 'El Comercio Peru', 'FD.nl', 'Gulf News Business', 'Il Sole 24 Ore', 'Infobae Economia AR', 'Japan Times Business', 'Korea JoongAng Daily', 'Les Echos', 'Live Mint', 'NZ Herald Business', 'Portafolio Colombia', 'The Star Malaysia', 'Xinhua Business', ]); const malformedFeedLabels = new Set([ 'BFM Business', 'Business Daily Africa', 'Nation News Barbados', ]); const loggedBlockedFeeds = new Set(); const loggedInvalidFeeds = new Set(); const loggedUpstreamFeedSkips = new Set(); function getHostname(url) { try { return new URL(url).hostname.toLowerCase(); } catch { return ''; } } function isBlockedFeed(feed) { const hostname = getHostname(feed.url); return blockedFeedDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`)); } function isMalformedFeedError(error) { const message = String(error && error.message || ''); return message.includes('Invalid character in entity name') || message.includes('Attribute without value'); } function getErrorStatus(error) { if (error && Number.isInteger(error.status)) { return error.status; } const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/); return match ? Number(match[1]) : null; } async function parseFeed(feedUrl) { const response = await fetchWithPolicy(feedUrl, { timeout: 10000, retries: 1, headers: { Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8', }, }); if (!response.ok) { const error = new Error(`Status code ${response.status}`); error.status = response.status; throw error; } const xml = await response.text(); return parser.parseString(xml); } async function fetchRssArticles() { const articles = []; for (const feed of config.rssFeeds || []) { const label = feed.label || feed.url; if (invalidFeedLabels.has(label)) { if (!loggedInvalidFeeds.has(label)) { loggedInvalidFeeds.add(label); console.warn(`RSS feed skipped for invalid endpoint ${label}`); } continue; } if (isBlockedFeed(feed)) { const hostname = getHostname(feed.url); if (!loggedBlockedFeeds.has(hostname)) { loggedBlockedFeeds.add(hostname); console.warn(`RSS feed skipped for blocked domain ${hostname}`); } continue; } try { const parsed = await parseFeed(feed.url); for (const item of parsed.items || []) { const title = String(item.title || '').trim(); const url = String(item.link || item.guid || '').trim(); if (!title || !url) { continue; } articles.push({ title, description: item.contentSnippet || item.content || item.summary || null, url, source: feed.label ? `rss:${feed.label}` : 'rss', pubDate: item.isoDate || item.pubDate || null, }); } } catch (error) { if (malformedFeedLabels.has(label) && isMalformedFeedError(error)) { if (!loggedInvalidFeeds.has(label)) { loggedInvalidFeeds.add(label); console.warn(`RSS feed skipped for malformed XML ${label}`); } continue; } const status = getErrorStatus(error); if (status === 401 || status === 403 || status === 404 || status === 429) { const key = `${label}:${status}`; if (!loggedUpstreamFeedSkips.has(key)) { loggedUpstreamFeedSkips.add(key); console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`); } continue; } console.error(`Failed to fetch RSS feed: ${label}`, error); } } return articles; } module.exports = { fetchRssArticles, };