168 lines
4.3 KiB
JavaScript
168 lines
4.3 KiB
JavaScript
const Parser = require('rss-parser');
|
|
const config = require('../config');
|
|
const { fetchWithPolicy } = require('../http');
|
|
|
|
const parser = new Parser({
|
|
timeout: 10000,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0',
|
|
Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
|
|
},
|
|
});
|
|
|
|
const blockedFeedDomains = [
|
|
'arabnews.com',
|
|
'arabianbusiness.com',
|
|
'business-standard.com',
|
|
'cityam.com',
|
|
'eleconomista.com.mx',
|
|
'eleconomista.es',
|
|
'moneycontrol.com',
|
|
'thisismoney.co.uk',
|
|
];
|
|
const invalidFeedLabels = new Set([
|
|
'ABC Business AU',
|
|
'Australian Fin Review',
|
|
'Business Daily Africa',
|
|
'BusinessLive SA',
|
|
'Caixin Global',
|
|
'Cinco Dias',
|
|
'El Comercio Peru',
|
|
'FD.nl',
|
|
'Gulf News Business',
|
|
'Il Sole 24 Ore',
|
|
'Infobae Economia AR',
|
|
'Japan Times Business',
|
|
'Korea JoongAng Daily',
|
|
'Les Echos',
|
|
'Live Mint',
|
|
'NZ Herald Business',
|
|
'Portafolio Colombia',
|
|
'The Star Malaysia',
|
|
'Xinhua Business',
|
|
]);
|
|
const malformedFeedLabels = new Set([
|
|
'BFM Business',
|
|
'Business Daily Africa',
|
|
'Nation News Barbados',
|
|
]);
|
|
const loggedBlockedFeeds = new Set();
|
|
const loggedInvalidFeeds = new Set();
|
|
const loggedUpstreamFeedSkips = new Set();
|
|
|
|
function getHostname(url) {
|
|
try {
|
|
return new URL(url).hostname.toLowerCase();
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function isBlockedFeed(feed) {
|
|
const hostname = getHostname(feed.url);
|
|
return blockedFeedDomains.some((domain) => hostname === domain || hostname.endsWith(`.${domain}`));
|
|
}
|
|
|
|
function isMalformedFeedError(error) {
|
|
const message = String(error && error.message || '');
|
|
return message.includes('Invalid character in entity name') || message.includes('Attribute without value');
|
|
}
|
|
|
|
function getErrorStatus(error) {
|
|
if (error && Number.isInteger(error.status)) {
|
|
return error.status;
|
|
}
|
|
|
|
const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
|
|
return match ? Number(match[1]) : null;
|
|
}
|
|
|
|
async function parseFeed(feedUrl) {
|
|
const response = await fetchWithPolicy(feedUrl, {
|
|
timeout: 10000,
|
|
retries: 1,
|
|
headers: {
|
|
Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
const error = new Error(`Status code ${response.status}`);
|
|
error.status = response.status;
|
|
throw error;
|
|
}
|
|
|
|
const xml = await response.text();
|
|
return parser.parseString(xml);
|
|
}
|
|
|
|
async function fetchRssArticles() {
|
|
const articles = [];
|
|
|
|
for (const feed of config.rssFeeds || []) {
|
|
const label = feed.label || feed.url;
|
|
|
|
if (invalidFeedLabels.has(label)) {
|
|
if (!loggedInvalidFeeds.has(label)) {
|
|
loggedInvalidFeeds.add(label);
|
|
console.warn(`RSS feed skipped for invalid endpoint ${label}`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (isBlockedFeed(feed)) {
|
|
const hostname = getHostname(feed.url);
|
|
if (!loggedBlockedFeeds.has(hostname)) {
|
|
loggedBlockedFeeds.add(hostname);
|
|
console.warn(`RSS feed skipped for blocked domain ${hostname}`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
const parsed = await parseFeed(feed.url);
|
|
for (const item of parsed.items || []) {
|
|
const title = String(item.title || '').trim();
|
|
const url = String(item.link || item.guid || '').trim();
|
|
|
|
if (!title || !url) {
|
|
continue;
|
|
}
|
|
|
|
articles.push({
|
|
title,
|
|
description: item.contentSnippet || item.content || item.summary || null,
|
|
url,
|
|
source: feed.label ? `rss:${feed.label}` : 'rss',
|
|
pubDate: item.isoDate || item.pubDate || null,
|
|
});
|
|
}
|
|
} catch (error) {
|
|
if (malformedFeedLabels.has(label) && isMalformedFeedError(error)) {
|
|
if (!loggedInvalidFeeds.has(label)) {
|
|
loggedInvalidFeeds.add(label);
|
|
console.warn(`RSS feed skipped for malformed XML ${label}`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
const status = getErrorStatus(error);
|
|
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
|
const key = `${label}:${status}`;
|
|
if (!loggedUpstreamFeedSkips.has(key)) {
|
|
loggedUpstreamFeedSkips.add(key);
|
|
console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
console.error(`Failed to fetch RSS feed: ${label}`, error);
|
|
}
|
|
}
|
|
|
|
return articles;
|
|
}
|
|
|
|
module.exports = {
|
|
fetchRssArticles,
|
|
};
|