enhance news crawler configuration with new sources and improved request headers

This commit is contained in:
ImBenji 2026-04-16 23:32:56 +01:00
parent c91e4ddb60
commit 11647e6a35
6 changed files with 449 additions and 33 deletions

View file

@ -26,7 +26,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
- SQLite archive file defaults to `./archive.sqlite`.
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery.
- `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
- Main article images are stored as ultra-compressed base64 WebP.
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.

View file

@ -365,6 +365,22 @@
{
"url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
"label": "El Comercio Peru"
},
{
"url": "https://jamaica-gleaner.com/feed/business.xml",
"label": "Jamaica Gleaner"
},
{
"url": "https://www.jamaicaobserver.com/app/business/",
"label": "Jamaica Observer"
},
{
"url": "https://www.stabroeknews.com/feed/",
"label": "Stabroek News"
},
{
"url": "https://nationnews.com/rss-feed/",
"label": "Nation News Barbados"
}
],
"gdelt": {
@ -376,24 +392,264 @@
"format": "json"
},
"newsCrawler": {
"sites": [
{
"name": "crawler_reuters",
"maxPages": 15,
"maxDepth": 1,
"requestTimeout": 15000,
"disabledLabels": [
"Arab News",
"Arabian Business",
"Australian Fin Review",
"BFM Business",
"Business Daily Africa",
"Business Standard IN",
"BusinessLive SA",
"Caixin Global",
"Cinco Dias",
"City A.M.",
"El Comercio Peru",
"El Economista ES",
"El Economista MX",
"FD.nl",
"Gulf News Business",
"Il Sole 24 Ore",
"Infobae Economia AR",
"Japan Times Business",
"Korea JoongAng Daily",
"Les Echos",
"Live Mint",
"Moneycontrol",
"NZ Herald Business",
"Portafolio Colombia",
"Reuters",
"The Star Malaysia",
"This Is Money",
"Xinhua Business"
],
"overrides": {
"Al Jazeera": {
"allowedHosts": [
"www.reuters.com",
"reuters.com"
"www.aljazeera.com",
"aljazeera.com"
],
"seeds": [
"https://www.reuters.com/world/",
"https://www.reuters.com/business/",
"https://www.reuters.com/markets/",
"https://www.reuters.com/technology/"
"https://www.aljazeera.com/",
"https://www.aljazeera.com/economy/",
"https://www.aljazeera.com/tag/technology/"
]
},
"Ars Technica": {
"allowedHosts": [
"arstechnica.com",
"www.arstechnica.com"
],
"maxPages": 100,
"maxDepth": 2,
"requestTimeout": 15000
"seeds": [
"https://arstechnica.com/",
"https://arstechnica.com/tech-policy/",
"https://arstechnica.com/information-technology/"
]
},
"BBC Business": {
"allowedHosts": [
"www.bbc.com",
"bbc.com"
],
"seeds": [
"https://www.bbc.com/news/business",
"https://www.bbc.com/news/technology"
]
},
"CNBC": {
"allowedHosts": [
"www.cnbc.com",
"cnbc.com"
],
"seeds": [
"https://www.cnbc.com/world/",
"https://www.cnbc.com/business/",
"https://www.cnbc.com/technology/"
]
},
"Guardian Business": {
"allowedHosts": [
"www.theguardian.com",
"theguardian.com"
],
"seeds": [
"https://www.theguardian.com/",
"https://www.theguardian.com/business",
"https://www.theguardian.com/technology"
]
},
"Jamaica Gleaner": {
"allowedHosts": [
"jamaica-gleaner.com",
"www.jamaica-gleaner.com"
],
"seeds": [
"https://jamaica-gleaner.com/",
"https://jamaica-gleaner.com/news",
"https://jamaica-gleaner.com/business"
]
},
"Jamaica Observer": {
"allowedHosts": [
"www.jamaicaobserver.com",
"jamaicaobserver.com"
],
"seeds": [
"https://www.jamaicaobserver.com/",
"https://www.jamaicaobserver.com/news/",
"https://www.jamaicaobserver.com/business/"
]
},
"Nation News Barbados": {
"allowedHosts": [
"nationnews.com",
"www.nationnews.com"
],
"seeds": [
"https://nationnews.com/",
"https://nationnews.com/category/business/",
"https://nationnews.com/category/news/"
]
},
"NPR Business": {
"allowedHosts": [
"www.npr.org",
"npr.org"
],
"seeds": [
"https://www.npr.org/sections/business/",
"https://www.npr.org/sections/technology/"
]
},
"The Verge": {
"allowedHosts": [
"www.theverge.com",
"theverge.com"
],
"seeds": [
"https://www.theverge.com/tech",
"https://www.theverge.com/business",
"https://www.theverge.com/archives"
]
},
"TechCrunch": {
"allowedHosts": [
"techcrunch.com",
"www.techcrunch.com"
],
"seeds": [
"https://techcrunch.com/",
"https://techcrunch.com/category/startups/",
"https://techcrunch.com/category/venture/"
]
},
"The Economist": {
"allowedHosts": [
"www.economist.com",
"economist.com"
],
"seeds": [
"https://www.economist.com/finance-and-economics",
"https://www.economist.com/business",
"https://www.economist.com/science-and-technology"
]
},
"Federal Reserve": {
"allowedHosts": [
"www.federalreserve.gov",
"federalreserve.gov"
],
"seeds": [
"https://www.federalreserve.gov/newsevents.htm",
"https://www.federalreserve.gov/monetarypolicy.htm"
]
},
"Fortune": {
"allowedHosts": [
"fortune.com",
"www.fortune.com"
],
"seeds": [
"https://fortune.com/",
"https://fortune.com/section/tech/",
"https://fortune.com/section/finance/"
]
},
"Forbes Business": {
"allowedHosts": [
"www.forbes.com",
"forbes.com"
],
"seeds": [
"https://www.forbes.com/business/",
"https://www.forbes.com/innovation/"
]
},
"Nikkei Asia": {
"allowedHosts": [
"asia.nikkei.com"
],
"seeds": [
"https://asia.nikkei.com/",
"https://asia.nikkei.com/Business",
"https://asia.nikkei.com/Technology"
]
},
"South China Morning Post": {
"allowedHosts": [
"www.scmp.com",
"scmp.com"
],
"seeds": [
"https://www.scmp.com/",
"https://www.scmp.com/business",
"https://www.scmp.com/tech"
]
},
"Stabroek News": {
"allowedHosts": [
"www.stabroeknews.com",
"stabroeknews.com"
],
"seeds": [
"https://www.stabroeknews.com/",
"https://www.stabroeknews.com/category/business/",
"https://www.stabroeknews.com/category/news/"
]
},
"Wall Street Journal": {
"allowedHosts": [
"www.wsj.com",
"wsj.com"
],
"seeds": [
"https://www.wsj.com/news/business",
"https://www.wsj.com/tech"
]
},
"Wired Business": {
"allowedHosts": [
"www.wired.com",
"wired.com"
],
"seeds": [
"https://www.wired.com/category/business/",
"https://www.wired.com/category/security/"
]
},
"Yahoo Finance": {
"allowedHosts": [
"finance.yahoo.com"
],
"seeds": [
"https://finance.yahoo.com/",
"https://finance.yahoo.com/news/",
"https://finance.yahoo.com/topic/tech/"
]
}
]
}
},
"scheduler": {
"rss": "0 */6 * * *",

View file

@ -44,8 +44,19 @@ const blockedContentDomains = [
];
const loggedBlockedDomains = new Set();
const articleFetchHeaders = {
Accept: 'text/html,application/xhtml+xml',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
};
let contentBackfillRunning = false;
@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
}
function markArticleStatus(statement, id, message) {
statement.run(message, new Date().toISOString(), id);
const attemptedAt = new Date().toISOString();
const parameterCount = statement.source.split('?').length - 1;
if (parameterCount === 3) {
statement.run(message, attemptedAt, id);
return;
}
if (parameterCount === 2) {
statement.run(attemptedAt, id);
return;
}
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
}
async function fetchCompressedImage(url) {

View file

@ -1,6 +1,17 @@
const DEFAULT_HEADERS = {
'User-Agent': 'duriin_api/1.0',
Accept: 'application/json, text/plain, */*',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
};
function sleep(ms) {

View file

@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
const { fetchEdgarArticles } = require('./sources/edgar');
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
const { fetchFinnhubArticles } = require('./sources/finnhub');
const { fetchCrawlerArticles } = require('./sources/newsCrawler');
const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
const { backfillMissingContent } = require('./content');
const { backfillMissingEmbeddings } = require('./embeddings');
@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
}
}
async function runCrawlerSources() {
const results = [];
for (const site of getConfiguredCrawlerSites()) {
results.push(await runSource(site.name, () => crawlSite(site)));
}
return results;
}
async function runAllIngestions() {
const results = [];
@ -28,7 +38,7 @@ async function runAllIngestions() {
results.push(await runSource('edgar', fetchEdgarArticles));
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
results.push(await runSource('finnhub', fetchFinnhubArticles));
results.push(await runSource('news_crawler', fetchCrawlerArticles));
results.push(...await runCrawlerSources());
try {
await backfillMissingContent();
@ -68,7 +78,7 @@ function startScheduler() {
if (config.scheduler.newsCrawler) {
cron.schedule(config.scheduler.newsCrawler, async () => {
await runSource('news_crawler', fetchCrawlerArticles);
await runCrawlerSources();
});
}

View file

@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
const USER_AGENT = 'duriin_api crawler/1.0';
function decodeHtmlEntities(value) {
return String(value || '')
@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
return null;
}
if (!isAllowedHost(url.hostname, allowedHosts)) {
if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
return null;
}
@ -323,30 +322,144 @@ function shouldQueueLink(url) {
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
}
function slugifyLabel(label) {
return String(label || '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
}
function unique(values) {
return [...new Set(values.filter(Boolean))];
}
function buildAllowedHosts(hostname) {
if (!hostname) {
return [];
}
const hosts = [hostname.toLowerCase()];
if (hostname.startsWith('www.')) {
hosts.push(hostname.slice(4).toLowerCase());
} else {
hosts.push(`www.${hostname}`.toLowerCase());
}
return unique(hosts);
}
function cleanFeedPath(pathname) {
const withoutIndex = pathname
.replace(/\/index\.[a-z0-9]+$/i, '/')
.replace(/\.[a-z0-9]+$/i, '')
.replace(/\/rss(?:$|\/.*$)/i, '/')
.replace(/\/feed(?:$|\/.*$)/i, '/')
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
.replace(/\/xml(?:$|\/.*$)/i, '/')
.replace(/\/arc\/outboundfeeds\//i, '/')
.replace(/\/dynamo\//i, '/')
.replace(/\/id\/\d+\/device\/rss\//i, '/')
.replace(/\/contentexport\//i, '/')
.replace(/\/rssfeedstopstories$/i, '/')
.replace(/\/latest$/i, '/')
.replace(/\/+$|^$/g, '');
if (!withoutIndex) {
return '/';
}
const segments = withoutIndex
.split('/')
.filter(Boolean)
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
.slice(0, 3);
if (!segments.length) {
return '/';
}
return `/${segments.join('/')}`;
}
function buildDefaultSeeds(feedUrl) {
try {
const parsed = new URL(feedUrl);
const origin = `${parsed.protocol}//${parsed.hostname}`;
const cleanedPath = cleanFeedPath(parsed.pathname);
return unique([
canonicalizeUrl(origin, origin),
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
]);
} catch {
return [];
}
}
function normalizeSite(site) {
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
const seeds = [...new Set((site.seeds || [])
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
const seeds = unique((site.seeds || [])
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
.filter(Boolean))];
.filter(Boolean));
return {
name: String(site.name || '').trim(),
label: String(site.label || '').trim(),
allowedHosts,
seeds,
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
};
}
function getCrawlerSiteOverrides(label) {
return config.newsCrawler?.overrides?.[label] || null;
}
function getConfiguredCrawlerSites() {
const defaults = config.newsCrawler || {};
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
const derivedSites = [];
for (const feed of config.rssFeeds || []) {
const label = String(feed.label || '').trim();
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
continue;
}
let hostname = '';
try {
hostname = new URL(feed.url).hostname;
} catch {
continue;
}
const override = getCrawlerSiteOverrides(label) || {};
const site = normalizeSite({
label,
name: override.name || `crawler_${slugifyLabel(label)}`,
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
seeds: override.seeds || buildDefaultSeeds(feed.url),
maxPages: override.maxPages || defaults.maxPages,
maxDepth: override.maxDepth || defaults.maxDepth,
requestTimeout: override.requestTimeout || defaults.requestTimeout,
});
if (site.name && site.allowedHosts.length && site.seeds.length) {
derivedSites.push(site);
}
}
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
}
async function fetchHtml(url, timeout) {
const response = await fetchWithPolicy(url, {
timeout,
retries: 1,
headers: {
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
'User-Agent': USER_AGENT,
},
});
if (!response.ok) {
@ -438,7 +551,7 @@ async function crawlSite(site) {
async function fetchCrawlerArticles() {
const articles = [];
for (const site of config.newsCrawler?.sites || []) {
for (const site of getConfiguredCrawlerSites()) {
try {
articles.push(...await crawlSite(site));
} catch (error) {
@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {
module.exports = {
fetchCrawlerArticles,
crawlSite,
canonicalizeUrl,
getConfiguredCrawlerSites,
};