enhance news crawler configuration with new sources and improved request headers

This commit is contained in:
ImBenji 2026-04-16 23:32:56 +01:00
parent c91e4ddb60
commit 11647e6a35
6 changed files with 449 additions and 33 deletions

View file

@ -26,7 +26,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
- SQLite archive file defaults to `./archive.sqlite`. - SQLite archive file defaults to `./archive.sqlite`.
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique. - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery. - `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content. - Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
- Main article images are stored as ultra-compressed base64 WebP. - Main article images are stored as ultra-compressed base64 WebP.
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search. - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.

View file

@ -365,6 +365,22 @@
{ {
"url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", "url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
"label": "El Comercio Peru" "label": "El Comercio Peru"
},
{
"url": "https://jamaica-gleaner.com/feed/business.xml",
"label": "Jamaica Gleaner"
},
{
"url": "https://www.jamaicaobserver.com/app/business/",
"label": "Jamaica Observer"
},
{
"url": "https://www.stabroeknews.com/feed/",
"label": "Stabroek News"
},
{
"url": "https://nationnews.com/rss-feed/",
"label": "Nation News Barbados"
} }
], ],
"gdelt": { "gdelt": {
@ -376,24 +392,264 @@
"format": "json" "format": "json"
}, },
"newsCrawler": { "newsCrawler": {
"sites": [ "maxPages": 15,
{ "maxDepth": 1,
"name": "crawler_reuters", "requestTimeout": 15000,
"disabledLabels": [
"Arab News",
"Arabian Business",
"Australian Fin Review",
"BFM Business",
"Business Daily Africa",
"Business Standard IN",
"BusinessLive SA",
"Caixin Global",
"Cinco Dias",
"City A.M.",
"El Comercio Peru",
"El Economista ES",
"El Economista MX",
"FD.nl",
"Gulf News Business",
"Il Sole 24 Ore",
"Infobae Economia AR",
"Japan Times Business",
"Korea JoongAng Daily",
"Les Echos",
"Live Mint",
"Moneycontrol",
"NZ Herald Business",
"Portafolio Colombia",
"Reuters",
"The Star Malaysia",
"This Is Money",
"Xinhua Business"
],
"overrides": {
"Al Jazeera": {
"allowedHosts": [ "allowedHosts": [
"www.reuters.com", "www.aljazeera.com",
"reuters.com" "aljazeera.com"
], ],
"seeds": [ "seeds": [
"https://www.reuters.com/world/", "https://www.aljazeera.com/",
"https://www.reuters.com/business/", "https://www.aljazeera.com/economy/",
"https://www.reuters.com/markets/", "https://www.aljazeera.com/tag/technology/"
"https://www.reuters.com/technology/" ]
},
"Ars Technica": {
"allowedHosts": [
"arstechnica.com",
"www.arstechnica.com"
], ],
"maxPages": 100, "seeds": [
"maxDepth": 2, "https://arstechnica.com/",
"requestTimeout": 15000 "https://arstechnica.com/tech-policy/",
"https://arstechnica.com/information-technology/"
]
},
"BBC Business": {
"allowedHosts": [
"www.bbc.com",
"bbc.com"
],
"seeds": [
"https://www.bbc.com/news/business",
"https://www.bbc.com/news/technology"
]
},
"CNBC": {
"allowedHosts": [
"www.cnbc.com",
"cnbc.com"
],
"seeds": [
"https://www.cnbc.com/world/",
"https://www.cnbc.com/business/",
"https://www.cnbc.com/technology/"
]
},
"Guardian Business": {
"allowedHosts": [
"www.theguardian.com",
"theguardian.com"
],
"seeds": [
"https://www.theguardian.com/",
"https://www.theguardian.com/business",
"https://www.theguardian.com/technology"
]
},
"Jamaica Gleaner": {
"allowedHosts": [
"jamaica-gleaner.com",
"www.jamaica-gleaner.com"
],
"seeds": [
"https://jamaica-gleaner.com/",
"https://jamaica-gleaner.com/news",
"https://jamaica-gleaner.com/business"
]
},
"Jamaica Observer": {
"allowedHosts": [
"www.jamaicaobserver.com",
"jamaicaobserver.com"
],
"seeds": [
"https://www.jamaicaobserver.com/",
"https://www.jamaicaobserver.com/news/",
"https://www.jamaicaobserver.com/business/"
]
},
"Nation News Barbados": {
"allowedHosts": [
"nationnews.com",
"www.nationnews.com"
],
"seeds": [
"https://nationnews.com/",
"https://nationnews.com/category/business/",
"https://nationnews.com/category/news/"
]
},
"NPR Business": {
"allowedHosts": [
"www.npr.org",
"npr.org"
],
"seeds": [
"https://www.npr.org/sections/business/",
"https://www.npr.org/sections/technology/"
]
},
"The Verge": {
"allowedHosts": [
"www.theverge.com",
"theverge.com"
],
"seeds": [
"https://www.theverge.com/tech",
"https://www.theverge.com/business",
"https://www.theverge.com/archives"
]
},
"TechCrunch": {
"allowedHosts": [
"techcrunch.com",
"www.techcrunch.com"
],
"seeds": [
"https://techcrunch.com/",
"https://techcrunch.com/category/startups/",
"https://techcrunch.com/category/venture/"
]
},
"The Economist": {
"allowedHosts": [
"www.economist.com",
"economist.com"
],
"seeds": [
"https://www.economist.com/finance-and-economics",
"https://www.economist.com/business",
"https://www.economist.com/science-and-technology"
]
},
"Federal Reserve": {
"allowedHosts": [
"www.federalreserve.gov",
"federalreserve.gov"
],
"seeds": [
"https://www.federalreserve.gov/newsevents.htm",
"https://www.federalreserve.gov/monetarypolicy.htm"
]
},
"Fortune": {
"allowedHosts": [
"fortune.com",
"www.fortune.com"
],
"seeds": [
"https://fortune.com/",
"https://fortune.com/section/tech/",
"https://fortune.com/section/finance/"
]
},
"Forbes Business": {
"allowedHosts": [
"www.forbes.com",
"forbes.com"
],
"seeds": [
"https://www.forbes.com/business/",
"https://www.forbes.com/innovation/"
]
},
"Nikkei Asia": {
"allowedHosts": [
"asia.nikkei.com"
],
"seeds": [
"https://asia.nikkei.com/",
"https://asia.nikkei.com/Business",
"https://asia.nikkei.com/Technology"
]
},
"South China Morning Post": {
"allowedHosts": [
"www.scmp.com",
"scmp.com"
],
"seeds": [
"https://www.scmp.com/",
"https://www.scmp.com/business",
"https://www.scmp.com/tech"
]
},
"Stabroek News": {
"allowedHosts": [
"www.stabroeknews.com",
"stabroeknews.com"
],
"seeds": [
"https://www.stabroeknews.com/",
"https://www.stabroeknews.com/category/business/",
"https://www.stabroeknews.com/category/news/"
]
},
"Wall Street Journal": {
"allowedHosts": [
"www.wsj.com",
"wsj.com"
],
"seeds": [
"https://www.wsj.com/news/business",
"https://www.wsj.com/tech"
]
},
"Wired Business": {
"allowedHosts": [
"www.wired.com",
"wired.com"
],
"seeds": [
"https://www.wired.com/category/business/",
"https://www.wired.com/category/security/"
]
},
"Yahoo Finance": {
"allowedHosts": [
"finance.yahoo.com"
],
"seeds": [
"https://finance.yahoo.com/",
"https://finance.yahoo.com/news/",
"https://finance.yahoo.com/topic/tech/"
]
} }
] }
}, },
"scheduler": { "scheduler": {
"rss": "0 */6 * * *", "rss": "0 */6 * * *",

View file

@ -44,8 +44,19 @@ const blockedContentDomains = [
]; ];
const loggedBlockedDomains = new Set(); const loggedBlockedDomains = new Set();
const articleFetchHeaders = { const articleFetchHeaders = {
Accept: 'text/html,application/xhtml+xml',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
}; };
let contentBackfillRunning = false; let contentBackfillRunning = false;
@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
} }
function markArticleStatus(statement, id, message) { function markArticleStatus(statement, id, message) {
statement.run(message, new Date().toISOString(), id); const attemptedAt = new Date().toISOString();
const parameterCount = statement.source.split('?').length - 1;
if (parameterCount === 3) {
statement.run(message, attemptedAt, id);
return;
}
if (parameterCount === 2) {
statement.run(attemptedAt, id);
return;
}
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
} }
async function fetchCompressedImage(url) { async function fetchCompressedImage(url) {

View file

@ -1,6 +1,17 @@
const DEFAULT_HEADERS = { const DEFAULT_HEADERS = {
'User-Agent': 'duriin_api/1.0', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'application/json, text/plain, */*', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
}; };
function sleep(ms) { function sleep(ms) {

View file

@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
const { fetchEdgarArticles } = require('./sources/edgar'); const { fetchEdgarArticles } = require('./sources/edgar');
const { fetchAlphaVantageArticles } = require('./sources/alphavantage'); const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
const { fetchFinnhubArticles } = require('./sources/finnhub'); const { fetchFinnhubArticles } = require('./sources/finnhub');
const { fetchCrawlerArticles } = require('./sources/newsCrawler'); const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
const { backfillMissingContent } = require('./content'); const { backfillMissingContent } = require('./content');
const { backfillMissingEmbeddings } = require('./embeddings'); const { backfillMissingEmbeddings } = require('./embeddings');
@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
} }
} }
async function runCrawlerSources() {
const results = [];
for (const site of getConfiguredCrawlerSites()) {
results.push(await runSource(site.name, () => crawlSite(site)));
}
return results;
}
async function runAllIngestions() { async function runAllIngestions() {
const results = []; const results = [];
@ -28,7 +38,7 @@ async function runAllIngestions() {
results.push(await runSource('edgar', fetchEdgarArticles)); results.push(await runSource('edgar', fetchEdgarArticles));
results.push(await runSource('alphavantage', fetchAlphaVantageArticles)); results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
results.push(await runSource('finnhub', fetchFinnhubArticles)); results.push(await runSource('finnhub', fetchFinnhubArticles));
results.push(await runSource('news_crawler', fetchCrawlerArticles)); results.push(...await runCrawlerSources());
try { try {
await backfillMissingContent(); await backfillMissingContent();
@ -68,7 +78,7 @@ function startScheduler() {
if (config.scheduler.newsCrawler) { if (config.scheduler.newsCrawler) {
cron.schedule(config.scheduler.newsCrawler, async () => { cron.schedule(config.scheduler.newsCrawler, async () => {
await runSource('news_crawler', fetchCrawlerArticles); await runCrawlerSources();
}); });
} }

View file

@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
const USER_AGENT = 'duriin_api crawler/1.0';
function decodeHtmlEntities(value) { function decodeHtmlEntities(value) {
return String(value || '') return String(value || '')
@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
return null; return null;
} }
if (!isAllowedHost(url.hostname, allowedHosts)) { if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
return null; return null;
} }
@ -323,30 +322,144 @@ function shouldQueueLink(url) {
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname); return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
} }
function slugifyLabel(label) {
return String(label || '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
}
function unique(values) {
return [...new Set(values.filter(Boolean))];
}
function buildAllowedHosts(hostname) {
if (!hostname) {
return [];
}
const hosts = [hostname.toLowerCase()];
if (hostname.startsWith('www.')) {
hosts.push(hostname.slice(4).toLowerCase());
} else {
hosts.push(`www.${hostname}`.toLowerCase());
}
return unique(hosts);
}
function cleanFeedPath(pathname) {
const withoutIndex = pathname
.replace(/\/index\.[a-z0-9]+$/i, '/')
.replace(/\.[a-z0-9]+$/i, '')
.replace(/\/rss(?:$|\/.*$)/i, '/')
.replace(/\/feed(?:$|\/.*$)/i, '/')
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
.replace(/\/xml(?:$|\/.*$)/i, '/')
.replace(/\/arc\/outboundfeeds\//i, '/')
.replace(/\/dynamo\//i, '/')
.replace(/\/id\/\d+\/device\/rss\//i, '/')
.replace(/\/contentexport\//i, '/')
.replace(/\/rssfeedstopstories$/i, '/')
.replace(/\/latest$/i, '/')
.replace(/\/+$|^$/g, '');
if (!withoutIndex) {
return '/';
}
const segments = withoutIndex
.split('/')
.filter(Boolean)
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
.slice(0, 3);
if (!segments.length) {
return '/';
}
return `/${segments.join('/')}`;
}
function buildDefaultSeeds(feedUrl) {
try {
const parsed = new URL(feedUrl);
const origin = `${parsed.protocol}//${parsed.hostname}`;
const cleanedPath = cleanFeedPath(parsed.pathname);
return unique([
canonicalizeUrl(origin, origin),
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
]);
} catch {
return [];
}
}
function normalizeSite(site) { function normalizeSite(site) {
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))]; const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
const seeds = [...new Set((site.seeds || []) const seeds = unique((site.seeds || [])
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts)) .map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
.filter(Boolean))]; .filter(Boolean));
return { return {
name: String(site.name || '').trim(), name: String(site.name || '').trim(),
label: String(site.label || '').trim(),
allowedHosts, allowedHosts,
seeds, seeds,
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)), maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)), maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
}; };
} }
function getCrawlerSiteOverrides(label) {
return config.newsCrawler?.overrides?.[label] || null;
}
function getConfiguredCrawlerSites() {
const defaults = config.newsCrawler || {};
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
const derivedSites = [];
for (const feed of config.rssFeeds || []) {
const label = String(feed.label || '').trim();
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
continue;
}
let hostname = '';
try {
hostname = new URL(feed.url).hostname;
} catch {
continue;
}
const override = getCrawlerSiteOverrides(label) || {};
const site = normalizeSite({
label,
name: override.name || `crawler_${slugifyLabel(label)}`,
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
seeds: override.seeds || buildDefaultSeeds(feed.url),
maxPages: override.maxPages || defaults.maxPages,
maxDepth: override.maxDepth || defaults.maxDepth,
requestTimeout: override.requestTimeout || defaults.requestTimeout,
});
if (site.name && site.allowedHosts.length && site.seeds.length) {
derivedSites.push(site);
}
}
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
}
async function fetchHtml(url, timeout) { async function fetchHtml(url, timeout) {
const response = await fetchWithPolicy(url, { const response = await fetchWithPolicy(url, {
timeout, timeout,
retries: 1, retries: 1,
headers: {
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
'User-Agent': USER_AGENT,
},
}); });
if (!response.ok) { if (!response.ok) {
@ -438,7 +551,7 @@ async function crawlSite(site) {
async function fetchCrawlerArticles() { async function fetchCrawlerArticles() {
const articles = []; const articles = [];
for (const site of config.newsCrawler?.sites || []) { for (const site of getConfiguredCrawlerSites()) {
try { try {
articles.push(...await crawlSite(site)); articles.push(...await crawlSite(site));
} catch (error) { } catch (error) {
@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {
module.exports = { module.exports = {
fetchCrawlerArticles, fetchCrawlerArticles,
crawlSite,
canonicalizeUrl, canonicalizeUrl,
getConfiguredCrawlerSites,
}; };