enhance news crawler configuration with new sources and improved request headers

This commit is contained in:
ImBenji
2026-04-16 23:32:56 +01:00
parent c91e4ddb60
commit 11647e6a35
6 changed files with 449 additions and 33 deletions
+26 -2
View File
@@ -44,8 +44,19 @@ const blockedContentDomains = [
];
const loggedBlockedDomains = new Set();
const articleFetchHeaders = {
Accept: 'text/html,application/xhtml+xml',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
};
let contentBackfillRunning = false;
@@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
}
function markArticleStatus(statement, id, message) {
statement.run(message, new Date().toISOString(), id);
const attemptedAt = new Date().toISOString();
const parameterCount = statement.source.split('?').length - 1;
if (parameterCount === 3) {
statement.run(message, attemptedAt, id);
return;
}
if (parameterCount === 2) {
statement.run(attemptedAt, id);
return;
}
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
}
async function fetchCompressedImage(url) {
+13 -2
View File
@@ -1,6 +1,17 @@
const DEFAULT_HEADERS = {
'User-Agent': 'duriin_api/1.0',
Accept: 'application/json, text/plain, */*',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Upgrade-Insecure-Requests': '1',
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
};
function sleep(ms) {
+13 -3
View File
@@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
const { fetchEdgarArticles } = require('./sources/edgar');
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
const { fetchFinnhubArticles } = require('./sources/finnhub');
const { fetchCrawlerArticles } = require('./sources/newsCrawler');
const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
const { backfillMissingContent } = require('./content');
const { backfillMissingEmbeddings } = require('./embeddings');
@@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
}
}
async function runCrawlerSources() {
const results = [];
for (const site of getConfiguredCrawlerSites()) {
results.push(await runSource(site.name, () => crawlSite(site)));
}
return results;
}
async function runAllIngestions() {
const results = [];
@@ -28,7 +38,7 @@ async function runAllIngestions() {
results.push(await runSource('edgar', fetchEdgarArticles));
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
results.push(await runSource('finnhub', fetchFinnhubArticles));
results.push(await runSource('news_crawler', fetchCrawlerArticles));
results.push(...await runCrawlerSources());
try {
await backfillMissingContent();
@@ -68,7 +78,7 @@ function startScheduler() {
if (config.scheduler.newsCrawler) {
cron.schedule(config.scheduler.newsCrawler, async () => {
await runSource('news_crawler', fetchCrawlerArticles);
await runCrawlerSources();
});
}
+127 -12
View File
@@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
const USER_AGENT = 'duriin_api crawler/1.0';
function decodeHtmlEntities(value) {
return String(value || '')
@@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
return null;
}
if (!isAllowedHost(url.hostname, allowedHosts)) {
if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
return null;
}
@@ -323,30 +322,144 @@ function shouldQueueLink(url) {
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
}
function slugifyLabel(label) {
return String(label || '')
.toLowerCase()
.replace(/[^a-z0-9]+/g, '_')
.replace(/^_+|_+$/g, '');
}
function unique(values) {
return [...new Set(values.filter(Boolean))];
}
function buildAllowedHosts(hostname) {
if (!hostname) {
return [];
}
const hosts = [hostname.toLowerCase()];
if (hostname.startsWith('www.')) {
hosts.push(hostname.slice(4).toLowerCase());
} else {
hosts.push(`www.${hostname}`.toLowerCase());
}
return unique(hosts);
}
function cleanFeedPath(pathname) {
const withoutIndex = pathname
.replace(/\/index\.[a-z0-9]+$/i, '/')
.replace(/\.[a-z0-9]+$/i, '')
.replace(/\/rss(?:$|\/.*$)/i, '/')
.replace(/\/feed(?:$|\/.*$)/i, '/')
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
.replace(/\/xml(?:$|\/.*$)/i, '/')
.replace(/\/arc\/outboundfeeds\//i, '/')
.replace(/\/dynamo\//i, '/')
.replace(/\/id\/\d+\/device\/rss\//i, '/')
.replace(/\/contentexport\//i, '/')
.replace(/\/rssfeedstopstories$/i, '/')
.replace(/\/latest$/i, '/')
.replace(/\/+$|^$/g, '');
if (!withoutIndex) {
return '/';
}
const segments = withoutIndex
.split('/')
.filter(Boolean)
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
.slice(0, 3);
if (!segments.length) {
return '/';
}
return `/${segments.join('/')}`;
}
function buildDefaultSeeds(feedUrl) {
try {
const parsed = new URL(feedUrl);
const origin = `${parsed.protocol}//${parsed.hostname}`;
const cleanedPath = cleanFeedPath(parsed.pathname);
return unique([
canonicalizeUrl(origin, origin),
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
]);
} catch {
return [];
}
}
function normalizeSite(site) {
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
const seeds = [...new Set((site.seeds || [])
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
const seeds = unique((site.seeds || [])
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
.filter(Boolean))];
.filter(Boolean));
return {
name: String(site.name || '').trim(),
label: String(site.label || '').trim(),
allowedHosts,
seeds,
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
};
}
function getCrawlerSiteOverrides(label) {
return config.newsCrawler?.overrides?.[label] || null;
}
function getConfiguredCrawlerSites() {
const defaults = config.newsCrawler || {};
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
const derivedSites = [];
for (const feed of config.rssFeeds || []) {
const label = String(feed.label || '').trim();
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
continue;
}
let hostname = '';
try {
hostname = new URL(feed.url).hostname;
} catch {
continue;
}
const override = getCrawlerSiteOverrides(label) || {};
const site = normalizeSite({
label,
name: override.name || `crawler_${slugifyLabel(label)}`,
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
seeds: override.seeds || buildDefaultSeeds(feed.url),
maxPages: override.maxPages || defaults.maxPages,
maxDepth: override.maxDepth || defaults.maxDepth,
requestTimeout: override.requestTimeout || defaults.requestTimeout,
});
if (site.name && site.allowedHosts.length && site.seeds.length) {
derivedSites.push(site);
}
}
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
}
async function fetchHtml(url, timeout) {
const response = await fetchWithPolicy(url, {
timeout,
retries: 1,
headers: {
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
'User-Agent': USER_AGENT,
},
});
if (!response.ok) {
@@ -438,7 +551,7 @@ async function crawlSite(site) {
async function fetchCrawlerArticles() {
const articles = [];
for (const site of config.newsCrawler?.sites || []) {
for (const site of getConfiguredCrawlerSites()) {
try {
articles.push(...await crawlSite(site));
} catch (error) {
@@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {
module.exports = {
fetchCrawlerArticles,
crawlSite,
canonicalizeUrl,
getConfiguredCrawlerSites,
};