enhance news crawler configuration with new sources and improved request headers
This commit is contained in:
+26
-2
@@ -44,8 +44,19 @@ const blockedContentDomains = [
|
||||
];
|
||||
const loggedBlockedDomains = new Set();
|
||||
const articleFetchHeaders = {
|
||||
Accept: 'text/html,application/xhtml+xml',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
};
|
||||
|
||||
let contentBackfillRunning = false;
|
||||
@@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
|
||||
}
|
||||
|
||||
function markArticleStatus(statement, id, message) {
|
||||
statement.run(message, new Date().toISOString(), id);
|
||||
const attemptedAt = new Date().toISOString();
|
||||
const parameterCount = statement.source.split('?').length - 1;
|
||||
|
||||
if (parameterCount === 3) {
|
||||
statement.run(message, attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (parameterCount === 2) {
|
||||
statement.run(attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
|
||||
}
|
||||
|
||||
async function fetchCompressedImage(url) {
|
||||
|
||||
+13
-2
@@ -1,6 +1,17 @@
|
||||
const DEFAULT_HEADERS = {
|
||||
'User-Agent': 'duriin_api/1.0',
|
||||
Accept: 'application/json, text/plain, */*',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
};
|
||||
|
||||
function sleep(ms) {
|
||||
|
||||
+13
-3
@@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
|
||||
const { fetchEdgarArticles } = require('./sources/edgar');
|
||||
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
||||
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
||||
const { fetchCrawlerArticles } = require('./sources/newsCrawler');
|
||||
const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
|
||||
const { backfillMissingContent } = require('./content');
|
||||
const { backfillMissingEmbeddings } = require('./embeddings');
|
||||
|
||||
@@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
|
||||
}
|
||||
}
|
||||
|
||||
async function runCrawlerSources() {
|
||||
const results = [];
|
||||
|
||||
for (const site of getConfiguredCrawlerSites()) {
|
||||
results.push(await runSource(site.name, () => crawlSite(site)));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function runAllIngestions() {
|
||||
const results = [];
|
||||
|
||||
@@ -28,7 +38,7 @@ async function runAllIngestions() {
|
||||
results.push(await runSource('edgar', fetchEdgarArticles));
|
||||
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
||||
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
||||
results.push(await runSource('news_crawler', fetchCrawlerArticles));
|
||||
results.push(...await runCrawlerSources());
|
||||
|
||||
try {
|
||||
await backfillMissingContent();
|
||||
@@ -68,7 +78,7 @@ function startScheduler() {
|
||||
|
||||
if (config.scheduler.newsCrawler) {
|
||||
cron.schedule(config.scheduler.newsCrawler, async () => {
|
||||
await runSource('news_crawler', fetchCrawlerArticles);
|
||||
await runCrawlerSources();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
+127
-12
@@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
|
||||
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
||||
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
||||
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
||||
const USER_AGENT = 'duriin_api crawler/1.0';
|
||||
|
||||
function decodeHtmlEntities(value) {
|
||||
return String(value || '')
|
||||
@@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isAllowedHost(url.hostname, allowedHosts)) {
|
||||
if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -323,30 +322,144 @@ function shouldQueueLink(url) {
|
||||
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
|
||||
}
|
||||
|
||||
function slugifyLabel(label) {
|
||||
return String(label || '')
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '_')
|
||||
.replace(/^_+|_+$/g, '');
|
||||
}
|
||||
|
||||
function unique(values) {
|
||||
return [...new Set(values.filter(Boolean))];
|
||||
}
|
||||
|
||||
function buildAllowedHosts(hostname) {
|
||||
if (!hostname) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const hosts = [hostname.toLowerCase()];
|
||||
if (hostname.startsWith('www.')) {
|
||||
hosts.push(hostname.slice(4).toLowerCase());
|
||||
} else {
|
||||
hosts.push(`www.${hostname}`.toLowerCase());
|
||||
}
|
||||
|
||||
return unique(hosts);
|
||||
}
|
||||
|
||||
function cleanFeedPath(pathname) {
|
||||
const withoutIndex = pathname
|
||||
.replace(/\/index\.[a-z0-9]+$/i, '/')
|
||||
.replace(/\.[a-z0-9]+$/i, '')
|
||||
.replace(/\/rss(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/feed(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/xml(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/arc\/outboundfeeds\//i, '/')
|
||||
.replace(/\/dynamo\//i, '/')
|
||||
.replace(/\/id\/\d+\/device\/rss\//i, '/')
|
||||
.replace(/\/contentexport\//i, '/')
|
||||
.replace(/\/rssfeedstopstories$/i, '/')
|
||||
.replace(/\/latest$/i, '/')
|
||||
.replace(/\/+$|^$/g, '');
|
||||
|
||||
if (!withoutIndex) {
|
||||
return '/';
|
||||
}
|
||||
|
||||
const segments = withoutIndex
|
||||
.split('/')
|
||||
.filter(Boolean)
|
||||
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
|
||||
.slice(0, 3);
|
||||
|
||||
if (!segments.length) {
|
||||
return '/';
|
||||
}
|
||||
|
||||
return `/${segments.join('/')}`;
|
||||
}
|
||||
|
||||
function buildDefaultSeeds(feedUrl) {
|
||||
try {
|
||||
const parsed = new URL(feedUrl);
|
||||
const origin = `${parsed.protocol}//${parsed.hostname}`;
|
||||
const cleanedPath = cleanFeedPath(parsed.pathname);
|
||||
|
||||
return unique([
|
||||
canonicalizeUrl(origin, origin),
|
||||
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
|
||||
]);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeSite(site) {
|
||||
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
|
||||
const seeds = [...new Set((site.seeds || [])
|
||||
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
|
||||
const seeds = unique((site.seeds || [])
|
||||
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
||||
.filter(Boolean))];
|
||||
.filter(Boolean));
|
||||
|
||||
return {
|
||||
name: String(site.name || '').trim(),
|
||||
label: String(site.label || '').trim(),
|
||||
allowedHosts,
|
||||
seeds,
|
||||
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
|
||||
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
|
||||
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
|
||||
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
|
||||
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||
};
|
||||
}
|
||||
|
||||
function getCrawlerSiteOverrides(label) {
|
||||
return config.newsCrawler?.overrides?.[label] || null;
|
||||
}
|
||||
|
||||
function getConfiguredCrawlerSites() {
|
||||
const defaults = config.newsCrawler || {};
|
||||
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
|
||||
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
|
||||
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
|
||||
const derivedSites = [];
|
||||
|
||||
for (const feed of config.rssFeeds || []) {
|
||||
const label = String(feed.label || '').trim();
|
||||
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let hostname = '';
|
||||
try {
|
||||
hostname = new URL(feed.url).hostname;
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const override = getCrawlerSiteOverrides(label) || {};
|
||||
const site = normalizeSite({
|
||||
label,
|
||||
name: override.name || `crawler_${slugifyLabel(label)}`,
|
||||
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
||||
seeds: override.seeds || buildDefaultSeeds(feed.url),
|
||||
maxPages: override.maxPages || defaults.maxPages,
|
||||
maxDepth: override.maxDepth || defaults.maxDepth,
|
||||
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
||||
});
|
||||
|
||||
if (site.name && site.allowedHosts.length && site.seeds.length) {
|
||||
derivedSites.push(site);
|
||||
}
|
||||
}
|
||||
|
||||
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
|
||||
}
|
||||
|
||||
async function fetchHtml(url, timeout) {
|
||||
const response = await fetchWithPolicy(url, {
|
||||
timeout,
|
||||
retries: 1,
|
||||
headers: {
|
||||
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
|
||||
'User-Agent': USER_AGENT,
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
@@ -438,7 +551,7 @@ async function crawlSite(site) {
|
||||
async function fetchCrawlerArticles() {
|
||||
const articles = [];
|
||||
|
||||
for (const site of config.newsCrawler?.sites || []) {
|
||||
for (const site of getConfiguredCrawlerSites()) {
|
||||
try {
|
||||
articles.push(...await crawlSite(site));
|
||||
} catch (error) {
|
||||
@@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {
|
||||
|
||||
module.exports = {
|
||||
fetchCrawlerArticles,
|
||||
crawlSite,
|
||||
canonicalizeUrl,
|
||||
getConfiguredCrawlerSites,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user