enhance news crawler configuration with new sources and improved request headers
This commit is contained in:
parent
c91e4ddb60
commit
11647e6a35
6 changed files with 449 additions and 33 deletions
|
|
@ -26,7 +26,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing
|
|||
|
||||
- SQLite archive file defaults to `./archive.sqlite`.
|
||||
- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique.
|
||||
- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery.
|
||||
- `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts.
|
||||
- Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content.
|
||||
- Main article images are stored as ultra-compressed base64 WebP.
|
||||
- Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search.
|
||||
|
|
|
|||
284
config.json
284
config.json
|
|
@ -365,6 +365,22 @@
|
|||
{
|
||||
"url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/",
|
||||
"label": "El Comercio Peru"
|
||||
},
|
||||
{
|
||||
"url": "https://jamaica-gleaner.com/feed/business.xml",
|
||||
"label": "Jamaica Gleaner"
|
||||
},
|
||||
{
|
||||
"url": "https://www.jamaicaobserver.com/app/business/",
|
||||
"label": "Jamaica Observer"
|
||||
},
|
||||
{
|
||||
"url": "https://www.stabroeknews.com/feed/",
|
||||
"label": "Stabroek News"
|
||||
},
|
||||
{
|
||||
"url": "https://nationnews.com/rss-feed/",
|
||||
"label": "Nation News Barbados"
|
||||
}
|
||||
],
|
||||
"gdelt": {
|
||||
|
|
@ -376,25 +392,265 @@
|
|||
"format": "json"
|
||||
},
|
||||
"newsCrawler": {
|
||||
"sites": [
|
||||
{
|
||||
"name": "crawler_reuters",
|
||||
"maxPages": 15,
|
||||
"maxDepth": 1,
|
||||
"requestTimeout": 15000,
|
||||
"disabledLabels": [
|
||||
"Arab News",
|
||||
"Arabian Business",
|
||||
"Australian Fin Review",
|
||||
"BFM Business",
|
||||
"Business Daily Africa",
|
||||
"Business Standard IN",
|
||||
"BusinessLive SA",
|
||||
"Caixin Global",
|
||||
"Cinco Dias",
|
||||
"City A.M.",
|
||||
"El Comercio Peru",
|
||||
"El Economista ES",
|
||||
"El Economista MX",
|
||||
"FD.nl",
|
||||
"Gulf News Business",
|
||||
"Il Sole 24 Ore",
|
||||
"Infobae Economia AR",
|
||||
"Japan Times Business",
|
||||
"Korea JoongAng Daily",
|
||||
"Les Echos",
|
||||
"Live Mint",
|
||||
"Moneycontrol",
|
||||
"NZ Herald Business",
|
||||
"Portafolio Colombia",
|
||||
"Reuters",
|
||||
"The Star Malaysia",
|
||||
"This Is Money",
|
||||
"Xinhua Business"
|
||||
],
|
||||
"overrides": {
|
||||
"Al Jazeera": {
|
||||
"allowedHosts": [
|
||||
"www.reuters.com",
|
||||
"reuters.com"
|
||||
"www.aljazeera.com",
|
||||
"aljazeera.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.reuters.com/world/",
|
||||
"https://www.reuters.com/business/",
|
||||
"https://www.reuters.com/markets/",
|
||||
"https://www.reuters.com/technology/"
|
||||
],
|
||||
"maxPages": 100,
|
||||
"maxDepth": 2,
|
||||
"requestTimeout": 15000
|
||||
}
|
||||
"https://www.aljazeera.com/",
|
||||
"https://www.aljazeera.com/economy/",
|
||||
"https://www.aljazeera.com/tag/technology/"
|
||||
]
|
||||
},
|
||||
"Ars Technica": {
|
||||
"allowedHosts": [
|
||||
"arstechnica.com",
|
||||
"www.arstechnica.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://arstechnica.com/",
|
||||
"https://arstechnica.com/tech-policy/",
|
||||
"https://arstechnica.com/information-technology/"
|
||||
]
|
||||
},
|
||||
"BBC Business": {
|
||||
"allowedHosts": [
|
||||
"www.bbc.com",
|
||||
"bbc.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.bbc.com/news/business",
|
||||
"https://www.bbc.com/news/technology"
|
||||
]
|
||||
},
|
||||
"CNBC": {
|
||||
"allowedHosts": [
|
||||
"www.cnbc.com",
|
||||
"cnbc.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.cnbc.com/world/",
|
||||
"https://www.cnbc.com/business/",
|
||||
"https://www.cnbc.com/technology/"
|
||||
]
|
||||
},
|
||||
"Guardian Business": {
|
||||
"allowedHosts": [
|
||||
"www.theguardian.com",
|
||||
"theguardian.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.theguardian.com/",
|
||||
"https://www.theguardian.com/business",
|
||||
"https://www.theguardian.com/technology"
|
||||
]
|
||||
},
|
||||
"Jamaica Gleaner": {
|
||||
"allowedHosts": [
|
||||
"jamaica-gleaner.com",
|
||||
"www.jamaica-gleaner.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://jamaica-gleaner.com/",
|
||||
"https://jamaica-gleaner.com/news",
|
||||
"https://jamaica-gleaner.com/business"
|
||||
]
|
||||
},
|
||||
"Jamaica Observer": {
|
||||
"allowedHosts": [
|
||||
"www.jamaicaobserver.com",
|
||||
"jamaicaobserver.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.jamaicaobserver.com/",
|
||||
"https://www.jamaicaobserver.com/news/",
|
||||
"https://www.jamaicaobserver.com/business/"
|
||||
]
|
||||
},
|
||||
"Nation News Barbados": {
|
||||
"allowedHosts": [
|
||||
"nationnews.com",
|
||||
"www.nationnews.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://nationnews.com/",
|
||||
"https://nationnews.com/category/business/",
|
||||
"https://nationnews.com/category/news/"
|
||||
]
|
||||
},
|
||||
"NPR Business": {
|
||||
"allowedHosts": [
|
||||
"www.npr.org",
|
||||
"npr.org"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.npr.org/sections/business/",
|
||||
"https://www.npr.org/sections/technology/"
|
||||
]
|
||||
},
|
||||
"The Verge": {
|
||||
"allowedHosts": [
|
||||
"www.theverge.com",
|
||||
"theverge.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.theverge.com/tech",
|
||||
"https://www.theverge.com/business",
|
||||
"https://www.theverge.com/archives"
|
||||
]
|
||||
},
|
||||
"TechCrunch": {
|
||||
"allowedHosts": [
|
||||
"techcrunch.com",
|
||||
"www.techcrunch.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://techcrunch.com/",
|
||||
"https://techcrunch.com/category/startups/",
|
||||
"https://techcrunch.com/category/venture/"
|
||||
]
|
||||
},
|
||||
"The Economist": {
|
||||
"allowedHosts": [
|
||||
"www.economist.com",
|
||||
"economist.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.economist.com/finance-and-economics",
|
||||
"https://www.economist.com/business",
|
||||
"https://www.economist.com/science-and-technology"
|
||||
]
|
||||
},
|
||||
"Federal Reserve": {
|
||||
"allowedHosts": [
|
||||
"www.federalreserve.gov",
|
||||
"federalreserve.gov"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.federalreserve.gov/newsevents.htm",
|
||||
"https://www.federalreserve.gov/monetarypolicy.htm"
|
||||
]
|
||||
},
|
||||
"Fortune": {
|
||||
"allowedHosts": [
|
||||
"fortune.com",
|
||||
"www.fortune.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://fortune.com/",
|
||||
"https://fortune.com/section/tech/",
|
||||
"https://fortune.com/section/finance/"
|
||||
]
|
||||
},
|
||||
"Forbes Business": {
|
||||
"allowedHosts": [
|
||||
"www.forbes.com",
|
||||
"forbes.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.forbes.com/business/",
|
||||
"https://www.forbes.com/innovation/"
|
||||
]
|
||||
},
|
||||
"Nikkei Asia": {
|
||||
"allowedHosts": [
|
||||
"asia.nikkei.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://asia.nikkei.com/",
|
||||
"https://asia.nikkei.com/Business",
|
||||
"https://asia.nikkei.com/Technology"
|
||||
]
|
||||
},
|
||||
"South China Morning Post": {
|
||||
"allowedHosts": [
|
||||
"www.scmp.com",
|
||||
"scmp.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.scmp.com/",
|
||||
"https://www.scmp.com/business",
|
||||
"https://www.scmp.com/tech"
|
||||
]
|
||||
},
|
||||
"Stabroek News": {
|
||||
"allowedHosts": [
|
||||
"www.stabroeknews.com",
|
||||
"stabroeknews.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.stabroeknews.com/",
|
||||
"https://www.stabroeknews.com/category/business/",
|
||||
"https://www.stabroeknews.com/category/news/"
|
||||
]
|
||||
},
|
||||
"Wall Street Journal": {
|
||||
"allowedHosts": [
|
||||
"www.wsj.com",
|
||||
"wsj.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.wsj.com/news/business",
|
||||
"https://www.wsj.com/tech"
|
||||
]
|
||||
},
|
||||
"Wired Business": {
|
||||
"allowedHosts": [
|
||||
"www.wired.com",
|
||||
"wired.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://www.wired.com/category/business/",
|
||||
"https://www.wired.com/category/security/"
|
||||
]
|
||||
},
|
||||
"Yahoo Finance": {
|
||||
"allowedHosts": [
|
||||
"finance.yahoo.com"
|
||||
],
|
||||
"seeds": [
|
||||
"https://finance.yahoo.com/",
|
||||
"https://finance.yahoo.com/news/",
|
||||
"https://finance.yahoo.com/topic/tech/"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"rss": "0 */6 * * *",
|
||||
"gdelt": "0 */6 * * *",
|
||||
|
|
|
|||
|
|
@ -44,8 +44,19 @@ const blockedContentDomains = [
|
|||
];
|
||||
const loggedBlockedDomains = new Set();
|
||||
const articleFetchHeaders = {
|
||||
Accept: 'text/html,application/xhtml+xml',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
};
|
||||
|
||||
let contentBackfillRunning = false;
|
||||
|
|
@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
|
|||
}
|
||||
|
||||
function markArticleStatus(statement, id, message) {
|
||||
statement.run(message, new Date().toISOString(), id);
|
||||
const attemptedAt = new Date().toISOString();
|
||||
const parameterCount = statement.source.split('?').length - 1;
|
||||
|
||||
if (parameterCount === 3) {
|
||||
statement.run(message, attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (parameterCount === 2) {
|
||||
statement.run(attemptedAt, id);
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
|
||||
}
|
||||
|
||||
async function fetchCompressedImage(url) {
|
||||
|
|
|
|||
15
src/http.js
15
src/http.js
|
|
@ -1,6 +1,17 @@
|
|||
const DEFAULT_HEADERS = {
|
||||
'User-Agent': 'duriin_api/1.0',
|
||||
Accept: 'application/json, text/plain, */*',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
Pragma: 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
};
|
||||
|
||||
function sleep(ms) {
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
|
|||
const { fetchEdgarArticles } = require('./sources/edgar');
|
||||
const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
|
||||
const { fetchFinnhubArticles } = require('./sources/finnhub');
|
||||
const { fetchCrawlerArticles } = require('./sources/newsCrawler');
|
||||
const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
|
||||
const { backfillMissingContent } = require('./content');
|
||||
const { backfillMissingEmbeddings } = require('./embeddings');
|
||||
|
||||
|
|
@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
|
|||
}
|
||||
}
|
||||
|
||||
async function runCrawlerSources() {
|
||||
const results = [];
|
||||
|
||||
for (const site of getConfiguredCrawlerSites()) {
|
||||
results.push(await runSource(site.name, () => crawlSite(site)));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async function runAllIngestions() {
|
||||
const results = [];
|
||||
|
||||
|
|
@ -28,7 +38,7 @@ async function runAllIngestions() {
|
|||
results.push(await runSource('edgar', fetchEdgarArticles));
|
||||
results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
|
||||
results.push(await runSource('finnhub', fetchFinnhubArticles));
|
||||
results.push(await runSource('news_crawler', fetchCrawlerArticles));
|
||||
results.push(...await runCrawlerSources());
|
||||
|
||||
try {
|
||||
await backfillMissingContent();
|
||||
|
|
@ -68,7 +78,7 @@ function startScheduler() {
|
|||
|
||||
if (config.scheduler.newsCrawler) {
|
||||
cron.schedule(config.scheduler.newsCrawler, async () => {
|
||||
await runSource('news_crawler', fetchCrawlerArticles);
|
||||
await runCrawlerSources();
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
|
|||
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
||||
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
||||
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
||||
const USER_AGENT = 'duriin_api crawler/1.0';
|
||||
|
||||
function decodeHtmlEntities(value) {
|
||||
return String(value || '')
|
||||
|
|
@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
|
|||
return null;
|
||||
}
|
||||
|
||||
if (!isAllowedHost(url.hostname, allowedHosts)) {
|
||||
if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
|
@ -323,30 +322,144 @@ function shouldQueueLink(url) {
|
|||
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
|
||||
}
|
||||
|
||||
function slugifyLabel(label) {
|
||||
return String(label || '')
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '_')
|
||||
.replace(/^_+|_+$/g, '');
|
||||
}
|
||||
|
||||
function unique(values) {
|
||||
return [...new Set(values.filter(Boolean))];
|
||||
}
|
||||
|
||||
function buildAllowedHosts(hostname) {
|
||||
if (!hostname) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const hosts = [hostname.toLowerCase()];
|
||||
if (hostname.startsWith('www.')) {
|
||||
hosts.push(hostname.slice(4).toLowerCase());
|
||||
} else {
|
||||
hosts.push(`www.${hostname}`.toLowerCase());
|
||||
}
|
||||
|
||||
return unique(hosts);
|
||||
}
|
||||
|
||||
function cleanFeedPath(pathname) {
|
||||
const withoutIndex = pathname
|
||||
.replace(/\/index\.[a-z0-9]+$/i, '/')
|
||||
.replace(/\.[a-z0-9]+$/i, '')
|
||||
.replace(/\/rss(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/feed(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/xml(?:$|\/.*$)/i, '/')
|
||||
.replace(/\/arc\/outboundfeeds\//i, '/')
|
||||
.replace(/\/dynamo\//i, '/')
|
||||
.replace(/\/id\/\d+\/device\/rss\//i, '/')
|
||||
.replace(/\/contentexport\//i, '/')
|
||||
.replace(/\/rssfeedstopstories$/i, '/')
|
||||
.replace(/\/latest$/i, '/')
|
||||
.replace(/\/+$|^$/g, '');
|
||||
|
||||
if (!withoutIndex) {
|
||||
return '/';
|
||||
}
|
||||
|
||||
const segments = withoutIndex
|
||||
.split('/')
|
||||
.filter(Boolean)
|
||||
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
|
||||
.slice(0, 3);
|
||||
|
||||
if (!segments.length) {
|
||||
return '/';
|
||||
}
|
||||
|
||||
return `/${segments.join('/')}`;
|
||||
}
|
||||
|
||||
function buildDefaultSeeds(feedUrl) {
|
||||
try {
|
||||
const parsed = new URL(feedUrl);
|
||||
const origin = `${parsed.protocol}//${parsed.hostname}`;
|
||||
const cleanedPath = cleanFeedPath(parsed.pathname);
|
||||
|
||||
return unique([
|
||||
canonicalizeUrl(origin, origin),
|
||||
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
|
||||
]);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeSite(site) {
|
||||
const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
|
||||
const seeds = [...new Set((site.seeds || [])
|
||||
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
|
||||
const seeds = unique((site.seeds || [])
|
||||
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
||||
.filter(Boolean))];
|
||||
.filter(Boolean));
|
||||
|
||||
return {
|
||||
name: String(site.name || '').trim(),
|
||||
label: String(site.label || '').trim(),
|
||||
allowedHosts,
|
||||
seeds,
|
||||
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
|
||||
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
|
||||
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
|
||||
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
|
||||
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||
};
|
||||
}
|
||||
|
||||
function getCrawlerSiteOverrides(label) {
|
||||
return config.newsCrawler?.overrides?.[label] || null;
|
||||
}
|
||||
|
||||
function getConfiguredCrawlerSites() {
|
||||
const defaults = config.newsCrawler || {};
|
||||
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
|
||||
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
|
||||
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
|
||||
const derivedSites = [];
|
||||
|
||||
for (const feed of config.rssFeeds || []) {
|
||||
const label = String(feed.label || '').trim();
|
||||
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let hostname = '';
|
||||
try {
|
||||
hostname = new URL(feed.url).hostname;
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const override = getCrawlerSiteOverrides(label) || {};
|
||||
const site = normalizeSite({
|
||||
label,
|
||||
name: override.name || `crawler_${slugifyLabel(label)}`,
|
||||
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
||||
seeds: override.seeds || buildDefaultSeeds(feed.url),
|
||||
maxPages: override.maxPages || defaults.maxPages,
|
||||
maxDepth: override.maxDepth || defaults.maxDepth,
|
||||
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
||||
});
|
||||
|
||||
if (site.name && site.allowedHosts.length && site.seeds.length) {
|
||||
derivedSites.push(site);
|
||||
}
|
||||
}
|
||||
|
||||
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
|
||||
}
|
||||
|
||||
async function fetchHtml(url, timeout) {
|
||||
const response = await fetchWithPolicy(url, {
|
||||
timeout,
|
||||
retries: 1,
|
||||
headers: {
|
||||
Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
|
||||
'User-Agent': USER_AGENT,
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
|
|
@ -438,7 +551,7 @@ async function crawlSite(site) {
|
|||
async function fetchCrawlerArticles() {
|
||||
const articles = [];
|
||||
|
||||
for (const site of config.newsCrawler?.sites || []) {
|
||||
for (const site of getConfiguredCrawlerSites()) {
|
||||
try {
|
||||
articles.push(...await crawlSite(site));
|
||||
} catch (error) {
|
||||
|
|
@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {
|
|||
|
||||
module.exports = {
|
||||
fetchCrawlerArticles,
|
||||
crawlSite,
|
||||
canonicalizeUrl,
|
||||
getConfiguredCrawlerSites,
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue