diff --git a/README.md b/README.md index 1de7d6f..0f4754d 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing - SQLite archive file defaults to `./archive.sqlite`. - Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique. -- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery. +- `newsCrawler` reuses `rssFeeds` as the publisher catalog, derives one crawler source per feed label, and supports `disabledLabels` plus per-label `overrides` for seeds and allowed hosts. - Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content. - Main article images are stored as ultra-compressed base64 WebP. - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search. diff --git a/config.json b/config.json index 677df70..c137619 100644 --- a/config.json +++ b/config.json @@ -365,6 +365,22 @@ { "url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", "label": "El Comercio Peru" + }, + { + "url": "https://jamaica-gleaner.com/feed/business.xml", + "label": "Jamaica Gleaner" + }, + { + "url": "https://www.jamaicaobserver.com/app/business/", + "label": "Jamaica Observer" + }, + { + "url": "https://www.stabroeknews.com/feed/", + "label": "Stabroek News" + }, + { + "url": "https://nationnews.com/rss-feed/", + "label": "Nation News Barbados" } ], "gdelt": { @@ -376,24 +392,264 @@ "format": "json" }, "newsCrawler": { - "sites": [ - { - "name": "crawler_reuters", + "maxPages": 15, + "maxDepth": 1, + "requestTimeout": 15000, + "disabledLabels": [ + "Arab News", + "Arabian Business", + "Australian Fin Review", + "BFM Business", + "Business Daily Africa", + "Business Standard IN", + "BusinessLive SA", + "Caixin Global", + "Cinco Dias", + "City A.M.", + "El Comercio Peru", + "El Economista ES", + "El Economista MX", + "FD.nl", + "Gulf News Business", + "Il Sole 24 Ore", + "Infobae Economia AR", + "Japan Times Business", + "Korea JoongAng Daily", + "Les Echos", + "Live Mint", + "Moneycontrol", + "NZ Herald Business", + "Portafolio Colombia", + "Reuters", + "The Star Malaysia", + "This Is Money", + "Xinhua Business" + ], + "overrides": { + "Al Jazeera": { "allowedHosts": [ - "www.reuters.com", - "reuters.com" + "www.aljazeera.com", + "aljazeera.com" ], "seeds": [ - "https://www.reuters.com/world/", - "https://www.reuters.com/business/", - "https://www.reuters.com/markets/", - "https://www.reuters.com/technology/" + "https://www.aljazeera.com/", + "https://www.aljazeera.com/economy/", + "https://www.aljazeera.com/tag/technology/" + ] + }, + "Ars Technica": { + "allowedHosts": [ + "arstechnica.com", + "www.arstechnica.com" ], - "maxPages": 100, - "maxDepth": 2, - "requestTimeout": 15000 + "seeds": [ + "https://arstechnica.com/", + "https://arstechnica.com/tech-policy/", + "https://arstechnica.com/information-technology/" + ] + }, + "BBC Business": { + "allowedHosts": [ + "www.bbc.com", + "bbc.com" + ], + "seeds": [ + "https://www.bbc.com/news/business", + "https://www.bbc.com/news/technology" + ] + }, + "CNBC": { + "allowedHosts": [ + "www.cnbc.com", + "cnbc.com" + ], + "seeds": [ + "https://www.cnbc.com/world/", + "https://www.cnbc.com/business/", + "https://www.cnbc.com/technology/" + ] + }, + "Guardian Business": { + "allowedHosts": [ + "www.theguardian.com", + "theguardian.com" + ], + "seeds": [ + "https://www.theguardian.com/", + "https://www.theguardian.com/business", + "https://www.theguardian.com/technology" + ] + }, + "Jamaica Gleaner": { + "allowedHosts": [ + "jamaica-gleaner.com", + "www.jamaica-gleaner.com" + ], + "seeds": [ + "https://jamaica-gleaner.com/", + "https://jamaica-gleaner.com/news", + "https://jamaica-gleaner.com/business" + ] + }, + "Jamaica Observer": { + "allowedHosts": [ + "www.jamaicaobserver.com", + "jamaicaobserver.com" + ], + "seeds": [ + "https://www.jamaicaobserver.com/", + "https://www.jamaicaobserver.com/news/", + "https://www.jamaicaobserver.com/business/" + ] + }, + "Nation News Barbados": { + "allowedHosts": [ + "nationnews.com", + "www.nationnews.com" + ], + "seeds": [ + "https://nationnews.com/", + "https://nationnews.com/category/business/", + "https://nationnews.com/category/news/" + ] + }, + "NPR Business": { + "allowedHosts": [ + "www.npr.org", + "npr.org" + ], + "seeds": [ + "https://www.npr.org/sections/business/", + "https://www.npr.org/sections/technology/" + ] + }, + "The Verge": { + "allowedHosts": [ + "www.theverge.com", + "theverge.com" + ], + "seeds": [ + "https://www.theverge.com/tech", + "https://www.theverge.com/business", + "https://www.theverge.com/archives" + ] + }, + "TechCrunch": { + "allowedHosts": [ + "techcrunch.com", + "www.techcrunch.com" + ], + "seeds": [ + "https://techcrunch.com/", + "https://techcrunch.com/category/startups/", + "https://techcrunch.com/category/venture/" + ] + }, + "The Economist": { + "allowedHosts": [ + "www.economist.com", + "economist.com" + ], + "seeds": [ + "https://www.economist.com/finance-and-economics", + "https://www.economist.com/business", + "https://www.economist.com/science-and-technology" + ] + }, + "Federal Reserve": { + "allowedHosts": [ + "www.federalreserve.gov", + "federalreserve.gov" + ], + "seeds": [ + "https://www.federalreserve.gov/newsevents.htm", + "https://www.federalreserve.gov/monetarypolicy.htm" + ] + }, + "Fortune": { + "allowedHosts": [ + "fortune.com", + "www.fortune.com" + ], + "seeds": [ + "https://fortune.com/", + "https://fortune.com/section/tech/", + "https://fortune.com/section/finance/" + ] + }, + "Forbes Business": { + "allowedHosts": [ + "www.forbes.com", + "forbes.com" + ], + "seeds": [ + "https://www.forbes.com/business/", + "https://www.forbes.com/innovation/" + ] + }, + "Nikkei Asia": { + "allowedHosts": [ + "asia.nikkei.com" + ], + "seeds": [ + "https://asia.nikkei.com/", + "https://asia.nikkei.com/Business", + "https://asia.nikkei.com/Technology" + ] + }, + "South China Morning Post": { + "allowedHosts": [ + "www.scmp.com", + "scmp.com" + ], + "seeds": [ + "https://www.scmp.com/", + "https://www.scmp.com/business", + "https://www.scmp.com/tech" + ] + }, + "Stabroek News": { + "allowedHosts": [ + "www.stabroeknews.com", + "stabroeknews.com" + ], + "seeds": [ + "https://www.stabroeknews.com/", + "https://www.stabroeknews.com/category/business/", + "https://www.stabroeknews.com/category/news/" + ] + }, + "Wall Street Journal": { + "allowedHosts": [ + "www.wsj.com", + "wsj.com" + ], + "seeds": [ + "https://www.wsj.com/news/business", + "https://www.wsj.com/tech" + ] + }, + "Wired Business": { + "allowedHosts": [ + "www.wired.com", + "wired.com" + ], + "seeds": [ + "https://www.wired.com/category/business/", + "https://www.wired.com/category/security/" + ] + }, + "Yahoo Finance": { + "allowedHosts": [ + "finance.yahoo.com" + ], + "seeds": [ + "https://finance.yahoo.com/", + "https://finance.yahoo.com/news/", + "https://finance.yahoo.com/topic/tech/" + ] } - ] + } }, "scheduler": { "rss": "0 */6 * * *", diff --git a/src/content.js b/src/content.js index 51b2cc6..27f6c73 100644 --- a/src/content.js +++ b/src/content.js @@ -44,8 +44,19 @@ const blockedContentDomains = [ ]; const loggedBlockedDomains = new Set(); const articleFetchHeaders = { - Accept: 'text/html,application/xhtml+xml', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'no-cache', + Pragma: 'no-cache', + 'Upgrade-Insecure-Requests': '1', + 'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', }; let contentBackfillRunning = false; @@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) { } function markArticleStatus(statement, id, message) { - statement.run(message, new Date().toISOString(), id); + const attemptedAt = new Date().toISOString(); + const parameterCount = statement.source.split('?').length - 1; + + if (parameterCount === 3) { + statement.run(message, attemptedAt, id); + return; + } + + if (parameterCount === 2) { + statement.run(attemptedAt, id); + return; + } + + throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`); } async function fetchCompressedImage(url) { diff --git a/src/http.js b/src/http.js index c459fbe..6f9de11 100644 --- a/src/http.js +++ b/src/http.js @@ -1,6 +1,17 @@ const DEFAULT_HEADERS = { - 'User-Agent': 'duriin_api/1.0', - Accept: 'application/json, text/plain, */*', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'no-cache', + Pragma: 'no-cache', + 'Upgrade-Insecure-Requests': '1', + 'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', }; function sleep(ms) { diff --git a/src/scheduler.js b/src/scheduler.js index 5b4abed..f3b23d1 100644 --- a/src/scheduler.js +++ b/src/scheduler.js @@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt'); const { fetchEdgarArticles } = require('./sources/edgar'); const { fetchAlphaVantageArticles } = require('./sources/alphavantage'); const { fetchFinnhubArticles } = require('./sources/finnhub'); -const { fetchCrawlerArticles } = require('./sources/newsCrawler'); +const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler'); const { backfillMissingContent } = require('./content'); const { backfillMissingEmbeddings } = require('./embeddings'); @@ -20,6 +20,16 @@ async function runSource(source, fetcher) { } } +async function runCrawlerSources() { + const results = []; + + for (const site of getConfiguredCrawlerSites()) { + results.push(await runSource(site.name, () => crawlSite(site))); + } + + return results; +} + async function runAllIngestions() { const results = []; @@ -28,7 +38,7 @@ async function runAllIngestions() { results.push(await runSource('edgar', fetchEdgarArticles)); results.push(await runSource('alphavantage', fetchAlphaVantageArticles)); results.push(await runSource('finnhub', fetchFinnhubArticles)); - results.push(await runSource('news_crawler', fetchCrawlerArticles)); + results.push(...await runCrawlerSources()); try { await backfillMissingContent(); @@ -68,7 +78,7 @@ function startScheduler() { if (config.scheduler.newsCrawler) { cron.schedule(config.scheduler.newsCrawler, async () => { - await runSource('news_crawler', fetchCrawlerArticles); + await runCrawlerSources(); }); } diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js index 4f79cc5..0ed6804 100644 --- a/src/sources/newsCrawler.js +++ b/src/sources/newsCrawler.js @@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; -const USER_AGENT = 'duriin_api crawler/1.0'; function decodeHtmlEntities(value) { return String(value || '') @@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) { return null; } - if (!isAllowedHost(url.hostname, allowedHosts)) { + if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) { return null; } @@ -323,30 +322,144 @@ function shouldQueueLink(url) { return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname); } +function slugifyLabel(label) { + return String(label || '') + .toLowerCase() + .replace(/[^a-z0-9]+/g, '_') + .replace(/^_+|_+$/g, ''); +} + +function unique(values) { + return [...new Set(values.filter(Boolean))]; +} + +function buildAllowedHosts(hostname) { + if (!hostname) { + return []; + } + + const hosts = [hostname.toLowerCase()]; + if (hostname.startsWith('www.')) { + hosts.push(hostname.slice(4).toLowerCase()); + } else { + hosts.push(`www.${hostname}`.toLowerCase()); + } + + return unique(hosts); +} + +function cleanFeedPath(pathname) { + const withoutIndex = pathname + .replace(/\/index\.[a-z0-9]+$/i, '/') + .replace(/\.[a-z0-9]+$/i, '') + .replace(/\/rss(?:$|\/.*$)/i, '/') + .replace(/\/feed(?:$|\/.*$)/i, '/') + .replace(/\/feeds?(?:$|\/.*$)/i, '/') + .replace(/\/xml(?:$|\/.*$)/i, '/') + .replace(/\/arc\/outboundfeeds\//i, '/') + .replace(/\/dynamo\//i, '/') + .replace(/\/id\/\d+\/device\/rss\//i, '/') + .replace(/\/contentexport\//i, '/') + .replace(/\/rssfeedstopstories$/i, '/') + .replace(/\/latest$/i, '/') + .replace(/\/+$|^$/g, ''); + + if (!withoutIndex) { + return '/'; + } + + const segments = withoutIndex + .split('/') + .filter(Boolean) + .filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment)) + .slice(0, 3); + + if (!segments.length) { + return '/'; + } + + return `/${segments.join('/')}`; +} + +function buildDefaultSeeds(feedUrl) { + try { + const parsed = new URL(feedUrl); + const origin = `${parsed.protocol}//${parsed.hostname}`; + const cleanedPath = cleanFeedPath(parsed.pathname); + + return unique([ + canonicalizeUrl(origin, origin), + cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin), + ]); + } catch { + return []; + } +} + function normalizeSite(site) { - const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))]; - const seeds = [...new Set((site.seeds || []) + const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean)); + const seeds = unique((site.seeds || []) .map((seed) => canonicalizeUrl(seed, seed, allowedHosts)) - .filter(Boolean))]; + .filter(Boolean)); return { name: String(site.name || '').trim(), + label: String(site.label || '').trim(), allowedHosts, seeds, - maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)), - maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)), + maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)), + maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)), requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), }; } +function getCrawlerSiteOverrides(label) { + return config.newsCrawler?.overrides?.[label] || null; +} + +function getConfiguredCrawlerSites() { + const defaults = config.newsCrawler || {}; + const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim())); + const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site)); + const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean)); + const derivedSites = []; + + for (const feed of config.rssFeeds || []) { + const label = String(feed.label || '').trim(); + if (!label || disabledLabels.has(label) || explicitLabels.has(label)) { + continue; + } + + let hostname = ''; + try { + hostname = new URL(feed.url).hostname; + } catch { + continue; + } + + const override = getCrawlerSiteOverrides(label) || {}; + const site = normalizeSite({ + label, + name: override.name || `crawler_${slugifyLabel(label)}`, + allowedHosts: override.allowedHosts || buildAllowedHosts(hostname), + seeds: override.seeds || buildDefaultSeeds(feed.url), + maxPages: override.maxPages || defaults.maxPages, + maxDepth: override.maxDepth || defaults.maxDepth, + requestTimeout: override.requestTimeout || defaults.requestTimeout, + }); + + if (site.name && site.allowedHosts.length && site.seeds.length) { + derivedSites.push(site); + } + } + + return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites]; +} + async function fetchHtml(url, timeout) { const response = await fetchWithPolicy(url, { timeout, retries: 1, - headers: { - Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8', - 'User-Agent': USER_AGENT, - }, }); if (!response.ok) { @@ -438,7 +551,7 @@ async function crawlSite(site) { async function fetchCrawlerArticles() { const articles = []; - for (const site of config.newsCrawler?.sites || []) { + for (const site of getConfiguredCrawlerSites()) { try { articles.push(...await crawlSite(site)); } catch (error) { @@ -451,5 +564,7 @@ async function fetchCrawlerArticles() { module.exports = { fetchCrawlerArticles, + crawlSite, canonicalizeUrl, + getConfiguredCrawlerSites, };