diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..589619a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +node_modules +npm-debug.log +Dockerfile* +docker-compose*.yml +.git +.gitignore +archive.sqlite +archive.sqlite-shm +archive.sqlite-wal +data diff --git a/.gitignore b/.gitignore index 61317ae..a46d4f1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ node_modules/ .env .env.* -config.json +#config.json *.sqlite *.sqlite-shm diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fb7e02b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM node:22-bookworm-slim + +ENV NODE_ENV=production + +WORKDIR /app + +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev \ + && npm cache clean --force \ + && mkdir -p /data \ + && ln -s /data/archive.sqlite /app/archive.sqlite + +COPY . . + +EXPOSE 3001 + +CMD ["npm", "start"] diff --git a/README.md b/README.md index 0b1727a..1de7d6f 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ Node.js Fastify server that ingests news articles from RSS, SEC EDGAR 8-K filing ## Notes - SQLite archive file defaults to `./archive.sqlite`. -- Deduplication is enforced on `url` and normalized title. +- Deduplication is enforced on `url`; normalized titles are stored and indexed for matching but are not unique. +- `newsCrawler.sites` can be configured with same-site seed pages for bounded HTML crawling and historical article discovery. - Article body extraction runs asynchronously after insertion, with hourly retries for rows still missing content. - Main article images are stored as ultra-compressed base64 WebP. - Embeddings are generated asynchronously with OpenRouter `perplexity/pplx-embed-v1-0.6b` and indexed in `sqlite-vec` for similarity search. diff --git a/config.json b/config.json new file mode 100644 index 0000000..677df70 --- /dev/null +++ b/config.json @@ -0,0 +1,406 @@ +{ + "server": { + "port": 3001, + "host": "0.0.0.0" + }, + "database": { + "path": "./archive.sqlite" + }, + "sec": { + "userAgent": "Augor benjamin.watt@imbenji.net", + "tickers": [] + }, + "alphaVantage": { + "apiKey": "KJ68ZQEW0PF524UA", + "tickers": [] + }, + "finnhub": { + "apiKey": "d7gg0h1r01qmqj4573sgd7gg0h1r01qmqj4573t0", + "tickers": [] + }, + "openRouter": { + "apiKey": "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" + }, + "rssFeeds": [ + { + "url": "https://www.aljazeera.com/xml/rss/all.xml", + "label": "Al Jazeera" + }, + { + "url": "https://feeds.bbci.co.uk/news/business/rss.xml", + "label": "BBC Business" + }, + { + "url": "https://feeds.businessinsider.com/custom/all", + "label": "Business Insider" + }, + { + "url": "https://feeds.bloomberg.com/markets/news.rss", + "label": "Bloomberg Markets" + }, + { + "url": "https://www.cnbc.com/id/100003114/device/rss/rss.html", + "label": "CNBC" + }, + { + "url": "https://feeds.a.dj.com/rss/RSSMarketsMain.xml", + "label": "Wall Street Journal" + }, + { + "url": "https://feeds.marketwatch.com/marketwatch/topstories/", + "label": "MarketWatch" + }, + { + "url": "https://finance.yahoo.com/news/rssindex", + "label": "Yahoo Finance" + }, + { + "url": "https://seekingalpha.com/feed.xml", + "label": "Seeking Alpha" + }, + { + "url": "https://www.ft.com/?format=rss", + "label": "Financial Times" + }, + { + "url": "https://www.economist.com/finance-and-economics/rss.xml", + "label": "The Economist" + }, + { + "url": "https://fortune.com/feed", + "label": "Fortune" + }, + { + "url": "https://www.forbes.com/business/feed/", + "label": "Forbes Business" + }, + { + "url": "https://www.inc.com/rss", + "label": "Inc Magazine" + }, + { + "url": "https://www.fastcompany.com/latest/rss", + "label": "Fast Company" + }, + { + "url": "https://www.entrepreneur.com/latest.rss", + "label": "Entrepreneur" + }, + { + "url": "https://api.axios.com/feed/", + "label": "Axios" + }, + { + "url": "https://www.wired.com/feed/category/business/latest/rss", + "label": "Wired Business" + }, + { + "url": "https://feeds.npr.org/1006/rss.xml", + "label": "NPR Business" + }, + { + "url": "https://www.federalreserve.gov/feeds/press_all.xml", + "label": "Federal Reserve" + }, + { + "url": "https://techcrunch.com/feed/", + "label": "TechCrunch" + }, + { + "url": "https://www.theverge.com/rss/index.xml", + "label": "The Verge" + }, + { + "url": "https://feeds.arstechnica.com/arstechnica/index", + "label": "Ars Technica" + }, + { + "url": "https://www.retaildive.com/feeds/news/", + "label": "Retail Dive" + }, + { + "url": "https://www.manufacturingdive.com/feeds/news/", + "label": "Manufacturing Dive" + }, + { + "url": "https://www.bankingdive.com/feeds/news/", + "label": "Banking Dive" + }, + { + "url": "https://financialpost.com/feed", + "label": "Financial Post CA" + }, + { + "url": "https://www.theglobeandmail.com/arc/outboundfeeds/rss/category/business/", + "label": "Globe and Mail" + }, + { + "url": "https://www.theguardian.com/uk/business/rss", + "label": "Guardian Business" + }, + { + "url": "https://feeds.skynews.com/feeds/rss/business.xml", + "label": "Sky News Business" + }, + { + "url": "https://www.thisismoney.co.uk/money/news/index.rss", + "label": "This Is Money" + }, + { + "url": "https://www.cityam.com/feed/", + "label": "City A.M." + }, + { + "url": "https://www.spiegel.de/wirtschaft/index.rss", + "label": "Spiegel Wirtschaft" + }, + { + "url": "https://www.handelsblatt.com/contentexport/feed/schlagzeilen", + "label": "Handelsblatt" + }, + { + "url": "https://www.faz.net/rss/aktuell/wirtschaft/", + "label": "FAZ Wirtschaft" + }, + { + "url": "https://www.welt.de/feeds/section/wirtschaft.rss", + "label": "Die Welt Wirtschaft" + }, + { + "url": "https://feeds.lesechos.fr/rss/rss_la_une.xml", + "label": "Les Echos" + }, + { + "url": "https://www.lemonde.fr/economie/rss_full.xml", + "label": "Le Monde Economie" + }, + { + "url": "https://bfmbusiness.bfmtv.com/rss/news-flux-rss/", + "label": "BFM Business" + }, + { + "url": "https://www.eleconomista.es/rss/rss-de-portada.php", + "label": "El Economista ES" + }, + { + "url": "https://e00-expansion.uecdn.es/rss/portada.xml", + "label": "Expansion ES" + }, + { + "url": "https://cincodias.elpais.com/rss/cincodias/ultima_hora_mercados.xml", + "label": "Cinco Dias" + }, + { + "url": "https://www.ilsole24ore.com/rss/economia--finanza.xml", + "label": "Il Sole 24 Ore" + }, + { + "url": "https://fd.nl/rss", + "label": "FD.nl" + }, + { + "url": "https://www.nzz.ch/wirtschaft.rss", + "label": "NZZ Wirtschaft" + }, + { + "url": "https://www.themoscowtimes.com/rss/news", + "label": "Moscow Times" + }, + { + "url": "https://rssexport.rbc.ru/rbcnews/news/30/full.rss", + "label": "RBC Russia" + }, + { + "url": "https://economictimes.indiatimes.com/rssfeedstopstories.cms", + "label": "Economic Times India" + }, + { + "url": "https://www.business-standard.com/rss/home_page_top_stories.rss", + "label": "Business Standard IN" + }, + { + "url": "https://www.livemint.com/rss/headlines", + "label": "Live Mint" + }, + { + "url": "https://www.moneycontrol.com/rss/MCtopnews.xml", + "label": "Moneycontrol" + }, + { + "url": "https://www.thehindubusinessline.com/feeder/default.rss", + "label": "Hindu Business Line" + }, + { + "url": "https://www.caixinglobal.com/rss/newsfeeds/", + "label": "Caixin Global" + }, + { + "url": "https://www.chinadaily.com.cn/rss/bizchina_rss.xml", + "label": "China Daily Business" + }, + { + "url": "https://english.news.cn/rss/business.xml", + "label": "Xinhua Business" + }, + { + "url": "https://www.scmp.com/rss/91/feed", + "label": "South China Morning Post" + }, + { + "url": "https://asia.nikkei.com/rss/feed/nar", + "label": "Nikkei Asia" + }, + { + "url": "https://www.japantimes.co.jp/feed/business/", + "label": "Japan Times Business" + }, + { + "url": "https://www.koreaherald.com/rss/010000000000.xml", + "label": "Korea Herald" + }, + { + "url": "https://koreajoongangdaily.joins.com/rss/", + "label": "Korea JoongAng Daily" + }, + { + "url": "https://www.businesstimes.com.sg/rss.xml", + "label": "Business Times SG" + }, + { + "url": "https://www.straitstimes.com/news/business/rss.xml", + "label": "Straits Times Business" + }, + { + "url": "https://www.channelnewsasia.com/rssfeeds/8395986", + "label": "Channel NewsAsia" + }, + { + "url": "https://www.bangkokpost.com/rss/data/business.xml", + "label": "Bangkok Post Business" + }, + { + "url": "https://www.thestar.com.my/rss/Business/Business-News", + "label": "The Star Malaysia" + }, + { + "url": "https://www.afr.com/rss", + "label": "Australian Fin Review" + }, + { + "url": "https://www.abc.net.au/news/feed/52278/rss.xml", + "label": "ABC Business AU" + }, + { + "url": "https://www.nzherald.co.nz/arc/outboundfeeds/rss/section/business/", + "label": "NZ Herald Business" + }, + { + "url": "https://www.arabianbusiness.com/rss.xml", + "label": "Arabian Business" + }, + { + "url": "https://gulfnews.com/rss/business", + "label": "Gulf News Business" + }, + { + "url": "https://www.arabnews.com/rss/front_page.xml", + "label": "Arab News" + }, + { + "url": "https://www.thenationalnews.com/arc/outboundfeeds/rss/?outputType=xml", + "label": "The National UAE" + }, + { + "url": "https://businessday.ng/feed/", + "label": "BusinessDay Nigeria" + }, + { + "url": "https://www.moneyweb.co.za/feed/", + "label": "Moneyweb SA" + }, + { + "url": "https://www.businesslive.co.za/rss/bd/", + "label": "BusinessLive SA" + }, + { + "url": "https://www.businessdailyafrica.com/rss/", + "label": "Business Daily Africa" + }, + { + "url": "https://www.vanguardngr.com/category/business/feed/", + "label": "Vanguard Business NG" + }, + { + "url": "https://feeds.folha.uol.com.br/mercado/rss091.xml", + "label": "Folha Mercado BR" + }, + { + "url": "https://g1.globo.com/dynamo/economia/rss2.xml", + "label": "G1 Economia BR" + }, + { + "url": "https://exame.com/feed/", + "label": "Exame BR" + }, + { + "url": "https://www.eleconomista.com.mx/rss/rss.html", + "label": "El Economista MX" + }, + { + "url": "https://expansion.mx/rss", + "label": "Expansion MX" + }, + { + "url": "https://www.lanacion.com.ar/arc/outboundfeeds/rss/category/economia/", + "label": "La Nacion AR" + }, + { + "url": "https://www.infobae.com/feeds/rss/economia/", + "label": "Infobae Economia AR" + }, + { + "url": "https://www.portafolio.co/rss/portafolio.xml", + "label": "Portafolio Colombia" + }, + { + "url": "https://elcomercio.pe/arc/outboundfeeds/rss/section/economia/", + "label": "El Comercio Peru" + } + ], + "gdelt": { + "queries": [ + "technology" + ], + "mode": "ArtList", + "maxRecords": 50, + "format": "json" + }, + "newsCrawler": { + "sites": [ + { + "name": "crawler_reuters", + "allowedHosts": [ + "www.reuters.com", + "reuters.com" + ], + "seeds": [ + "https://www.reuters.com/world/", + "https://www.reuters.com/business/", + "https://www.reuters.com/markets/", + "https://www.reuters.com/technology/" + ], + "maxPages": 100, + "maxDepth": 2, + "requestTimeout": 15000 + } + ] + }, + "scheduler": { + "rss": "0 */6 * * *", + "gdelt": "0 */6 * * *", + "edgar": "15 0 * * *", + "alphaVantage": "30 0 * * *", + "finnhub": "45 0 * * *", + "newsCrawler": "15 */12 * * *" + } +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0691091 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +services: + api: + build: + context: . + volumes: + - ./config.json:/app/config.json:ro + - ./data:/data + environment: + NODE_ENV: production + restart: unless-stopped + networks: + - nginx_proxy_manager_default + +networks: + nginx_proxy_manager_default: + external: true diff --git a/src/scheduler.js b/src/scheduler.js index 9f331f7..5b4abed 100644 --- a/src/scheduler.js +++ b/src/scheduler.js @@ -6,6 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt'); const { fetchEdgarArticles } = require('./sources/edgar'); const { fetchAlphaVantageArticles } = require('./sources/alphavantage'); const { fetchFinnhubArticles } = require('./sources/finnhub'); +const { fetchCrawlerArticles } = require('./sources/newsCrawler'); const { backfillMissingContent } = require('./content'); const { backfillMissingEmbeddings } = require('./embeddings'); @@ -27,6 +28,7 @@ async function runAllIngestions() { results.push(await runSource('edgar', fetchEdgarArticles)); results.push(await runSource('alphavantage', fetchAlphaVantageArticles)); results.push(await runSource('finnhub', fetchFinnhubArticles)); + results.push(await runSource('news_crawler', fetchCrawlerArticles)); try { await backfillMissingContent(); @@ -64,6 +66,12 @@ function startScheduler() { await runSource('finnhub', fetchFinnhubArticles); }); + if (config.scheduler.newsCrawler) { + cron.schedule(config.scheduler.newsCrawler, async () => { + await runSource('news_crawler', fetchCrawlerArticles); + }); + } + cron.schedule('0 * * * *', async () => { try { await backfillMissingContent(); diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js new file mode 100644 index 0000000..4f79cc5 --- /dev/null +++ b/src/sources/newsCrawler.js @@ -0,0 +1,455 @@ +const config = require('../config'); +const { fetchWithPolicy } = require('../http'); + +const TRACKING_PARAM_PATTERNS = [ + /^utm_/i, + /^fbclid$/i, + /^gclid$/i, + /^mkt_tok$/i, + /^mc_cid$/i, + /^mc_eid$/i, + /^ref$/i, + /^ref_src$/i, + /^s$/i, + /^cmpid$/i, + /^guccounter$/i, + /^guce_referrer$/i, + /^guce_referrer_sig$/i, +]; +const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i; +const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; +const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; +const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; +const USER_AGENT = 'duriin_api crawler/1.0'; + +function decodeHtmlEntities(value) { + return String(value || '') + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) + .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/ /g, ' '); +} + +function stripTags(value) { + return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim(); +} + +function normalizeText(value) { + return stripTags(value).replace(/\s+/g, ' ').trim(); +} + +function isAllowedHost(hostname, allowedHosts) { + const normalized = String(hostname || '').toLowerCase(); + return allowedHosts.some((allowedHost) => { + const candidate = String(allowedHost || '').toLowerCase(); + return normalized === candidate || normalized.endsWith(`.${candidate}`); + }); +} + +function shouldDropParam(key) { + return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key)); +} + +function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) { + try { + const url = new URL(rawUrl, baseUrl); + + if (!['http:', 'https:'].includes(url.protocol)) { + return null; + } + + if (!isAllowedHost(url.hostname, allowedHosts)) { + return null; + } + + url.hash = ''; + url.username = ''; + url.password = ''; + + const params = [...url.searchParams.entries()] + .filter(([key]) => !shouldDropParam(key)) + .sort(([left], [right]) => left.localeCompare(right)); + + url.search = ''; + for (const [key, value] of params) { + url.searchParams.append(key, value); + } + + if (url.pathname !== '/') { + url.pathname = url.pathname.replace(/\/+$/, '') || '/'; + } + + return url.toString(); + } catch { + return null; + } +} + +function extractAttribute(tag, name) { + const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i')); + return match ? decodeHtmlEntities(match[2]).trim() : ''; +} + +function extractMetaMap(html) { + const metas = new Map(); + const metaTags = html.match(/]*>/gi) || []; + + for (const tag of metaTags) { + const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name'); + const content = extractAttribute(tag, 'content'); + + if (!key || !content) { + continue; + } + + metas.set(key.toLowerCase(), content); + } + + return metas; +} + +function extractCanonicalHref(html) { + const links = html.match(/]*>/gi) || []; + + for (const tag of links) { + const rel = extractAttribute(tag, 'rel').toLowerCase(); + if (!rel || !rel.split(/\s+/).includes('canonical')) { + continue; + } + + const href = extractAttribute(tag, 'href'); + if (href) { + return href; + } + } + + return null; +} + +function extractTitleTag(html) { + const match = html.match(/]*>([\s\S]*?)<\/title>/i); + return match ? normalizeText(match[1]) : null; +} + +function extractH1(html) { + const match = html.match(/]*>([\s\S]*?)<\/h1>/i); + return match ? normalizeText(match[1]) : null; +} + +function extractTimeDatetime(html) { + const match = html.match(/]*datetime\s*=\s*(["'])(.*?)\1/i); + return match ? decodeHtmlEntities(match[2]).trim() : null; +} + +function extractParagraphTextLength(html) { + const paragraphs = html.match(/]*>[\s\S]*?<\/p>/gi) || []; + return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0); +} + +function extractJsonLdBlocks(html) { + const blocks = []; + const regex = /]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi; + let match; + + while ((match = regex.exec(html)) !== null) { + const raw = String(match[2] || '').trim(); + if (!raw) { + continue; + } + + try { + blocks.push(JSON.parse(raw)); + } catch { + continue; + } + } + + return blocks; +} + +function walkJson(value, visit) { + if (Array.isArray(value)) { + for (const item of value) { + walkJson(item, visit); + } + return; + } + + if (!value || typeof value !== 'object') { + return; + } + + visit(value); + + for (const child of Object.values(value)) { + walkJson(child, visit); + } +} + +function isArticleType(type) { + if (Array.isArray(type)) { + return type.some((entry) => isArticleType(entry)); + } + + return ['article', 'newsarticle'].includes(String(type || '').toLowerCase()); +} + +function extractArticleJsonLd(html) { + const blocks = extractJsonLdBlocks(html); + let article = null; + + for (const block of blocks) { + walkJson(block, (value) => { + if (!article && isArticleType(value['@type'])) { + article = value; + } + }); + + if (article) { + return article; + } + } + + return null; +} + +function extractLinks(html, pageUrl, allowedHosts) { + const links = []; + const seen = new Set(); + const regex = /]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi; + let match; + + while ((match = regex.exec(html)) !== null) { + const url = canonicalizeUrl(match[2], pageUrl, allowedHosts); + if (!url || seen.has(url)) { + continue; + } + + const text = normalizeText(match[3]); + seen.add(url); + links.push({ url, text }); + } + + return links; +} + +function selectTitle(meta, jsonLdArticle, html) { + return [ + meta.get('og:title'), + meta.get('twitter:title'), + jsonLdArticle && jsonLdArticle.headline, + extractH1(html), + extractTitleTag(html), + ].find((value) => String(value || '').trim()) || null; +} + +function selectDescription(meta, jsonLdArticle) { + return [ + meta.get('og:description'), + meta.get('description'), + jsonLdArticle && jsonLdArticle.description, + ].find((value) => String(value || '').trim()) || null; +} + +function selectPubDate(meta, jsonLdArticle, html) { + return [ + jsonLdArticle && jsonLdArticle.datePublished, + meta.get('article:published_time'), + meta.get('og:article:published_time'), + extractTimeDatetime(html), + ].find((value) => String(value || '').trim()) || null; +} + +function scorePage(pageUrl, meta, html, jsonLdArticle, links) { + let articleScore = 0; + let listingScore = 0; + const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length; + + if (jsonLdArticle) { + articleScore += 3; + } + + if (String(meta.get('og:type') || '').toLowerCase() === 'article') { + articleScore += 2; + } + + if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) { + articleScore += 2; + } + + if (/= 500) { + articleScore += 1; + } + + if (links.length >= 20) { + listingScore += 2; + } + + if (headlineLinks >= 8) { + listingScore += 2; + } + + if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) { + listingScore += 1; + } + + if (articleScore > 0) { + listingScore -= 2; + } + + return { articleScore, listingScore }; +} + +function shouldQueueLink(url) { + const pathname = new URL(url).pathname.toLowerCase(); + + if (BLOCKED_PATH_HINT.test(pathname)) { + return false; + } + + return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname); +} + +function normalizeSite(site) { + const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))]; + const seeds = [...new Set((site.seeds || []) + .map((seed) => canonicalizeUrl(seed, seed, allowedHosts)) + .filter(Boolean))]; + + return { + name: String(site.name || '').trim(), + allowedHosts, + seeds, + maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)), + maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)), + requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), + }; +} + +async function fetchHtml(url, timeout) { + const response = await fetchWithPolicy(url, { + timeout, + retries: 1, + headers: { + Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8', + 'User-Agent': USER_AGENT, + }, + }); + + if (!response.ok) { + return null; + } + + const contentType = String(response.headers.get('content-type') || '').toLowerCase(); + if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) { + return null; + } + + return response.text(); +} + +async function crawlSite(site) { + const normalizedSite = normalizeSite(site); + + if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) { + return []; + } + + const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 })); + const queuedUrls = new Set(normalizedSite.seeds); + const visitedUrls = new Set(); + const discoveredArticleUrls = new Set(); + const articles = []; + + while (queue.length && visitedUrls.size < normalizedSite.maxPages) { + const current = queue.shift(); + + if (!current || visitedUrls.has(current.url)) { + continue; + } + + visitedUrls.add(current.url); + + let html; + try { + html = await fetchHtml(current.url, normalizedSite.requestTimeout); + } catch (error) { + console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error); + continue; + } + + if (!html) { + continue; + } + + const meta = extractMetaMap(html); + const jsonLdArticle = extractArticleJsonLd(html); + const canonicalHref = extractCanonicalHref(html); + const canonicalUrl = canonicalHref + ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url + : current.url; + const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts); + const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links); + + if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) { + const title = normalizeText(selectTitle(meta, jsonLdArticle, html)); + if (title) { + discoveredArticleUrls.add(canonicalUrl); + articles.push({ + title, + description: normalizeText(selectDescription(meta, jsonLdArticle)) || null, + url: canonicalUrl, + source: normalizedSite.name, + pubDate: selectPubDate(meta, jsonLdArticle, html), + }); + } + } + + if (current.depth >= normalizedSite.maxDepth || listingScore < 2) { + continue; + } + + for (const link of links) { + if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) { + continue; + } + + queuedUrls.add(link.url); + queue.push({ url: link.url, depth: current.depth + 1 }); + } + } + + return articles; +} + +async function fetchCrawlerArticles() { + const articles = []; + + for (const site of config.newsCrawler?.sites || []) { + try { + articles.push(...await crawlSite(site)); + } catch (error) { + console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error); + } + } + + return articles; +} + +module.exports = { + fetchCrawlerArticles, + canonicalizeUrl, +};