const config = require('../config'); const { fetchWithPolicy } = require('../http'); const TRACKING_PARAM_PATTERNS = [ /^utm_/i, /^fbclid$/i, /^gclid$/i, /^mkt_tok$/i, /^mc_cid$/i, /^mc_eid$/i, /^ref$/i, /^ref_src$/i, /^s$/i, /^cmpid$/i, /^guccounter$/i, /^guce_referrer$/i, /^guce_referrer_sig$/i, ]; const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i; const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//; const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; function decodeHtmlEntities(value) { return String(value || '') .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/ /g, ' '); } function stripTags(value) { return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim(); } function normalizeText(value) { return stripTags(value).replace(/\s+/g, ' ').trim(); } function isAllowedHost(hostname, allowedHosts) { const normalized = String(hostname || '').toLowerCase(); return allowedHosts.some((allowedHost) => { const candidate = String(allowedHost || '').toLowerCase(); return normalized === candidate || normalized.endsWith(`.${candidate}`); }); } function shouldDropParam(key) { return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key)); } function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) { try { const url = new URL(rawUrl, baseUrl); if (!['http:', 'https:'].includes(url.protocol)) { return null; } if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) { return null; } url.hash = ''; url.username = ''; url.password = ''; const params = [...url.searchParams.entries()] .filter(([key]) => !shouldDropParam(key)) .sort(([left], [right]) => left.localeCompare(right)); url.search = ''; for (const [key, value] of params) { url.searchParams.append(key, value); } if (url.pathname !== '/') { url.pathname = url.pathname.replace(/\/+$/, '') || '/'; } return url.toString(); } catch { return null; } } function extractAttribute(tag, name) { const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i')); return match ? decodeHtmlEntities(match[2]).trim() : ''; } function extractMetaMap(html) { const metas = new Map(); const metaTags = html.match(/]*>/gi) || []; for (const tag of metaTags) { const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name'); const content = extractAttribute(tag, 'content'); if (!key || !content) { continue; } metas.set(key.toLowerCase(), content); } return metas; } function extractCanonicalHref(html) { const links = html.match(/]*>/gi) || []; for (const tag of links) { const rel = extractAttribute(tag, 'rel').toLowerCase(); if (!rel || !rel.split(/\s+/).includes('canonical')) { continue; } const href = extractAttribute(tag, 'href'); if (href) { return href; } } return null; } function extractTitleTag(html) { const match = html.match(/]*>([\s\S]*?)<\/title>/i); return match ? normalizeText(match[1]) : null; } function extractH1(html) { const match = html.match(/]*>([\s\S]*?)<\/h1>/i); return match ? normalizeText(match[1]) : null; } function extractTimeDatetime(html) { const match = html.match(/]*datetime\s*=\s*(["'])(.*?)\1/i); return match ? decodeHtmlEntities(match[2]).trim() : null; } function extractParagraphTextLength(html) { const paragraphs = html.match(/]*>[\s\S]*?<\/p>/gi) || []; return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0); } function extractJsonLdBlocks(html) { const blocks = []; const regex = /]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi; let match; while ((match = regex.exec(html)) !== null) { const raw = String(match[2] || '').trim(); if (!raw) { continue; } try { blocks.push(JSON.parse(raw)); } catch { continue; } } return blocks; } function walkJson(value, visit) { if (Array.isArray(value)) { for (const item of value) { walkJson(item, visit); } return; } if (!value || typeof value !== 'object') { return; } visit(value); for (const child of Object.values(value)) { walkJson(child, visit); } } function isArticleType(type) { if (Array.isArray(type)) { return type.some((entry) => isArticleType(entry)); } return ['article', 'newsarticle'].includes(String(type || '').toLowerCase()); } function extractArticleJsonLd(html) { const blocks = extractJsonLdBlocks(html); let article = null; for (const block of blocks) { walkJson(block, (value) => { if (!article && isArticleType(value['@type'])) { article = value; } }); if (article) { return article; } } return null; } function extractLinks(html, pageUrl, allowedHosts) { const links = []; const seen = new Set(); const regex = /]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi; let match; while ((match = regex.exec(html)) !== null) { const url = canonicalizeUrl(match[2], pageUrl, allowedHosts); if (!url || seen.has(url)) { continue; } const text = normalizeText(match[3]); seen.add(url); links.push({ url, text }); } return links; } function selectTitle(meta, jsonLdArticle, html) { return [ meta.get('og:title'), meta.get('twitter:title'), jsonLdArticle && jsonLdArticle.headline, extractH1(html), extractTitleTag(html), ].find((value) => String(value || '').trim()) || null; } function selectDescription(meta, jsonLdArticle) { return [ meta.get('og:description'), meta.get('description'), jsonLdArticle && jsonLdArticle.description, ].find((value) => String(value || '').trim()) || null; } function selectPubDate(meta, jsonLdArticle, html) { return [ jsonLdArticle && jsonLdArticle.datePublished, meta.get('article:published_time'), meta.get('og:article:published_time'), extractTimeDatetime(html), ].find((value) => String(value || '').trim()) || null; } function scorePage(pageUrl, meta, html, jsonLdArticle, links) { let articleScore = 0; let listingScore = 0; const pathname = new URL(pageUrl).pathname; const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl); const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl); const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname); const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname); const paragraphTextLength = extractParagraphTextLength(html); const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length; if (jsonLdArticle) { articleScore += 4; } if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) { articleScore += 1; } if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) { articleScore += 1; } if (/= 500) { articleScore += 2; } if (links.length >= 20) { listingScore += 2; } if (headlineLinks >= 8) { listingScore += 2; } if (LISTING_PATH_HINT.test(pathname)) { listingScore += 1; } if (hasListingFalsePositivePath) { listingScore += 3; } if (articleScore > 0) { listingScore -= 1; } const isArticleCandidate = articleScore >= 4 && articleScore > listingScore && (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500); return { articleScore, listingScore, isArticleCandidate }; } function shouldQueueLink(url) { const pathname = new URL(url).pathname.toLowerCase(); if (BLOCKED_PATH_HINT.test(pathname)) { return false; } return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname); } function slugifyLabel(label) { return String(label || '') .toLowerCase() .replace(/[^a-z0-9]+/g, '_') .replace(/^_+|_+$/g, ''); } function unique(values) { return [...new Set(values.filter(Boolean))]; } function buildAllowedHosts(hostname) { if (!hostname) { return []; } const hosts = [hostname.toLowerCase()]; if (hostname.startsWith('www.')) { hosts.push(hostname.slice(4).toLowerCase()); } else { hosts.push(`www.${hostname}`.toLowerCase()); } return unique(hosts); } function cleanFeedPath(pathname) { const withoutIndex = pathname .replace(/\/index\.[a-z0-9]+$/i, '/') .replace(/\.[a-z0-9]+$/i, '') .replace(/\/rss(?:$|\/.*$)/i, '/') .replace(/\/feed(?:$|\/.*$)/i, '/') .replace(/\/feeds?(?:$|\/.*$)/i, '/') .replace(/\/xml(?:$|\/.*$)/i, '/') .replace(/\/arc\/outboundfeeds\//i, '/') .replace(/\/dynamo\//i, '/') .replace(/\/id\/\d+\/device\/rss\//i, '/') .replace(/\/contentexport\//i, '/') .replace(/\/rssfeedstopstories$/i, '/') .replace(/\/latest$/i, '/') .replace(/\/+$|^$/g, ''); if (!withoutIndex) { return '/'; } const segments = withoutIndex .split('/') .filter(Boolean) .filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment)) .slice(0, 3); if (!segments.length) { return '/'; } return `/${segments.join('/')}`; } function buildDefaultSeeds(feedUrl) { try { const parsed = new URL(feedUrl); const origin = `${parsed.protocol}//${parsed.hostname}`; const cleanedPath = cleanFeedPath(parsed.pathname); return unique([ canonicalizeUrl(origin, origin), cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin), ]); } catch { return []; } } function normalizeLimit(value, fallback, minimum, maximum) { const numeric = Number(value); if (numeric === -1) { return Number.POSITIVE_INFINITY; } if (!Number.isFinite(numeric)) { return fallback; } return Math.max(minimum, Math.min(numeric, maximum)); } function normalizeSite(site) { const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean)); const seeds = unique((site.seeds || []) .map((seed) => canonicalizeUrl(seed, seed, allowedHosts)) .filter(Boolean)); return { name: String(site.name || '').trim(), label: String(site.label || '').trim(), allowedHosts, seeds, maxPages: normalizeLimit(site.maxPages, 15, 1, 500), maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5), requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), }; } function getCrawlerSiteOverrides(label) { return config.newsCrawler?.overrides?.[label] || null; } function getConfiguredCrawlerSites() { const defaults = config.newsCrawler || {}; const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim())); const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site)); const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean)); const derivedSites = []; for (const feed of config.rssFeeds || []) { const label = String(feed.label || '').trim(); if (!label || disabledLabels.has(label) || explicitLabels.has(label)) { continue; } let hostname = ''; try { hostname = new URL(feed.url).hostname; } catch { continue; } const override = getCrawlerSiteOverrides(label) || {}; const site = normalizeSite({ label, name: override.name || `crawler_${slugifyLabel(label)}`, allowedHosts: override.allowedHosts || buildAllowedHosts(hostname), seeds: override.seeds || buildDefaultSeeds(feed.url), maxPages: override.maxPages || defaults.maxPages, maxDepth: override.maxDepth || defaults.maxDepth, requestTimeout: override.requestTimeout || defaults.requestTimeout, }); if (site.name && site.allowedHosts.length && site.seeds.length) { derivedSites.push(site); } } return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites]; } async function fetchHtml(url, timeout) { const response = await fetchWithPolicy(url, { timeout, retries: 1, }); if (!response.ok) { return null; } const contentType = String(response.headers.get('content-type') || '').toLowerCase(); if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) { return null; } return response.text(); } async function crawlSite(site) { const normalizedSite = normalizeSite(site); if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) { return []; } const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 })); const queuedUrls = new Set(normalizedSite.seeds); const visitedUrls = new Set(); const discoveredArticleUrls = new Set(); const articles = []; while (queue.length && visitedUrls.size < normalizedSite.maxPages) { const current = queue.shift(); if (!current || visitedUrls.has(current.url)) { continue; } visitedUrls.add(current.url); let html; try { html = await fetchHtml(current.url, normalizedSite.requestTimeout); } catch (error) { console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error); continue; } if (!html) { continue; } const meta = extractMetaMap(html); const jsonLdArticle = extractArticleJsonLd(html); const canonicalHref = extractCanonicalHref(html); const canonicalUrl = canonicalHref ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url : current.url; const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts); const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links); if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) { const title = normalizeText(selectTitle(meta, jsonLdArticle, html)); if (title) { discoveredArticleUrls.add(canonicalUrl); articles.push({ title, description: normalizeText(selectDescription(meta, jsonLdArticle)) || null, url: canonicalUrl, source: normalizedSite.name, pubDate: selectPubDate(meta, jsonLdArticle, html), }); } } if (current.depth >= normalizedSite.maxDepth || listingScore < 2) { continue; } for (const link of links) { if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) { continue; } queuedUrls.add(link.url); queue.push({ url: link.url, depth: current.depth + 1 }); } } return articles; } async function fetchCrawlerArticles() { const articles = []; for (const site of getConfiguredCrawlerSites()) { try { articles.push(...await crawlSite(site)); } catch (error) { console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error); } } return articles; } module.exports = { fetchCrawlerArticles, crawlSite, canonicalizeUrl, getConfiguredCrawlerSites, };