const config = require('../config'); const { fetchWithPolicy } = require('../http'); const TRACKING_PARAM_PATTERNS = [ /^utm_/i, /^fbclid$/i, /^gclid$/i, /^mkt_tok$/i, /^mc_cid$/i, /^mc_eid$/i, /^ref$/i, /^ref_src$/i, /^s$/i, /^cmpid$/i, /^guccounter$/i, /^guce_referrer$/i, /^guce_referrer_sig$/i, ]; const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i; const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//; const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; function decodeHtmlEntities(value) { return String(value || '') .replace(/([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16))) .replace(/(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10))) .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/ /g, ' '); } function stripTags(value) { return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim(); } function normalizeText(value) { return stripTags(value).replace(/\s+/g, ' ').trim(); } function isAllowedHost(hostname, allowedHosts) { const normalized = String(hostname || '').toLowerCase(); return allowedHosts.some((allowedHost) => { const candidate = String(allowedHost || '').toLowerCase(); return normalized === candidate || normalized.endsWith(`.${candidate}`); }); } function shouldDropParam(key) { return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key)); } function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) { try { const url = new URL(rawUrl, baseUrl); if (!['http:', 'https:'].includes(url.protocol)) { return null; } if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) { return null; } url.hash = ''; url.username = ''; url.password = ''; const params = [...url.searchParams.entries()] .filter(([key]) => !shouldDropParam(key)) .sort(([left], [right]) => left.localeCompare(right)); url.search = ''; for (const [key, value] of params) { url.searchParams.append(key, value); } if (url.pathname !== '/') { url.pathname = url.pathname.replace(/\/+$/, '') || '/'; } return url.toString(); } catch { return null; } } function extractAttribute(tag, name) { const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i')); return match ? decodeHtmlEntities(match[2]).trim() : ''; } function extractMetaMap(html) { const metas = new Map(); const metaTags = html.match(/]*>/gi) || []; for (const tag of metaTags) { const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name'); const content = extractAttribute(tag, 'content'); if (!key || !content) { continue; } metas.set(key.toLowerCase(), content); } return metas; } function extractCanonicalHref(html) { const links = html.match(/]*>/gi) || []; for (const tag of links) { const rel = extractAttribute(tag, 'rel').toLowerCase(); if (!rel || !rel.split(/\s+/).includes('canonical')) { continue; } const href = extractAttribute(tag, 'href'); if (href) { return href; } } return null; } function extractTitleTag(html) { const match = html.match(/