Duriin-API/src/sources/newsCrawler.js

const config = require('../config');
const { fetchWithPolicy } = require('../http');

const TRACKING_PARAM_PATTERNS = [
  /^utm_/i,
  /^fbclid$/i,
  /^gclid$/i,
  /^mkt_tok$/i,
  /^mc_cid$/i,
  /^mc_eid$/i,
  /^ref$/i,
  /^ref_src$/i,
  /^s$/i,
  /^cmpid$/i,
  /^guccounter$/i,
  /^guce_referrer$/i,
  /^guce_referrer_sig$/i,
];
const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;

function decodeHtmlEntities(value) {
  return String(value || '')
    .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
    .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&apos;/g, "'")
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&nbsp;/g, ' ');
}

function stripTags(value) {
  return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim();
}

function normalizeText(value) {
  return stripTags(value).replace(/\s+/g, ' ').trim();
}

function isAllowedHost(hostname, allowedHosts) {
  const normalized = String(hostname || '').toLowerCase();
  return allowedHosts.some((allowedHost) => {
    const candidate = String(allowedHost || '').toLowerCase();
    return normalized === candidate || normalized.endsWith(`.${candidate}`);
  });
}

function shouldDropParam(key) {
  return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key));
}

function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
  try {
    const url = new URL(rawUrl, baseUrl);

    if (!['http:', 'https:'].includes(url.protocol)) {
      return null;
    }

    if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
      return null;
    }

    url.hash = '';
    url.username = '';
    url.password = '';

    const params = [...url.searchParams.entries()]
      .filter(([key]) => !shouldDropParam(key))
      .sort(([left], [right]) => left.localeCompare(right));

    url.search = '';
    for (const [key, value] of params) {
      url.searchParams.append(key, value);
    }

    if (url.pathname !== '/') {
      url.pathname = url.pathname.replace(/\/+$/, '') || '/';
    }

    return url.toString();
  } catch {
    return null;
  }
}

function extractAttribute(tag, name) {
  const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i'));
  return match ? decodeHtmlEntities(match[2]).trim() : '';
}

function extractMetaMap(html) {
  const metas = new Map();
  const metaTags = html.match(/<meta\b[^>]*>/gi) || [];

  for (const tag of metaTags) {
    const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name');
    const content = extractAttribute(tag, 'content');

    if (!key || !content) {
      continue;
    }

    metas.set(key.toLowerCase(), content);
  }

  return metas;
}

function extractCanonicalHref(html) {
  const links = html.match(/<link\b[^>]*>/gi) || [];

  for (const tag of links) {
    const rel = extractAttribute(tag, 'rel').toLowerCase();
    if (!rel || !rel.split(/\s+/).includes('canonical')) {
      continue;
    }

    const href = extractAttribute(tag, 'href');
    if (href) {
      return href;
    }
  }

  return null;
}

function extractTitleTag(html) {
  const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
  return match ? normalizeText(match[1]) : null;
}

function extractH1(html) {
  const match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
  return match ? normalizeText(match[1]) : null;
}

function extractTimeDatetime(html) {
  const match = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
  return match ? decodeHtmlEntities(match[2]).trim() : null;
}

function extractParagraphTextLength(html) {
  const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
  return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0);
}

function extractJsonLdBlocks(html) {
  const blocks = [];
  const regex = /<script\b[^>]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi;
  let match;

  while ((match = regex.exec(html)) !== null) {
    const raw = String(match[2] || '').trim();
    if (!raw) {
      continue;
    }

    try {
      blocks.push(JSON.parse(raw));
    } catch {
      continue;
    }
  }

  return blocks;
}

function walkJson(value, visit) {
  if (Array.isArray(value)) {
    for (const item of value) {
      walkJson(item, visit);
    }
    return;
  }

  if (!value || typeof value !== 'object') {
    return;
  }

  visit(value);

  for (const child of Object.values(value)) {
    walkJson(child, visit);
  }
}

function isArticleType(type) {
  if (Array.isArray(type)) {
    return type.some((entry) => isArticleType(entry));
  }

  return ['article', 'newsarticle'].includes(String(type || '').toLowerCase());
}

function extractArticleJsonLd(html) {
  const blocks = extractJsonLdBlocks(html);
  let article = null;

  for (const block of blocks) {
    walkJson(block, (value) => {
      if (!article && isArticleType(value['@type'])) {
        article = value;
      }
    });

    if (article) {
      return article;
    }
  }

  return null;
}

function extractLinks(html, pageUrl, allowedHosts) {
  const links = [];
  const seen = new Set();
  const regex = /<a\b[^>]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi;
  let match;

  while ((match = regex.exec(html)) !== null) {
    const url = canonicalizeUrl(match[2], pageUrl, allowedHosts);
    if (!url || seen.has(url)) {
      continue;
    }

    const text = normalizeText(match[3]);
    seen.add(url);
    links.push({ url, text });
  }

  return links;
}

function selectTitle(meta, jsonLdArticle, html) {
  return [
    meta.get('og:title'),
    meta.get('twitter:title'),
    jsonLdArticle && jsonLdArticle.headline,
    extractH1(html),
    extractTitleTag(html),
  ].find((value) => String(value || '').trim()) || null;
}

function selectDescription(meta, jsonLdArticle) {
  return [
    meta.get('og:description'),
    meta.get('description'),
    jsonLdArticle && jsonLdArticle.description,
  ].find((value) => String(value || '').trim()) || null;
}

function selectPubDate(meta, jsonLdArticle, html) {
  return [
    jsonLdArticle && jsonLdArticle.datePublished,
    meta.get('article:published_time'),
    meta.get('og:article:published_time'),
    extractTimeDatetime(html),
  ].find((value) => String(value || '').trim()) || null;
}

function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
  let articleScore = 0;
  let listingScore = 0;
  const pathname = new URL(pageUrl).pathname;
  const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl);
  const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
  const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
  const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
  const paragraphTextLength = extractParagraphTextLength(html);
  const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;

  if (jsonLdArticle) {
    articleScore += 4;
  }

  if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
    articleScore += 1;
  }

  if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
    articleScore += 1;
  }

  if (/<article\b/i.test(html)) {
    articleScore += 1;
  }

  if (hasArticleDatePath || hasArticlePathHint) {
    articleScore += 2;
  }

  if (extractH1(html) && paragraphTextLength >= 500) {
    articleScore += 2;
  }

  if (links.length >= 20) {
    listingScore += 2;
  }

  if (headlineLinks >= 8) {
    listingScore += 2;
  }

  if (LISTING_PATH_HINT.test(pathname)) {
    listingScore += 1;
  }

  if (hasListingFalsePositivePath) {
    listingScore += 3;
  }

  if (articleScore > 0) {
    listingScore -= 1;
  }

  const isArticleCandidate = articleScore >= 4
    && articleScore > listingScore
    && (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);

  return { articleScore, listingScore, isArticleCandidate };
}

function shouldQueueLink(url) {
  const pathname = new URL(url).pathname.toLowerCase();

  if (BLOCKED_PATH_HINT.test(pathname)) {
    return false;
  }

  return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
}

function slugifyLabel(label) {
  return String(label || '')
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, '_')
    .replace(/^_+|_+$/g, '');
}

function unique(values) {
  return [...new Set(values.filter(Boolean))];
}

function buildAllowedHosts(hostname) {
  if (!hostname) {
    return [];
  }

  const hosts = [hostname.toLowerCase()];
  if (hostname.startsWith('www.')) {
    hosts.push(hostname.slice(4).toLowerCase());
  } else {
    hosts.push(`www.${hostname}`.toLowerCase());
  }

  return unique(hosts);
}

function cleanFeedPath(pathname) {
  const withoutIndex = pathname
    .replace(/\/index\.[a-z0-9]+$/i, '/')
    .replace(/\.[a-z0-9]+$/i, '')
    .replace(/\/rss(?:$|\/.*$)/i, '/')
    .replace(/\/feed(?:$|\/.*$)/i, '/')
    .replace(/\/feeds?(?:$|\/.*$)/i, '/')
    .replace(/\/xml(?:$|\/.*$)/i, '/')
    .replace(/\/arc\/outboundfeeds\//i, '/')
    .replace(/\/dynamo\//i, '/')
    .replace(/\/id\/\d+\/device\/rss\//i, '/')
    .replace(/\/contentexport\//i, '/')
    .replace(/\/rssfeedstopstories$/i, '/')
    .replace(/\/latest$/i, '/')
    .replace(/\/+$|^$/g, '');

  if (!withoutIndex) {
    return '/';
  }

  const segments = withoutIndex
    .split('/')
    .filter(Boolean)
    .filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
    .slice(0, 3);

  if (!segments.length) {
    return '/';
  }

  return `/${segments.join('/')}`;
}

function buildDefaultSeeds(feedUrl) {
  try {
    const parsed = new URL(feedUrl);
    const origin = `${parsed.protocol}//${parsed.hostname}`;
    const cleanedPath = cleanFeedPath(parsed.pathname);

    return unique([
      canonicalizeUrl(origin, origin),
      cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
    ]);
  } catch {
    return [];
  }
}

function normalizeLimit(value, fallback, minimum, maximum) {
  const numeric = Number(value);

  if (numeric === -1) {
    return Number.POSITIVE_INFINITY;
  }

  if (!Number.isFinite(numeric)) {
    return fallback;
  }

  return Math.max(minimum, Math.min(numeric, maximum));
}

function normalizeSite(site) {
  const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
  const seeds = unique((site.seeds || [])
    .map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
    .filter(Boolean));

  return {
    name: String(site.name || '').trim(),
    label: String(site.label || '').trim(),
    allowedHosts,
    seeds,
    maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
    maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
    requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
  };
}

function getCrawlerSiteOverrides(label) {
  return config.newsCrawler?.overrides?.[label] || null;
}

function getConfiguredCrawlerSites() {
  const defaults = config.newsCrawler || {};
  const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
  const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
  const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
  const derivedSites = [];

  for (const feed of config.rssFeeds || []) {
    const label = String(feed.label || '').trim();
    if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
      continue;
    }

    let hostname = '';
    try {
      hostname = new URL(feed.url).hostname;
    } catch {
      continue;
    }

    const override = getCrawlerSiteOverrides(label) || {};
    const site = normalizeSite({
      label,
      name: override.name || `crawler_${slugifyLabel(label)}`,
      allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
      seeds: override.seeds || buildDefaultSeeds(feed.url),
      maxPages: override.maxPages || defaults.maxPages,
      maxDepth: override.maxDepth || defaults.maxDepth,
      requestTimeout: override.requestTimeout || defaults.requestTimeout,
    });

    if (site.name && site.allowedHosts.length && site.seeds.length) {
      derivedSites.push(site);
    }
  }

  return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
}

async function fetchHtml(url, timeout) {
  const response = await fetchWithPolicy(url, {
    timeout,
    retries: 1,
  });

  if (!response.ok) {
    return null;
  }

  const contentType = String(response.headers.get('content-type') || '').toLowerCase();
  if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
    return null;
  }

  return response.text();
}

async function crawlSite(site) {
  const normalizedSite = normalizeSite(site);

  if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) {
    return [];
  }

  const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
  const queuedUrls = new Set(normalizedSite.seeds);
  const visitedUrls = new Set();
  const discoveredArticleUrls = new Set();
  const articles = [];

  while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
    const current = queue.shift();

    if (!current || visitedUrls.has(current.url)) {
      continue;
    }

    visitedUrls.add(current.url);

    let html;
    try {
      html = await fetchHtml(current.url, normalizedSite.requestTimeout);
    } catch (error) {
      console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
      continue;
    }

    if (!html) {
      continue;
    }

    const meta = extractMetaMap(html);
    const jsonLdArticle = extractArticleJsonLd(html);
    const canonicalHref = extractCanonicalHref(html);
    const canonicalUrl = canonicalHref
      ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
      : current.url;
    const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
    const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);

    if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
      const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
      if (title) {
        discoveredArticleUrls.add(canonicalUrl);
        articles.push({
          title,
          description: normalizeText(selectDescription(meta, jsonLdArticle)) || null,
          url: canonicalUrl,
          source: normalizedSite.name,
          pubDate: selectPubDate(meta, jsonLdArticle, html),
        });
      }
    }

    if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
      continue;
    }

    for (const link of links) {
      if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
        continue;
      }

      queuedUrls.add(link.url);
      queue.push({ url: link.url, depth: current.depth + 1 });
    }
  }

  return articles;
}

async function fetchCrawlerArticles() {
  const articles = [];

  for (const site of getConfiguredCrawlerSites()) {
    try {
      articles.push(...await crawlSite(site));
    } catch (error) {
      console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error);
    }
  }

  return articles;
}

module.exports = {
  fetchCrawlerArticles,
  crawlSite,
  canonicalizeUrl,
  getConfiguredCrawlerSites,
};