enhance news crawler configuration with new sources and improved request headers

2026-04-16 23:32:56 +01:00
parent c91e4ddb60
commit 11647e6a35
6 changed files with 449 additions and 33 deletions
@@ -44,8 +44,19 @@ const blockedContentDomains = [
 ];
 const loggedBlockedDomains = new Set();
 const articleFetchHeaders = {
-  Accept: 'text/html,application/xhtml+xml',
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+  Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+  'Accept-Language': 'en-US,en;q=0.9',
+  'Cache-Control': 'no-cache',
+  Pragma: 'no-cache',
+  'Upgrade-Insecure-Requests': '1',
+  'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
+  'sec-ch-ua-mobile': '?0',
+  'sec-ch-ua-platform': '"macOS"',
+  'Sec-Fetch-Dest': 'document',
+  'Sec-Fetch-Mode': 'navigate',
+  'Sec-Fetch-Site': 'none',
+  'Sec-Fetch-User': '?1',
 };

 let contentBackfillRunning = false;
@@ -78,7 +89,20 @@ function getErrorMessage(error, fallback) {
 }

 function markArticleStatus(statement, id, message) {
-  statement.run(message, new Date().toISOString(), id);
+  const attemptedAt = new Date().toISOString();
+  const parameterCount = statement.source.split('?').length - 1;
+
+  if (parameterCount === 3) {
+    statement.run(message, attemptedAt, id);
+    return;
+  }
+
+  if (parameterCount === 2) {
+    statement.run(attemptedAt, id);
+    return;
+  }
+
+  throw new Error(`Unexpected content status statement parameter count: ${parameterCount}`);
 }

 async function fetchCompressedImage(url) {
@@ -1,6 +1,17 @@
 const DEFAULT_HEADERS = {
-  'User-Agent': 'duriin_api/1.0',
-  Accept: 'application/json, text/plain, */*',
+  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+  Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+  'Accept-Language': 'en-US,en;q=0.9',
+  'Cache-Control': 'no-cache',
+  Pragma: 'no-cache',
+  'Upgrade-Insecure-Requests': '1',
+  'sec-ch-ua': '"Google Chrome";v="135", "Chromium";v="135", "Not.A/Brand";v="24"',
+  'sec-ch-ua-mobile': '?0',
+  'sec-ch-ua-platform': '"macOS"',
+  'Sec-Fetch-Dest': 'document',
+  'Sec-Fetch-Mode': 'navigate',
+  'Sec-Fetch-Site': 'none',
+  'Sec-Fetch-User': '?1',
 };

 function sleep(ms) {
@@ -6,7 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
 const { fetchEdgarArticles } = require('./sources/edgar');
 const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
 const { fetchFinnhubArticles } = require('./sources/finnhub');
-const { fetchCrawlerArticles } = require('./sources/newsCrawler');
+const { crawlSite, getConfiguredCrawlerSites } = require('./sources/newsCrawler');
 const { backfillMissingContent } = require('./content');
 const { backfillMissingEmbeddings } = require('./embeddings');

@@ -20,6 +20,16 @@ async function runSource(source, fetcher) {
  }
 }

+async function runCrawlerSources() {
+  const results = [];
+
+  for (const site of getConfiguredCrawlerSites()) {
+    results.push(await runSource(site.name, () => crawlSite(site)));
+  }
+
+  return results;
+}
+
 async function runAllIngestions() {
  const results = [];

@@ -28,7 +38,7 @@ async function runAllIngestions() {
  results.push(await runSource('edgar', fetchEdgarArticles));
  results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
  results.push(await runSource('finnhub', fetchFinnhubArticles));
-  results.push(await runSource('news_crawler', fetchCrawlerArticles));
+  results.push(...await runCrawlerSources());

  try {
    await backfillMissingContent();
@@ -68,7 +78,7 @@ function startScheduler() {

  if (config.scheduler.newsCrawler) {
    cron.schedule(config.scheduler.newsCrawler, async () => {
-      await runSource('news_crawler', fetchCrawlerArticles);
+      await runCrawlerSources();
    });
  }

@@ -20,7 +20,6 @@ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|section
 const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
 const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
 const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
-const USER_AGENT = 'duriin_api crawler/1.0';

 function decodeHtmlEntities(value) {
  return String(value || '')
@@ -63,7 +62,7 @@ function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
      return null;
    }

-    if (!isAllowedHost(url.hostname, allowedHosts)) {
+    if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
      return null;
    }

@@ -323,30 +322,144 @@ function shouldQueueLink(url) {
  return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
 }

+function slugifyLabel(label) {
+  return String(label || '')
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '_')
+    .replace(/^_+|_+$/g, '');
+}
+
+function unique(values) {
+  return [...new Set(values.filter(Boolean))];
+}
+
+function buildAllowedHosts(hostname) {
+  if (!hostname) {
+    return [];
+  }
+
+  const hosts = [hostname.toLowerCase()];
+  if (hostname.startsWith('www.')) {
+    hosts.push(hostname.slice(4).toLowerCase());
+  } else {
+    hosts.push(`www.${hostname}`.toLowerCase());
+  }
+
+  return unique(hosts);
+}
+
+function cleanFeedPath(pathname) {
+  const withoutIndex = pathname
+    .replace(/\/index\.[a-z0-9]+$/i, '/')
+    .replace(/\.[a-z0-9]+$/i, '')
+    .replace(/\/rss(?:$|\/.*$)/i, '/')
+    .replace(/\/feed(?:$|\/.*$)/i, '/')
+    .replace(/\/feeds?(?:$|\/.*$)/i, '/')
+    .replace(/\/xml(?:$|\/.*$)/i, '/')
+    .replace(/\/arc\/outboundfeeds\//i, '/')
+    .replace(/\/dynamo\//i, '/')
+    .replace(/\/id\/\d+\/device\/rss\//i, '/')
+    .replace(/\/contentexport\//i, '/')
+    .replace(/\/rssfeedstopstories$/i, '/')
+    .replace(/\/latest$/i, '/')
+    .replace(/\/+$|^$/g, '');
+
+  if (!withoutIndex) {
+    return '/';
+  }
+
+  const segments = withoutIndex
+    .split('/')
+    .filter(Boolean)
+    .filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
+    .slice(0, 3);
+
+  if (!segments.length) {
+    return '/';
+  }
+
+  return `/${segments.join('/')}`;
+}
+
+function buildDefaultSeeds(feedUrl) {
+  try {
+    const parsed = new URL(feedUrl);
+    const origin = `${parsed.protocol}//${parsed.hostname}`;
+    const cleanedPath = cleanFeedPath(parsed.pathname);
+
+    return unique([
+      canonicalizeUrl(origin, origin),
+      cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
+    ]);
+  } catch {
+    return [];
+  }
+}
+
 function normalizeSite(site) {
-  const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
-  const seeds = [...new Set((site.seeds || [])
+  const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
+  const seeds = unique((site.seeds || [])
    .map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
-    .filter(Boolean))];
+    .filter(Boolean));

  return {
    name: String(site.name || '').trim(),
+    label: String(site.label || '').trim(),
    allowedHosts,
    seeds,
-    maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
-    maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
+    maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
+    maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
    requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
  };
 }

+function getCrawlerSiteOverrides(label) {
+  return config.newsCrawler?.overrides?.[label] || null;
+}
+
+function getConfiguredCrawlerSites() {
+  const defaults = config.newsCrawler || {};
+  const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
+  const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
+  const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
+  const derivedSites = [];
+
+  for (const feed of config.rssFeeds || []) {
+    const label = String(feed.label || '').trim();
+    if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
+      continue;
+    }
+
+    let hostname = '';
+    try {
+      hostname = new URL(feed.url).hostname;
+    } catch {
+      continue;
+    }
+
+    const override = getCrawlerSiteOverrides(label) || {};
+    const site = normalizeSite({
+      label,
+      name: override.name || `crawler_${slugifyLabel(label)}`,
+      allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
+      seeds: override.seeds || buildDefaultSeeds(feed.url),
+      maxPages: override.maxPages || defaults.maxPages,
+      maxDepth: override.maxDepth || defaults.maxDepth,
+      requestTimeout: override.requestTimeout || defaults.requestTimeout,
+    });
+
+    if (site.name && site.allowedHosts.length && site.seeds.length) {
+      derivedSites.push(site);
+    }
+  }
+
+  return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
+}
+
 async function fetchHtml(url, timeout) {
  const response = await fetchWithPolicy(url, {
    timeout,
    retries: 1,
-    headers: {
-      Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
-      'User-Agent': USER_AGENT,
-    },
  });

  if (!response.ok) {
@@ -438,7 +551,7 @@ async function crawlSite(site) {
 async function fetchCrawlerArticles() {
  const articles = [];

-  for (const site of config.newsCrawler?.sites || []) {
+  for (const site of getConfiguredCrawlerSites()) {
    try {
      articles.push(...await crawlSite(site));
    } catch (error) {
@@ -451,5 +564,7 @@ async function fetchCrawlerArticles() {

 module.exports = {
  fetchCrawlerArticles,
+  crawlSite,
  canonicalizeUrl,
+  getConfiguredCrawlerSites,
 };