add Docker configuration and news crawler implementation

2026-04-16 22:54:27 +01:00
parent 7724fafbdc
commit c91e4ddb60
8 changed files with 915 additions and 2 deletions
@@ -6,6 +6,7 @@ const { fetchGdeltArticles } = require('./sources/gdelt');
 const { fetchEdgarArticles } = require('./sources/edgar');
 const { fetchAlphaVantageArticles } = require('./sources/alphavantage');
 const { fetchFinnhubArticles } = require('./sources/finnhub');
+const { fetchCrawlerArticles } = require('./sources/newsCrawler');
 const { backfillMissingContent } = require('./content');
 const { backfillMissingEmbeddings } = require('./embeddings');

@@ -27,6 +28,7 @@ async function runAllIngestions() {
  results.push(await runSource('edgar', fetchEdgarArticles));
  results.push(await runSource('alphavantage', fetchAlphaVantageArticles));
  results.push(await runSource('finnhub', fetchFinnhubArticles));
+  results.push(await runSource('news_crawler', fetchCrawlerArticles));

  try {
    await backfillMissingContent();
@@ -64,6 +66,12 @@ function startScheduler() {
    await runSource('finnhub', fetchFinnhubArticles);
  });

+  if (config.scheduler.newsCrawler) {
+    cron.schedule(config.scheduler.newsCrawler, async () => {
+      await runSource('news_crawler', fetchCrawlerArticles);
+    });
+  }
+
  cron.schedule('0 * * * *', async () => {
    try {
      await backfillMissingContent();
@@ -0,0 +1,455 @@
+const config = require('../config');
+const { fetchWithPolicy } = require('../http');
+
+const TRACKING_PARAM_PATTERNS = [
+  /^utm_/i,
+  /^fbclid$/i,
+  /^gclid$/i,
+  /^mkt_tok$/i,
+  /^mc_cid$/i,
+  /^mc_eid$/i,
+  /^ref$/i,
+  /^ref_src$/i,
+  /^s$/i,
+  /^cmpid$/i,
+  /^guccounter$/i,
+  /^guce_referrer$/i,
+  /^guce_referrer_sig$/i,
+];
+const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
+const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
+const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
+const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
+const USER_AGENT = 'duriin_api crawler/1.0';
+
+function decodeHtmlEntities(value) {
+  return String(value || '')
+    .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
+    .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&apos;/g, "'")
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&nbsp;/g, ' ');
+}
+
+function stripTags(value) {
+  return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim();
+}
+
+function normalizeText(value) {
+  return stripTags(value).replace(/\s+/g, ' ').trim();
+}
+
+function isAllowedHost(hostname, allowedHosts) {
+  const normalized = String(hostname || '').toLowerCase();
+  return allowedHosts.some((allowedHost) => {
+    const candidate = String(allowedHost || '').toLowerCase();
+    return normalized === candidate || normalized.endsWith(`.${candidate}`);
+  });
+}
+
+function shouldDropParam(key) {
+  return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key));
+}
+
+function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
+  try {
+    const url = new URL(rawUrl, baseUrl);
+
+    if (!['http:', 'https:'].includes(url.protocol)) {
+      return null;
+    }
+
+    if (!isAllowedHost(url.hostname, allowedHosts)) {
+      return null;
+    }
+
+    url.hash = '';
+    url.username = '';
+    url.password = '';
+
+    const params = [...url.searchParams.entries()]
+      .filter(([key]) => !shouldDropParam(key))
+      .sort(([left], [right]) => left.localeCompare(right));
+
+    url.search = '';
+    for (const [key, value] of params) {
+      url.searchParams.append(key, value);
+    }
+
+    if (url.pathname !== '/') {
+      url.pathname = url.pathname.replace(/\/+$/, '') || '/';
+    }
+
+    return url.toString();
+  } catch {
+    return null;
+  }
+}
+
+function extractAttribute(tag, name) {
+  const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i'));
+  return match ? decodeHtmlEntities(match[2]).trim() : '';
+}
+
+function extractMetaMap(html) {
+  const metas = new Map();
+  const metaTags = html.match(/<meta\b[^>]*>/gi) || [];
+
+  for (const tag of metaTags) {
+    const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name');
+    const content = extractAttribute(tag, 'content');
+
+    if (!key || !content) {
+      continue;
+    }
+
+    metas.set(key.toLowerCase(), content);
+  }
+
+  return metas;
+}
+
+function extractCanonicalHref(html) {
+  const links = html.match(/<link\b[^>]*>/gi) || [];
+
+  for (const tag of links) {
+    const rel = extractAttribute(tag, 'rel').toLowerCase();
+    if (!rel || !rel.split(/\s+/).includes('canonical')) {
+      continue;
+    }
+
+    const href = extractAttribute(tag, 'href');
+    if (href) {
+      return href;
+    }
+  }
+
+  return null;
+}
+
+function extractTitleTag(html) {
+  const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
+  return match ? normalizeText(match[1]) : null;
+}
+
+function extractH1(html) {
+  const match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
+  return match ? normalizeText(match[1]) : null;
+}
+
+function extractTimeDatetime(html) {
+  const match = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
+  return match ? decodeHtmlEntities(match[2]).trim() : null;
+}
+
+function extractParagraphTextLength(html) {
+  const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
+  return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0);
+}
+
+function extractJsonLdBlocks(html) {
+  const blocks = [];
+  const regex = /<script\b[^>]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi;
+  let match;
+
+  while ((match = regex.exec(html)) !== null) {
+    const raw = String(match[2] || '').trim();
+    if (!raw) {
+      continue;
+    }
+
+    try {
+      blocks.push(JSON.parse(raw));
+    } catch {
+      continue;
+    }
+  }
+
+  return blocks;
+}
+
+function walkJson(value, visit) {
+  if (Array.isArray(value)) {
+    for (const item of value) {
+      walkJson(item, visit);
+    }
+    return;
+  }
+
+  if (!value || typeof value !== 'object') {
+    return;
+  }
+
+  visit(value);
+
+  for (const child of Object.values(value)) {
+    walkJson(child, visit);
+  }
+}
+
+function isArticleType(type) {
+  if (Array.isArray(type)) {
+    return type.some((entry) => isArticleType(entry));
+  }
+
+  return ['article', 'newsarticle'].includes(String(type || '').toLowerCase());
+}
+
+function extractArticleJsonLd(html) {
+  const blocks = extractJsonLdBlocks(html);
+  let article = null;
+
+  for (const block of blocks) {
+    walkJson(block, (value) => {
+      if (!article && isArticleType(value['@type'])) {
+        article = value;
+      }
+    });
+
+    if (article) {
+      return article;
+    }
+  }
+
+  return null;
+}
+
+function extractLinks(html, pageUrl, allowedHosts) {
+  const links = [];
+  const seen = new Set();
+  const regex = /<a\b[^>]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi;
+  let match;
+
+  while ((match = regex.exec(html)) !== null) {
+    const url = canonicalizeUrl(match[2], pageUrl, allowedHosts);
+    if (!url || seen.has(url)) {
+      continue;
+    }
+
+    const text = normalizeText(match[3]);
+    seen.add(url);
+    links.push({ url, text });
+  }
+
+  return links;
+}
+
+function selectTitle(meta, jsonLdArticle, html) {
+  return [
+    meta.get('og:title'),
+    meta.get('twitter:title'),
+    jsonLdArticle && jsonLdArticle.headline,
+    extractH1(html),
+    extractTitleTag(html),
+  ].find((value) => String(value || '').trim()) || null;
+}
+
+function selectDescription(meta, jsonLdArticle) {
+  return [
+    meta.get('og:description'),
+    meta.get('description'),
+    jsonLdArticle && jsonLdArticle.description,
+  ].find((value) => String(value || '').trim()) || null;
+}
+
+function selectPubDate(meta, jsonLdArticle, html) {
+  return [
+    jsonLdArticle && jsonLdArticle.datePublished,
+    meta.get('article:published_time'),
+    meta.get('og:article:published_time'),
+    extractTimeDatetime(html),
+  ].find((value) => String(value || '').trim()) || null;
+}
+
+function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
+  let articleScore = 0;
+  let listingScore = 0;
+  const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
+
+  if (jsonLdArticle) {
+    articleScore += 3;
+  }
+
+  if (String(meta.get('og:type') || '').toLowerCase() === 'article') {
+    articleScore += 2;
+  }
+
+  if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) {
+    articleScore += 2;
+  }
+
+  if (/<article\b/i.test(html)) {
+    articleScore += 1;
+  }
+
+  if (ARTICLE_DATE_PATH.test(pageUrl) || ARTICLE_PATH_HINT.test(pageUrl)) {
+    articleScore += 1;
+  }
+
+  if (extractH1(html) && extractParagraphTextLength(html) >= 500) {
+    articleScore += 1;
+  }
+
+  if (links.length >= 20) {
+    listingScore += 2;
+  }
+
+  if (headlineLinks >= 8) {
+    listingScore += 2;
+  }
+
+  if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) {
+    listingScore += 1;
+  }
+
+  if (articleScore > 0) {
+    listingScore -= 2;
+  }
+
+  return { articleScore, listingScore };
+}
+
+function shouldQueueLink(url) {
+  const pathname = new URL(url).pathname.toLowerCase();
+
+  if (BLOCKED_PATH_HINT.test(pathname)) {
+    return false;
+  }
+
+  return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
+}
+
+function normalizeSite(site) {
+  const allowedHosts = [...new Set((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean))];
+  const seeds = [...new Set((site.seeds || [])
+    .map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
+    .filter(Boolean))];
+
+  return {
+    name: String(site.name || '').trim(),
+    allowedHosts,
+    seeds,
+    maxPages: Math.max(1, Math.min(Number(site.maxPages) || 100, 500)),
+    maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 2, 5)),
+    requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
+  };
+}
+
+async function fetchHtml(url, timeout) {
+  const response = await fetchWithPolicy(url, {
+    timeout,
+    retries: 1,
+    headers: {
+      Accept: 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.8',
+      'User-Agent': USER_AGENT,
+    },
+  });
+
+  if (!response.ok) {
+    return null;
+  }
+
+  const contentType = String(response.headers.get('content-type') || '').toLowerCase();
+  if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
+    return null;
+  }
+
+  return response.text();
+}
+
+async function crawlSite(site) {
+  const normalizedSite = normalizeSite(site);
+
+  if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) {
+    return [];
+  }
+
+  const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
+  const queuedUrls = new Set(normalizedSite.seeds);
+  const visitedUrls = new Set();
+  const discoveredArticleUrls = new Set();
+  const articles = [];
+
+  while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
+    const current = queue.shift();
+
+    if (!current || visitedUrls.has(current.url)) {
+      continue;
+    }
+
+    visitedUrls.add(current.url);
+
+    let html;
+    try {
+      html = await fetchHtml(current.url, normalizedSite.requestTimeout);
+    } catch (error) {
+      console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
+      continue;
+    }
+
+    if (!html) {
+      continue;
+    }
+
+    const meta = extractMetaMap(html);
+    const jsonLdArticle = extractArticleJsonLd(html);
+    const canonicalHref = extractCanonicalHref(html);
+    const canonicalUrl = canonicalHref
+      ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
+      : current.url;
+    const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
+    const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
+
+    if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) {
+      const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
+      if (title) {
+        discoveredArticleUrls.add(canonicalUrl);
+        articles.push({
+          title,
+          description: normalizeText(selectDescription(meta, jsonLdArticle)) || null,
+          url: canonicalUrl,
+          source: normalizedSite.name,
+          pubDate: selectPubDate(meta, jsonLdArticle, html),
+        });
+      }
+    }
+
+    if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
+      continue;
+    }
+
+    for (const link of links) {
+      if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
+        continue;
+      }
+
+      queuedUrls.add(link.url);
+      queue.push({ url: link.url, depth: current.depth + 1 });
+    }
+  }
+
+  return articles;
+}
+
+async function fetchCrawlerArticles() {
+  const articles = [];
+
+  for (const site of config.newsCrawler?.sites || []) {
+    try {
+      articles.push(...await crawlSite(site));
+    } catch (error) {
+      console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error);
+    }
+  }
+
+  return articles;
+}
+
+module.exports = {
+  fetchCrawlerArticles,
+  canonicalizeUrl,
+};