From 77db05a5552daedf4bf995f6e9104abc4998befc Mon Sep 17 00:00:00 2001
From: ImBenji <benjamin.watt@imbenji.net>
Date: Thu, 16 Apr 2026 23:50:24 +0100
Subject: [PATCH] enhance news crawler configuration with new sources and
 improved request headers

---
 config.json                |  4 +--
 src/sources/newsCrawler.js | 64 ++++++++++++++++++++++++++++----------
 src/sources/rss.js         | 43 ++++++++++++++++++++++++-
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/config.json b/config.json
index c137619..a19de9b 100644
--- a/config.json
+++ b/config.json
@@ -392,8 +392,8 @@
     "format": "json"
   },
   "newsCrawler": {
-    "maxPages": 15,
-    "maxDepth": 1,
+    "maxPages": -1,
+    "maxDepth": 10,
     "requestTimeout": 15000,
     "disabledLabels": [
       "Arab News",
diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js
index 0ed6804..3ddab52 100644
--- a/src/sources/newsCrawler.js
+++ b/src/sources/newsCrawler.js
@@ -19,6 +19,8 @@ const TRACKING_PARAM_PATTERNS = [
 const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
 const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
 const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
+const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
+const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
 const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
 
 function decodeHtmlEntities(value) {
@@ -267,30 +269,36 @@ function selectPubDate(meta, jsonLdArticle, html) {
 function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
   let articleScore = 0;
   let listingScore = 0;
+  const pathname = new URL(pageUrl).pathname;
+  const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl);
+  const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
+  const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
+  const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
+  const paragraphTextLength = extractParagraphTextLength(html);
   const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
 
   if (jsonLdArticle) {
-    articleScore += 3;
+    articleScore += 4;
   }
 
-  if (String(meta.get('og:type') || '').toLowerCase() === 'article') {
-    articleScore += 2;
+  if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
+    articleScore += 1;
   }
 
-  if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) {
-    articleScore += 2;
+  if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
+    articleScore += 1;
   }
 
   if (/<article\b/i.test(html)) {
     articleScore += 1;
   }
 
-  if (ARTICLE_DATE_PATH.test(pageUrl) || ARTICLE_PATH_HINT.test(pageUrl)) {
-    articleScore += 1;
+  if (hasArticleDatePath || hasArticlePathHint) {
+    articleScore += 2;
   }
 
-  if (extractH1(html) && extractParagraphTextLength(html) >= 500) {
-    articleScore += 1;
+  if (extractH1(html) && paragraphTextLength >= 500) {
+    articleScore += 2;
   }
 
   if (links.length >= 20) {
@@ -301,15 +309,23 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
     listingScore += 2;
   }
 
-  if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) {
+  if (LISTING_PATH_HINT.test(pathname)) {
     listingScore += 1;
   }
 
-  if (articleScore > 0) {
-    listingScore -= 2;
+  if (hasListingFalsePositivePath) {
+    listingScore += 3;
   }
 
-  return { articleScore, listingScore };
+  if (articleScore > 0) {
+    listingScore -= 1;
+  }
+
+  const isArticleCandidate = articleScore >= 4
+    && articleScore > listingScore
+    && (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
+
+  return { articleScore, listingScore, isArticleCandidate };
 }
 
 function shouldQueueLink(url) {
@@ -396,6 +412,20 @@ function buildDefaultSeeds(feedUrl) {
   }
 }
 
+function normalizeLimit(value, fallback, minimum, maximum) {
+  const numeric = Number(value);
+
+  if (numeric === -1) {
+    return Number.POSITIVE_INFINITY;
+  }
+
+  if (!Number.isFinite(numeric)) {
+    return fallback;
+  }
+
+  return Math.max(minimum, Math.min(numeric, maximum));
+}
+
 function normalizeSite(site) {
   const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
   const seeds = unique((site.seeds || [])
@@ -407,8 +437,8 @@ function normalizeSite(site) {
     label: String(site.label || '').trim(),
     allowedHosts,
     seeds,
-    maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
-    maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
+    maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
+    maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
     requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
   };
 }
@@ -515,9 +545,9 @@ async function crawlSite(site) {
       ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
       : current.url;
     const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
-    const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
+    const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
 
-    if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) {
+    if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
       const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
       if (title) {
         discoveredArticleUrls.add(canonicalUrl);
diff --git a/src/sources/rss.js b/src/sources/rss.js
index b7e6243..4f9f7a1 100644
--- a/src/sources/rss.js
+++ b/src/sources/rss.js
@@ -1,5 +1,6 @@
 const Parser = require('rss-parser');
 const config = require('../config');
+const { fetchWithPolicy } = require('../http');
 
 const parser = new Parser({
   timeout: 10000,
@@ -43,9 +44,11 @@ const invalidFeedLabels = new Set([
 const malformedFeedLabels = new Set([
   'BFM Business',
   'Business Daily Africa',
+  'Nation News Barbados',
 ]);
 const loggedBlockedFeeds = new Set();
 const loggedInvalidFeeds = new Set();
+const loggedUpstreamFeedSkips = new Set();
 
 function getHostname(url) {
   try {
@@ -65,6 +68,34 @@ function isMalformedFeedError(error) {
   return message.includes('Invalid character in entity name') || message.includes('Attribute without value');
 }
 
+function getErrorStatus(error) {
+  if (error && Number.isInteger(error.status)) {
+    return error.status;
+  }
+
+  const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
+  return match ? Number(match[1]) : null;
+}
+
+async function parseFeed(feedUrl) {
+  const response = await fetchWithPolicy(feedUrl, {
+    timeout: 10000,
+    retries: 1,
+    headers: {
+      Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
+    },
+  });
+
+  if (!response.ok) {
+    const error = new Error(`Status code ${response.status}`);
+    error.status = response.status;
+    throw error;
+  }
+
+  const xml = await response.text();
+  return parser.parseString(xml);
+}
+
 async function fetchRssArticles() {
   const articles = [];
 
@@ -89,7 +120,7 @@ async function fetchRssArticles() {
     }
 
     try {
-      const parsed = await parser.parseURL(feed.url);
+      const parsed = await parseFeed(feed.url);
       for (const item of parsed.items || []) {
         const title = String(item.title || '').trim();
         const url = String(item.link || item.guid || '').trim();
@@ -115,6 +146,16 @@ async function fetchRssArticles() {
         continue;
       }
 
+      const status = getErrorStatus(error);
+      if (status === 401 || status === 403 || status === 404 || status === 429) {
+        const key = `${label}:${status}`;
+        if (!loggedUpstreamFeedSkips.has(key)) {
+          loggedUpstreamFeedSkips.add(key);
+          console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`);
+        }
+        continue;
+      }
+
       console.error(`Failed to fetch RSS feed: ${label}`, error);
     }
   }