From 77db05a5552daedf4bf995f6e9104abc4998befc Mon Sep 17 00:00:00 2001 From: ImBenji Date: Thu, 16 Apr 2026 23:50:24 +0100 Subject: [PATCH] enhance news crawler configuration with new sources and improved request headers --- config.json | 4 +-- src/sources/newsCrawler.js | 64 ++++++++++++++++++++++++++++---------- src/sources/rss.js | 43 ++++++++++++++++++++++++- 3 files changed, 91 insertions(+), 20 deletions(-) diff --git a/config.json b/config.json index c137619..a19de9b 100644 --- a/config.json +++ b/config.json @@ -392,8 +392,8 @@ "format": "json" }, "newsCrawler": { - "maxPages": 15, - "maxDepth": 1, + "maxPages": -1, + "maxDepth": 10, "requestTimeout": 15000, "disabledLabels": [ "Arab News", diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js index 0ed6804..3ddab52 100644 --- a/src/sources/newsCrawler.js +++ b/src/sources/newsCrawler.js @@ -19,6 +19,8 @@ const TRACKING_PARAM_PATTERNS = [ const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i; const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/; const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i; +const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//; +const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; function decodeHtmlEntities(value) { @@ -267,30 +269,36 @@ function selectPubDate(meta, jsonLdArticle, html) { function scorePage(pageUrl, meta, html, jsonLdArticle, links) { let articleScore = 0; let listingScore = 0; + const pathname = new URL(pageUrl).pathname; + const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl); + const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl); + const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname); + const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname); + const paragraphTextLength = extractParagraphTextLength(html); const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length; if (jsonLdArticle) { - articleScore += 3; + articleScore += 4; } - if (String(meta.get('og:type') || '').toLowerCase() === 'article') { - articleScore += 2; + if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) { + articleScore += 1; } - if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) { - articleScore += 2; + if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) { + articleScore += 1; } if (/= 500) { - articleScore += 1; + if (extractH1(html) && paragraphTextLength >= 500) { + articleScore += 2; } if (links.length >= 20) { @@ -301,15 +309,23 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) { listingScore += 2; } - if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) { + if (LISTING_PATH_HINT.test(pathname)) { listingScore += 1; } - if (articleScore > 0) { - listingScore -= 2; + if (hasListingFalsePositivePath) { + listingScore += 3; } - return { articleScore, listingScore }; + if (articleScore > 0) { + listingScore -= 1; + } + + const isArticleCandidate = articleScore >= 4 + && articleScore > listingScore + && (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500); + + return { articleScore, listingScore, isArticleCandidate }; } function shouldQueueLink(url) { @@ -396,6 +412,20 @@ function buildDefaultSeeds(feedUrl) { } } +function normalizeLimit(value, fallback, minimum, maximum) { + const numeric = Number(value); + + if (numeric === -1) { + return Number.POSITIVE_INFINITY; + } + + if (!Number.isFinite(numeric)) { + return fallback; + } + + return Math.max(minimum, Math.min(numeric, maximum)); +} + function normalizeSite(site) { const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean)); const seeds = unique((site.seeds || []) @@ -407,8 +437,8 @@ function normalizeSite(site) { label: String(site.label || '').trim(), allowedHosts, seeds, - maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)), - maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)), + maxPages: normalizeLimit(site.maxPages, 15, 1, 500), + maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5), requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), }; } @@ -515,9 +545,9 @@ async function crawlSite(site) { ? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url : current.url; const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts); - const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links); + const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links); - if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) { + if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) { const title = normalizeText(selectTitle(meta, jsonLdArticle, html)); if (title) { discoveredArticleUrls.add(canonicalUrl); diff --git a/src/sources/rss.js b/src/sources/rss.js index b7e6243..4f9f7a1 100644 --- a/src/sources/rss.js +++ b/src/sources/rss.js @@ -1,5 +1,6 @@ const Parser = require('rss-parser'); const config = require('../config'); +const { fetchWithPolicy } = require('../http'); const parser = new Parser({ timeout: 10000, @@ -43,9 +44,11 @@ const invalidFeedLabels = new Set([ const malformedFeedLabels = new Set([ 'BFM Business', 'Business Daily Africa', + 'Nation News Barbados', ]); const loggedBlockedFeeds = new Set(); const loggedInvalidFeeds = new Set(); +const loggedUpstreamFeedSkips = new Set(); function getHostname(url) { try { @@ -65,6 +68,34 @@ function isMalformedFeedError(error) { return message.includes('Invalid character in entity name') || message.includes('Attribute without value'); } +function getErrorStatus(error) { + if (error && Number.isInteger(error.status)) { + return error.status; + } + + const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/); + return match ? Number(match[1]) : null; +} + +async function parseFeed(feedUrl) { + const response = await fetchWithPolicy(feedUrl, { + timeout: 10000, + retries: 1, + headers: { + Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8', + }, + }); + + if (!response.ok) { + const error = new Error(`Status code ${response.status}`); + error.status = response.status; + throw error; + } + + const xml = await response.text(); + return parser.parseString(xml); +} + async function fetchRssArticles() { const articles = []; @@ -89,7 +120,7 @@ async function fetchRssArticles() { } try { - const parsed = await parser.parseURL(feed.url); + const parsed = await parseFeed(feed.url); for (const item of parsed.items || []) { const title = String(item.title || '').trim(); const url = String(item.link || item.guid || '').trim(); @@ -115,6 +146,16 @@ async function fetchRssArticles() { continue; } + const status = getErrorStatus(error); + if (status === 401 || status === 403 || status === 404 || status === 429) { + const key = `${label}:${status}`; + if (!loggedUpstreamFeedSkips.has(key)) { + loggedUpstreamFeedSkips.add(key); + console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`); + } + continue; + } + console.error(`Failed to fetch RSS feed: ${label}`, error); } }