enhance news crawler configuration with new sources and improved request headers
This commit is contained in:
parent
11647e6a35
commit
77db05a555
3 changed files with 91 additions and 20 deletions
|
|
@ -392,8 +392,8 @@
|
||||||
"format": "json"
|
"format": "json"
|
||||||
},
|
},
|
||||||
"newsCrawler": {
|
"newsCrawler": {
|
||||||
"maxPages": 15,
|
"maxPages": -1,
|
||||||
"maxDepth": 1,
|
"maxDepth": 10,
|
||||||
"requestTimeout": 15000,
|
"requestTimeout": 15000,
|
||||||
"disabledLabels": [
|
"disabledLabels": [
|
||||||
"Arab News",
|
"Arab News",
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,8 @@ const TRACKING_PARAM_PATTERNS = [
|
||||||
const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
|
const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
|
||||||
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
||||||
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
||||||
|
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
|
||||||
|
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
|
||||||
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
||||||
|
|
||||||
function decodeHtmlEntities(value) {
|
function decodeHtmlEntities(value) {
|
||||||
|
|
@ -267,30 +269,36 @@ function selectPubDate(meta, jsonLdArticle, html) {
|
||||||
function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
let articleScore = 0;
|
let articleScore = 0;
|
||||||
let listingScore = 0;
|
let listingScore = 0;
|
||||||
|
const pathname = new URL(pageUrl).pathname;
|
||||||
|
const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl);
|
||||||
|
const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
|
||||||
|
const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
|
||||||
|
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
|
||||||
|
const paragraphTextLength = extractParagraphTextLength(html);
|
||||||
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
||||||
|
|
||||||
if (jsonLdArticle) {
|
if (jsonLdArticle) {
|
||||||
articleScore += 3;
|
articleScore += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (String(meta.get('og:type') || '').toLowerCase() === 'article') {
|
if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
|
||||||
articleScore += 2;
|
articleScore += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) {
|
if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
|
||||||
articleScore += 2;
|
articleScore += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (/<article\b/i.test(html)) {
|
if (/<article\b/i.test(html)) {
|
||||||
articleScore += 1;
|
articleScore += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ARTICLE_DATE_PATH.test(pageUrl) || ARTICLE_PATH_HINT.test(pageUrl)) {
|
if (hasArticleDatePath || hasArticlePathHint) {
|
||||||
articleScore += 1;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (extractH1(html) && extractParagraphTextLength(html) >= 500) {
|
if (extractH1(html) && paragraphTextLength >= 500) {
|
||||||
articleScore += 1;
|
articleScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (links.length >= 20) {
|
if (links.length >= 20) {
|
||||||
|
|
@ -301,15 +309,23 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
||||||
listingScore += 2;
|
listingScore += 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (LISTING_PATH_HINT.test(new URL(pageUrl).pathname)) {
|
if (LISTING_PATH_HINT.test(pathname)) {
|
||||||
listingScore += 1;
|
listingScore += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (articleScore > 0) {
|
if (hasListingFalsePositivePath) {
|
||||||
listingScore -= 2;
|
listingScore += 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
return { articleScore, listingScore };
|
if (articleScore > 0) {
|
||||||
|
listingScore -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const isArticleCandidate = articleScore >= 4
|
||||||
|
&& articleScore > listingScore
|
||||||
|
&& (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
|
||||||
|
|
||||||
|
return { articleScore, listingScore, isArticleCandidate };
|
||||||
}
|
}
|
||||||
|
|
||||||
function shouldQueueLink(url) {
|
function shouldQueueLink(url) {
|
||||||
|
|
@ -396,6 +412,20 @@ function buildDefaultSeeds(feedUrl) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function normalizeLimit(value, fallback, minimum, maximum) {
|
||||||
|
const numeric = Number(value);
|
||||||
|
|
||||||
|
if (numeric === -1) {
|
||||||
|
return Number.POSITIVE_INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Number.isFinite(numeric)) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(minimum, Math.min(numeric, maximum));
|
||||||
|
}
|
||||||
|
|
||||||
function normalizeSite(site) {
|
function normalizeSite(site) {
|
||||||
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
|
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
|
||||||
const seeds = unique((site.seeds || [])
|
const seeds = unique((site.seeds || [])
|
||||||
|
|
@ -407,8 +437,8 @@ function normalizeSite(site) {
|
||||||
label: String(site.label || '').trim(),
|
label: String(site.label || '').trim(),
|
||||||
allowedHosts,
|
allowedHosts,
|
||||||
seeds,
|
seeds,
|
||||||
maxPages: Math.max(1, Math.min(Number(site.maxPages) || 15, 500)),
|
maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
|
||||||
maxDepth: Math.max(0, Math.min(Number(site.maxDepth) || 1, 5)),
|
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
|
||||||
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
@ -515,9 +545,9 @@ async function crawlSite(site) {
|
||||||
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
||||||
: current.url;
|
: current.url;
|
||||||
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
||||||
const { articleScore, listingScore } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
||||||
|
|
||||||
if (articleScore >= 3 && !discoveredArticleUrls.has(canonicalUrl)) {
|
if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
|
||||||
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
||||||
if (title) {
|
if (title) {
|
||||||
discoveredArticleUrls.add(canonicalUrl);
|
discoveredArticleUrls.add(canonicalUrl);
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
const Parser = require('rss-parser');
|
const Parser = require('rss-parser');
|
||||||
const config = require('../config');
|
const config = require('../config');
|
||||||
|
const { fetchWithPolicy } = require('../http');
|
||||||
|
|
||||||
const parser = new Parser({
|
const parser = new Parser({
|
||||||
timeout: 10000,
|
timeout: 10000,
|
||||||
|
|
@ -43,9 +44,11 @@ const invalidFeedLabels = new Set([
|
||||||
const malformedFeedLabels = new Set([
|
const malformedFeedLabels = new Set([
|
||||||
'BFM Business',
|
'BFM Business',
|
||||||
'Business Daily Africa',
|
'Business Daily Africa',
|
||||||
|
'Nation News Barbados',
|
||||||
]);
|
]);
|
||||||
const loggedBlockedFeeds = new Set();
|
const loggedBlockedFeeds = new Set();
|
||||||
const loggedInvalidFeeds = new Set();
|
const loggedInvalidFeeds = new Set();
|
||||||
|
const loggedUpstreamFeedSkips = new Set();
|
||||||
|
|
||||||
function getHostname(url) {
|
function getHostname(url) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -65,6 +68,34 @@ function isMalformedFeedError(error) {
|
||||||
return message.includes('Invalid character in entity name') || message.includes('Attribute without value');
|
return message.includes('Invalid character in entity name') || message.includes('Attribute without value');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getErrorStatus(error) {
|
||||||
|
if (error && Number.isInteger(error.status)) {
|
||||||
|
return error.status;
|
||||||
|
}
|
||||||
|
|
||||||
|
const match = String(error && error.message || '').match(/\b(401|403|404|408|429|5\d\d)\b/);
|
||||||
|
return match ? Number(match[1]) : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function parseFeed(feedUrl) {
|
||||||
|
const response = await fetchWithPolicy(feedUrl, {
|
||||||
|
timeout: 10000,
|
||||||
|
retries: 1,
|
||||||
|
headers: {
|
||||||
|
Accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = new Error(`Status code ${response.status}`);
|
||||||
|
error.status = response.status;
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const xml = await response.text();
|
||||||
|
return parser.parseString(xml);
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchRssArticles() {
|
async function fetchRssArticles() {
|
||||||
const articles = [];
|
const articles = [];
|
||||||
|
|
||||||
|
|
@ -89,7 +120,7 @@ async function fetchRssArticles() {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const parsed = await parser.parseURL(feed.url);
|
const parsed = await parseFeed(feed.url);
|
||||||
for (const item of parsed.items || []) {
|
for (const item of parsed.items || []) {
|
||||||
const title = String(item.title || '').trim();
|
const title = String(item.title || '').trim();
|
||||||
const url = String(item.link || item.guid || '').trim();
|
const url = String(item.link || item.guid || '').trim();
|
||||||
|
|
@ -115,6 +146,16 @@ async function fetchRssArticles() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const status = getErrorStatus(error);
|
||||||
|
if (status === 401 || status === 403 || status === 404 || status === 429) {
|
||||||
|
const key = `${label}:${status}`;
|
||||||
|
if (!loggedUpstreamFeedSkips.has(key)) {
|
||||||
|
loggedUpstreamFeedSkips.add(key);
|
||||||
|
console.warn(`RSS feed skipped for ${label}: upstream returned ${status}`);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
console.error(`Failed to fetch RSS feed: ${label}`, error);
|
console.error(`Failed to fetch RSS feed: ${label}`, error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue