600 lines
16 KiB
JavaScript
600 lines
16 KiB
JavaScript
const config = require('../config');
|
|
const { fetchWithPolicy } = require('../http');
|
|
|
|
const TRACKING_PARAM_PATTERNS = [
|
|
/^utm_/i,
|
|
/^fbclid$/i,
|
|
/^gclid$/i,
|
|
/^mkt_tok$/i,
|
|
/^mc_cid$/i,
|
|
/^mc_eid$/i,
|
|
/^ref$/i,
|
|
/^ref_src$/i,
|
|
/^s$/i,
|
|
/^cmpid$/i,
|
|
/^guccounter$/i,
|
|
/^guce_referrer$/i,
|
|
/^guce_referrer_sig$/i,
|
|
];
|
|
const LISTING_PATH_HINT = /(archive|archives|latest|topic|topics|section|sections|category|categories|news|world|business|politics|technology|tech|markets|economy|page|tag|tags)/i;
|
|
const ARTICLE_DATE_PATH = /\/\d{4}\/\d{2}\/\d{2}(?:\/|$)|\/\d{4}\/\d{2}(?:\/|$)/;
|
|
const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/stories\/)/i;
|
|
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
|
|
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
|
|
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
|
|
|
function decodeHtmlEntities(value) {
|
|
return String(value || '')
|
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
|
|
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/'/g, "'")
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/ /g, ' ');
|
|
}
|
|
|
|
function stripTags(value) {
|
|
return decodeHtmlEntities(String(value || '').replace(/<[^>]*>/g, ' ')).replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function normalizeText(value) {
|
|
return stripTags(value).replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function isAllowedHost(hostname, allowedHosts) {
|
|
const normalized = String(hostname || '').toLowerCase();
|
|
return allowedHosts.some((allowedHost) => {
|
|
const candidate = String(allowedHost || '').toLowerCase();
|
|
return normalized === candidate || normalized.endsWith(`.${candidate}`);
|
|
});
|
|
}
|
|
|
|
function shouldDropParam(key) {
|
|
return TRACKING_PARAM_PATTERNS.some((pattern) => pattern.test(key));
|
|
}
|
|
|
|
function canonicalizeUrl(rawUrl, baseUrl, allowedHosts) {
|
|
try {
|
|
const url = new URL(rawUrl, baseUrl);
|
|
|
|
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
return null;
|
|
}
|
|
|
|
if (allowedHosts && allowedHosts.length && !isAllowedHost(url.hostname, allowedHosts)) {
|
|
return null;
|
|
}
|
|
|
|
url.hash = '';
|
|
url.username = '';
|
|
url.password = '';
|
|
|
|
const params = [...url.searchParams.entries()]
|
|
.filter(([key]) => !shouldDropParam(key))
|
|
.sort(([left], [right]) => left.localeCompare(right));
|
|
|
|
url.search = '';
|
|
for (const [key, value] of params) {
|
|
url.searchParams.append(key, value);
|
|
}
|
|
|
|
if (url.pathname !== '/') {
|
|
url.pathname = url.pathname.replace(/\/+$/, '') || '/';
|
|
}
|
|
|
|
return url.toString();
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function extractAttribute(tag, name) {
|
|
const match = tag.match(new RegExp(`${name}\\s*=\\s*(["'])(.*?)\\1`, 'i'));
|
|
return match ? decodeHtmlEntities(match[2]).trim() : '';
|
|
}
|
|
|
|
function extractMetaMap(html) {
|
|
const metas = new Map();
|
|
const metaTags = html.match(/<meta\b[^>]*>/gi) || [];
|
|
|
|
for (const tag of metaTags) {
|
|
const key = extractAttribute(tag, 'property') || extractAttribute(tag, 'name');
|
|
const content = extractAttribute(tag, 'content');
|
|
|
|
if (!key || !content) {
|
|
continue;
|
|
}
|
|
|
|
metas.set(key.toLowerCase(), content);
|
|
}
|
|
|
|
return metas;
|
|
}
|
|
|
|
function extractCanonicalHref(html) {
|
|
const links = html.match(/<link\b[^>]*>/gi) || [];
|
|
|
|
for (const tag of links) {
|
|
const rel = extractAttribute(tag, 'rel').toLowerCase();
|
|
if (!rel || !rel.split(/\s+/).includes('canonical')) {
|
|
continue;
|
|
}
|
|
|
|
const href = extractAttribute(tag, 'href');
|
|
if (href) {
|
|
return href;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function extractTitleTag(html) {
|
|
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
return match ? normalizeText(match[1]) : null;
|
|
}
|
|
|
|
function extractH1(html) {
|
|
const match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
|
|
return match ? normalizeText(match[1]) : null;
|
|
}
|
|
|
|
function extractTimeDatetime(html) {
|
|
const match = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
|
|
return match ? decodeHtmlEntities(match[2]).trim() : null;
|
|
}
|
|
|
|
function extractParagraphTextLength(html) {
|
|
const paragraphs = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
|
return paragraphs.slice(0, 10).reduce((total, paragraph) => total + normalizeText(paragraph).length, 0);
|
|
}
|
|
|
|
function extractJsonLdBlocks(html) {
|
|
const blocks = [];
|
|
const regex = /<script\b[^>]*type\s*=\s*(["'])application\/ld\+json\1[^>]*>([\s\S]*?)<\/script>/gi;
|
|
let match;
|
|
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const raw = String(match[2] || '').trim();
|
|
if (!raw) {
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
blocks.push(JSON.parse(raw));
|
|
} catch {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
return blocks;
|
|
}
|
|
|
|
function walkJson(value, visit) {
|
|
if (Array.isArray(value)) {
|
|
for (const item of value) {
|
|
walkJson(item, visit);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (!value || typeof value !== 'object') {
|
|
return;
|
|
}
|
|
|
|
visit(value);
|
|
|
|
for (const child of Object.values(value)) {
|
|
walkJson(child, visit);
|
|
}
|
|
}
|
|
|
|
function isArticleType(type) {
|
|
if (Array.isArray(type)) {
|
|
return type.some((entry) => isArticleType(entry));
|
|
}
|
|
|
|
return ['article', 'newsarticle'].includes(String(type || '').toLowerCase());
|
|
}
|
|
|
|
function extractArticleJsonLd(html) {
|
|
const blocks = extractJsonLdBlocks(html);
|
|
let article = null;
|
|
|
|
for (const block of blocks) {
|
|
walkJson(block, (value) => {
|
|
if (!article && isArticleType(value['@type'])) {
|
|
article = value;
|
|
}
|
|
});
|
|
|
|
if (article) {
|
|
return article;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function extractLinks(html, pageUrl, allowedHosts) {
|
|
const links = [];
|
|
const seen = new Set();
|
|
const regex = /<a\b[^>]*href\s*=\s*(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi;
|
|
let match;
|
|
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const url = canonicalizeUrl(match[2], pageUrl, allowedHosts);
|
|
if (!url || seen.has(url)) {
|
|
continue;
|
|
}
|
|
|
|
const text = normalizeText(match[3]);
|
|
seen.add(url);
|
|
links.push({ url, text });
|
|
}
|
|
|
|
return links;
|
|
}
|
|
|
|
function selectTitle(meta, jsonLdArticle, html) {
|
|
return [
|
|
meta.get('og:title'),
|
|
meta.get('twitter:title'),
|
|
jsonLdArticle && jsonLdArticle.headline,
|
|
extractH1(html),
|
|
extractTitleTag(html),
|
|
].find((value) => String(value || '').trim()) || null;
|
|
}
|
|
|
|
function selectDescription(meta, jsonLdArticle) {
|
|
return [
|
|
meta.get('og:description'),
|
|
meta.get('description'),
|
|
jsonLdArticle && jsonLdArticle.description,
|
|
].find((value) => String(value || '').trim()) || null;
|
|
}
|
|
|
|
function selectPubDate(meta, jsonLdArticle, html) {
|
|
return [
|
|
jsonLdArticle && jsonLdArticle.datePublished,
|
|
meta.get('article:published_time'),
|
|
meta.get('og:article:published_time'),
|
|
extractTimeDatetime(html),
|
|
].find((value) => String(value || '').trim()) || null;
|
|
}
|
|
|
|
function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
|
let articleScore = 0;
|
|
let listingScore = 0;
|
|
const pathname = new URL(pageUrl).pathname;
|
|
const hasArticleDatePath = ARTICLE_DATE_PATH.test(pageUrl);
|
|
const hasArticlePathHint = ARTICLE_PATH_HINT.test(pageUrl);
|
|
const hasStrongArticlePath = ARTICLE_PATH_STRONG_HINT.test(pathname);
|
|
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
|
|
const paragraphTextLength = extractParagraphTextLength(html);
|
|
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
|
|
|
if (jsonLdArticle) {
|
|
articleScore += 4;
|
|
}
|
|
|
|
if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
|
|
articleScore += 1;
|
|
}
|
|
|
|
if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
|
|
articleScore += 1;
|
|
}
|
|
|
|
if (/<article\b/i.test(html)) {
|
|
articleScore += 1;
|
|
}
|
|
|
|
if (hasArticleDatePath || hasArticlePathHint) {
|
|
articleScore += 2;
|
|
}
|
|
|
|
if (extractH1(html) && paragraphTextLength >= 500) {
|
|
articleScore += 2;
|
|
}
|
|
|
|
if (links.length >= 20) {
|
|
listingScore += 2;
|
|
}
|
|
|
|
if (headlineLinks >= 8) {
|
|
listingScore += 2;
|
|
}
|
|
|
|
if (LISTING_PATH_HINT.test(pathname)) {
|
|
listingScore += 1;
|
|
}
|
|
|
|
if (hasListingFalsePositivePath) {
|
|
listingScore += 3;
|
|
}
|
|
|
|
if (articleScore > 0) {
|
|
listingScore -= 1;
|
|
}
|
|
|
|
const isArticleCandidate = articleScore >= 4
|
|
&& articleScore > listingScore
|
|
&& (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
|
|
|
|
return { articleScore, listingScore, isArticleCandidate };
|
|
}
|
|
|
|
function shouldQueueLink(url) {
|
|
const pathname = new URL(url).pathname.toLowerCase();
|
|
|
|
if (BLOCKED_PATH_HINT.test(pathname)) {
|
|
return false;
|
|
}
|
|
|
|
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
|
|
}
|
|
|
|
function slugifyLabel(label) {
|
|
return String(label || '')
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '_')
|
|
.replace(/^_+|_+$/g, '');
|
|
}
|
|
|
|
function unique(values) {
|
|
return [...new Set(values.filter(Boolean))];
|
|
}
|
|
|
|
function buildAllowedHosts(hostname) {
|
|
if (!hostname) {
|
|
return [];
|
|
}
|
|
|
|
const hosts = [hostname.toLowerCase()];
|
|
if (hostname.startsWith('www.')) {
|
|
hosts.push(hostname.slice(4).toLowerCase());
|
|
} else {
|
|
hosts.push(`www.${hostname}`.toLowerCase());
|
|
}
|
|
|
|
return unique(hosts);
|
|
}
|
|
|
|
function cleanFeedPath(pathname) {
|
|
const withoutIndex = pathname
|
|
.replace(/\/index\.[a-z0-9]+$/i, '/')
|
|
.replace(/\.[a-z0-9]+$/i, '')
|
|
.replace(/\/rss(?:$|\/.*$)/i, '/')
|
|
.replace(/\/feed(?:$|\/.*$)/i, '/')
|
|
.replace(/\/feeds?(?:$|\/.*$)/i, '/')
|
|
.replace(/\/xml(?:$|\/.*$)/i, '/')
|
|
.replace(/\/arc\/outboundfeeds\//i, '/')
|
|
.replace(/\/dynamo\//i, '/')
|
|
.replace(/\/id\/\d+\/device\/rss\//i, '/')
|
|
.replace(/\/contentexport\//i, '/')
|
|
.replace(/\/rssfeedstopstories$/i, '/')
|
|
.replace(/\/latest$/i, '/')
|
|
.replace(/\/+$|^$/g, '');
|
|
|
|
if (!withoutIndex) {
|
|
return '/';
|
|
}
|
|
|
|
const segments = withoutIndex
|
|
.split('/')
|
|
.filter(Boolean)
|
|
.filter((segment) => !/^(rss|feed|feeds|xml)$/i.test(segment))
|
|
.slice(0, 3);
|
|
|
|
if (!segments.length) {
|
|
return '/';
|
|
}
|
|
|
|
return `/${segments.join('/')}`;
|
|
}
|
|
|
|
function buildDefaultSeeds(feedUrl) {
|
|
try {
|
|
const parsed = new URL(feedUrl);
|
|
const origin = `${parsed.protocol}//${parsed.hostname}`;
|
|
const cleanedPath = cleanFeedPath(parsed.pathname);
|
|
|
|
return unique([
|
|
canonicalizeUrl(origin, origin),
|
|
cleanedPath === '/' ? null : canonicalizeUrl(`${origin}${cleanedPath}`, origin),
|
|
]);
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
function normalizeLimit(value, fallback, minimum, maximum) {
|
|
const numeric = Number(value);
|
|
|
|
if (numeric === -1) {
|
|
return Number.POSITIVE_INFINITY;
|
|
}
|
|
|
|
if (!Number.isFinite(numeric)) {
|
|
return fallback;
|
|
}
|
|
|
|
return Math.max(minimum, Math.min(numeric, maximum));
|
|
}
|
|
|
|
function normalizeSite(site) {
|
|
const allowedHosts = unique((site.allowedHosts || []).map((host) => String(host || '').toLowerCase()).filter(Boolean));
|
|
const seeds = unique((site.seeds || [])
|
|
.map((seed) => canonicalizeUrl(seed, seed, allowedHosts))
|
|
.filter(Boolean));
|
|
|
|
return {
|
|
name: String(site.name || '').trim(),
|
|
label: String(site.label || '').trim(),
|
|
allowedHosts,
|
|
seeds,
|
|
maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
|
|
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
|
|
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
|
};
|
|
}
|
|
|
|
function getCrawlerSiteOverrides(label) {
|
|
return config.newsCrawler?.overrides?.[label] || null;
|
|
}
|
|
|
|
function getConfiguredCrawlerSites() {
|
|
const defaults = config.newsCrawler || {};
|
|
const disabledLabels = new Set((defaults.disabledLabels || []).map((label) => String(label || '').trim()));
|
|
const explicitSites = (defaults.sites || []).map((site) => normalizeSite(site));
|
|
const explicitLabels = new Set(explicitSites.map((site) => site.label).filter(Boolean));
|
|
const derivedSites = [];
|
|
|
|
for (const feed of config.rssFeeds || []) {
|
|
const label = String(feed.label || '').trim();
|
|
if (!label || disabledLabels.has(label) || explicitLabels.has(label)) {
|
|
continue;
|
|
}
|
|
|
|
let hostname = '';
|
|
try {
|
|
hostname = new URL(feed.url).hostname;
|
|
} catch {
|
|
continue;
|
|
}
|
|
|
|
const override = getCrawlerSiteOverrides(label) || {};
|
|
const site = normalizeSite({
|
|
label,
|
|
name: override.name || `crawler_${slugifyLabel(label)}`,
|
|
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
|
seeds: override.seeds || buildDefaultSeeds(feed.url),
|
|
maxPages: override.maxPages || defaults.maxPages,
|
|
maxDepth: override.maxDepth || defaults.maxDepth,
|
|
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
|
});
|
|
|
|
if (site.name && site.allowedHosts.length && site.seeds.length) {
|
|
derivedSites.push(site);
|
|
}
|
|
}
|
|
|
|
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
|
|
}
|
|
|
|
async function fetchHtml(url, timeout) {
|
|
const response = await fetchWithPolicy(url, {
|
|
timeout,
|
|
retries: 1,
|
|
});
|
|
|
|
if (!response.ok) {
|
|
return null;
|
|
}
|
|
|
|
const contentType = String(response.headers.get('content-type') || '').toLowerCase();
|
|
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
|
|
return null;
|
|
}
|
|
|
|
return response.text();
|
|
}
|
|
|
|
async function crawlSite(site) {
|
|
const normalizedSite = normalizeSite(site);
|
|
|
|
if (!normalizedSite.name || !normalizedSite.allowedHosts.length || !normalizedSite.seeds.length) {
|
|
return [];
|
|
}
|
|
|
|
const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
|
|
const queuedUrls = new Set(normalizedSite.seeds);
|
|
const visitedUrls = new Set();
|
|
const discoveredArticleUrls = new Set();
|
|
const articles = [];
|
|
|
|
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
|
const current = queue.shift();
|
|
|
|
if (!current || visitedUrls.has(current.url)) {
|
|
continue;
|
|
}
|
|
|
|
visitedUrls.add(current.url);
|
|
|
|
let html;
|
|
try {
|
|
html = await fetchHtml(current.url, normalizedSite.requestTimeout);
|
|
} catch (error) {
|
|
console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
|
|
continue;
|
|
}
|
|
|
|
if (!html) {
|
|
continue;
|
|
}
|
|
|
|
const meta = extractMetaMap(html);
|
|
const jsonLdArticle = extractArticleJsonLd(html);
|
|
const canonicalHref = extractCanonicalHref(html);
|
|
const canonicalUrl = canonicalHref
|
|
? canonicalizeUrl(canonicalHref, current.url, normalizedSite.allowedHosts) || current.url
|
|
: current.url;
|
|
const links = extractLinks(html, canonicalUrl, normalizedSite.allowedHosts);
|
|
const { listingScore, isArticleCandidate } = scorePage(canonicalUrl, meta, html, jsonLdArticle, links);
|
|
|
|
if (isArticleCandidate && !discoveredArticleUrls.has(canonicalUrl)) {
|
|
const title = normalizeText(selectTitle(meta, jsonLdArticle, html));
|
|
if (title) {
|
|
discoveredArticleUrls.add(canonicalUrl);
|
|
articles.push({
|
|
title,
|
|
description: normalizeText(selectDescription(meta, jsonLdArticle)) || null,
|
|
url: canonicalUrl,
|
|
source: normalizedSite.name,
|
|
pubDate: selectPubDate(meta, jsonLdArticle, html),
|
|
});
|
|
}
|
|
}
|
|
|
|
if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
|
|
continue;
|
|
}
|
|
|
|
for (const link of links) {
|
|
if (!shouldQueueLink(link.url) || visitedUrls.has(link.url) || queuedUrls.has(link.url)) {
|
|
continue;
|
|
}
|
|
|
|
queuedUrls.add(link.url);
|
|
queue.push({ url: link.url, depth: current.depth + 1 });
|
|
}
|
|
}
|
|
|
|
return articles;
|
|
}
|
|
|
|
async function fetchCrawlerArticles() {
|
|
const articles = [];
|
|
|
|
for (const site of getConfiguredCrawlerSites()) {
|
|
try {
|
|
articles.push(...await crawlSite(site));
|
|
} catch (error) {
|
|
console.error(`Crawler failed for ${site && site.name ? site.name : 'unknown_site'}`, error);
|
|
}
|
|
}
|
|
|
|
return articles;
|
|
}
|
|
|
|
module.exports = {
|
|
fetchCrawlerArticles,
|
|
crawlSite,
|
|
canonicalizeUrl,
|
|
getConfiguredCrawlerSites,
|
|
};
|