add browser crawling capabilities and enhance configuration options

This commit is contained in:
ImBenji 2026-04-17 16:53:18 +01:00
parent c6bbe2a061
commit 1a8504389a
6 changed files with 298 additions and 49 deletions

View file

@ -394,6 +394,7 @@
"newsCrawler": {
"maxPages": -1,
"maxDepth": 10,
"pageConcurrency": 4,
"requestTimeout": 15000,
"disabledLabels": [
"Arab News",
@ -463,6 +464,7 @@
"www.cnbc.com",
"cnbc.com"
],
"renderMode": "browser",
"seeds": [
"https://www.cnbc.com/world/",
"https://www.cnbc.com/business/",
@ -572,6 +574,7 @@
"fortune.com",
"www.fortune.com"
],
"renderMode": "browser",
"seeds": [
"https://fortune.com/",
"https://fortune.com/section/tech/",
@ -583,11 +586,23 @@
"www.forbes.com",
"forbes.com"
],
"renderMode": "browser",
"seeds": [
"https://www.forbes.com/business/",
"https://www.forbes.com/innovation/"
]
},
"Financial Times": {
"allowedHosts": [
"www.ft.com",
"ft.com"
],
"renderMode": "browser",
"seeds": [
"https://www.ft.com/world/us",
"https://www.ft.com/technology"
]
},
"Nikkei Asia": {
"allowedHosts": [
"asia.nikkei.com"
@ -635,6 +650,7 @@
"www.wired.com",
"wired.com"
],
"renderMode": "browser",
"seeds": [
"https://www.wired.com/category/business/",
"https://www.wired.com/category/security/"
@ -644,6 +660,7 @@
"allowedHosts": [
"finance.yahoo.com"
],
"renderMode": "browser",
"seeds": [
"https://finance.yahoo.com/",
"https://finance.yahoo.com/news/",

82
package-lock.json generated
View file

@ -10,9 +10,11 @@
"license": "ISC",
"dependencies": {
"@extractus/article-extractor": "^8.0.18",
"@fastify/cors": "^11.2.0",
"better-sqlite3": "^12.4.1",
"fastify": "^5.6.1",
"node-cron": "^4.2.1",
"playwright": "^1.59.1",
"rss-parser": "^3.13.0",
"sharp": "^0.34.5",
"sqlite-vec": "^0.1.9"
@ -65,6 +67,26 @@
"fast-uri": "^3.0.0"
}
},
"node_modules/@fastify/cors": {
"version": "11.2.0",
"resolved": "https://registry.npmjs.org/@fastify/cors/-/cors-11.2.0.tgz",
"integrity": "sha512-LbLHBuSAdGdSFZYTLVA3+Ch2t+sA6nq3Ejc6XLAKiQ6ViS2qFnvicpj0htsx03FyYeLs04HfRNBsz/a8SvbcUw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fastify"
},
{
"type": "opencollective",
"url": "https://opencollective.com/fastify"
}
],
"license": "MIT",
"dependencies": {
"fastify-plugin": "^5.0.0",
"toad-cache": "^3.7.0"
}
},
"node_modules/@fastify/error": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/@fastify/error/-/error-4.2.0.tgz",
@ -1097,6 +1119,22 @@
"toad-cache": "^3.7.0"
}
},
"node_modules/fastify-plugin": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/fastify-plugin/-/fastify-plugin-5.1.0.tgz",
"integrity": "sha512-FAIDA8eovSt5qcDgcBvDuX/v0Cjz0ohGhENZ/wpc3y+oZCY2afZ9Baqql3g/lC+OHRnciQol4ww7tuthOb9idw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fastify"
},
{
"type": "opencollective",
"url": "https://opencollective.com/fastify"
}
],
"license": "MIT"
},
"node_modules/fastq": {
"version": "1.20.1",
"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.20.1.tgz",
@ -1132,6 +1170,20 @@
"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==",
"license": "MIT"
},
"node_modules/fsevents": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
"hasInstallScript": true,
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/github-from-package": {
"version": "0.0.0",
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
@ -1482,6 +1534,36 @@
"integrity": "sha512-BndPH67/JxGExRgiX1dX0w1FvZck5Wa4aal9198SrRhZjH3GxKQUKIBnYJTdj2HDN3UQAS06HlfcSbQj2OHmaw==",
"license": "MIT"
},
"node_modules/playwright": {
"version": "1.59.1",
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
"integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
"license": "Apache-2.0",
"dependencies": {
"playwright-core": "1.59.1"
},
"bin": {
"playwright": "cli.js"
},
"engines": {
"node": ">=18"
},
"optionalDependencies": {
"fsevents": "2.3.2"
}
},
"node_modules/playwright-core": {
"version": "1.59.1",
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
"integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
"license": "Apache-2.0",
"bin": {
"playwright-core": "cli.js"
},
"engines": {
"node": ">=18"
}
},
"node_modules/postcss": {
"version": "8.5.10",
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.10.tgz",

View file

@ -12,9 +12,11 @@
"type": "commonjs",
"dependencies": {
"@extractus/article-extractor": "^8.0.18",
"@fastify/cors": "^11.2.0",
"better-sqlite3": "^12.4.1",
"fastify": "^5.6.1",
"node-cron": "^4.2.1",
"playwright": "^1.59.1",
"rss-parser": "^3.13.0",
"sharp": "^0.34.5",
"sqlite-vec": "^0.1.9"

View file

@ -1,4 +1,5 @@
const Fastify = require('fastify');
const cors = require('@fastify/cors');
const articleRoutes = require('./src/routes/articles');
const statusRoutes = require('./src/routes/status');
const config = require('./src/config');
@ -6,6 +7,7 @@ const { startScheduler } = require('./src/scheduler');
const app = Fastify({ logger: true });
app.register(cors, { origin: true });
app.register(articleRoutes);
app.register(statusRoutes);

View file

@ -0,0 +1,90 @@
const { chromium } = require('playwright');
const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36';
let browserPromise = null;
async function getBrowser() {
if (!browserPromise) {
browserPromise = chromium.launch({
headless: false,
});
}
return browserPromise;
}
async function waitForUsefulDom(page, site) {
try {
await page.waitForLoadState('networkidle', { timeout: Math.min(site.requestTimeout, 5000) });
} catch {
}
try {
await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, {
timeout: Math.min(site.requestTimeout, 5000),
});
} catch {
}
}
async function createBrowserSession(site) {
const browser = await getBrowser();
const context = await browser.newContext({
userAgent: BROWSER_USER_AGENT,
viewport: { width: 1440, height: 1200 },
javaScriptEnabled: true,
});
await context.route('**/*', async (route) => {
const request = route.request();
const resourceType = request.resourceType();
if (['image', 'media', 'font'].includes(resourceType)) {
await route.abort();
return;
}
await route.continue();
});
return {
async fetchRenderedHtml(url, options = {}) {
const page = await context.newPage();
try {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: site.requestTimeout,
});
await waitForUsefulDom(page, site);
const html = await page.content();
if (options.includeDebug) {
return {
html,
finalUrl: page.url(),
title: await page.title(),
linkCount: await page.locator('a[href]').count(),
};
}
return html;
} finally {
await page.close();
}
},
async close() {
await context.close();
},
};
}
function shouldUseBrowser(site) {
return site.renderMode === 'browser';
}
module.exports = {
createBrowserSession,
shouldUseBrowser,
};

View file

@ -1,5 +1,6 @@
const config = require('../config');
const { fetchWithPolicy } = require('../http');
const { createBrowserSession, shouldUseBrowser } = require('./browserCrawler');
const TRACKING_PARAM_PATTERNS = [
/^utm_/i,
@ -22,6 +23,7 @@ const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/storie
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
const EXPLORATION_PATH_HINT = /(\/page\/\d+(?:\/|$)|[?&]page=\d+|\/archive(?:s)?(?:\/|$)|\/latest(?:\/|$)|\/news(?:\/|$)|\/world(?:\/|$)|\/business(?:\/|$)|\/politics(?:\/|$)|\/technology(?:\/|$)|\/tech(?:\/|$)|\/markets(?:\/|$)|\/economy(?:\/|$)|\/topic(?:s)?(?:\/|$)|\/section(?:s)?(?:\/|$)|\/category(?:ies)?(?:\/|$)|\/tag(?:s)?(?:\/|$))/i;
function decodeHtmlEntities(value) {
return String(value || '')
@ -173,25 +175,6 @@ function extractJsonLdBlocks(html) {
return blocks;
}
function walkJson(value, visit) {
if (Array.isArray(value)) {
for (const item of value) {
walkJson(item, visit);
}
return;
}
if (!value || typeof value !== 'object') {
return;
}
visit(value);
for (const child of Object.values(value)) {
walkJson(child, visit);
}
}
function isArticleType(type) {
if (Array.isArray(type)) {
return type.some((entry) => isArticleType(entry));
@ -202,15 +185,24 @@ function isArticleType(type) {
function extractArticleJsonLd(html) {
const blocks = extractJsonLdBlocks(html);
let article = null;
for (const block of blocks) {
walkJson(block, (value) => {
if (!article && isArticleType(value['@type'])) {
article = value;
}
});
if (!block || typeof block !== 'object') {
continue;
}
if (isArticleType(block['@type'])) {
return block;
}
const graph = Array.isArray(block['@graph']) ? block['@graph'] : [];
const directChildren = [
...graph,
...(Array.isArray(block.mainEntity) ? block.mainEntity : [block.mainEntity]),
...(Array.isArray(block.mainEntityOfPage) ? block.mainEntityOfPage : [block.mainEntityOfPage]),
].filter((value) => value && typeof value === 'object');
const article = directChildren.find((value) => isArticleType(value['@type']));
if (article) {
return article;
}
@ -276,16 +268,25 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
const paragraphTextLength = extractParagraphTextLength(html);
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
const h1 = extractH1(html);
const ogTitle = normalizeText(meta.get('og:title') || '');
const jsonLdHeadline = normalizeText(jsonLdArticle && jsonLdArticle.headline);
const jsonLdMatchesPage = jsonLdHeadline
&& ((h1 && (h1.includes(jsonLdHeadline) || jsonLdHeadline.includes(h1)))
|| (ogTitle && (ogTitle.includes(jsonLdHeadline) || jsonLdHeadline.includes(ogTitle))));
const hasJsonLdArticle = Boolean(jsonLdArticle && jsonLdMatchesPage);
const hasPublishTime = Boolean(meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html));
const hasOgArticle = String(meta.get('og:type') || '').toLowerCase() === 'article';
if (jsonLdArticle) {
if (hasJsonLdArticle) {
articleScore += 4;
}
if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
if (hasOgArticle && !hasListingFalsePositivePath) {
articleScore += 1;
}
if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
if (hasPublishTime && !hasListingFalsePositivePath) {
articleScore += 1;
}
@ -297,7 +298,7 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
articleScore += 2;
}
if (extractH1(html) && paragraphTextLength >= 500) {
if (h1 && paragraphTextLength >= 500) {
articleScore += 2;
}
@ -321,23 +322,49 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
listingScore -= 1;
}
const isArticleCandidate = articleScore >= 4
const hasArticleSignalsBeyondJsonLd = hasOgArticle || hasPublishTime || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500;
const looksLikeListingPage = headlineLinks >= 15;
const isArticleCandidate = !looksLikeListingPage
&& articleScore >= 5
&& articleScore > listingScore
&& (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
&& hasArticleSignalsBeyondJsonLd
&& (!jsonLdArticle || hasJsonLdArticle || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
return { articleScore, listingScore, isArticleCandidate };
}
function shouldQueueLink(url) {
const pathname = new URL(url).pathname.toLowerCase();
const parsed = new URL(url);
const pathname = parsed.pathname.toLowerCase();
const hostname = parsed.hostname.toLowerCase();
if (BLOCKED_PATH_HINT.test(pathname)) {
return false;
}
if (hostname.includes('consent.yahoo.com') || pathname.startsWith('/v2/') || pathname.startsWith('/redirect')) {
return false;
}
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
}
function shouldContinueExploring(current, listingScore, links) {
if (current.depth === 0) {
return true;
}
if (listingScore >= 1) {
return true;
}
if (current.depth <= 1 && links.length >= 8) {
return true;
}
return EXPLORATION_PATH_HINT.test(current.url);
}
function slugifyLabel(label) {
return String(label || '')
.toLowerCase()
@ -437,8 +464,10 @@ function normalizeSite(site) {
label: String(site.label || '').trim(),
allowedHosts,
seeds,
renderMode: String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 'browser' : 'http',
maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
pageConcurrency: normalizeLimit(site.pageConcurrency, String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 3 : 4, 1, 12),
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
};
}
@ -473,8 +502,10 @@ function getConfiguredCrawlerSites() {
name: override.name || `crawler_${slugifyLabel(label)}`,
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
seeds: override.seeds || buildDefaultSeeds(feed.url),
renderMode: override.renderMode || defaults.renderMode,
maxPages: override.maxPages || defaults.maxPages,
maxDepth: override.maxDepth || defaults.maxDepth,
pageConcurrency: override.pageConcurrency,
requestTimeout: override.requestTimeout || defaults.requestTimeout,
});
@ -486,9 +517,13 @@ function getConfiguredCrawlerSites() {
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
}
async function fetchHtml(url, timeout) {
async function fetchHtml(url, site, browserSession) {
if (browserSession) {
return browserSession.fetchRenderedHtml(url);
}
const response = await fetchWithPolicy(url, {
timeout,
timeout: site.requestTimeout,
retries: 1,
});
@ -511,31 +546,26 @@ async function crawlSite(site) {
return [];
}
const browserSession = shouldUseBrowser(normalizedSite)
? await createBrowserSession(normalizedSite)
: null;
const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
const queuedUrls = new Set(normalizedSite.seeds);
const visitedUrls = new Set();
const discoveredArticleUrls = new Set();
const articles = [];
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
const current = queue.shift();
if (!current || visitedUrls.has(current.url)) {
continue;
}
visitedUrls.add(current.url);
async function processPage(current) {
let html;
try {
html = await fetchHtml(current.url, normalizedSite.requestTimeout);
html = await fetchHtml(current.url, normalizedSite, browserSession);
} catch (error) {
console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
continue;
return;
}
if (!html) {
continue;
return;
}
const meta = extractMetaMap(html);
@ -562,8 +592,8 @@ async function crawlSite(site) {
}
}
if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
continue;
if (current.depth >= normalizedSite.maxDepth || !shouldContinueExploring(current, listingScore, links)) {
return;
}
for (const link of links) {
@ -576,7 +606,33 @@ async function crawlSite(site) {
}
}
return articles;
try {
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
const batch = [];
while (queue.length && batch.length < normalizedSite.pageConcurrency && visitedUrls.size + batch.length < normalizedSite.maxPages) {
const current = queue.shift();
if (!current || visitedUrls.has(current.url)) {
continue;
}
visitedUrls.add(current.url);
batch.push(current);
}
if (!batch.length) {
continue;
}
await Promise.all(batch.map(processPage));
}
return articles;
} finally {
if (browserSession) {
await browserSession.close();
}
}
}
async function fetchCrawlerArticles() {