From 1a8504389ade05a22b58127fdb622587c346538b Mon Sep 17 00:00:00 2001 From: ImBenji Date: Fri, 17 Apr 2026 16:53:18 +0100 Subject: [PATCH] add browser crawling capabilities and enhance configuration options --- config.json | 17 ++++ package-lock.json | 82 ++++++++++++++++++ package.json | 2 + server.js | 2 + src/sources/browserCrawler.js | 90 ++++++++++++++++++++ src/sources/newsCrawler.js | 154 +++++++++++++++++++++++----------- 6 files changed, 298 insertions(+), 49 deletions(-) create mode 100644 src/sources/browserCrawler.js diff --git a/config.json b/config.json index a7b0d4f..0fa2203 100644 --- a/config.json +++ b/config.json @@ -394,6 +394,7 @@ "newsCrawler": { "maxPages": -1, "maxDepth": 10, + "pageConcurrency": 4, "requestTimeout": 15000, "disabledLabels": [ "Arab News", @@ -463,6 +464,7 @@ "www.cnbc.com", "cnbc.com" ], + "renderMode": "browser", "seeds": [ "https://www.cnbc.com/world/", "https://www.cnbc.com/business/", @@ -572,6 +574,7 @@ "fortune.com", "www.fortune.com" ], + "renderMode": "browser", "seeds": [ "https://fortune.com/", "https://fortune.com/section/tech/", @@ -583,11 +586,23 @@ "www.forbes.com", "forbes.com" ], + "renderMode": "browser", "seeds": [ "https://www.forbes.com/business/", "https://www.forbes.com/innovation/" ] }, + "Financial Times": { + "allowedHosts": [ + "www.ft.com", + "ft.com" + ], + "renderMode": "browser", + "seeds": [ + "https://www.ft.com/world/us", + "https://www.ft.com/technology" + ] + }, "Nikkei Asia": { "allowedHosts": [ "asia.nikkei.com" @@ -635,6 +650,7 @@ "www.wired.com", "wired.com" ], + "renderMode": "browser", "seeds": [ "https://www.wired.com/category/business/", "https://www.wired.com/category/security/" @@ -644,6 +660,7 @@ "allowedHosts": [ "finance.yahoo.com" ], + "renderMode": "browser", "seeds": [ "https://finance.yahoo.com/", "https://finance.yahoo.com/news/", diff --git a/package-lock.json b/package-lock.json index bd8ffd7..808f9eb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,9 +10,11 @@ "license": "ISC", "dependencies": { "@extractus/article-extractor": "^8.0.18", + "@fastify/cors": "^11.2.0", "better-sqlite3": "^12.4.1", "fastify": "^5.6.1", "node-cron": "^4.2.1", + "playwright": "^1.59.1", "rss-parser": "^3.13.0", "sharp": "^0.34.5", "sqlite-vec": "^0.1.9" @@ -65,6 +67,26 @@ "fast-uri": "^3.0.0" } }, + "node_modules/@fastify/cors": { + "version": "11.2.0", + "resolved": "https://registry.npmjs.org/@fastify/cors/-/cors-11.2.0.tgz", + "integrity": "sha512-LbLHBuSAdGdSFZYTLVA3+Ch2t+sA6nq3Ejc6XLAKiQ6ViS2qFnvicpj0htsx03FyYeLs04HfRNBsz/a8SvbcUw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT", + "dependencies": { + "fastify-plugin": "^5.0.0", + "toad-cache": "^3.7.0" + } + }, "node_modules/@fastify/error": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/@fastify/error/-/error-4.2.0.tgz", @@ -1097,6 +1119,22 @@ "toad-cache": "^3.7.0" } }, + "node_modules/fastify-plugin": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/fastify-plugin/-/fastify-plugin-5.1.0.tgz", + "integrity": "sha512-FAIDA8eovSt5qcDgcBvDuX/v0Cjz0ohGhENZ/wpc3y+oZCY2afZ9Baqql3g/lC+OHRnciQol4ww7tuthOb9idw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "MIT" + }, "node_modules/fastq": { "version": "1.20.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.20.1.tgz", @@ -1132,6 +1170,20 @@ "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", "license": "MIT" }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/github-from-package": { "version": "0.0.0", "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", @@ -1482,6 +1534,36 @@ "integrity": "sha512-BndPH67/JxGExRgiX1dX0w1FvZck5Wa4aal9198SrRhZjH3GxKQUKIBnYJTdj2HDN3UQAS06HlfcSbQj2OHmaw==", "license": "MIT" }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/postcss": { "version": "8.5.10", "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.10.tgz", diff --git a/package.json b/package.json index cc1b8f8..5ffc6e9 100644 --- a/package.json +++ b/package.json @@ -12,9 +12,11 @@ "type": "commonjs", "dependencies": { "@extractus/article-extractor": "^8.0.18", + "@fastify/cors": "^11.2.0", "better-sqlite3": "^12.4.1", "fastify": "^5.6.1", "node-cron": "^4.2.1", + "playwright": "^1.59.1", "rss-parser": "^3.13.0", "sharp": "^0.34.5", "sqlite-vec": "^0.1.9" diff --git a/server.js b/server.js index 04862c7..87330bb 100644 --- a/server.js +++ b/server.js @@ -1,4 +1,5 @@ const Fastify = require('fastify'); +const cors = require('@fastify/cors'); const articleRoutes = require('./src/routes/articles'); const statusRoutes = require('./src/routes/status'); const config = require('./src/config'); @@ -6,6 +7,7 @@ const { startScheduler } = require('./src/scheduler'); const app = Fastify({ logger: true }); +app.register(cors, { origin: true }); app.register(articleRoutes); app.register(statusRoutes); diff --git a/src/sources/browserCrawler.js b/src/sources/browserCrawler.js new file mode 100644 index 0000000..6eab8e6 --- /dev/null +++ b/src/sources/browserCrawler.js @@ -0,0 +1,90 @@ +const { chromium } = require('playwright'); + +const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'; +let browserPromise = null; + +async function getBrowser() { + if (!browserPromise) { + browserPromise = chromium.launch({ + headless: false, + }); + } + + return browserPromise; +} + +async function waitForUsefulDom(page, site) { + try { + await page.waitForLoadState('networkidle', { timeout: Math.min(site.requestTimeout, 5000) }); + } catch { + } + + try { + await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, { + timeout: Math.min(site.requestTimeout, 5000), + }); + } catch { + } +} + +async function createBrowserSession(site) { + const browser = await getBrowser(); + const context = await browser.newContext({ + userAgent: BROWSER_USER_AGENT, + viewport: { width: 1440, height: 1200 }, + javaScriptEnabled: true, + }); + + await context.route('**/*', async (route) => { + const request = route.request(); + const resourceType = request.resourceType(); + + if (['image', 'media', 'font'].includes(resourceType)) { + await route.abort(); + return; + } + + await route.continue(); + }); + + return { + async fetchRenderedHtml(url, options = {}) { + const page = await context.newPage(); + + try { + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: site.requestTimeout, + }); + + await waitForUsefulDom(page, site); + const html = await page.content(); + + if (options.includeDebug) { + return { + html, + finalUrl: page.url(), + title: await page.title(), + linkCount: await page.locator('a[href]').count(), + }; + } + + return html; + } finally { + await page.close(); + } + }, + async close() { + await context.close(); + }, + }; +} + +function shouldUseBrowser(site) { + return site.renderMode === 'browser'; +} + +module.exports = { + createBrowserSession, + shouldUseBrowser, +}; diff --git a/src/sources/newsCrawler.js b/src/sources/newsCrawler.js index 0a14f49..2662fab 100644 --- a/src/sources/newsCrawler.js +++ b/src/sources/newsCrawler.js @@ -1,5 +1,6 @@ const config = require('../config'); const { fetchWithPolicy } = require('../http'); +const { createBrowserSession, shouldUseBrowser } = require('./browserCrawler'); const TRACKING_PARAM_PATTERNS = [ /^utm_/i, @@ -22,6 +23,7 @@ const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/storie const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//; const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i; const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i; +const EXPLORATION_PATH_HINT = /(\/page\/\d+(?:\/|$)|[?&]page=\d+|\/archive(?:s)?(?:\/|$)|\/latest(?:\/|$)|\/news(?:\/|$)|\/world(?:\/|$)|\/business(?:\/|$)|\/politics(?:\/|$)|\/technology(?:\/|$)|\/tech(?:\/|$)|\/markets(?:\/|$)|\/economy(?:\/|$)|\/topic(?:s)?(?:\/|$)|\/section(?:s)?(?:\/|$)|\/category(?:ies)?(?:\/|$)|\/tag(?:s)?(?:\/|$))/i; function decodeHtmlEntities(value) { return String(value || '') @@ -173,25 +175,6 @@ function extractJsonLdBlocks(html) { return blocks; } -function walkJson(value, visit) { - if (Array.isArray(value)) { - for (const item of value) { - walkJson(item, visit); - } - return; - } - - if (!value || typeof value !== 'object') { - return; - } - - visit(value); - - for (const child of Object.values(value)) { - walkJson(child, visit); - } -} - function isArticleType(type) { if (Array.isArray(type)) { return type.some((entry) => isArticleType(entry)); @@ -202,15 +185,24 @@ function isArticleType(type) { function extractArticleJsonLd(html) { const blocks = extractJsonLdBlocks(html); - let article = null; for (const block of blocks) { - walkJson(block, (value) => { - if (!article && isArticleType(value['@type'])) { - article = value; - } - }); + if (!block || typeof block !== 'object') { + continue; + } + if (isArticleType(block['@type'])) { + return block; + } + + const graph = Array.isArray(block['@graph']) ? block['@graph'] : []; + const directChildren = [ + ...graph, + ...(Array.isArray(block.mainEntity) ? block.mainEntity : [block.mainEntity]), + ...(Array.isArray(block.mainEntityOfPage) ? block.mainEntityOfPage : [block.mainEntityOfPage]), + ].filter((value) => value && typeof value === 'object'); + + const article = directChildren.find((value) => isArticleType(value['@type'])); if (article) { return article; } @@ -276,16 +268,25 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) { const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname); const paragraphTextLength = extractParagraphTextLength(html); const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length; + const h1 = extractH1(html); + const ogTitle = normalizeText(meta.get('og:title') || ''); + const jsonLdHeadline = normalizeText(jsonLdArticle && jsonLdArticle.headline); + const jsonLdMatchesPage = jsonLdHeadline + && ((h1 && (h1.includes(jsonLdHeadline) || jsonLdHeadline.includes(h1))) + || (ogTitle && (ogTitle.includes(jsonLdHeadline) || jsonLdHeadline.includes(ogTitle)))); + const hasJsonLdArticle = Boolean(jsonLdArticle && jsonLdMatchesPage); + const hasPublishTime = Boolean(meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)); + const hasOgArticle = String(meta.get('og:type') || '').toLowerCase() === 'article'; - if (jsonLdArticle) { + if (hasJsonLdArticle) { articleScore += 4; } - if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) { + if (hasOgArticle && !hasListingFalsePositivePath) { articleScore += 1; } - if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) { + if (hasPublishTime && !hasListingFalsePositivePath) { articleScore += 1; } @@ -297,7 +298,7 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) { articleScore += 2; } - if (extractH1(html) && paragraphTextLength >= 500) { + if (h1 && paragraphTextLength >= 500) { articleScore += 2; } @@ -321,23 +322,49 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) { listingScore -= 1; } - const isArticleCandidate = articleScore >= 4 + const hasArticleSignalsBeyondJsonLd = hasOgArticle || hasPublishTime || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500; + const looksLikeListingPage = headlineLinks >= 15; + const isArticleCandidate = !looksLikeListingPage + && articleScore >= 5 && articleScore > listingScore - && (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500); + && hasArticleSignalsBeyondJsonLd + && (!jsonLdArticle || hasJsonLdArticle || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500); return { articleScore, listingScore, isArticleCandidate }; } function shouldQueueLink(url) { - const pathname = new URL(url).pathname.toLowerCase(); + const parsed = new URL(url); + const pathname = parsed.pathname.toLowerCase(); + const hostname = parsed.hostname.toLowerCase(); if (BLOCKED_PATH_HINT.test(pathname)) { return false; } + if (hostname.includes('consent.yahoo.com') || pathname.startsWith('/v2/') || pathname.startsWith('/redirect')) { + return false; + } + return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname); } +function shouldContinueExploring(current, listingScore, links) { + if (current.depth === 0) { + return true; + } + + if (listingScore >= 1) { + return true; + } + + if (current.depth <= 1 && links.length >= 8) { + return true; + } + + return EXPLORATION_PATH_HINT.test(current.url); +} + function slugifyLabel(label) { return String(label || '') .toLowerCase() @@ -437,8 +464,10 @@ function normalizeSite(site) { label: String(site.label || '').trim(), allowedHosts, seeds, + renderMode: String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 'browser' : 'http', maxPages: normalizeLimit(site.maxPages, 15, 1, 500), maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5), + pageConcurrency: normalizeLimit(site.pageConcurrency, String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 3 : 4, 1, 12), requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)), }; } @@ -473,8 +502,10 @@ function getConfiguredCrawlerSites() { name: override.name || `crawler_${slugifyLabel(label)}`, allowedHosts: override.allowedHosts || buildAllowedHosts(hostname), seeds: override.seeds || buildDefaultSeeds(feed.url), + renderMode: override.renderMode || defaults.renderMode, maxPages: override.maxPages || defaults.maxPages, maxDepth: override.maxDepth || defaults.maxDepth, + pageConcurrency: override.pageConcurrency, requestTimeout: override.requestTimeout || defaults.requestTimeout, }); @@ -486,9 +517,13 @@ function getConfiguredCrawlerSites() { return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites]; } -async function fetchHtml(url, timeout) { +async function fetchHtml(url, site, browserSession) { + if (browserSession) { + return browserSession.fetchRenderedHtml(url); + } + const response = await fetchWithPolicy(url, { - timeout, + timeout: site.requestTimeout, retries: 1, }); @@ -511,31 +546,26 @@ async function crawlSite(site) { return []; } + const browserSession = shouldUseBrowser(normalizedSite) + ? await createBrowserSession(normalizedSite) + : null; const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 })); const queuedUrls = new Set(normalizedSite.seeds); const visitedUrls = new Set(); const discoveredArticleUrls = new Set(); const articles = []; - while (queue.length && visitedUrls.size < normalizedSite.maxPages) { - const current = queue.shift(); - - if (!current || visitedUrls.has(current.url)) { - continue; - } - - visitedUrls.add(current.url); - + async function processPage(current) { let html; try { - html = await fetchHtml(current.url, normalizedSite.requestTimeout); + html = await fetchHtml(current.url, normalizedSite, browserSession); } catch (error) { console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error); - continue; + return; } if (!html) { - continue; + return; } const meta = extractMetaMap(html); @@ -562,8 +592,8 @@ async function crawlSite(site) { } } - if (current.depth >= normalizedSite.maxDepth || listingScore < 2) { - continue; + if (current.depth >= normalizedSite.maxDepth || !shouldContinueExploring(current, listingScore, links)) { + return; } for (const link of links) { @@ -576,7 +606,33 @@ async function crawlSite(site) { } } - return articles; + try { + while (queue.length && visitedUrls.size < normalizedSite.maxPages) { + const batch = []; + + while (queue.length && batch.length < normalizedSite.pageConcurrency && visitedUrls.size + batch.length < normalizedSite.maxPages) { + const current = queue.shift(); + if (!current || visitedUrls.has(current.url)) { + continue; + } + + visitedUrls.add(current.url); + batch.push(current); + } + + if (!batch.length) { + continue; + } + + await Promise.all(batch.map(processPage)); + } + + return articles; + } finally { + if (browserSession) { + await browserSession.close(); + } + } } async function fetchCrawlerArticles() {