add browser crawling capabilities and enhance configuration options
This commit is contained in:
parent
c6bbe2a061
commit
1a8504389a
6 changed files with 298 additions and 49 deletions
17
config.json
17
config.json
|
|
@ -394,6 +394,7 @@
|
|||
"newsCrawler": {
|
||||
"maxPages": -1,
|
||||
"maxDepth": 10,
|
||||
"pageConcurrency": 4,
|
||||
"requestTimeout": 15000,
|
||||
"disabledLabels": [
|
||||
"Arab News",
|
||||
|
|
@ -463,6 +464,7 @@
|
|||
"www.cnbc.com",
|
||||
"cnbc.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://www.cnbc.com/world/",
|
||||
"https://www.cnbc.com/business/",
|
||||
|
|
@ -572,6 +574,7 @@
|
|||
"fortune.com",
|
||||
"www.fortune.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://fortune.com/",
|
||||
"https://fortune.com/section/tech/",
|
||||
|
|
@ -583,11 +586,23 @@
|
|||
"www.forbes.com",
|
||||
"forbes.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://www.forbes.com/business/",
|
||||
"https://www.forbes.com/innovation/"
|
||||
]
|
||||
},
|
||||
"Financial Times": {
|
||||
"allowedHosts": [
|
||||
"www.ft.com",
|
||||
"ft.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://www.ft.com/world/us",
|
||||
"https://www.ft.com/technology"
|
||||
]
|
||||
},
|
||||
"Nikkei Asia": {
|
||||
"allowedHosts": [
|
||||
"asia.nikkei.com"
|
||||
|
|
@ -635,6 +650,7 @@
|
|||
"www.wired.com",
|
||||
"wired.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://www.wired.com/category/business/",
|
||||
"https://www.wired.com/category/security/"
|
||||
|
|
@ -644,6 +660,7 @@
|
|||
"allowedHosts": [
|
||||
"finance.yahoo.com"
|
||||
],
|
||||
"renderMode": "browser",
|
||||
"seeds": [
|
||||
"https://finance.yahoo.com/",
|
||||
"https://finance.yahoo.com/news/",
|
||||
|
|
|
|||
82
package-lock.json
generated
82
package-lock.json
generated
|
|
@ -10,9 +10,11 @@
|
|||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@extractus/article-extractor": "^8.0.18",
|
||||
"@fastify/cors": "^11.2.0",
|
||||
"better-sqlite3": "^12.4.1",
|
||||
"fastify": "^5.6.1",
|
||||
"node-cron": "^4.2.1",
|
||||
"playwright": "^1.59.1",
|
||||
"rss-parser": "^3.13.0",
|
||||
"sharp": "^0.34.5",
|
||||
"sqlite-vec": "^0.1.9"
|
||||
|
|
@ -65,6 +67,26 @@
|
|||
"fast-uri": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@fastify/cors": {
|
||||
"version": "11.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@fastify/cors/-/cors-11.2.0.tgz",
|
||||
"integrity": "sha512-LbLHBuSAdGdSFZYTLVA3+Ch2t+sA6nq3Ejc6XLAKiQ6ViS2qFnvicpj0htsx03FyYeLs04HfRNBsz/a8SvbcUw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fastify"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/fastify"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"fastify-plugin": "^5.0.0",
|
||||
"toad-cache": "^3.7.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@fastify/error": {
|
||||
"version": "4.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@fastify/error/-/error-4.2.0.tgz",
|
||||
|
|
@ -1097,6 +1119,22 @@
|
|||
"toad-cache": "^3.7.0"
|
||||
}
|
||||
},
|
||||
"node_modules/fastify-plugin": {
|
||||
"version": "5.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fastify-plugin/-/fastify-plugin-5.1.0.tgz",
|
||||
"integrity": "sha512-FAIDA8eovSt5qcDgcBvDuX/v0Cjz0ohGhENZ/wpc3y+oZCY2afZ9Baqql3g/lC+OHRnciQol4ww7tuthOb9idw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fastify"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/fastify"
|
||||
}
|
||||
],
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fastq": {
|
||||
"version": "1.20.1",
|
||||
"resolved": "https://registry.npmjs.org/fastq/-/fastq-1.20.1.tgz",
|
||||
|
|
@ -1132,6 +1170,20 @@
|
|||
"integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fsevents": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/github-from-package": {
|
||||
"version": "0.0.0",
|
||||
"resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
|
||||
|
|
@ -1482,6 +1534,36 @@
|
|||
"integrity": "sha512-BndPH67/JxGExRgiX1dX0w1FvZck5Wa4aal9198SrRhZjH3GxKQUKIBnYJTdj2HDN3UQAS06HlfcSbQj2OHmaw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz",
|
||||
"integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.59.1"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.59.1",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz",
|
||||
"integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/postcss": {
|
||||
"version": "8.5.10",
|
||||
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.10.tgz",
|
||||
|
|
|
|||
|
|
@ -12,9 +12,11 @@
|
|||
"type": "commonjs",
|
||||
"dependencies": {
|
||||
"@extractus/article-extractor": "^8.0.18",
|
||||
"@fastify/cors": "^11.2.0",
|
||||
"better-sqlite3": "^12.4.1",
|
||||
"fastify": "^5.6.1",
|
||||
"node-cron": "^4.2.1",
|
||||
"playwright": "^1.59.1",
|
||||
"rss-parser": "^3.13.0",
|
||||
"sharp": "^0.34.5",
|
||||
"sqlite-vec": "^0.1.9"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
const Fastify = require('fastify');
|
||||
const cors = require('@fastify/cors');
|
||||
const articleRoutes = require('./src/routes/articles');
|
||||
const statusRoutes = require('./src/routes/status');
|
||||
const config = require('./src/config');
|
||||
|
|
@ -6,6 +7,7 @@ const { startScheduler } = require('./src/scheduler');
|
|||
|
||||
const app = Fastify({ logger: true });
|
||||
|
||||
app.register(cors, { origin: true });
|
||||
app.register(articleRoutes);
|
||||
app.register(statusRoutes);
|
||||
|
||||
|
|
|
|||
90
src/sources/browserCrawler.js
Normal file
90
src/sources/browserCrawler.js
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
const { chromium } = require('playwright');
|
||||
|
||||
const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36';
|
||||
let browserPromise = null;
|
||||
|
||||
async function getBrowser() {
|
||||
if (!browserPromise) {
|
||||
browserPromise = chromium.launch({
|
||||
headless: false,
|
||||
});
|
||||
}
|
||||
|
||||
return browserPromise;
|
||||
}
|
||||
|
||||
async function waitForUsefulDom(page, site) {
|
||||
try {
|
||||
await page.waitForLoadState('networkidle', { timeout: Math.min(site.requestTimeout, 5000) });
|
||||
} catch {
|
||||
}
|
||||
|
||||
try {
|
||||
await page.waitForFunction(() => document.querySelectorAll('a[href]').length > 20, {
|
||||
timeout: Math.min(site.requestTimeout, 5000),
|
||||
});
|
||||
} catch {
|
||||
}
|
||||
}
|
||||
|
||||
async function createBrowserSession(site) {
|
||||
const browser = await getBrowser();
|
||||
const context = await browser.newContext({
|
||||
userAgent: BROWSER_USER_AGENT,
|
||||
viewport: { width: 1440, height: 1200 },
|
||||
javaScriptEnabled: true,
|
||||
});
|
||||
|
||||
await context.route('**/*', async (route) => {
|
||||
const request = route.request();
|
||||
const resourceType = request.resourceType();
|
||||
|
||||
if (['image', 'media', 'font'].includes(resourceType)) {
|
||||
await route.abort();
|
||||
return;
|
||||
}
|
||||
|
||||
await route.continue();
|
||||
});
|
||||
|
||||
return {
|
||||
async fetchRenderedHtml(url, options = {}) {
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: site.requestTimeout,
|
||||
});
|
||||
|
||||
await waitForUsefulDom(page, site);
|
||||
const html = await page.content();
|
||||
|
||||
if (options.includeDebug) {
|
||||
return {
|
||||
html,
|
||||
finalUrl: page.url(),
|
||||
title: await page.title(),
|
||||
linkCount: await page.locator('a[href]').count(),
|
||||
};
|
||||
}
|
||||
|
||||
return html;
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
},
|
||||
async close() {
|
||||
await context.close();
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function shouldUseBrowser(site) {
|
||||
return site.renderMode === 'browser';
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
createBrowserSession,
|
||||
shouldUseBrowser,
|
||||
};
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
const config = require('../config');
|
||||
const { fetchWithPolicy } = require('../http');
|
||||
const { createBrowserSession, shouldUseBrowser } = require('./browserCrawler');
|
||||
|
||||
const TRACKING_PARAM_PATTERNS = [
|
||||
/^utm_/i,
|
||||
|
|
@ -22,6 +23,7 @@ const ARTICLE_PATH_HINT = /(\/article\/|\/articles\/|\/news\/|\/story\/|\/storie
|
|||
const ARTICLE_PATH_STRONG_HINT = /\/\d{4}\/\d{2}\/\d{2}\//;
|
||||
const LISTING_ARTICLE_FALSE_POSITIVE_PATH = /(\/category\/|\/tag\/|\/latest(?:\/|$)|\/topics?(?:\/|$)|\/sections?(?:\/|$))/i;
|
||||
const BLOCKED_PATH_HINT = /(\/search(?:\/|$)|\/login(?:\/|$)|\/account(?:\/|$)|\/video(?:\/|$)|\/videos(?:\/|$)|\/podcast(?:\/|$)|\/podcasts(?:\/|$)|\/live(?:\/|$))/i;
|
||||
const EXPLORATION_PATH_HINT = /(\/page\/\d+(?:\/|$)|[?&]page=\d+|\/archive(?:s)?(?:\/|$)|\/latest(?:\/|$)|\/news(?:\/|$)|\/world(?:\/|$)|\/business(?:\/|$)|\/politics(?:\/|$)|\/technology(?:\/|$)|\/tech(?:\/|$)|\/markets(?:\/|$)|\/economy(?:\/|$)|\/topic(?:s)?(?:\/|$)|\/section(?:s)?(?:\/|$)|\/category(?:ies)?(?:\/|$)|\/tag(?:s)?(?:\/|$))/i;
|
||||
|
||||
function decodeHtmlEntities(value) {
|
||||
return String(value || '')
|
||||
|
|
@ -173,25 +175,6 @@ function extractJsonLdBlocks(html) {
|
|||
return blocks;
|
||||
}
|
||||
|
||||
function walkJson(value, visit) {
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) {
|
||||
walkJson(item, visit);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!value || typeof value !== 'object') {
|
||||
return;
|
||||
}
|
||||
|
||||
visit(value);
|
||||
|
||||
for (const child of Object.values(value)) {
|
||||
walkJson(child, visit);
|
||||
}
|
||||
}
|
||||
|
||||
function isArticleType(type) {
|
||||
if (Array.isArray(type)) {
|
||||
return type.some((entry) => isArticleType(entry));
|
||||
|
|
@ -202,15 +185,24 @@ function isArticleType(type) {
|
|||
|
||||
function extractArticleJsonLd(html) {
|
||||
const blocks = extractJsonLdBlocks(html);
|
||||
let article = null;
|
||||
|
||||
for (const block of blocks) {
|
||||
walkJson(block, (value) => {
|
||||
if (!article && isArticleType(value['@type'])) {
|
||||
article = value;
|
||||
if (!block || typeof block !== 'object') {
|
||||
continue;
|
||||
}
|
||||
});
|
||||
|
||||
if (isArticleType(block['@type'])) {
|
||||
return block;
|
||||
}
|
||||
|
||||
const graph = Array.isArray(block['@graph']) ? block['@graph'] : [];
|
||||
const directChildren = [
|
||||
...graph,
|
||||
...(Array.isArray(block.mainEntity) ? block.mainEntity : [block.mainEntity]),
|
||||
...(Array.isArray(block.mainEntityOfPage) ? block.mainEntityOfPage : [block.mainEntityOfPage]),
|
||||
].filter((value) => value && typeof value === 'object');
|
||||
|
||||
const article = directChildren.find((value) => isArticleType(value['@type']));
|
||||
if (article) {
|
||||
return article;
|
||||
}
|
||||
|
|
@ -276,16 +268,25 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
|||
const hasListingFalsePositivePath = LISTING_ARTICLE_FALSE_POSITIVE_PATH.test(pathname);
|
||||
const paragraphTextLength = extractParagraphTextLength(html);
|
||||
const headlineLinks = links.filter(({ text }) => text.length >= 25 && text.length <= 180).length;
|
||||
const h1 = extractH1(html);
|
||||
const ogTitle = normalizeText(meta.get('og:title') || '');
|
||||
const jsonLdHeadline = normalizeText(jsonLdArticle && jsonLdArticle.headline);
|
||||
const jsonLdMatchesPage = jsonLdHeadline
|
||||
&& ((h1 && (h1.includes(jsonLdHeadline) || jsonLdHeadline.includes(h1)))
|
||||
|| (ogTitle && (ogTitle.includes(jsonLdHeadline) || jsonLdHeadline.includes(ogTitle))));
|
||||
const hasJsonLdArticle = Boolean(jsonLdArticle && jsonLdMatchesPage);
|
||||
const hasPublishTime = Boolean(meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html));
|
||||
const hasOgArticle = String(meta.get('og:type') || '').toLowerCase() === 'article';
|
||||
|
||||
if (jsonLdArticle) {
|
||||
if (hasJsonLdArticle) {
|
||||
articleScore += 4;
|
||||
}
|
||||
|
||||
if (String(meta.get('og:type') || '').toLowerCase() === 'article' && !hasListingFalsePositivePath) {
|
||||
if (hasOgArticle && !hasListingFalsePositivePath) {
|
||||
articleScore += 1;
|
||||
}
|
||||
|
||||
if ((meta.get('article:published_time') || meta.get('og:article:published_time') || extractTimeDatetime(html)) && !hasListingFalsePositivePath) {
|
||||
if (hasPublishTime && !hasListingFalsePositivePath) {
|
||||
articleScore += 1;
|
||||
}
|
||||
|
||||
|
|
@ -297,7 +298,7 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
|||
articleScore += 2;
|
||||
}
|
||||
|
||||
if (extractH1(html) && paragraphTextLength >= 500) {
|
||||
if (h1 && paragraphTextLength >= 500) {
|
||||
articleScore += 2;
|
||||
}
|
||||
|
||||
|
|
@ -321,23 +322,49 @@ function scorePage(pageUrl, meta, html, jsonLdArticle, links) {
|
|||
listingScore -= 1;
|
||||
}
|
||||
|
||||
const isArticleCandidate = articleScore >= 4
|
||||
const hasArticleSignalsBeyondJsonLd = hasOgArticle || hasPublishTime || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500;
|
||||
const looksLikeListingPage = headlineLinks >= 15;
|
||||
const isArticleCandidate = !looksLikeListingPage
|
||||
&& articleScore >= 5
|
||||
&& articleScore > listingScore
|
||||
&& (Boolean(jsonLdArticle) || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
|
||||
&& hasArticleSignalsBeyondJsonLd
|
||||
&& (!jsonLdArticle || hasJsonLdArticle || hasStrongArticlePath || hasArticlePathHint || paragraphTextLength >= 500);
|
||||
|
||||
return { articleScore, listingScore, isArticleCandidate };
|
||||
}
|
||||
|
||||
function shouldQueueLink(url) {
|
||||
const pathname = new URL(url).pathname.toLowerCase();
|
||||
const parsed = new URL(url);
|
||||
const pathname = parsed.pathname.toLowerCase();
|
||||
const hostname = parsed.hostname.toLowerCase();
|
||||
|
||||
if (BLOCKED_PATH_HINT.test(pathname)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (hostname.includes('consent.yahoo.com') || pathname.startsWith('/v2/') || pathname.startsWith('/redirect')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !/\.(?:jpg|jpeg|png|gif|webp|svg|pdf|zip|xml|mp4|mp3|avi|mov|wmv|m4v)$/i.test(pathname);
|
||||
}
|
||||
|
||||
function shouldContinueExploring(current, listingScore, links) {
|
||||
if (current.depth === 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (listingScore >= 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (current.depth <= 1 && links.length >= 8) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return EXPLORATION_PATH_HINT.test(current.url);
|
||||
}
|
||||
|
||||
function slugifyLabel(label) {
|
||||
return String(label || '')
|
||||
.toLowerCase()
|
||||
|
|
@ -437,8 +464,10 @@ function normalizeSite(site) {
|
|||
label: String(site.label || '').trim(),
|
||||
allowedHosts,
|
||||
seeds,
|
||||
renderMode: String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 'browser' : 'http',
|
||||
maxPages: normalizeLimit(site.maxPages, 15, 1, 500),
|
||||
maxDepth: normalizeLimit(site.maxDepth, 1, 0, 5),
|
||||
pageConcurrency: normalizeLimit(site.pageConcurrency, String(site.renderMode || 'http').trim().toLowerCase() === 'browser' ? 3 : 4, 1, 12),
|
||||
requestTimeout: Math.max(1000, Math.min(Number(site.requestTimeout) || 15000, 30000)),
|
||||
};
|
||||
}
|
||||
|
|
@ -473,8 +502,10 @@ function getConfiguredCrawlerSites() {
|
|||
name: override.name || `crawler_${slugifyLabel(label)}`,
|
||||
allowedHosts: override.allowedHosts || buildAllowedHosts(hostname),
|
||||
seeds: override.seeds || buildDefaultSeeds(feed.url),
|
||||
renderMode: override.renderMode || defaults.renderMode,
|
||||
maxPages: override.maxPages || defaults.maxPages,
|
||||
maxDepth: override.maxDepth || defaults.maxDepth,
|
||||
pageConcurrency: override.pageConcurrency,
|
||||
requestTimeout: override.requestTimeout || defaults.requestTimeout,
|
||||
});
|
||||
|
||||
|
|
@ -486,9 +517,13 @@ function getConfiguredCrawlerSites() {
|
|||
return [...explicitSites.filter((site) => site.name && site.allowedHosts.length && site.seeds.length), ...derivedSites];
|
||||
}
|
||||
|
||||
async function fetchHtml(url, timeout) {
|
||||
async function fetchHtml(url, site, browserSession) {
|
||||
if (browserSession) {
|
||||
return browserSession.fetchRenderedHtml(url);
|
||||
}
|
||||
|
||||
const response = await fetchWithPolicy(url, {
|
||||
timeout,
|
||||
timeout: site.requestTimeout,
|
||||
retries: 1,
|
||||
});
|
||||
|
||||
|
|
@ -511,31 +546,26 @@ async function crawlSite(site) {
|
|||
return [];
|
||||
}
|
||||
|
||||
const browserSession = shouldUseBrowser(normalizedSite)
|
||||
? await createBrowserSession(normalizedSite)
|
||||
: null;
|
||||
const queue = normalizedSite.seeds.map((url) => ({ url, depth: 0 }));
|
||||
const queuedUrls = new Set(normalizedSite.seeds);
|
||||
const visitedUrls = new Set();
|
||||
const discoveredArticleUrls = new Set();
|
||||
const articles = [];
|
||||
|
||||
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
||||
const current = queue.shift();
|
||||
|
||||
if (!current || visitedUrls.has(current.url)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
visitedUrls.add(current.url);
|
||||
|
||||
async function processPage(current) {
|
||||
let html;
|
||||
try {
|
||||
html = await fetchHtml(current.url, normalizedSite.requestTimeout);
|
||||
html = await fetchHtml(current.url, normalizedSite, browserSession);
|
||||
} catch (error) {
|
||||
console.error(`Crawler fetch failed for ${normalizedSite.name}: ${current.url}`, error);
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!html) {
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
const meta = extractMetaMap(html);
|
||||
|
|
@ -562,8 +592,8 @@ async function crawlSite(site) {
|
|||
}
|
||||
}
|
||||
|
||||
if (current.depth >= normalizedSite.maxDepth || listingScore < 2) {
|
||||
continue;
|
||||
if (current.depth >= normalizedSite.maxDepth || !shouldContinueExploring(current, listingScore, links)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const link of links) {
|
||||
|
|
@ -576,7 +606,33 @@ async function crawlSite(site) {
|
|||
}
|
||||
}
|
||||
|
||||
try {
|
||||
while (queue.length && visitedUrls.size < normalizedSite.maxPages) {
|
||||
const batch = [];
|
||||
|
||||
while (queue.length && batch.length < normalizedSite.pageConcurrency && visitedUrls.size + batch.length < normalizedSite.maxPages) {
|
||||
const current = queue.shift();
|
||||
if (!current || visitedUrls.has(current.url)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
visitedUrls.add(current.url);
|
||||
batch.push(current);
|
||||
}
|
||||
|
||||
if (!batch.length) {
|
||||
continue;
|
||||
}
|
||||
|
||||
await Promise.all(batch.map(processPage));
|
||||
}
|
||||
|
||||
return articles;
|
||||
} finally {
|
||||
if (browserSession) {
|
||||
await browserSession.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchCrawlerArticles() {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue