804 lines
25 KiB
JavaScript
804 lines
25 KiB
JavaScript
const db = require('../db');
|
|
const config = require('../config');
|
|
|
|
const POSITIVE_RULE_TYPES = new Set([
|
|
'meta_og_type',
|
|
'meta_has_publish_time',
|
|
'jsonld_type',
|
|
'has_tag',
|
|
'url_pattern',
|
|
'path_segment',
|
|
'meta_presence',
|
|
'meta_value_pattern',
|
|
'selector_present',
|
|
'class_token_present',
|
|
'attr_presence',
|
|
'link_density_bucket',
|
|
'paragraph_count_bucket',
|
|
'headline_container_pattern',
|
|
'byline_signal',
|
|
'time_signal',
|
|
'body_container_signal',
|
|
'listing_container_signal',
|
|
'pagination_signal',
|
|
'url_prefix_pattern',
|
|
'canonical_pattern',
|
|
'shallow_text_signal',
|
|
'repeated_card_signal',
|
|
'nav_density_bucket',
|
|
'utility_path_signal',
|
|
'commercial_signal',
|
|
'media_signal',
|
|
]);
|
|
|
|
const selectCachedClassification = db.prepare(`
|
|
SELECT classification
|
|
FROM crawler_page_classifications
|
|
WHERE url = ?
|
|
`);
|
|
const upsertCachedClassification = db.prepare(`
|
|
INSERT INTO crawler_page_classifications (url, site_name, classification, pattern)
|
|
VALUES (?, ?, ?, ?)
|
|
ON CONFLICT(url) DO UPDATE SET
|
|
site_name = excluded.site_name,
|
|
classification = excluded.classification,
|
|
pattern = excluded.pattern,
|
|
classified_at = datetime('now')
|
|
`);
|
|
const selectPatternsForSite = db.prepare(`
|
|
SELECT pattern, classification, hit_count
|
|
FROM crawler_url_patterns
|
|
WHERE site_name = ?
|
|
AND hit_count >= ?
|
|
ORDER BY hit_count DESC, pattern ASC
|
|
`);
|
|
const upsertPattern = db.prepare(`
|
|
INSERT INTO crawler_url_patterns (site_name, pattern, classification, hit_count)
|
|
VALUES (?, ?, ?, 1)
|
|
ON CONFLICT(site_name, pattern) DO UPDATE SET
|
|
classification = excluded.classification,
|
|
hit_count = CASE
|
|
WHEN crawler_url_patterns.classification = excluded.classification THEN crawler_url_patterns.hit_count + 1
|
|
ELSE 1
|
|
END,
|
|
updated_at = datetime('now')
|
|
`);
|
|
const selectRulesForSite = db.prepare(`
|
|
SELECT rule_type, rule_value, classification, hit_count
|
|
FROM crawler_site_rules
|
|
WHERE site_name = ?
|
|
AND hit_count >= ?
|
|
ORDER BY hit_count DESC, rule_type ASC, rule_value ASC
|
|
`);
|
|
const upsertRule = db.prepare(`
|
|
INSERT INTO crawler_site_rules (site_name, rule_type, rule_value, classification, hit_count)
|
|
VALUES (?, ?, ?, ?, 1)
|
|
ON CONFLICT(site_name, rule_type, rule_value) DO UPDATE SET
|
|
classification = excluded.classification,
|
|
hit_count = CASE
|
|
WHEN crawler_site_rules.classification = excluded.classification THEN crawler_site_rules.hit_count + 1
|
|
ELSE 1
|
|
END,
|
|
updated_at = datetime('now')
|
|
`);
|
|
|
|
function normalizePathSegment(segment) {
|
|
if (/^\d{4}$/.test(segment)) {
|
|
return '{year}';
|
|
}
|
|
|
|
if (/^\d{2}$/.test(segment)) {
|
|
return '{num2}';
|
|
}
|
|
|
|
if (/^\d+$/.test(segment)) {
|
|
return '{id}';
|
|
}
|
|
|
|
if (/^[a-f0-9]{8,}$/i.test(segment)) {
|
|
return '{hex}';
|
|
}
|
|
|
|
return String(segment || '').toLowerCase();
|
|
}
|
|
|
|
function buildUrlPattern(url) {
|
|
try {
|
|
const parsed = new URL(url);
|
|
const normalizedSegments = parsed.pathname
|
|
.split('/')
|
|
.filter(Boolean)
|
|
.map(normalizePathSegment);
|
|
|
|
return `/${normalizedSegments.join('/')}` || '/';
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function patternToRegex(pattern) {
|
|
const escaped = String(pattern || '')
|
|
.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
.replace(/\{year\}/g, '\\d{4}')
|
|
.replace(/\{num2\}/g, '\\d{2}')
|
|
.replace(/\{id\}/g, '\\d+')
|
|
.replace(/\{hex\}/g, '[a-f0-9]+');
|
|
|
|
return new RegExp(`^${escaped}$`, 'i');
|
|
}
|
|
|
|
function sanitizeText(value, maxLength = 200) {
|
|
return String(value || '')
|
|
.replace(/<[^>]*>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim()
|
|
.slice(0, maxLength);
|
|
}
|
|
|
|
function normalizeRuleValue(value) {
|
|
return sanitizeText(String(value || '').toLowerCase(), 160);
|
|
}
|
|
|
|
function pushSignal(signals, ruleType, ruleValue) {
|
|
const normalizedValue = normalizeRuleValue(ruleValue);
|
|
if (!ruleType || !normalizedValue) {
|
|
return;
|
|
}
|
|
|
|
signals.push({ ruleType, ruleValue: normalizedValue });
|
|
}
|
|
|
|
function uniqueSignals(signals) {
|
|
const seen = new Set();
|
|
const unique = [];
|
|
|
|
for (const signal of signals) {
|
|
const key = `${signal.ruleType}:${signal.ruleValue}`;
|
|
if (seen.has(key)) {
|
|
continue;
|
|
}
|
|
|
|
seen.add(key);
|
|
unique.push(signal);
|
|
}
|
|
|
|
return unique;
|
|
}
|
|
|
|
function extractJsonObjectString(value) {
|
|
const text = String(value || '').trim();
|
|
const start = text.indexOf('{');
|
|
if (start === -1) {
|
|
return '';
|
|
}
|
|
|
|
let depth = 0;
|
|
let inString = false;
|
|
let escape = false;
|
|
|
|
for (let index = start; index < text.length; index += 1) {
|
|
const char = text[index];
|
|
|
|
if (escape) {
|
|
escape = false;
|
|
continue;
|
|
}
|
|
|
|
if (char === '\\') {
|
|
escape = true;
|
|
continue;
|
|
}
|
|
|
|
if (char === '"') {
|
|
inString = !inString;
|
|
continue;
|
|
}
|
|
|
|
if (inString) {
|
|
continue;
|
|
}
|
|
|
|
if (char === '{') {
|
|
depth += 1;
|
|
continue;
|
|
}
|
|
|
|
if (char === '}') {
|
|
depth -= 1;
|
|
if (depth === 0) {
|
|
return text.slice(start, index + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
return text.slice(start);
|
|
}
|
|
|
|
function repairJsonString(value) {
|
|
let repaired = String(value || '').trim();
|
|
if (!repaired) {
|
|
return '';
|
|
}
|
|
|
|
repaired = repaired
|
|
.replace(/^```(?:json)?\s*/i, '')
|
|
.replace(/\s*```$/i, '')
|
|
.trim();
|
|
|
|
repaired = extractJsonObjectString(repaired);
|
|
if (!repaired) {
|
|
return '';
|
|
}
|
|
|
|
repaired = repaired
|
|
.replace(/[\u0000-\u001f]+/g, ' ')
|
|
.replace(/,\s*([}\]])/g, '$1')
|
|
.replace(/:\s*undefined\b/g, ': null')
|
|
.replace(/:\s*NaN\b/g, ': null')
|
|
.replace(/:\s*Infinity\b/g, ': null');
|
|
|
|
const openCurly = (repaired.match(/\{/g) || []).length;
|
|
const closeCurly = (repaired.match(/\}/g) || []).length;
|
|
const openSquare = (repaired.match(/\[/g) || []).length;
|
|
const closeSquare = (repaired.match(/\]/g) || []).length;
|
|
|
|
if (closeSquare < openSquare) {
|
|
repaired += ']'.repeat(openSquare - closeSquare);
|
|
}
|
|
|
|
if (closeCurly < openCurly) {
|
|
repaired += '}'.repeat(openCurly - closeCurly);
|
|
}
|
|
|
|
return repaired;
|
|
}
|
|
|
|
function parseJsonLoose(value) {
|
|
const direct = String(value || '').trim();
|
|
if (!direct) {
|
|
return {};
|
|
}
|
|
|
|
try {
|
|
return JSON.parse(direct);
|
|
} catch {
|
|
}
|
|
|
|
const repaired = repairJsonString(direct);
|
|
if (!repaired) {
|
|
return {};
|
|
}
|
|
|
|
try {
|
|
return JSON.parse(repaired);
|
|
} catch (error) {
|
|
console.error('failed to parse crawler classification payload:', error, direct);
|
|
return {};
|
|
}
|
|
}
|
|
|
|
function extractClassTokens(html) {
|
|
const attrs = html.match(/\bclass\s*=\s*(["'])(.*?)\1/gi) || [];
|
|
const tokens = [];
|
|
|
|
for (const attr of attrs) {
|
|
const match = attr.match(/\bclass\s*=\s*(["'])(.*?)\1/i);
|
|
const raw = match ? match[2] : '';
|
|
for (const token of raw.split(/\s+/)) {
|
|
const normalized = String(token || '').trim().toLowerCase();
|
|
if (!normalized || normalized.length < 3 || normalized.length > 40) {
|
|
continue;
|
|
}
|
|
if (!/[a-z]/.test(normalized)) {
|
|
continue;
|
|
}
|
|
if (/^(jsx-\d+|sc-[a-z0-9]+|css-[a-z0-9]+|_[a-z0-9]+|[a-f0-9]{10,})$/i.test(normalized)) {
|
|
continue;
|
|
}
|
|
tokens.push(normalized);
|
|
for (const part of normalized.split(/[_-]+/)) {
|
|
if (part.length >= 4 && part.length <= 24 && /[a-z]/.test(part) && !/^\d+$/.test(part)) {
|
|
tokens.push(part);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return uniqueSignals(tokens.map((token) => ({ ruleType: 'class_token_present', ruleValue: token }))).map((entry) => entry.ruleValue);
|
|
}
|
|
|
|
function extractTagSummary(html) {
|
|
const tags = new Set();
|
|
const regex = /<([a-z0-9:-]+)\b/gi;
|
|
let match;
|
|
|
|
while ((match = regex.exec(html)) !== null && tags.size < 50) {
|
|
tags.add(String(match[1] || '').toLowerCase());
|
|
}
|
|
|
|
return [...tags];
|
|
}
|
|
|
|
function extractAttributeValues(html, attrName) {
|
|
const regex = new RegExp(`\\b${attrName}\\s*=\\s*(["'])(.*?)\\1`, 'gi');
|
|
const values = [];
|
|
let match;
|
|
|
|
while ((match = regex.exec(html)) !== null) {
|
|
values.push(String(match[2] || '').trim());
|
|
}
|
|
|
|
return values;
|
|
}
|
|
|
|
function detectLinkDensityBucket(links, paragraphTextLength) {
|
|
if (!paragraphTextLength) {
|
|
return links.length >= 15 ? 'high' : 'medium';
|
|
}
|
|
|
|
const ratio = (links.length * 1000) / Math.max(paragraphTextLength, 1);
|
|
if (ratio >= 18 || links.length >= 60) {
|
|
return 'high';
|
|
}
|
|
if (ratio >= 8 || links.length >= 25) {
|
|
return 'medium';
|
|
}
|
|
return 'low';
|
|
}
|
|
|
|
function detectParagraphBucket(paragraphCount) {
|
|
if (paragraphCount === 0) {
|
|
return '0';
|
|
}
|
|
if (paragraphCount <= 2) {
|
|
return '1-2';
|
|
}
|
|
if (paragraphCount <= 7) {
|
|
return '3-7';
|
|
}
|
|
return '8+';
|
|
}
|
|
|
|
function detectHeadlineContainerPattern(html, headlineLinks) {
|
|
const h1Count = (html.match(/<h1\b/gi) || []).length;
|
|
const h2Count = (html.match(/<h2\b/gi) || []).length;
|
|
const h3Count = (html.match(/<h3\b/gi) || []).length;
|
|
|
|
if (h1Count === 1 && headlineLinks <= 6 && h2Count <= 4) {
|
|
return 'single_h1';
|
|
}
|
|
if (h2Count >= 6 || h3Count >= 8) {
|
|
return 'repeated_h2_cards';
|
|
}
|
|
if (headlineLinks >= 10) {
|
|
return 'multiple_headline_links';
|
|
}
|
|
return 'mixed';
|
|
}
|
|
|
|
function detectCanonicalPattern(url) {
|
|
try {
|
|
const pathname = new URL(url).pathname || '/';
|
|
const segments = pathname.split('/').filter(Boolean);
|
|
|
|
if (!segments.length) {
|
|
return 'root';
|
|
}
|
|
if (/^\d{4}$/.test(segments[0])) {
|
|
return 'dated_article';
|
|
}
|
|
if (segments.length === 1) {
|
|
return 'section_root';
|
|
}
|
|
if (segments.length === 2) {
|
|
return 'short_section_slug';
|
|
}
|
|
return 'multi_segment_slug';
|
|
} catch {
|
|
return 'unknown';
|
|
}
|
|
}
|
|
|
|
function buildRuleSignals(url, meta, html, jsonLdArticle, links, heuristic) {
|
|
const signals = [];
|
|
const classTokens = extractClassTokens(html);
|
|
const tagSummary = extractTagSummary(html);
|
|
const pathname = (() => {
|
|
try {
|
|
return new URL(url).pathname || '/';
|
|
} catch {
|
|
return '/';
|
|
}
|
|
})();
|
|
const segments = pathname.split('/').filter(Boolean);
|
|
const selectedMetaKeys = [
|
|
'og:type',
|
|
'og:title',
|
|
'article:published_time',
|
|
'og:article:published_time',
|
|
'author',
|
|
'article:author',
|
|
'twitter:title',
|
|
'description',
|
|
'article:section',
|
|
];
|
|
|
|
for (const key of selectedMetaKeys) {
|
|
const value = meta.get(key);
|
|
if (!value) {
|
|
continue;
|
|
}
|
|
|
|
pushSignal(signals, 'meta_presence', key);
|
|
pushSignal(signals, 'meta_value_pattern', `${key}:${value}`);
|
|
}
|
|
|
|
const ogType = String(meta.get('og:type') || '').trim().toLowerCase();
|
|
const publishTime = String(meta.get('article:published_time') || meta.get('og:article:published_time') || '').trim();
|
|
const jsonLdType = String(jsonLdArticle && jsonLdArticle['@type'] || '').trim().toLowerCase();
|
|
|
|
if (ogType) {
|
|
pushSignal(signals, 'meta_og_type', ogType);
|
|
}
|
|
|
|
if (publishTime) {
|
|
pushSignal(signals, 'meta_has_publish_time', 'true');
|
|
pushSignal(signals, 'time_signal', meta.get('article:published_time') ? 'meta_article_published_time' : 'meta_og_article_published_time');
|
|
}
|
|
|
|
if (jsonLdType) {
|
|
pushSignal(signals, 'jsonld_type', jsonLdType);
|
|
}
|
|
|
|
for (const tag of ['article', 'main', 'nav', 'aside', 'section', 'time', 'h1', 'h2']) {
|
|
if (tagSummary.includes(tag)) {
|
|
pushSignal(signals, 'has_tag', tag);
|
|
pushSignal(signals, 'selector_present', tag);
|
|
}
|
|
}
|
|
|
|
if (/<main\b[\s\S]{0,800}<article\b/i.test(html) || /<article\b[\s\S]{0,800}<main\b/i.test(html)) {
|
|
pushSignal(signals, 'selector_present', 'main article');
|
|
pushSignal(signals, 'body_container_signal', 'main_article');
|
|
}
|
|
|
|
if (/itemprop\s*=\s*(["'])articlebody\1/i.test(html)) {
|
|
pushSignal(signals, 'selector_present', '[itemprop="articlebody"]');
|
|
pushSignal(signals, 'attr_presence', 'itemprop:articlebody');
|
|
pushSignal(signals, 'body_container_signal', 'itemprop_articlebody');
|
|
}
|
|
|
|
if (/itemprop\s*=\s*(["'])headline\1/i.test(html)) {
|
|
pushSignal(signals, 'attr_presence', 'itemprop:headline');
|
|
}
|
|
|
|
if (/rel\s*=\s*(["'])author\1/i.test(html)) {
|
|
pushSignal(signals, 'selector_present', '[rel="author"]');
|
|
pushSignal(signals, 'attr_presence', 'rel:author');
|
|
pushSignal(signals, 'byline_signal', 'rel_author');
|
|
}
|
|
|
|
if (/role\s*=\s*(["'])main\1/i.test(html)) {
|
|
pushSignal(signals, 'attr_presence', 'role:main');
|
|
}
|
|
|
|
if (/role\s*=\s*(["'])navigation\1/i.test(html)) {
|
|
pushSignal(signals, 'attr_presence', 'role:navigation');
|
|
}
|
|
|
|
if (/<time\b[^>]*datetime\s*=/i.test(html)) {
|
|
pushSignal(signals, 'selector_present', 'time[datetime]');
|
|
pushSignal(signals, 'time_signal', 'time_datetime');
|
|
}
|
|
|
|
if (jsonLdArticle && jsonLdArticle.datePublished) {
|
|
pushSignal(signals, 'time_signal', 'jsonld_datepublished');
|
|
}
|
|
|
|
if (jsonLdArticle && jsonLdArticle.dateModified) {
|
|
pushSignal(signals, 'time_signal', 'jsonld_datemodified');
|
|
}
|
|
|
|
const paragraphCount = Number(heuristic.paragraphCount || 0);
|
|
pushSignal(signals, 'paragraph_count_bucket', detectParagraphBucket(paragraphCount));
|
|
pushSignal(signals, 'link_density_bucket', detectLinkDensityBucket(links, heuristic.paragraphTextLength));
|
|
pushSignal(signals, 'headline_container_pattern', detectHeadlineContainerPattern(html, heuristic.headlineLinks));
|
|
pushSignal(signals, 'canonical_pattern', detectCanonicalPattern(url));
|
|
|
|
if (heuristic.paragraphTextLength < 200) {
|
|
pushSignal(signals, 'shallow_text_signal', 'very_low');
|
|
} else if (heuristic.paragraphTextLength < 600) {
|
|
pushSignal(signals, 'shallow_text_signal', 'low');
|
|
} else {
|
|
pushSignal(signals, 'shallow_text_signal', 'substantial');
|
|
}
|
|
|
|
if (heuristic.headlineLinks >= 8 || (html.match(/<h2\b/gi) || []).length >= 6) {
|
|
pushSignal(signals, 'repeated_card_signal', 'present');
|
|
}
|
|
|
|
const linkDensityBucket = detectLinkDensityBucket(links, heuristic.paragraphTextLength);
|
|
if (tagSummary.includes('nav') && linkDensityBucket === 'high') {
|
|
pushSignal(signals, 'nav_density_bucket', 'high');
|
|
} else if (tagSummary.includes('nav')) {
|
|
pushSignal(signals, 'nav_density_bucket', 'present');
|
|
}
|
|
|
|
if (meta.get('author') || meta.get('article:author')) {
|
|
pushSignal(signals, 'byline_signal', 'meta_author');
|
|
}
|
|
if (classTokens.some((token) => token.includes('byline'))) {
|
|
pushSignal(signals, 'byline_signal', 'class_token_byline');
|
|
}
|
|
if (classTokens.some((token) => token === 'author' || token.endsWith('author'))) {
|
|
pushSignal(signals, 'byline_signal', 'class_token_author');
|
|
}
|
|
if (/\bby\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b/.test(html)) {
|
|
pushSignal(signals, 'byline_signal', 'text_prefix_by');
|
|
}
|
|
|
|
for (const token of classTokens.slice(0, 60)) {
|
|
pushSignal(signals, 'class_token_present', token);
|
|
|
|
if (['article', 'story', 'headline', 'content', 'body', 'byline', 'author'].includes(token)) {
|
|
pushSignal(signals, 'body_container_signal', token);
|
|
}
|
|
|
|
if (['latest', 'archive', 'category', 'topic', 'topics', 'feed', 'river', 'grid', 'cards', 'listing', 'section', 'stream'].includes(token)) {
|
|
pushSignal(signals, 'listing_container_signal', token);
|
|
}
|
|
|
|
if (['subscribe', 'subscription', 'newsletter', 'advertise', 'sponsored'].includes(token)) {
|
|
pushSignal(signals, 'commercial_signal', token);
|
|
}
|
|
|
|
if (['video', 'podcast', 'live', 'watch', 'media'].includes(token)) {
|
|
pushSignal(signals, 'media_signal', token);
|
|
}
|
|
}
|
|
|
|
const relValues = extractAttributeValues(html, 'rel').map((value) => value.toLowerCase());
|
|
if (relValues.some((value) => value.split(/\s+/).includes('next'))) {
|
|
pushSignal(signals, 'pagination_signal', 'rel_next');
|
|
}
|
|
if (relValues.some((value) => value.split(/\s+/).includes('prev'))) {
|
|
pushSignal(signals, 'pagination_signal', 'rel_prev');
|
|
}
|
|
if (/[?&]page=\d+/i.test(url) || /\/page\/\d+(?:\/|$)/i.test(pathname)) {
|
|
pushSignal(signals, 'pagination_signal', 'page_param');
|
|
}
|
|
if (/load more/i.test(html)) {
|
|
pushSignal(signals, 'pagination_signal', 'load_more');
|
|
}
|
|
if (/(next page|older posts|older stories)/i.test(html)) {
|
|
pushSignal(signals, 'pagination_signal', 'next_page_text');
|
|
}
|
|
|
|
const utilitySegments = segments.filter((segment) => /^(login|signin|search|account|about|contact|privacy|terms)$/i.test(segment));
|
|
for (const segment of utilitySegments) {
|
|
pushSignal(signals, 'utility_path_signal', normalizePathSegment(segment));
|
|
}
|
|
|
|
const commercialSegments = segments.filter((segment) => /^(subscribe|subscription|newsletter|advertise|sponsored)$/i.test(segment));
|
|
for (const segment of commercialSegments) {
|
|
pushSignal(signals, 'commercial_signal', normalizePathSegment(segment));
|
|
}
|
|
|
|
const mediaSegments = segments.filter((segment) => /^(video|videos|podcast|podcasts|live|watch)$/i.test(segment));
|
|
for (const segment of mediaSegments) {
|
|
pushSignal(signals, 'media_signal', normalizePathSegment(segment));
|
|
}
|
|
|
|
const urlPattern = buildUrlPattern(url);
|
|
if (urlPattern) {
|
|
pushSignal(signals, 'url_pattern', urlPattern);
|
|
}
|
|
|
|
if (segments.length >= 1) {
|
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}`);
|
|
}
|
|
if (segments.length >= 2) {
|
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}/${normalizePathSegment(segments[1])}`);
|
|
}
|
|
if (segments.length >= 3) {
|
|
pushSignal(signals, 'url_prefix_pattern', `/${normalizePathSegment(segments[0])}/${normalizePathSegment(segments[1])}/${normalizePathSegment(segments[2])}`);
|
|
}
|
|
|
|
for (const segment of segments.slice(0, 5)) {
|
|
pushSignal(signals, 'path_segment', normalizePathSegment(segment));
|
|
}
|
|
|
|
return uniqueSignals(signals);
|
|
}
|
|
|
|
function matchRule(rule, signals) {
|
|
return signals.some((signal) => signal.ruleType === rule.rule_type && signal.ruleValue === rule.rule_value);
|
|
}
|
|
|
|
function formatSignalsForPrompt(signals) {
|
|
return signals
|
|
.slice(0, 120)
|
|
.map((signal) => `${signal.ruleType}:${signal.ruleValue}`)
|
|
.join('\n');
|
|
}
|
|
|
|
function sanitizeForLlm(url, html, meta, jsonLdArticle, links, heuristic, signals) {
|
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
const h1Match = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
|
|
const timeMatch = html.match(/<time\b[^>]*datetime\s*=\s*(["'])(.*?)\1/i);
|
|
const paragraphMatches = html.match(/<p\b[^>]*>[\s\S]*?<\/p>/gi) || [];
|
|
const paragraphs = paragraphMatches
|
|
.map((paragraph) => sanitizeText(paragraph, 180))
|
|
.filter(Boolean)
|
|
.slice(0, 5);
|
|
const sampleLinks = links
|
|
.slice(0, 12)
|
|
.map((link) => `${link.url} | ${sanitizeText(link.text, 120)}`)
|
|
.join('\n');
|
|
|
|
const parts = [
|
|
`URL: ${url}`,
|
|
`TITLE: ${titleMatch ? sanitizeText(titleMatch[1]) : ''}`,
|
|
`H1: ${h1Match ? sanitizeText(h1Match[1]) : ''}`,
|
|
`OG_TYPE: ${String(meta.get('og:type') || '').slice(0, 80)}`,
|
|
`OG_TITLE: ${sanitizeText(meta.get('og:title') || '')}`,
|
|
`PUBLISHED: ${String(meta.get('article:published_time') || meta.get('og:article:published_time') || (timeMatch ? timeMatch[2] : '') || '').slice(0, 80)}`,
|
|
`JSONLD_TYPE: ${jsonLdArticle ? String(jsonLdArticle['@type'] || '').slice(0, 80) : ''}`,
|
|
`JSONLD_HEADLINE: ${jsonLdArticle ? sanitizeText(jsonLdArticle.headline || '') : ''}`,
|
|
`LINK_COUNT: ${links.length}`,
|
|
`PARAGRAPH_COUNT: ${heuristic.paragraphCount}`,
|
|
`PARAGRAPH_TEXT_LENGTH: ${heuristic.paragraphTextLength}`,
|
|
`HEADLINE_LINKS: ${heuristic.headlineLinks}`,
|
|
sampleLinks ? `LINKS:\n${sampleLinks}` : '',
|
|
...paragraphs.map((paragraph, index) => `P${index + 1}: ${paragraph}`),
|
|
`AVAILABLE_SIGNALS:\n${formatSignalsForPrompt(signals.slice(0, 40))}`,
|
|
];
|
|
|
|
return parts.filter(Boolean).join('\n').slice(0, 4200);
|
|
}
|
|
|
|
async function requestLlmClassification(url, sanitizedHtml, heuristic) {
|
|
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
method: 'POST',
|
|
headers: {
|
|
Authorization: `Bearer ${String(config.openRouter.apiKey || '').trim()}`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
body: JSON.stringify({
|
|
model: 'openai/gpt-4.1-mini',
|
|
messages: [
|
|
{
|
|
role: 'system',
|
|
content: 'Classify pages for a news crawler. Return strict JSON only with keys classification, confidence, learnedSignals, and negativeSignals. classification must be ARTICLE, LISTING, or OTHER. learnedSignals and negativeSignals must be arrays of objects with keys type and value. Only use reusable, site-level structural signals. Allowed types: meta_og_type, meta_has_publish_time, jsonld_type, has_tag, url_pattern, path_segment, meta_presence, meta_value_pattern, selector_present, class_token_present, attr_presence, link_density_bucket, paragraph_count_bucket, headline_container_pattern, byline_signal, time_signal, body_container_signal, listing_container_signal, pagination_signal, url_prefix_pattern, canonical_pattern, shallow_text_signal, repeated_card_signal, nav_density_bucket, utility_path_signal, commercial_signal, media_signal.',
|
|
},
|
|
{
|
|
role: 'user',
|
|
content: [
|
|
'ARTICLE = a single news story page.',
|
|
'LISTING = homepage, topic page, category page, archive, feature hub, or page containing many article links.',
|
|
'OTHER = anything else.',
|
|
'learnedSignals should explain why this page belongs to its classification.',
|
|
'negativeSignals should capture strong anti-article clues when this page is LISTING or OTHER.',
|
|
'Return compact JSON only. Keep learnedSignals and negativeSignals short. Max 3 entries in each array.',
|
|
'Never include exact titles, exact names, full article text, random hashes, or one-off values.',
|
|
`HEURISTIC_ARTICLE_SCORE: ${heuristic.articleScore}`,
|
|
`HEURISTIC_LISTING_SCORE: ${heuristic.listingScore}`,
|
|
`HEURISTIC_SHOULD_ASK: ${heuristic.shouldAskLlm ? 'yes' : 'no'}`,
|
|
sanitizedHtml,
|
|
].join('\n\n'),
|
|
},
|
|
],
|
|
temperature: 0,
|
|
max_tokens: 220,
|
|
response_format: { type: 'json_object' },
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
let message = `crawler classification failed with ${response.status}`;
|
|
|
|
try {
|
|
const payload = await response.json();
|
|
const errorMessage = payload && payload.error && payload.error.message;
|
|
if (errorMessage) {
|
|
message = errorMessage;
|
|
}
|
|
} catch (error) {
|
|
console.error('failed to parse crawler classification error response:', error);
|
|
}
|
|
|
|
const requestError = new Error(message);
|
|
requestError.status = response.status;
|
|
throw requestError;
|
|
}
|
|
|
|
const payload = await response.json();
|
|
const content = String(payload?.choices?.[0]?.message?.content || '').trim();
|
|
|
|
const parsed = parseJsonLoose(content);
|
|
|
|
const classificationRaw = String(parsed.classification || '').trim().toUpperCase();
|
|
const classification = classificationRaw === 'ARTICLE'
|
|
? 'article'
|
|
: classificationRaw === 'LISTING'
|
|
? 'listing'
|
|
: 'other';
|
|
|
|
function parseSignals(entries) {
|
|
return Array.isArray(entries)
|
|
? entries
|
|
.filter((entry) => entry && typeof entry === 'object')
|
|
.map((entry) => ({
|
|
ruleType: String(entry.type || '').trim(),
|
|
ruleValue: normalizeRuleValue(entry.value || ''),
|
|
}))
|
|
.filter((entry) => POSITIVE_RULE_TYPES.has(entry.ruleType) && entry.ruleValue)
|
|
.slice(0, 8)
|
|
: [];
|
|
}
|
|
|
|
return {
|
|
classification,
|
|
learnedSignals: parseSignals(parsed.learnedSignals),
|
|
negativeSignals: parseSignals(parsed.negativeSignals),
|
|
};
|
|
}
|
|
|
|
async function classifyPageWithLlm({ siteName, url, html, meta, jsonLdArticle, heuristic, links, minPatternHits }) {
|
|
const cached = selectCachedClassification.get(url);
|
|
if (cached) {
|
|
return { classification: cached.classification, source: 'cache', learnedSignals: [], negativeSignals: [] };
|
|
}
|
|
|
|
const pattern = buildUrlPattern(url);
|
|
if (pattern) {
|
|
const pathname = new URL(url).pathname || '/';
|
|
const matchedPattern = selectPatternsForSite.all(siteName, minPatternHits)
|
|
.find((entry) => patternToRegex(entry.pattern).test(pathname));
|
|
|
|
if (matchedPattern) {
|
|
upsertCachedClassification.run(url, siteName, matchedPattern.classification, matchedPattern.pattern);
|
|
return { classification: matchedPattern.classification, source: 'pattern', learnedSignals: [], negativeSignals: [] };
|
|
}
|
|
}
|
|
|
|
const ruleSignals = buildRuleSignals(url, meta, html, jsonLdArticle, links, heuristic);
|
|
const matchedRule = selectRulesForSite.all(siteName, minPatternHits).find((rule) => matchRule(rule, ruleSignals));
|
|
if (matchedRule) {
|
|
upsertCachedClassification.run(url, siteName, matchedRule.classification, pattern);
|
|
return { classification: matchedRule.classification, source: 'rule', learnedSignals: [], negativeSignals: [] };
|
|
}
|
|
|
|
if (!String(config.openRouter?.apiKey || '').trim()) {
|
|
return { classification: null, source: 'disabled', learnedSignals: [], negativeSignals: [] };
|
|
}
|
|
|
|
const result = await requestLlmClassification(
|
|
url,
|
|
sanitizeForLlm(url, html, meta, jsonLdArticle, links, heuristic, ruleSignals),
|
|
heuristic,
|
|
);
|
|
|
|
upsertCachedClassification.run(url, siteName, result.classification, pattern);
|
|
|
|
if (pattern) {
|
|
upsertPattern.run(siteName, pattern, result.classification);
|
|
}
|
|
|
|
for (const signal of [...result.learnedSignals, ...result.negativeSignals]) {
|
|
upsertRule.run(siteName, signal.ruleType, signal.ruleValue, result.classification);
|
|
}
|
|
|
|
console.log(`[crawler-llm] ${siteName} ${result.classification.toUpperCase()} ${url}`);
|
|
return {
|
|
classification: result.classification,
|
|
source: 'llm',
|
|
learnedSignals: result.learnedSignals,
|
|
negativeSignals: result.negativeSignals,
|
|
};
|
|
}
|
|
|
|
module.exports = {
|
|
classifyPageWithLlm,
|
|
buildUrlPattern,
|
|
};
|