const db = require('../db'); const config = require('../config'); const POSITIVE_RULE_TYPES = new Set([ 'meta_og_type', 'meta_has_publish_time', 'jsonld_type', 'has_tag', 'url_pattern', 'path_segment', 'meta_presence', 'meta_value_pattern', 'selector_present', 'class_token_present', 'attr_presence', 'link_density_bucket', 'paragraph_count_bucket', 'headline_container_pattern', 'byline_signal', 'time_signal', 'body_container_signal', 'listing_container_signal', 'pagination_signal', 'url_prefix_pattern', 'canonical_pattern', 'shallow_text_signal', 'repeated_card_signal', 'nav_density_bucket', 'utility_path_signal', 'commercial_signal', 'media_signal', ]); const selectCachedClassification = db.prepare(` SELECT classification FROM crawler_page_classifications WHERE url = ? `); const upsertCachedClassification = db.prepare(` INSERT INTO crawler_page_classifications (url, site_name, classification, pattern) VALUES (?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET site_name = excluded.site_name, classification = excluded.classification, pattern = excluded.pattern, classified_at = datetime('now') `); const selectPatternsForSite = db.prepare(` SELECT pattern, classification, hit_count FROM crawler_url_patterns WHERE site_name = ? AND hit_count >= ? ORDER BY hit_count DESC, pattern ASC `); const upsertPattern = db.prepare(` INSERT INTO crawler_url_patterns (site_name, pattern, classification, hit_count) VALUES (?, ?, ?, 1) ON CONFLICT(site_name, pattern) DO UPDATE SET classification = excluded.classification, hit_count = CASE WHEN crawler_url_patterns.classification = excluded.classification THEN crawler_url_patterns.hit_count + 1 ELSE 1 END, updated_at = datetime('now') `); const selectRulesForSite = db.prepare(` SELECT rule_type, rule_value, classification, hit_count FROM crawler_site_rules WHERE site_name = ? AND hit_count >= ? ORDER BY hit_count DESC, rule_type ASC, rule_value ASC `); const upsertRule = db.prepare(` INSERT INTO crawler_site_rules (site_name, rule_type, rule_value, classification, hit_count) VALUES (?, ?, ?, ?, 1) ON CONFLICT(site_name, rule_type, rule_value) DO UPDATE SET classification = excluded.classification, hit_count = CASE WHEN crawler_site_rules.classification = excluded.classification THEN crawler_site_rules.hit_count + 1 ELSE 1 END, updated_at = datetime('now') `); function normalizePathSegment(segment) { if (/^\d{4}$/.test(segment)) { return '{year}'; } if (/^\d{2}$/.test(segment)) { return '{num2}'; } if (/^\d+$/.test(segment)) { return '{id}'; } if (/^[a-f0-9]{8,}$/i.test(segment)) { return '{hex}'; } return String(segment || '').toLowerCase(); } function buildUrlPattern(url) { try { const parsed = new URL(url); const normalizedSegments = parsed.pathname .split('/') .filter(Boolean) .map(normalizePathSegment); return `/${normalizedSegments.join('/')}` || '/'; } catch { return null; } } function patternToRegex(pattern) { const escaped = String(pattern || '') .replace(/[.*+?^${}()|[\]\\]/g, '\\$&') .replace(/\{year\}/g, '\\d{4}') .replace(/\{num2\}/g, '\\d{2}') .replace(/\{id\}/g, '\\d+') .replace(/\{hex\}/g, '[a-f0-9]+'); return new RegExp(`^${escaped}$`, 'i'); } function sanitizeText(value, maxLength = 200) { return String(value || '') .replace(/<[^>]*>/g, ' ') .replace(/\s+/g, ' ') .trim() .slice(0, maxLength); } function normalizeRuleValue(value) { return sanitizeText(String(value || '').toLowerCase(), 160); } function pushSignal(signals, ruleType, ruleValue) { const normalizedValue = normalizeRuleValue(ruleValue); if (!ruleType || !normalizedValue) { return; } signals.push({ ruleType, ruleValue: normalizedValue }); } function uniqueSignals(signals) { const seen = new Set(); const unique = []; for (const signal of signals) { const key = `${signal.ruleType}:${signal.ruleValue}`; if (seen.has(key)) { continue; } seen.add(key); unique.push(signal); } return unique; } function extractJsonObjectString(value) { const text = String(value || '').trim(); const start = text.indexOf('{'); if (start === -1) { return ''; } let depth = 0; let inString = false; let escape = false; for (let index = start; index < text.length; index += 1) { const char = text[index]; if (escape) { escape = false; continue; } if (char === '\\') { escape = true; continue; } if (char === '"') { inString = !inString; continue; } if (inString) { continue; } if (char === '{') { depth += 1; continue; } if (char === '}') { depth -= 1; if (depth === 0) { return text.slice(start, index + 1); } } } return text.slice(start); } function repairJsonString(value) { let repaired = String(value || '').trim(); if (!repaired) { return ''; } repaired = repaired .replace(/^```(?:json)?\s*/i, '') .replace(/\s*```$/i, '') .trim(); repaired = extractJsonObjectString(repaired); if (!repaired) { return ''; } repaired = repaired .replace(/[\u0000-\u001f]+/g, ' ') .replace(/,\s*([}\]])/g, '$1') .replace(/:\s*undefined\b/g, ': null') .replace(/:\s*NaN\b/g, ': null') .replace(/:\s*Infinity\b/g, ': null'); const openCurly = (repaired.match(/\{/g) || []).length; const closeCurly = (repaired.match(/\}/g) || []).length; const openSquare = (repaired.match(/\[/g) || []).length; const closeSquare = (repaired.match(/\]/g) || []).length; if (closeSquare < openSquare) { repaired += ']'.repeat(openSquare - closeSquare); } if (closeCurly < openCurly) { repaired += '}'.repeat(openCurly - closeCurly); } return repaired; } function parseJsonLoose(value) { const direct = String(value || '').trim(); if (!direct) { return {}; } try { return JSON.parse(direct); } catch { } const repaired = repairJsonString(direct); if (!repaired) { return {}; } try { return JSON.parse(repaired); } catch (error) { console.error('failed to parse crawler classification payload:', error, direct); return {}; } } function extractClassTokens(html) { const attrs = html.match(/\bclass\s*=\s*(["'])(.*?)\1/gi) || []; const tokens = []; for (const attr of attrs) { const match = attr.match(/\bclass\s*=\s*(["'])(.*?)\1/i); const raw = match ? match[2] : ''; for (const token of raw.split(/\s+/)) { const normalized = String(token || '').trim().toLowerCase(); if (!normalized || normalized.length < 3 || normalized.length > 40) { continue; } if (!/[a-z]/.test(normalized)) { continue; } if (/^(jsx-\d+|sc-[a-z0-9]+|css-[a-z0-9]+|_[a-z0-9]+|[a-f0-9]{10,})$/i.test(normalized)) { continue; } tokens.push(normalized); for (const part of normalized.split(/[_-]+/)) { if (part.length >= 4 && part.length <= 24 && /[a-z]/.test(part) && !/^\d+$/.test(part)) { tokens.push(part); } } } } return uniqueSignals(tokens.map((token) => ({ ruleType: 'class_token_present', ruleValue: token }))).map((entry) => entry.ruleValue); } function extractTagSummary(html) { const tags = new Set(); const regex = /<([a-z0-9:-]+)\b/gi; let match; while ((match = regex.exec(html)) !== null && tags.size < 50) { tags.add(String(match[1] || '').toLowerCase()); } return [...tags]; } function extractAttributeValues(html, attrName) { const regex = new RegExp(`\\b${attrName}\\s*=\\s*(["'])(.*?)\\1`, 'gi'); const values = []; let match; while ((match = regex.exec(html)) !== null) { values.push(String(match[2] || '').trim()); } return values; } function detectLinkDensityBucket(links, paragraphTextLength) { if (!paragraphTextLength) { return links.length >= 15 ? 'high' : 'medium'; } const ratio = (links.length * 1000) / Math.max(paragraphTextLength, 1); if (ratio >= 18 || links.length >= 60) { return 'high'; } if (ratio >= 8 || links.length >= 25) { return 'medium'; } return 'low'; } function detectParagraphBucket(paragraphCount) { if (paragraphCount === 0) { return '0'; } if (paragraphCount <= 2) { return '1-2'; } if (paragraphCount <= 7) { return '3-7'; } return '8+'; } function detectHeadlineContainerPattern(html, headlineLinks) { const h1Count = (html.match(/