const db = require('./db'); const { normalizeTitle } = require('./dedup'); const { markSourceRun } = require('./state'); const sourcesById = Object.fromEntries( require('../sources.json').map((s) => [s.id, s]) ); const insertArticle = db.prepare(` INSERT INTO articles ( title, description, content, is_index_page, url, normalized_title, source, pub_date, ingested_at, pub_date_effective, language ) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?) `); const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?'); const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i; const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i; function normalizePubDate(value) { if (!value) { return null; } if (typeof value === 'number') { const parsed = new Date(value); return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); } const input = String(value).trim(); if (!input) { return null; } if (/^\d{8}T\d{6}$/.test(input)) { const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`; const parsed = new Date(normalized); return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); } if (/^\d{8}T\d{6}Z$/.test(input)) { const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`; const parsed = new Date(normalized); return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); } if (/^\d{4}-\d{2}-\d{2}$/.test(input)) { return `${input}T00:00:00.000Z`; } const parsed = new Date(input); return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString(); } function inferIsIndexPage(article, title, url) { if (article.isIndexPage != null) { return article.isIndexPage ? 1 : 0; } return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0; } function ingestArticle(article) { const title = String(article.title || '').trim(); const url = String(article.url || '').trim(); const source = String(article.source || '').trim(); if (!title || !url || !source) { return { inserted: false, reason: 'missing_required_fields' }; } const normalizedTitle = normalizeTitle(title); if (!normalizedTitle) { return { inserted: false, reason: 'empty_normalized_title' }; } const description = article.description == null ? null : String(article.description).trim() || null; const isIndexPage = inferIsIndexPage(article, title, url); const pubDate = normalizePubDate(article.pubDate); const ingestedAt = new Date().toISOString(); const language = (sourcesById[source] && sourcesById[source].language) || null; try { const result = insertArticle.run( title, description, isIndexPage, url, normalizedTitle, source, pubDate, ingestedAt, pubDate || ingestedAt, language ); // dont kick off the content fetch here — it used to be fire-and-forget which // pinned thousands of pending render promises in memory during big gdelt // backfills. the runContentLoop polls for pending rows and handles them // with proper concurrency limits return { inserted: true, id: result.lastInsertRowid }; } catch (error) { if (error.code === 'SQLITE_CONSTRAINT_UNIQUE') { const duplicateByUrl = findByUrl.get(url); if (duplicateByUrl) { return { inserted: false, reason: 'duplicate_url', id: duplicateByUrl.id }; } return { inserted: false, reason: 'duplicate' }; } throw error; } } async function ingestBatch(source, articles) { let inserted = 0; for (const article of articles) { const result = ingestArticle({ ...article, source: article.source || source }); if (result.inserted) { inserted += 1; } } markSourceRun(source); return { source, inserted, total: articles.length }; } module.exports = { ingestArticle, ingestBatch, };