Duriin-API/src/ingest.js

143 lines
4.1 KiB
JavaScript

const db = require('./db');
const { normalizeTitle } = require('./dedup');
const { markSourceRun } = require('./state');
const sourcesById = Object.fromEntries(
require('../sources.json').map((s) => [s.id, s])
);
const insertArticle = db.prepare(`
INSERT INTO articles (
title,
description,
content,
is_index_page,
url,
normalized_title,
source,
pub_date,
ingested_at,
pub_date_effective,
language
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
const INDEX_PAGE_TITLE_HINT = /\b(category|archives?|latest news)\b/i;
function normalizePubDate(value) {
if (!value) {
return null;
}
if (typeof value === 'number') {
const parsed = new Date(value);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
const input = String(value).trim();
if (!input) {
return null;
}
if (/^\d{8}T\d{6}$/.test(input)) {
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
const parsed = new Date(normalized);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
if (/^\d{8}T\d{6}Z$/.test(input)) {
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
const parsed = new Date(normalized);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
if (/^\d{4}-\d{2}-\d{2}$/.test(input)) {
return `${input}T00:00:00.000Z`;
}
const parsed = new Date(input);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
function inferIsIndexPage(article, title, url) {
if (article.isIndexPage != null) {
return article.isIndexPage ? 1 : 0;
}
return INDEX_PAGE_URL_HINT.test(url) || INDEX_PAGE_TITLE_HINT.test(title) ? 1 : 0;
}
function ingestArticle(article) {
const title = String(article.title || '').trim();
const url = String(article.url || '').trim();
const source = String(article.source || '').trim();
if (!title || !url || !source) {
return { inserted: false, reason: 'missing_required_fields' };
}
const normalizedTitle = normalizeTitle(title);
if (!normalizedTitle) {
return { inserted: false, reason: 'empty_normalized_title' };
}
const description = article.description == null ? null : String(article.description).trim() || null;
const isIndexPage = inferIsIndexPage(article, title, url);
const pubDate = normalizePubDate(article.pubDate);
const ingestedAt = new Date().toISOString();
const language = (sourcesById[source] && sourcesById[source].language) || null;
try {
const result = insertArticle.run(
title,
description,
isIndexPage,
url,
normalizedTitle,
source,
pubDate,
ingestedAt,
pubDate || ingestedAt,
language
);
// dont kick off the content fetch here — it used to be fire-and-forget which
// pinned thousands of pending render promises in memory during big gdelt
// backfills. the runContentLoop polls for pending rows and handles them
// with proper concurrency limits
return { inserted: true, id: result.lastInsertRowid };
} catch (error) {
if (error.code === 'SQLITE_CONSTRAINT_UNIQUE') {
const duplicateByUrl = findByUrl.get(url);
if (duplicateByUrl) {
return { inserted: false, reason: 'duplicate_url', id: duplicateByUrl.id };
}
return { inserted: false, reason: 'duplicate' };
}
throw error;
}
}
async function ingestBatch(source, articles) {
let inserted = 0;
for (const article of articles) {
const result = ingestArticle({ ...article, source: article.source || source });
if (result.inserted) {
inserted += 1;
}
}
markSourceRun(source);
return { source, inserted, total: articles.length };
}
module.exports = {
ingestArticle,
ingestBatch,
};