Duriin-API/src/ingest.js

120 lines
3.2 KiB
JavaScript

const db = require('./db');
const { normalizeTitle } = require('./dedup');
const { fetchAndStoreContent } = require('./content');
const { markSourceRun } = require('./state');
const insertArticle = db.prepare(`
INSERT INTO articles (
title,
description,
content,
image,
url,
normalized_title,
source,
pub_date,
ingested_at
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?)
`);
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
function normalizePubDate(value) {
if (!value) {
return null;
}
if (typeof value === 'number') {
const parsed = new Date(value);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
const input = String(value).trim();
if (!input) {
return null;
}
if (/^\d{8}T\d{6}$/.test(input)) {
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
const parsed = new Date(normalized);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
if (/^\d{8}T\d{6}Z$/.test(input)) {
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
const parsed = new Date(normalized);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
if (/^\d{4}-\d{2}-\d{2}$/.test(input)) {
return `${input}T00:00:00.000Z`;
}
const parsed = new Date(input);
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
}
function ingestArticle(article) {
const title = String(article.title || '').trim();
const url = String(article.url || '').trim();
const source = String(article.source || '').trim();
if (!title || !url || !source) {
return { inserted: false, reason: 'missing_required_fields' };
}
const normalizedTitle = normalizeTitle(title);
if (!normalizedTitle) {
return { inserted: false, reason: 'empty_normalized_title' };
}
const description = article.description == null ? null : String(article.description).trim() || null;
const pubDate = normalizePubDate(article.pubDate);
const ingestedAt = new Date().toISOString();
try {
const result = insertArticle.run(
title,
description,
url,
normalizedTitle,
source,
pubDate,
ingestedAt
);
fetchAndStoreContent(result.lastInsertRowid, url);
return { inserted: true, id: result.lastInsertRowid };
} catch (error) {
if (error.code === 'SQLITE_CONSTRAINT_UNIQUE') {
const duplicateByUrl = findByUrl.get(url);
if (duplicateByUrl) {
return { inserted: false, reason: 'duplicate_url', id: duplicateByUrl.id };
}
return { inserted: false, reason: 'duplicate' };
}
throw error;
}
}
async function ingestBatch(source, articles) {
let inserted = 0;
for (const article of articles) {
const result = ingestArticle({ ...article, source: article.source || source });
if (result.inserted) {
inserted += 1;
}
}
markSourceRun(source);
return { source, inserted, total: articles.length };
}
module.exports = {
ingestArticle,
ingestBatch,
};