120 lines
3.2 KiB
JavaScript
120 lines
3.2 KiB
JavaScript
const db = require('./db');
|
|
const { normalizeTitle } = require('./dedup');
|
|
const { fetchAndStoreContent } = require('./content');
|
|
const { markSourceRun } = require('./state');
|
|
|
|
const insertArticle = db.prepare(`
|
|
INSERT INTO articles (
|
|
title,
|
|
description,
|
|
content,
|
|
image,
|
|
url,
|
|
normalized_title,
|
|
source,
|
|
pub_date,
|
|
ingested_at
|
|
) VALUES (?, ?, NULL, NULL, ?, ?, ?, ?, ?)
|
|
`);
|
|
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
|
|
|
|
function normalizePubDate(value) {
|
|
if (!value) {
|
|
return null;
|
|
}
|
|
|
|
if (typeof value === 'number') {
|
|
const parsed = new Date(value);
|
|
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
|
}
|
|
|
|
const input = String(value).trim();
|
|
if (!input) {
|
|
return null;
|
|
}
|
|
|
|
if (/^\d{8}T\d{6}$/.test(input)) {
|
|
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
|
|
const parsed = new Date(normalized);
|
|
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
|
}
|
|
|
|
if (/^\d{8}T\d{6}Z$/.test(input)) {
|
|
const normalized = `${input.slice(0, 4)}-${input.slice(4, 6)}-${input.slice(6, 8)}T${input.slice(9, 11)}:${input.slice(11, 13)}:${input.slice(13, 15)}Z`;
|
|
const parsed = new Date(normalized);
|
|
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
|
}
|
|
|
|
if (/^\d{4}-\d{2}-\d{2}$/.test(input)) {
|
|
return `${input}T00:00:00.000Z`;
|
|
}
|
|
|
|
const parsed = new Date(input);
|
|
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
|
}
|
|
|
|
function ingestArticle(article) {
|
|
const title = String(article.title || '').trim();
|
|
const url = String(article.url || '').trim();
|
|
const source = String(article.source || '').trim();
|
|
|
|
if (!title || !url || !source) {
|
|
return { inserted: false, reason: 'missing_required_fields' };
|
|
}
|
|
|
|
const normalizedTitle = normalizeTitle(title);
|
|
if (!normalizedTitle) {
|
|
return { inserted: false, reason: 'empty_normalized_title' };
|
|
}
|
|
|
|
const description = article.description == null ? null : String(article.description).trim() || null;
|
|
const pubDate = normalizePubDate(article.pubDate);
|
|
const ingestedAt = new Date().toISOString();
|
|
|
|
try {
|
|
const result = insertArticle.run(
|
|
title,
|
|
description,
|
|
url,
|
|
normalizedTitle,
|
|
source,
|
|
pubDate,
|
|
ingestedAt
|
|
);
|
|
|
|
fetchAndStoreContent(result.lastInsertRowid, url);
|
|
|
|
return { inserted: true, id: result.lastInsertRowid };
|
|
} catch (error) {
|
|
if (error.code === 'SQLITE_CONSTRAINT_UNIQUE') {
|
|
const duplicateByUrl = findByUrl.get(url);
|
|
if (duplicateByUrl) {
|
|
return { inserted: false, reason: 'duplicate_url', id: duplicateByUrl.id };
|
|
}
|
|
|
|
return { inserted: false, reason: 'duplicate' };
|
|
}
|
|
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function ingestBatch(source, articles) {
|
|
let inserted = 0;
|
|
|
|
for (const article of articles) {
|
|
const result = ingestArticle({ ...article, source: article.source || source });
|
|
if (result.inserted) {
|
|
inserted += 1;
|
|
}
|
|
}
|
|
|
|
markSourceRun(source);
|
|
|
|
return { source, inserted, total: articles.length };
|
|
}
|
|
|
|
module.exports = {
|
|
ingestArticle,
|
|
ingestBatch,
|
|
};
|