add intelligence worker and embedding generation for article processing

This commit is contained in:
ImBenji
2026-04-22 20:12:38 +01:00
parent 715172596f
commit ac7c87c6cf
11 changed files with 754 additions and 16 deletions
+27
View File
@@ -0,0 +1,27 @@
const fs = require('fs');
const path = require('path');
const config = require('../config');
async function devRoutes(fastify) {
if (!config.dev || !config.dev.enabled) return;
fastify.get('/dev/db/download', async (req, reply) => {
const dbPath = path.resolve(config.duriin_db || './archive.sqlite');
if (!fs.existsSync(dbPath)) {
return reply.code(404).send({ error: 'database file not found' });
}
const stat = fs.statSync(dbPath);
const filename = path.basename(dbPath);
reply.header('Content-Type', 'application/octet-stream');
reply.header('Content-Disposition', `attachment; filename="${filename}"`);
reply.header('Content-Length', stat.size);
return reply.send(fs.createReadStream(dbPath));
});
}
module.exports = devRoutes;
+113 -14
View File
@@ -1,4 +1,5 @@
const db = require('../db');
const { findArticlesByEmbedding, getOrCreateQueryEmbedding } = require('../embeddings');
function parseLimit(value) {
const n = Number.parseInt(value, 10);
@@ -10,10 +11,88 @@ function parseOffset(value) {
return Number.isFinite(n) && n >= 0 ? n : 0;
}
const getArticlesForEvent = db.prepare(`
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE event_id = ?
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
ORDER BY pub_date_effective DESC, id DESC
`);
function fetchEventsByIds(ids) {
if (ids.length === 0) return [];
const placeholders = ids.map(() => '?').join(', ');
const rows = db.prepare(`
SELECT e.id, e.title,
(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != '' AND a.is_index_page = 0) AS pub_date
FROM events e
WHERE e.id IN (${placeholders})
`).all(...ids);
const byId = new Map(rows.map(e => [e.id, e]));
// preserve the caller-supplied order (distance order for semantic, sort order for keyword)
return ids.map(id => {
const e = byId.get(id);
return e ? { ...e, articles: getArticlesForEvent.all(e.id) } : null;
}).filter(Boolean);
}
async function eventRoutes(fastify) {
fastify.get('/events', async (request, reply) => {
const query = request.query || {};
const limit = parseLimit(query.limit);
const offset = parseOffset(query.offset);
// semantic path: embed query → find nearest articles → resolve to events
if (query.semantic !== undefined) {
const embedding = await getOrCreateQueryEmbedding(query.semantic);
if (!embedding) {
reply.code(400);
return { error: 'Semantic query must not be empty' };
}
// fetch more article candidates than we need so we have enough after dedup
const neighbors = findArticlesByEmbedding(embedding, Math.min(limit * 20, 2000));
const neighborIds = neighbors.map(n => n.articleId);
if (neighborIds.length === 0) return [];
const placeholders = neighborIds.map(() => '?').join(', ');
const conditions = [
`id IN (${placeholders})`,
'event_id IS NOT NULL',
"content IS NOT NULL AND content != ''",
'is_index_page = 0',
];
const params = [...neighborIds];
if (query.from) { conditions.push('pub_date_effective >= ?'); params.push(query.from); }
if (query.to) { conditions.push('pub_date_effective <= ?'); params.push(query.to); }
const articleRows = db.prepare(
`SELECT id, event_id FROM articles WHERE ${conditions.join(' AND ')}`
).all(...params);
const articleEventMap = new Map(articleRows.map(r => [r.id, r.event_id]));
// walk neighbors in distance order and collect unique event IDs
const seen = new Set();
const orderedEventIds = [];
for (const id of neighborIds) {
const eventId = articleEventMap.get(id);
if (eventId != null && !seen.has(eventId)) {
seen.add(eventId);
orderedEventIds.push(eventId);
}
}
return fetchEventsByIds(orderedEventIds.slice(offset, offset + limit));
}
// keyword / date filter path
const conditions = [];
const params = [];
@@ -27,17 +106,46 @@ async function eventRoutes(fastify) {
params.push(id);
}
const limit = parseLimit(query.limit);
const offset = parseOffset(query.offset);
if (query.keyword) {
const keywords = [].concat(query.keyword).map(k => k.trim()).filter(Boolean);
const mode = String(query.keyword_mode || '').toLowerCase() === 'or' ? 'OR' : 'AND';
const clauses = keywords.map(() => '(a.title LIKE ? OR a.description LIKE ? OR a.content LIKE ?)');
conditions.push(`EXISTS (
SELECT 1 FROM articles a
WHERE a.event_id = e.id
AND a.content IS NOT NULL AND a.content != ''
AND a.is_index_page = 0
AND (${clauses.join(` ${mode} `)})
)`);
for (const kw of keywords) {
const like = `%${kw}%`;
params.push(like, like, like);
}
}
if (query.from) {
conditions.push(`EXISTS (
SELECT 1 FROM articles a WHERE a.event_id = e.id AND a.pub_date_effective >= ?
)`);
params.push(query.from);
}
if (query.to) {
conditions.push(`EXISTS (
SELECT 1 FROM articles a WHERE a.event_id = e.id AND a.pub_date_effective <= ?
)`);
params.push(query.to);
}
const SORT_COLUMNS = {
pub_date: '(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != \'\' AND a.is_index_page = 0)',
pub_date: "(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != '' AND a.is_index_page = 0)",
id: 'e.id',
};
const sortBy = SORT_COLUMNS[query.sort_by] || SORT_COLUMNS.pub_date;
const order = String(query.order || '').toLowerCase() === 'asc' ? 'ASC' : 'DESC';
const where = conditions.length ? `WHERE ${conditions.join(' AND ')}` : '';
const events = db.prepare(`
@@ -49,16 +157,7 @@ async function eventRoutes(fastify) {
LIMIT ? OFFSET ?
`).all(...params, limit, offset);
const getArticles = db.prepare(`
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE event_id = ?
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
ORDER BY pub_date_effective DESC, id DESC
`);
return events.map(e => ({ ...e, articles: getArticles.all(e.id) }));
return events.map(e => ({ ...e, articles: getArticlesForEvent.all(e.id) }));
});
}