add intelligence worker and embedding generation for article processing
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const config = require('../config');
|
||||
|
||||
|
||||
async function devRoutes(fastify) {
|
||||
if (!config.dev || !config.dev.enabled) return;
|
||||
|
||||
fastify.get('/dev/db/download', async (req, reply) => {
|
||||
const dbPath = path.resolve(config.duriin_db || './archive.sqlite');
|
||||
|
||||
if (!fs.existsSync(dbPath)) {
|
||||
return reply.code(404).send({ error: 'database file not found' });
|
||||
}
|
||||
|
||||
const stat = fs.statSync(dbPath);
|
||||
const filename = path.basename(dbPath);
|
||||
|
||||
reply.header('Content-Type', 'application/octet-stream');
|
||||
reply.header('Content-Disposition', `attachment; filename="${filename}"`);
|
||||
reply.header('Content-Length', stat.size);
|
||||
|
||||
return reply.send(fs.createReadStream(dbPath));
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = devRoutes;
|
||||
+113
-14
@@ -1,4 +1,5 @@
|
||||
const db = require('../db');
|
||||
const { findArticlesByEmbedding, getOrCreateQueryEmbedding } = require('../embeddings');
|
||||
|
||||
function parseLimit(value) {
|
||||
const n = Number.parseInt(value, 10);
|
||||
@@ -10,10 +11,88 @@ function parseOffset(value) {
|
||||
return Number.isFinite(n) && n >= 0 ? n : 0;
|
||||
}
|
||||
|
||||
const getArticlesForEvent = db.prepare(`
|
||||
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE event_id = ?
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
ORDER BY pub_date_effective DESC, id DESC
|
||||
`);
|
||||
|
||||
function fetchEventsByIds(ids) {
|
||||
if (ids.length === 0) return [];
|
||||
|
||||
const placeholders = ids.map(() => '?').join(', ');
|
||||
const rows = db.prepare(`
|
||||
SELECT e.id, e.title,
|
||||
(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != '' AND a.is_index_page = 0) AS pub_date
|
||||
FROM events e
|
||||
WHERE e.id IN (${placeholders})
|
||||
`).all(...ids);
|
||||
|
||||
const byId = new Map(rows.map(e => [e.id, e]));
|
||||
|
||||
// preserve the caller-supplied order (distance order for semantic, sort order for keyword)
|
||||
return ids.map(id => {
|
||||
const e = byId.get(id);
|
||||
return e ? { ...e, articles: getArticlesForEvent.all(e.id) } : null;
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
async function eventRoutes(fastify) {
|
||||
fastify.get('/events', async (request, reply) => {
|
||||
const query = request.query || {};
|
||||
const limit = parseLimit(query.limit);
|
||||
const offset = parseOffset(query.offset);
|
||||
|
||||
// semantic path: embed query → find nearest articles → resolve to events
|
||||
if (query.semantic !== undefined) {
|
||||
const embedding = await getOrCreateQueryEmbedding(query.semantic);
|
||||
if (!embedding) {
|
||||
reply.code(400);
|
||||
return { error: 'Semantic query must not be empty' };
|
||||
}
|
||||
|
||||
// fetch more article candidates than we need so we have enough after dedup
|
||||
const neighbors = findArticlesByEmbedding(embedding, Math.min(limit * 20, 2000));
|
||||
const neighborIds = neighbors.map(n => n.articleId);
|
||||
if (neighborIds.length === 0) return [];
|
||||
|
||||
const placeholders = neighborIds.map(() => '?').join(', ');
|
||||
const conditions = [
|
||||
`id IN (${placeholders})`,
|
||||
'event_id IS NOT NULL',
|
||||
"content IS NOT NULL AND content != ''",
|
||||
'is_index_page = 0',
|
||||
];
|
||||
const params = [...neighborIds];
|
||||
|
||||
if (query.from) { conditions.push('pub_date_effective >= ?'); params.push(query.from); }
|
||||
if (query.to) { conditions.push('pub_date_effective <= ?'); params.push(query.to); }
|
||||
|
||||
const articleRows = db.prepare(
|
||||
`SELECT id, event_id FROM articles WHERE ${conditions.join(' AND ')}`
|
||||
).all(...params);
|
||||
|
||||
const articleEventMap = new Map(articleRows.map(r => [r.id, r.event_id]));
|
||||
|
||||
// walk neighbors in distance order and collect unique event IDs
|
||||
const seen = new Set();
|
||||
const orderedEventIds = [];
|
||||
for (const id of neighborIds) {
|
||||
const eventId = articleEventMap.get(id);
|
||||
if (eventId != null && !seen.has(eventId)) {
|
||||
seen.add(eventId);
|
||||
orderedEventIds.push(eventId);
|
||||
}
|
||||
}
|
||||
|
||||
return fetchEventsByIds(orderedEventIds.slice(offset, offset + limit));
|
||||
}
|
||||
|
||||
|
||||
// keyword / date filter path
|
||||
const conditions = [];
|
||||
const params = [];
|
||||
|
||||
@@ -27,17 +106,46 @@ async function eventRoutes(fastify) {
|
||||
params.push(id);
|
||||
}
|
||||
|
||||
const limit = parseLimit(query.limit);
|
||||
const offset = parseOffset(query.offset);
|
||||
if (query.keyword) {
|
||||
const keywords = [].concat(query.keyword).map(k => k.trim()).filter(Boolean);
|
||||
const mode = String(query.keyword_mode || '').toLowerCase() === 'or' ? 'OR' : 'AND';
|
||||
const clauses = keywords.map(() => '(a.title LIKE ? OR a.description LIKE ? OR a.content LIKE ?)');
|
||||
|
||||
conditions.push(`EXISTS (
|
||||
SELECT 1 FROM articles a
|
||||
WHERE a.event_id = e.id
|
||||
AND a.content IS NOT NULL AND a.content != ''
|
||||
AND a.is_index_page = 0
|
||||
AND (${clauses.join(` ${mode} `)})
|
||||
)`);
|
||||
|
||||
for (const kw of keywords) {
|
||||
const like = `%${kw}%`;
|
||||
params.push(like, like, like);
|
||||
}
|
||||
}
|
||||
|
||||
if (query.from) {
|
||||
conditions.push(`EXISTS (
|
||||
SELECT 1 FROM articles a WHERE a.event_id = e.id AND a.pub_date_effective >= ?
|
||||
)`);
|
||||
params.push(query.from);
|
||||
}
|
||||
|
||||
if (query.to) {
|
||||
conditions.push(`EXISTS (
|
||||
SELECT 1 FROM articles a WHERE a.event_id = e.id AND a.pub_date_effective <= ?
|
||||
)`);
|
||||
params.push(query.to);
|
||||
}
|
||||
|
||||
const SORT_COLUMNS = {
|
||||
pub_date: '(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != \'\' AND a.is_index_page = 0)',
|
||||
pub_date: "(SELECT MIN(a.pub_date_effective) FROM articles a WHERE a.event_id = e.id AND a.content IS NOT NULL AND a.content != '' AND a.is_index_page = 0)",
|
||||
id: 'e.id',
|
||||
};
|
||||
|
||||
const sortBy = SORT_COLUMNS[query.sort_by] || SORT_COLUMNS.pub_date;
|
||||
const order = String(query.order || '').toLowerCase() === 'asc' ? 'ASC' : 'DESC';
|
||||
|
||||
const where = conditions.length ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
|
||||
const events = db.prepare(`
|
||||
@@ -49,16 +157,7 @@ async function eventRoutes(fastify) {
|
||||
LIMIT ? OFFSET ?
|
||||
`).all(...params, limit, offset);
|
||||
|
||||
const getArticles = db.prepare(`
|
||||
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
|
||||
FROM articles
|
||||
WHERE event_id = ?
|
||||
AND content IS NOT NULL AND content != ''
|
||||
AND is_index_page = 0
|
||||
ORDER BY pub_date_effective DESC, id DESC
|
||||
`);
|
||||
|
||||
return events.map(e => ({ ...e, articles: getArticles.all(e.id) }));
|
||||
return events.map(e => ({ ...e, articles: getArticlesForEvent.all(e.id) }));
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user