add event clustering feature to group articles by similarity and time proximity

This commit is contained in:
ImBenji
2026-04-21 12:45:22 +01:00
parent cb819e77ee
commit 3c238fb5f7
8 changed files with 239 additions and 102 deletions
+114
View File
@@ -0,0 +1,114 @@
const db = require('./db');
const config = require('./config');
const { findSimilarArticles } = require('./embeddings');
// cosine distance threshold — articles closer than this get filed into the same event.
// tunable via config.clustering.distanceThreshold
const DEFAULT_THRESHOLD = 0.25;
const DEFAULT_WINDOW_HOURS = 72;
function getThreshold() {
return Number((config.clustering && config.clustering.distanceThreshold) || DEFAULT_THRESHOLD);
}
function getWindowHours() {
return Number((config.clustering && config.clustering.windowHours) || DEFAULT_WINDOW_HOURS);
}
const selectPending = db.prepare(`
SELECT id, title, pub_date_effective FROM articles
WHERE has_embedding = 1
AND event_id IS NULL
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
ORDER BY pub_date_effective DESC, id DESC
LIMIT ?
`);
const getEventNeighbor = db.prepare(`
SELECT a.event_id, a.pub_date_effective
FROM articles a
WHERE a.id = ? AND a.event_id IS NOT NULL
`);
const assignEvent = db.prepare(`
UPDATE articles SET event_id = ? WHERE id = ?
`);
const createEvent = db.prepare(`
INSERT INTO events (title) VALUES (?)
`);
let clusteringRunning = false;
async function backfillMissingClusters(limit = 128) {
if (clusteringRunning) return { processed: 0 };
clusteringRunning = true;
let processed = 0;
try {
const candidates = selectPending.all(limit);
if (candidates.length === 0) return { processed: 0 };
const threshold = getThreshold();
const windowMs = getWindowHours() * 60 * 60 * 1000;
for (const article of candidates) {
// re-check in case a prev iteration in this batch already assigned it
const alreadyAssigned = db.prepare(`SELECT event_id FROM articles WHERE id = ?`).get(article.id);
if (alreadyAssigned && alreadyAssigned.event_id != null) {
processed++;
continue;
}
const neighbors = findSimilarArticles(article.id, 10);
const closeEnough = neighbors.filter((n) => n.distance <= threshold);
const articleTime = article.pub_date_effective ? new Date(article.pub_date_effective).getTime() : null;
let eventId = null;
for (const neighbor of closeEnough) {
const row = getEventNeighbor.get(neighbor.articleId);
if (!row || row.event_id == null) continue;
if (articleTime && row.pub_date_effective) {
const neighborTime = new Date(row.pub_date_effective).getTime();
if (Math.abs(articleTime - neighborTime) > windowMs) continue;
}
eventId = row.event_id;
break;
}
if (eventId == null) {
const result = createEvent.run(article.title);
eventId = result.lastInsertRowid;
}
assignEvent.run(eventId, article.id);
processed++;
}
} finally {
clusteringRunning = false;
}
return { processed };
}
function hasPendingClusters() {
return Boolean(db.prepare(`
SELECT 1 FROM articles
WHERE has_embedding = 1
AND event_id IS NULL
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
LIMIT 1
`).get());
}
module.exports = { backfillMissingClusters, hasPendingClusters };
+24 -60
View File
@@ -220,6 +220,30 @@ db.exec(`
}
}
db.exec(`
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);
`);
for (const statement of [
'ALTER TABLE articles ADD COLUMN event_id INTEGER REFERENCES events(id)',
]) {
try {
db.exec(statement);
} catch (error) {
if (!String(error.message).includes('duplicate column name')) {
throw error;
}
}
}
db.exec(`
CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id);
`);
db.exec(`
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
source_id TEXT NOT NULL,
@@ -300,69 +324,9 @@ for (const statement of [
}
}
// backfill has_embedding for existing rows — safe to re-run, only touches rows that need it
db.exec(`
UPDATE articles SET has_embedding = 1
WHERE has_embedding = 0
AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id)
`);
// backfill pub_date_effective for existing rows
db.exec(`
UPDATE articles SET pub_date_effective = COALESCE(pub_date, ingested_at)
WHERE pub_date_effective IS NULL
`);
// backfill language from sources.json for existing rows
{
const sources = require('../sources.json');
const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`);
const backfillLang = db.transaction(() => {
for (const src of sources) {
if (src.language) {
updateLang.run(src.language, src.id);
}
}
});
backfillLang();
}
db.exec(`
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
`);
db.exec(`
UPDATE articles
SET is_index_page = 1
WHERE is_index_page = 0
AND (
LOWER(url) LIKE '%/category/%'
OR LOWER(url) LIKE '%/categories/%'
OR LOWER(url) LIKE '%/tag/%'
OR LOWER(url) LIKE '%/tags/%'
OR LOWER(url) LIKE '%/topic/%'
OR LOWER(url) LIKE '%/topics/%'
OR LOWER(url) LIKE '%/section/%'
OR LOWER(url) LIKE '%/sections/%'
OR LOWER(url) LIKE '%/archive%'
OR LOWER(url) LIKE '%/archives/%'
OR LOWER(url) LIKE '%/authors/%'
OR LOWER(url) LIKE '%/search%'
OR LOWER(title) LIKE '%category%'
OR LOWER(title) LIKE '%archives%'
OR LOWER(title) LIKE '%archive%'
OR LOWER(title) LIKE '%latest news%'
)
`);
// reset articles that grabbed yahoo finance's nav shell instead of article body
db.exec(`
UPDATE articles
SET content = NULL, content_status = NULL, content_error = NULL,
content_attempted_at = NULL, content_attempt_count = 0,
content_retry_after = NULL
WHERE content LIKE 'Today''s news US Politics World Weather%'
`);
module.exports = db;
-40
View File
@@ -121,46 +121,6 @@ try {
console.error('embedding store backfill failed:', err);
}
// probe the API to get the real dimension count for the current model, then purge
// any store entries that don't match — handles the case where old embeddings
// got stamped with the wrong model name during migration
async function purgeWrongSizeEmbeddings() {
const apiKey = config.openRouter && config.openRouter.apiKey
? String(config.openRouter.apiKey).trim()
: '';
if (!apiKey) return;
try {
const probe = await requestEmbedding('probe');
const expectedBytes = probe.length * 4;
const stale = db.prepare(`
SELECT article_id FROM article_embedding_store
WHERE model = ? AND LENGTH(embedding) != ?
`).all(EMBEDDING_MODEL, expectedBytes);
if (stale.length === 0) return;
const deleteStore = db.prepare(`DELETE FROM article_embedding_store WHERE article_id = ? AND model = ?`);
const deleteMeta = db.prepare(`DELETE FROM article_embedding_meta WHERE article_id = ?`);
const deleteVec = db.prepare(`DELETE FROM article_embeddings WHERE article_id = ?`);
db.transaction(() => {
for (const row of stale) {
deleteStore.run(row.article_id, EMBEDDING_MODEL);
deleteMeta.run(row.article_id);
deleteVec.run(BigInt(row.article_id));
}
})();
console.log(`purged ${stale.length} wrong-size embeddings for model ${EMBEDDING_MODEL} (expected ${probe.length} dims)`);
} catch (err) {
console.error('embedding size validation failed:', err);
}
}
purgeWrongSizeEmbeddings();
// if the config model changed, rebuild the vec0 search index from store.
// only proceeds if the store actually has embeddings for the new model.
+37
View File
@@ -0,0 +1,37 @@
const db = require('../db');
async function eventRoutes(fastify) {
fastify.get('/events', async (request, reply) => {
const query = request.query || {};
if (!query.id) {
reply.code(400);
return { error: 'id is required' };
}
const id = Number.parseInt(query.id, 10);
if (!Number.isFinite(id)) {
reply.code(400);
return { error: 'id must be a number' };
}
const event = db.prepare(`SELECT id, title, created_at FROM events WHERE id = ?`).get(id);
if (!event) {
reply.code(404);
return { error: 'Event not found' };
}
const articles = db.prepare(`
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
FROM articles
WHERE event_id = ?
AND content IS NOT NULL AND content != ''
AND is_index_page = 0
ORDER BY pub_date_effective DESC, id DESC
`).all(id);
return { ...event, articles };
});
}
module.exports = eventRoutes;
+23
View File
@@ -9,6 +9,7 @@ const { fetchFinnhubArticles } = require('./sources/finnhub');
const { fetchGoogleNewsArticles } = require('./sources/googleNews');
const { backfillMissingContent, runBackfillWorker, hasPendingContent } = require('./content');
const { backfillMissingEmbeddings, hasPendingEmbeddings } = require('./embeddings');
const { backfillMissingClusters, hasPendingClusters } = require('./clustering');
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
@@ -173,7 +174,29 @@ function startScheduler() {
runContentWorker(i, workerCount);
}
const runClusteringLoop = async () => {
while (true) {
if (!hasPendingClusters()) {
await sleep(30 * 1000);
continue;
}
try {
const limit = Number(config.clustering && config.clustering.perRound) || 128;
const result = await backfillMissingClusters(limit);
if (result.processed === 0) {
await sleep(5000);
}
} catch (error) {
console.error('clustering backfill failed:', error);
await sleep(10000);
}
}
};
runEmbeddingLoop();
runClusteringLoop();
cron.schedule(config.scheduler.rss, runRss);
cron.schedule(config.scheduler.edgar, runEdgar);