add event clustering feature to group articles by similarity and time proximity
This commit is contained in:
parent
cb819e77ee
commit
3c238fb5f7
8 changed files with 239 additions and 102 deletions
39
README.md
39
README.md
|
|
@ -23,7 +23,7 @@ On startup the server:
|
||||||
1. Opens the SQLite database and runs any pending migrations.
|
1. Opens the SQLite database and runs any pending migrations.
|
||||||
2. Registers routes.
|
2. Registers routes.
|
||||||
3. Starts the HTTP server.
|
3. Starts the HTTP server.
|
||||||
4. Launches continuous background loops for each source, content backfill, and embedding backfill.
|
4. Launches continuous background loops for each source, content backfill, embedding backfill, and event clustering.
|
||||||
|
|
||||||
When a new article is inserted:
|
When a new article is inserted:
|
||||||
|
|
||||||
|
|
@ -31,6 +31,7 @@ When a new article is inserted:
|
||||||
- `content` starts as `null`
|
- `content` starts as `null`
|
||||||
- content backfill workers pick it up asynchronously — plain HTTP first, Playwright fallback for JS-heavy sites
|
- content backfill workers pick it up asynchronously — plain HTTP first, Playwright fallback for JS-heavy sites
|
||||||
- vector embeddings are generated after title, description, and content are all available
|
- vector embeddings are generated after title, description, and content are all available
|
||||||
|
- the clustering worker assigns the article to an event once it has an embedding
|
||||||
- only articles with content + embedding are exposed via the API
|
- only articles with content + embedding are exposed via the API
|
||||||
|
|
||||||
Content backfill prioritises recent articles (`pub_date_effective DESC`) so newest content surfaces first regardless of ingestion order.
|
Content backfill prioritises recent articles (`pub_date_effective DESC`) so newest content surfaces first regardless of ingestion order.
|
||||||
|
|
@ -107,6 +108,41 @@ Semantic and similarity results also include `"distance": 0.1234`.
|
||||||
|
|
||||||
Returns one article by numeric ID. Same usability filter as the list endpoint — returns `404` if the article exists but has no content or embedding.
|
Returns one article by numeric ID. Same usability filter as the list endpoint — returns `404` if the article exists but has no content or embedding.
|
||||||
|
|
||||||
|
### `GET /events`
|
||||||
|
|
||||||
|
Returns a single event and its articles.
|
||||||
|
|
||||||
|
#### Query params
|
||||||
|
|
||||||
|
| Param | Description |
|
||||||
|
|---|---|
|
||||||
|
| `id` | Event ID (required) |
|
||||||
|
|
||||||
|
#### Response shape
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "...",
|
||||||
|
"created_at": "2025-01-01T12:35:10.000Z",
|
||||||
|
"articles": [
|
||||||
|
{
|
||||||
|
"id": 123,
|
||||||
|
"title": "...",
|
||||||
|
"description": "...",
|
||||||
|
"content": "...",
|
||||||
|
"url": "...",
|
||||||
|
"normalized_title": "...",
|
||||||
|
"source": "rss:BBC",
|
||||||
|
"pub_date": "2025-01-01T12:34:56.000Z",
|
||||||
|
"ingested_at": "2025-01-01T12:35:10.000Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Returns `404` if the event ID does not exist.
|
||||||
|
|
||||||
### `GET /status`
|
### `GET /status`
|
||||||
|
|
||||||
Returns archive summary. Cached for 30 seconds.
|
Returns archive summary. Cached for 30 seconds.
|
||||||
|
|
@ -149,3 +185,4 @@ Use `domains[].policy` to diagnose why a source has high `skipped` or `failed` c
|
||||||
- Embeddings use OpenRouter and are indexed in `sqlite-vec` for ANN search.
|
- Embeddings use OpenRouter and are indexed in `sqlite-vec` for ANN search.
|
||||||
- Query embeddings are cached in SQLite to avoid redundant API calls.
|
- Query embeddings are cached in SQLite to avoid redundant API calls.
|
||||||
- SEC requests use the `User-Agent` from `config.json`.
|
- SEC requests use the `User-Agent` from `config.json`.
|
||||||
|
- Event clustering groups articles by embedding similarity (cosine distance ≤ `config.clustering.distanceThreshold`, default `0.25`) and time proximity (within `config.clustering.windowHours`, default `72`). Articles outside the time window are never grouped together even if embeddings are close.
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@
|
||||||
"tickers": []
|
"tickers": []
|
||||||
},
|
},
|
||||||
"openRouter": {
|
"openRouter": {
|
||||||
"apiKey": "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",
|
"apiKey": "[OFF]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",
|
||||||
"embeddingModel": "qwen/qwen3-embedding-8b"
|
"embeddingModel": "qwen/qwen3-embedding-8b"
|
||||||
},
|
},
|
||||||
"gdelt": {
|
"gdelt": {
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ const cors = require('@fastify/cors');
|
||||||
const articleRoutes = require('./src/routes/articles');
|
const articleRoutes = require('./src/routes/articles');
|
||||||
const statusRoutes = require('./src/routes/status');
|
const statusRoutes = require('./src/routes/status');
|
||||||
const sourcesRoutes = require('./src/routes/sources');
|
const sourcesRoutes = require('./src/routes/sources');
|
||||||
|
const eventRoutes = require('./src/routes/events');
|
||||||
const config = require('./src/config');
|
const config = require('./src/config');
|
||||||
const { startScheduler } = require('./src/scheduler');
|
const { startScheduler } = require('./src/scheduler');
|
||||||
|
|
||||||
|
|
@ -12,6 +13,7 @@ app.register(cors, { origin: true });
|
||||||
app.register(articleRoutes);
|
app.register(articleRoutes);
|
||||||
app.register(statusRoutes);
|
app.register(statusRoutes);
|
||||||
app.register(sourcesRoutes);
|
app.register(sourcesRoutes);
|
||||||
|
app.register(eventRoutes);
|
||||||
|
|
||||||
app.get('/', async () => ({ ok: true }));
|
app.get('/', async () => ({ ok: true }));
|
||||||
|
|
||||||
|
|
|
||||||
114
src/clustering.js
Normal file
114
src/clustering.js
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
const db = require('./db');
|
||||||
|
const config = require('./config');
|
||||||
|
const { findSimilarArticles } = require('./embeddings');
|
||||||
|
|
||||||
|
// cosine distance threshold — articles closer than this get filed into the same event.
|
||||||
|
// tunable via config.clustering.distanceThreshold
|
||||||
|
const DEFAULT_THRESHOLD = 0.25;
|
||||||
|
|
||||||
|
const DEFAULT_WINDOW_HOURS = 72;
|
||||||
|
|
||||||
|
function getThreshold() {
|
||||||
|
return Number((config.clustering && config.clustering.distanceThreshold) || DEFAULT_THRESHOLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
function getWindowHours() {
|
||||||
|
return Number((config.clustering && config.clustering.windowHours) || DEFAULT_WINDOW_HOURS);
|
||||||
|
}
|
||||||
|
|
||||||
|
const selectPending = db.prepare(`
|
||||||
|
SELECT id, title, pub_date_effective FROM articles
|
||||||
|
WHERE has_embedding = 1
|
||||||
|
AND event_id IS NULL
|
||||||
|
AND content IS NOT NULL AND content != ''
|
||||||
|
AND is_index_page = 0
|
||||||
|
ORDER BY pub_date_effective DESC, id DESC
|
||||||
|
LIMIT ?
|
||||||
|
`);
|
||||||
|
|
||||||
|
const getEventNeighbor = db.prepare(`
|
||||||
|
SELECT a.event_id, a.pub_date_effective
|
||||||
|
FROM articles a
|
||||||
|
WHERE a.id = ? AND a.event_id IS NOT NULL
|
||||||
|
`);
|
||||||
|
|
||||||
|
const assignEvent = db.prepare(`
|
||||||
|
UPDATE articles SET event_id = ? WHERE id = ?
|
||||||
|
`);
|
||||||
|
|
||||||
|
const createEvent = db.prepare(`
|
||||||
|
INSERT INTO events (title) VALUES (?)
|
||||||
|
`);
|
||||||
|
|
||||||
|
|
||||||
|
let clusteringRunning = false;
|
||||||
|
|
||||||
|
async function backfillMissingClusters(limit = 128) {
|
||||||
|
if (clusteringRunning) return { processed: 0 };
|
||||||
|
|
||||||
|
clusteringRunning = true;
|
||||||
|
let processed = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const candidates = selectPending.all(limit);
|
||||||
|
if (candidates.length === 0) return { processed: 0 };
|
||||||
|
|
||||||
|
const threshold = getThreshold();
|
||||||
|
const windowMs = getWindowHours() * 60 * 60 * 1000;
|
||||||
|
|
||||||
|
for (const article of candidates) {
|
||||||
|
|
||||||
|
// re-check in case a prev iteration in this batch already assigned it
|
||||||
|
const alreadyAssigned = db.prepare(`SELECT event_id FROM articles WHERE id = ?`).get(article.id);
|
||||||
|
if (alreadyAssigned && alreadyAssigned.event_id != null) {
|
||||||
|
processed++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const neighbors = findSimilarArticles(article.id, 10);
|
||||||
|
const closeEnough = neighbors.filter((n) => n.distance <= threshold);
|
||||||
|
|
||||||
|
const articleTime = article.pub_date_effective ? new Date(article.pub_date_effective).getTime() : null;
|
||||||
|
|
||||||
|
let eventId = null;
|
||||||
|
|
||||||
|
for (const neighbor of closeEnough) {
|
||||||
|
const row = getEventNeighbor.get(neighbor.articleId);
|
||||||
|
if (!row || row.event_id == null) continue;
|
||||||
|
|
||||||
|
if (articleTime && row.pub_date_effective) {
|
||||||
|
const neighborTime = new Date(row.pub_date_effective).getTime();
|
||||||
|
if (Math.abs(articleTime - neighborTime) > windowMs) continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
eventId = row.event_id;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (eventId == null) {
|
||||||
|
const result = createEvent.run(article.title);
|
||||||
|
eventId = result.lastInsertRowid;
|
||||||
|
}
|
||||||
|
|
||||||
|
assignEvent.run(eventId, article.id);
|
||||||
|
processed++;
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
clusteringRunning = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return { processed };
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasPendingClusters() {
|
||||||
|
return Boolean(db.prepare(`
|
||||||
|
SELECT 1 FROM articles
|
||||||
|
WHERE has_embedding = 1
|
||||||
|
AND event_id IS NULL
|
||||||
|
AND content IS NOT NULL AND content != ''
|
||||||
|
AND is_index_page = 0
|
||||||
|
LIMIT 1
|
||||||
|
`).get());
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { backfillMissingClusters, hasPendingClusters };
|
||||||
84
src/db.js
84
src/db.js
|
|
@ -220,6 +220,30 @@ db.exec(`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS events (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
`);
|
||||||
|
|
||||||
|
for (const statement of [
|
||||||
|
'ALTER TABLE articles ADD COLUMN event_id INTEGER REFERENCES events(id)',
|
||||||
|
]) {
|
||||||
|
try {
|
||||||
|
db.exec(statement);
|
||||||
|
} catch (error) {
|
||||||
|
if (!String(error.message).includes('duplicate column name')) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
db.exec(`
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id);
|
||||||
|
`);
|
||||||
|
|
||||||
db.exec(`
|
db.exec(`
|
||||||
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
||||||
source_id TEXT NOT NULL,
|
source_id TEXT NOT NULL,
|
||||||
|
|
@ -300,69 +324,9 @@ for (const statement of [
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// backfill has_embedding for existing rows — safe to re-run, only touches rows that need it
|
|
||||||
db.exec(`
|
|
||||||
UPDATE articles SET has_embedding = 1
|
|
||||||
WHERE has_embedding = 0
|
|
||||||
AND EXISTS (SELECT 1 FROM article_embedding_meta WHERE article_id = articles.id)
|
|
||||||
`);
|
|
||||||
|
|
||||||
// backfill pub_date_effective for existing rows
|
|
||||||
db.exec(`
|
|
||||||
UPDATE articles SET pub_date_effective = COALESCE(pub_date, ingested_at)
|
|
||||||
WHERE pub_date_effective IS NULL
|
|
||||||
`);
|
|
||||||
|
|
||||||
// backfill language from sources.json for existing rows
|
|
||||||
{
|
|
||||||
const sources = require('../sources.json');
|
|
||||||
const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`);
|
|
||||||
const backfillLang = db.transaction(() => {
|
|
||||||
for (const src of sources) {
|
|
||||||
if (src.language) {
|
|
||||||
updateLang.run(src.language, src.id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
backfillLang();
|
|
||||||
}
|
|
||||||
|
|
||||||
db.exec(`
|
db.exec(`
|
||||||
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
||||||
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
db.exec(`
|
|
||||||
UPDATE articles
|
|
||||||
SET is_index_page = 1
|
|
||||||
WHERE is_index_page = 0
|
|
||||||
AND (
|
|
||||||
LOWER(url) LIKE '%/category/%'
|
|
||||||
OR LOWER(url) LIKE '%/categories/%'
|
|
||||||
OR LOWER(url) LIKE '%/tag/%'
|
|
||||||
OR LOWER(url) LIKE '%/tags/%'
|
|
||||||
OR LOWER(url) LIKE '%/topic/%'
|
|
||||||
OR LOWER(url) LIKE '%/topics/%'
|
|
||||||
OR LOWER(url) LIKE '%/section/%'
|
|
||||||
OR LOWER(url) LIKE '%/sections/%'
|
|
||||||
OR LOWER(url) LIKE '%/archive%'
|
|
||||||
OR LOWER(url) LIKE '%/archives/%'
|
|
||||||
OR LOWER(url) LIKE '%/authors/%'
|
|
||||||
OR LOWER(url) LIKE '%/search%'
|
|
||||||
OR LOWER(title) LIKE '%category%'
|
|
||||||
OR LOWER(title) LIKE '%archives%'
|
|
||||||
OR LOWER(title) LIKE '%archive%'
|
|
||||||
OR LOWER(title) LIKE '%latest news%'
|
|
||||||
)
|
|
||||||
`);
|
|
||||||
|
|
||||||
// reset articles that grabbed yahoo finance's nav shell instead of article body
|
|
||||||
db.exec(`
|
|
||||||
UPDATE articles
|
|
||||||
SET content = NULL, content_status = NULL, content_error = NULL,
|
|
||||||
content_attempted_at = NULL, content_attempt_count = 0,
|
|
||||||
content_retry_after = NULL
|
|
||||||
WHERE content LIKE 'Today''s news US Politics World Weather%'
|
|
||||||
`);
|
|
||||||
|
|
||||||
module.exports = db;
|
module.exports = db;
|
||||||
|
|
|
||||||
|
|
@ -121,46 +121,6 @@ try {
|
||||||
console.error('embedding store backfill failed:', err);
|
console.error('embedding store backfill failed:', err);
|
||||||
}
|
}
|
||||||
|
|
||||||
// probe the API to get the real dimension count for the current model, then purge
|
|
||||||
// any store entries that don't match — handles the case where old embeddings
|
|
||||||
// got stamped with the wrong model name during migration
|
|
||||||
async function purgeWrongSizeEmbeddings() {
|
|
||||||
const apiKey = config.openRouter && config.openRouter.apiKey
|
|
||||||
? String(config.openRouter.apiKey).trim()
|
|
||||||
: '';
|
|
||||||
|
|
||||||
if (!apiKey) return;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const probe = await requestEmbedding('probe');
|
|
||||||
const expectedBytes = probe.length * 4;
|
|
||||||
|
|
||||||
const stale = db.prepare(`
|
|
||||||
SELECT article_id FROM article_embedding_store
|
|
||||||
WHERE model = ? AND LENGTH(embedding) != ?
|
|
||||||
`).all(EMBEDDING_MODEL, expectedBytes);
|
|
||||||
|
|
||||||
if (stale.length === 0) return;
|
|
||||||
|
|
||||||
const deleteStore = db.prepare(`DELETE FROM article_embedding_store WHERE article_id = ? AND model = ?`);
|
|
||||||
const deleteMeta = db.prepare(`DELETE FROM article_embedding_meta WHERE article_id = ?`);
|
|
||||||
const deleteVec = db.prepare(`DELETE FROM article_embeddings WHERE article_id = ?`);
|
|
||||||
|
|
||||||
db.transaction(() => {
|
|
||||||
for (const row of stale) {
|
|
||||||
deleteStore.run(row.article_id, EMBEDDING_MODEL);
|
|
||||||
deleteMeta.run(row.article_id);
|
|
||||||
deleteVec.run(BigInt(row.article_id));
|
|
||||||
}
|
|
||||||
})();
|
|
||||||
|
|
||||||
console.log(`purged ${stale.length} wrong-size embeddings for model ${EMBEDDING_MODEL} (expected ${probe.length} dims)`);
|
|
||||||
} catch (err) {
|
|
||||||
console.error('embedding size validation failed:', err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
purgeWrongSizeEmbeddings();
|
|
||||||
|
|
||||||
// if the config model changed, rebuild the vec0 search index from store.
|
// if the config model changed, rebuild the vec0 search index from store.
|
||||||
// only proceeds if the store actually has embeddings for the new model.
|
// only proceeds if the store actually has embeddings for the new model.
|
||||||
|
|
|
||||||
37
src/routes/events.js
Normal file
37
src/routes/events.js
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
const db = require('../db');
|
||||||
|
|
||||||
|
async function eventRoutes(fastify) {
|
||||||
|
fastify.get('/events', async (request, reply) => {
|
||||||
|
const query = request.query || {};
|
||||||
|
|
||||||
|
if (!query.id) {
|
||||||
|
reply.code(400);
|
||||||
|
return { error: 'id is required' };
|
||||||
|
}
|
||||||
|
|
||||||
|
const id = Number.parseInt(query.id, 10);
|
||||||
|
if (!Number.isFinite(id)) {
|
||||||
|
reply.code(400);
|
||||||
|
return { error: 'id must be a number' };
|
||||||
|
}
|
||||||
|
|
||||||
|
const event = db.prepare(`SELECT id, title, created_at FROM events WHERE id = ?`).get(id);
|
||||||
|
if (!event) {
|
||||||
|
reply.code(404);
|
||||||
|
return { error: 'Event not found' };
|
||||||
|
}
|
||||||
|
|
||||||
|
const articles = db.prepare(`
|
||||||
|
SELECT id, title, description, content, url, normalized_title, source, pub_date, ingested_at
|
||||||
|
FROM articles
|
||||||
|
WHERE event_id = ?
|
||||||
|
AND content IS NOT NULL AND content != ''
|
||||||
|
AND is_index_page = 0
|
||||||
|
ORDER BY pub_date_effective DESC, id DESC
|
||||||
|
`).all(id);
|
||||||
|
|
||||||
|
return { ...event, articles };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = eventRoutes;
|
||||||
|
|
@ -9,6 +9,7 @@ const { fetchFinnhubArticles } = require('./sources/finnhub');
|
||||||
const { fetchGoogleNewsArticles } = require('./sources/googleNews');
|
const { fetchGoogleNewsArticles } = require('./sources/googleNews');
|
||||||
const { backfillMissingContent, runBackfillWorker, hasPendingContent } = require('./content');
|
const { backfillMissingContent, runBackfillWorker, hasPendingContent } = require('./content');
|
||||||
const { backfillMissingEmbeddings, hasPendingEmbeddings } = require('./embeddings');
|
const { backfillMissingEmbeddings, hasPendingEmbeddings } = require('./embeddings');
|
||||||
|
const { backfillMissingClusters, hasPendingClusters } = require('./clustering');
|
||||||
|
|
||||||
function sleep(ms) {
|
function sleep(ms) {
|
||||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
|
@ -173,7 +174,29 @@ function startScheduler() {
|
||||||
runContentWorker(i, workerCount);
|
runContentWorker(i, workerCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const runClusteringLoop = async () => {
|
||||||
|
while (true) {
|
||||||
|
if (!hasPendingClusters()) {
|
||||||
|
await sleep(30 * 1000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const limit = Number(config.clustering && config.clustering.perRound) || 128;
|
||||||
|
const result = await backfillMissingClusters(limit);
|
||||||
|
|
||||||
|
if (result.processed === 0) {
|
||||||
|
await sleep(5000);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('clustering backfill failed:', error);
|
||||||
|
await sleep(10000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
runEmbeddingLoop();
|
runEmbeddingLoop();
|
||||||
|
runClusteringLoop();
|
||||||
|
|
||||||
cron.schedule(config.scheduler.rss, runRss);
|
cron.schedule(config.scheduler.rss, runRss);
|
||||||
cron.schedule(config.scheduler.edgar, runEdgar);
|
cron.schedule(config.scheduler.edgar, runEdgar);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue