From 461699806417ac68197f10a7b9a2a4e62948b99c Mon Sep 17 00:00:00 2001 From: ImBenji Date: Sat, 18 Apr 2026 17:19:55 +0100 Subject: [PATCH] migrate article embeddings to support multi-model architecture and enhance data integrity --- src/db.js | 45 ++++++++++++++++++++++++++++++++++++-------- src/embeddings.js | 34 +++++++++++++++++++++++---------- src/routes/status.js | 8 ++++++-- 3 files changed, 67 insertions(+), 20 deletions(-) diff --git a/src/db.js b/src/db.js index 7dc0706..c70ead0 100644 --- a/src/db.js +++ b/src/db.js @@ -134,17 +134,46 @@ db.exec(` { const existing = db.prepare(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'article_embeddings'`).get(); const currentDim = existing && existing.sql && existing.sql.match(/FLOAT\[(\d+)\]/); + const needsMigration = existing && (!currentDim || parseInt(currentDim[1], 10) !== 8192); + + if (needsMigration) { + // save everything in vec0 to the store before dropping it, keyed by whatever model is in meta + try { + const BATCH = 500; + let offset = 0; + + const fetchBatch = db.prepare(` + SELECT e.article_id, m.model, e.embedding + FROM article_embeddings e + JOIN article_embedding_meta m ON m.article_id = e.article_id + LIMIT ? OFFSET ? + `); + + const insert = db.prepare(` + INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding) + VALUES (?, ?, ?) + `); + + const insertMany = db.transaction((rows) => { + for (const row of rows) insert.run(row.article_id, row.model, row.embedding); + }); + + while (true) { + const rows = fetchBatch.all(BATCH, offset); + if (rows.length === 0) break; + insertMany(rows); + offset += rows.length; + if (rows.length < BATCH) break; + } + } catch (err) { + console.error('failed to rescue embeddings from vec0 before migration:', err); + } - if (!existing) { - db.exec(` - CREATE VIRTUAL TABLE article_embeddings USING vec0( - article_id INTEGER PRIMARY KEY, - embedding FLOAT[8192] - ); - `); - } else if (!currentDim || parseInt(currentDim[1], 10) !== 8192) { db.exec(`DROP TABLE article_embeddings`); db.exec(`DELETE FROM article_embedding_meta`); + } + + if (!existing || needsMigration) { db.exec(` CREATE VIRTUAL TABLE article_embeddings USING vec0( article_id INTEGER PRIMARY KEY, diff --git a/src/embeddings.js b/src/embeddings.js index 0914f96..40e57c9 100644 --- a/src/embeddings.js +++ b/src/embeddings.js @@ -85,22 +85,27 @@ const upsertQueryEmbedding = db.prepare(` created_at = datetime('now') `); -// backfill store from any embeddings that predate multi-model support +// backfill store from vec0 for any embeddings that predate multi-model support. +// only runs when store is completely empty, so we never stamp the wrong model on existing data. try { - db.prepare(` - INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding) - SELECT e.article_id, ?, e.embedding FROM article_embeddings e - `).run(EMBEDDING_MODEL); + const storeEmpty = !db.prepare(`SELECT 1 FROM article_embedding_store LIMIT 1`).get(); - db.prepare(` - INSERT OR IGNORE INTO article_embedding_meta (article_id, model) - SELECT article_id, ? FROM article_embedding_store WHERE model = ? - `).run(EMBEDDING_MODEL, EMBEDDING_MODEL); + if (storeEmpty) { + // use model from meta if available, fall back to config model + const metaRow = db.prepare(`SELECT model FROM article_embedding_meta LIMIT 1`).get(); + const legacyModel = (metaRow && metaRow.model) || EMBEDDING_MODEL; + + db.prepare(` + INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding) + SELECT e.article_id, ?, e.embedding FROM article_embeddings e + `).run(legacyModel); + } } catch (err) { console.error('embedding store backfill failed:', err); } -// if the config model changed, rebuild the vec0 search index from store +// if the config model changed, rebuild the vec0 search index from store. +// only proceeds if the store actually has embeddings for the new model. function rebuildVec0IfModelChanged() { const stale = db.prepare(` SELECT 1 FROM article_embedding_meta WHERE model != ? LIMIT 1 @@ -108,6 +113,15 @@ function rebuildVec0IfModelChanged() { if (!stale) return; + const storeCount = db.prepare(` + SELECT COUNT(*) AS n FROM article_embedding_store WHERE model = ? + `).get(EMBEDDING_MODEL); + + if (!storeCount || storeCount.n === 0) { + console.log(`embedding model changed to ${EMBEDDING_MODEL} but store has no embeddings for it yet — skipping index rebuild`); + return; + } + console.log(`embedding model changed to ${EMBEDDING_MODEL}, rebuilding search index...`); db.exec('BEGIN'); diff --git a/src/routes/status.js b/src/routes/status.js index dce57b5..36ac822 100644 --- a/src/routes/status.js +++ b/src/routes/status.js @@ -29,8 +29,11 @@ async function statusRoutes(fastify) { ); const embeddingModelRows = db.prepare(` - SELECT model, COUNT(*) AS article_count - FROM article_embedding_store + SELECT + model, + COUNT(*) AS article_count, + (SELECT LENGTH(embedding) FROM article_embedding_store s2 WHERE s2.model = s.model LIMIT 1) AS sample_bytes + FROM article_embedding_store s GROUP BY model ORDER BY article_count DESC `).all(); @@ -45,6 +48,7 @@ async function statusRoutes(fastify) { embeddingModels: embeddingModelRows.map((row) => ({ model: row.model, articles: row.article_count, + dimensions: row.sample_bytes ? row.sample_bytes / 4 : null, })), }; });