diff --git a/src/embeddings.js b/src/embeddings.js index 4e87171..dbfb5fb 100644 --- a/src/embeddings.js +++ b/src/embeddings.js @@ -108,99 +108,6 @@ function padEmbeddingForVec0(values) { return Buffer.from(padded.buffer); } -// backfill store from vec0 for any embeddings that predate multi-model support. -// only runs when store is completely empty, so we never stamp the wrong model on existing data. -try { - const storeEmpty = !db.prepare(`SELECT 1 FROM article_embedding_store LIMIT 1`).get(); - - if (storeEmpty) { - // use model from meta if available, fall back to config model - const metaRow = db.prepare(`SELECT model FROM article_embedding_meta LIMIT 1`).get(); - const legacyModel = (metaRow && metaRow.model) || EMBEDDING_MODEL; - - db.prepare(` - INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding) - SELECT e.article_id, ?, e.embedding FROM article_embeddings e - `).run(legacyModel); - } -} catch (err) { - console.error('embedding store backfill failed:', err); -} - - -// if the config model changed, rebuild the vec0 search index from store. -// only proceeds if the store actually has embeddings for the new model. -function rebuildVec0IfModelChanged() { - const stale = db.prepare(` - SELECT 1 FROM article_embedding_meta WHERE model != ? LIMIT 1 - `).get(EMBEDDING_MODEL); - - // also sync any store entries that didn't make it into vec0 - const missing = db.prepare(` - SELECT article_id, embedding FROM article_embedding_store - WHERE model = ? - AND NOT EXISTS (SELECT 1 FROM article_embeddings WHERE article_id = article_embedding_store.article_id) - `).all(EMBEDDING_MODEL); - - if (missing.length > 0) { - const insertVec = db.prepare(`INSERT OR IGNORE INTO article_embeddings (article_id, embedding) VALUES (?, ?)`); - const insertMeta = db.prepare(`INSERT OR IGNORE INTO article_embedding_meta (article_id, model) VALUES (?, ?)`); - - const sync = db.transaction(() => { - for (const row of missing) { - const vals = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4); - insertVec.run(BigInt(row.article_id), padEmbeddingForVec0(vals)); - insertMeta.run(row.article_id, EMBEDDING_MODEL); - } - }); - - sync(); - console.log(`synced ${missing.length} store embeddings into vec0`); - } - - if (!stale) return; - - const storeCount = db.prepare(` - SELECT COUNT(*) AS n FROM article_embedding_store WHERE model = ? - `).get(EMBEDDING_MODEL); - - if (!storeCount || storeCount.n === 0) { - console.log(`embedding model changed to ${EMBEDDING_MODEL} but store has no embeddings for it yet — skipping index rebuild`); - return; - } - - console.log(`embedding model changed to ${EMBEDDING_MODEL}, rebuilding search index...`); - - db.exec('BEGIN'); - - try { - db.exec('DELETE FROM article_embeddings'); - db.exec('DELETE FROM article_embedding_meta'); - - const rows = db.prepare(` - SELECT article_id, embedding FROM article_embedding_store WHERE model = ? - `).all(EMBEDDING_MODEL); - - const insertVec = db.prepare(`INSERT INTO article_embeddings (article_id, embedding) VALUES (?, ?)`); - const insertMeta = db.prepare(`INSERT INTO article_embedding_meta (article_id, model) VALUES (?, ?)`); - - for (const row of rows) { - const vals = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.byteLength / 4); - const padded = new Float32Array(VEC0_DIM); - padded.set(vals); - insertVec.run(BigInt(row.article_id), Buffer.from(padded.buffer)); - insertMeta.run(row.article_id, EMBEDDING_MODEL); - } - - db.exec('COMMIT'); - console.log(`rebuilt search index with ${rows.length} embeddings`); - } catch (err) { - db.exec('ROLLBACK'); - throw err; - } -} - -rebuildVec0IfModelChanged(); let embeddingBackfillRunning = false; const embeddingJobsRunning = new Set();