migrate article embeddings to support multi-model architecture and enhance data integrity
This commit is contained in:
parent
88f465e71f
commit
4616998064
3 changed files with 67 additions and 20 deletions
45
src/db.js
45
src/db.js
|
|
@ -134,17 +134,46 @@ db.exec(`
|
||||||
{
|
{
|
||||||
const existing = db.prepare(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'article_embeddings'`).get();
|
const existing = db.prepare(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'article_embeddings'`).get();
|
||||||
const currentDim = existing && existing.sql && existing.sql.match(/FLOAT\[(\d+)\]/);
|
const currentDim = existing && existing.sql && existing.sql.match(/FLOAT\[(\d+)\]/);
|
||||||
|
const needsMigration = existing && (!currentDim || parseInt(currentDim[1], 10) !== 8192);
|
||||||
|
|
||||||
|
if (needsMigration) {
|
||||||
|
// save everything in vec0 to the store before dropping it, keyed by whatever model is in meta
|
||||||
|
try {
|
||||||
|
const BATCH = 500;
|
||||||
|
let offset = 0;
|
||||||
|
|
||||||
|
const fetchBatch = db.prepare(`
|
||||||
|
SELECT e.article_id, m.model, e.embedding
|
||||||
|
FROM article_embeddings e
|
||||||
|
JOIN article_embedding_meta m ON m.article_id = e.article_id
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
`);
|
||||||
|
|
||||||
|
const insert = db.prepare(`
|
||||||
|
INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
`);
|
||||||
|
|
||||||
|
const insertMany = db.transaction((rows) => {
|
||||||
|
for (const row of rows) insert.run(row.article_id, row.model, row.embedding);
|
||||||
|
});
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const rows = fetchBatch.all(BATCH, offset);
|
||||||
|
if (rows.length === 0) break;
|
||||||
|
insertMany(rows);
|
||||||
|
offset += rows.length;
|
||||||
|
if (rows.length < BATCH) break;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('failed to rescue embeddings from vec0 before migration:', err);
|
||||||
|
}
|
||||||
|
|
||||||
if (!existing) {
|
|
||||||
db.exec(`
|
|
||||||
CREATE VIRTUAL TABLE article_embeddings USING vec0(
|
|
||||||
article_id INTEGER PRIMARY KEY,
|
|
||||||
embedding FLOAT[8192]
|
|
||||||
);
|
|
||||||
`);
|
|
||||||
} else if (!currentDim || parseInt(currentDim[1], 10) !== 8192) {
|
|
||||||
db.exec(`DROP TABLE article_embeddings`);
|
db.exec(`DROP TABLE article_embeddings`);
|
||||||
db.exec(`DELETE FROM article_embedding_meta`);
|
db.exec(`DELETE FROM article_embedding_meta`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!existing || needsMigration) {
|
||||||
db.exec(`
|
db.exec(`
|
||||||
CREATE VIRTUAL TABLE article_embeddings USING vec0(
|
CREATE VIRTUAL TABLE article_embeddings USING vec0(
|
||||||
article_id INTEGER PRIMARY KEY,
|
article_id INTEGER PRIMARY KEY,
|
||||||
|
|
|
||||||
|
|
@ -85,22 +85,27 @@ const upsertQueryEmbedding = db.prepare(`
|
||||||
created_at = datetime('now')
|
created_at = datetime('now')
|
||||||
`);
|
`);
|
||||||
|
|
||||||
// backfill store from any embeddings that predate multi-model support
|
// backfill store from vec0 for any embeddings that predate multi-model support.
|
||||||
|
// only runs when store is completely empty, so we never stamp the wrong model on existing data.
|
||||||
try {
|
try {
|
||||||
db.prepare(`
|
const storeEmpty = !db.prepare(`SELECT 1 FROM article_embedding_store LIMIT 1`).get();
|
||||||
INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding)
|
|
||||||
SELECT e.article_id, ?, e.embedding FROM article_embeddings e
|
|
||||||
`).run(EMBEDDING_MODEL);
|
|
||||||
|
|
||||||
db.prepare(`
|
if (storeEmpty) {
|
||||||
INSERT OR IGNORE INTO article_embedding_meta (article_id, model)
|
// use model from meta if available, fall back to config model
|
||||||
SELECT article_id, ? FROM article_embedding_store WHERE model = ?
|
const metaRow = db.prepare(`SELECT model FROM article_embedding_meta LIMIT 1`).get();
|
||||||
`).run(EMBEDDING_MODEL, EMBEDDING_MODEL);
|
const legacyModel = (metaRow && metaRow.model) || EMBEDDING_MODEL;
|
||||||
|
|
||||||
|
db.prepare(`
|
||||||
|
INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding)
|
||||||
|
SELECT e.article_id, ?, e.embedding FROM article_embeddings e
|
||||||
|
`).run(legacyModel);
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error('embedding store backfill failed:', err);
|
console.error('embedding store backfill failed:', err);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the config model changed, rebuild the vec0 search index from store
|
// if the config model changed, rebuild the vec0 search index from store.
|
||||||
|
// only proceeds if the store actually has embeddings for the new model.
|
||||||
function rebuildVec0IfModelChanged() {
|
function rebuildVec0IfModelChanged() {
|
||||||
const stale = db.prepare(`
|
const stale = db.prepare(`
|
||||||
SELECT 1 FROM article_embedding_meta WHERE model != ? LIMIT 1
|
SELECT 1 FROM article_embedding_meta WHERE model != ? LIMIT 1
|
||||||
|
|
@ -108,6 +113,15 @@ function rebuildVec0IfModelChanged() {
|
||||||
|
|
||||||
if (!stale) return;
|
if (!stale) return;
|
||||||
|
|
||||||
|
const storeCount = db.prepare(`
|
||||||
|
SELECT COUNT(*) AS n FROM article_embedding_store WHERE model = ?
|
||||||
|
`).get(EMBEDDING_MODEL);
|
||||||
|
|
||||||
|
if (!storeCount || storeCount.n === 0) {
|
||||||
|
console.log(`embedding model changed to ${EMBEDDING_MODEL} but store has no embeddings for it yet — skipping index rebuild`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
console.log(`embedding model changed to ${EMBEDDING_MODEL}, rebuilding search index...`);
|
console.log(`embedding model changed to ${EMBEDDING_MODEL}, rebuilding search index...`);
|
||||||
|
|
||||||
db.exec('BEGIN');
|
db.exec('BEGIN');
|
||||||
|
|
|
||||||
|
|
@ -29,8 +29,11 @@ async function statusRoutes(fastify) {
|
||||||
);
|
);
|
||||||
|
|
||||||
const embeddingModelRows = db.prepare(`
|
const embeddingModelRows = db.prepare(`
|
||||||
SELECT model, COUNT(*) AS article_count
|
SELECT
|
||||||
FROM article_embedding_store
|
model,
|
||||||
|
COUNT(*) AS article_count,
|
||||||
|
(SELECT LENGTH(embedding) FROM article_embedding_store s2 WHERE s2.model = s.model LIMIT 1) AS sample_bytes
|
||||||
|
FROM article_embedding_store s
|
||||||
GROUP BY model
|
GROUP BY model
|
||||||
ORDER BY article_count DESC
|
ORDER BY article_count DESC
|
||||||
`).all();
|
`).all();
|
||||||
|
|
@ -45,6 +48,7 @@ async function statusRoutes(fastify) {
|
||||||
embeddingModels: embeddingModelRows.map((row) => ({
|
embeddingModels: embeddingModelRows.map((row) => ({
|
||||||
model: row.model,
|
model: row.model,
|
||||||
articles: row.article_count,
|
articles: row.article_count,
|
||||||
|
dimensions: row.sample_bytes ? row.sample_bytes / 4 : null,
|
||||||
})),
|
})),
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue