Duriin-API/workers/embeddings.js

134 lines
4.4 KiB
JavaScript

// embedding generation and cosine similarity for the intelligence layer
async function generateEmbedding(text, openRouterConfig) {
const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
method: "POST",
headers: {
"Authorization": `Bearer ${openRouterConfig.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: openRouterConfig.embeddingModel,
input: text,
}),
});
if (!response.ok) {
let msg = `embedding request failed with ${response.status}`;
try {
const payload = await response.json();
if (payload?.error?.message) msg = payload.error.message;
} catch (_) {}
throw new Error(msg);
}
const payload = await response.json();
const embedding = payload?.data?.[0]?.embedding;
if (!Array.isArray(embedding) || embedding.length === 0) {
throw new Error("invalid embedding response");
}
return embedding;
}
// Float32 BLOB -> Float32Array
function blobToFloat32(buf) {
return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
}
function cosineSimilarity(a, b) {
if (a.length !== b.length) {
// if dims differ just use the shorter length — handles edge cases gracefully
const len = Math.min(a.length, b.length);
a = a.subarray(0, len);
b = b.subarray(0, len);
}
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denom = Math.sqrt(normA) * Math.sqrt(normB);
return denom === 0 ? 0 : dot / denom;
}
// generates company embeddings for any tracked company that doesnt have one yet
async function ensureCompanyEmbeddings(intelligenceDb, openRouterConfig) {
const companies = intelligenceDb.prepare("SELECT * FROM tracked_companies").all();
const getEmbed = intelligenceDb.prepare(
"SELECT embedding FROM company_embeddings WHERE company_id = ?"
);
const upsertEmbed = intelligenceDb.prepare(`
INSERT INTO company_embeddings (company_id, embedding, model, generated_at)
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(company_id) DO UPDATE SET
embedding = excluded.embedding,
model = excluded.model,
generated_at = excluded.generated_at
`);
for (const company of companies) {
const existing = getEmbed.get(company.id);
if (existing) continue;
const text = `${company.name} is a company with ticker ${company.ticker}`;
try {
const embedding = await generateEmbedding(text, openRouterConfig);
const buf = Buffer.from(new Float32Array(embedding).buffer);
upsertEmbed.run(company.id, buf, openRouterConfig.embeddingModel);
console.log(`[embeddings] generated embedding for ${company.name}`);
} catch (err) {
console.error(`[embeddings] failed for ${company.name}:`, err.message);
}
}
}
// returns matched company objects from tracked_companies
// checks cosine similarity between each company embedding and
// the raw embeddings of all articles in the event
function findMatchedCompaniesByEmbedding(eventArticleIds, archiveDb, intelligenceDb, config) {
const threshold = config.intelligence?.similarityThreshold ?? 0.35;
const model = config.openRouter?.embeddingModel;
const companies = intelligenceDb.prepare(
"SELECT id, name, ticker FROM company_embeddings ce JOIN tracked_companies tc ON tc.id = ce.company_id"
).all();
if (companies.length === 0) return [];
// load article embeddings from archive — only articles that have one
const articleEmbeddings = [];
for (const articleId of eventArticleIds) {
const row = archiveDb.prepare(
"SELECT embedding FROM article_embedding_store WHERE article_id = ? AND model = ?"
).get(articleId, model);
if (row) articleEmbeddings.push(blobToFloat32(row.embedding));
}
if (articleEmbeddings.length === 0) return [];
const matched = [];
for (const company of companies) {
const companyRow = intelligenceDb.prepare(
"SELECT embedding FROM company_embeddings WHERE company_id = ?"
).get(company.id);
if (!companyRow) continue;
const companyVec = blobToFloat32(companyRow.embedding);
const hit = articleEmbeddings.some(articleVec => {
const sim = cosineSimilarity(companyVec, articleVec);
return sim >= threshold;
});
if (hit) matched.push(company);
}
return matched;
}
module.exports = { generateEmbedding, ensureCompanyEmbeddings, findMatchedCompaniesByEmbedding };