134 lines
4.4 KiB
JavaScript
134 lines
4.4 KiB
JavaScript
// embedding generation and cosine similarity for the intelligence layer
|
|
|
|
async function generateEmbedding(text, openRouterConfig) {
|
|
const response = await fetch("https://openrouter.ai/api/v1/embeddings", {
|
|
method: "POST",
|
|
headers: {
|
|
"Authorization": `Bearer ${openRouterConfig.apiKey}`,
|
|
"Content-Type": "application/json",
|
|
},
|
|
body: JSON.stringify({
|
|
model: openRouterConfig.embeddingModel,
|
|
input: text,
|
|
}),
|
|
});
|
|
|
|
if (!response.ok) {
|
|
let msg = `embedding request failed with ${response.status}`;
|
|
try {
|
|
const payload = await response.json();
|
|
if (payload?.error?.message) msg = payload.error.message;
|
|
} catch (_) {}
|
|
throw new Error(msg);
|
|
}
|
|
|
|
const payload = await response.json();
|
|
const embedding = payload?.data?.[0]?.embedding;
|
|
if (!Array.isArray(embedding) || embedding.length === 0) {
|
|
throw new Error("invalid embedding response");
|
|
}
|
|
|
|
return embedding;
|
|
}
|
|
|
|
// Float32 BLOB -> Float32Array
|
|
function blobToFloat32(buf) {
|
|
return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
|
|
}
|
|
|
|
function cosineSimilarity(a, b) {
|
|
if (a.length !== b.length) {
|
|
// if dims differ just use the shorter length — handles edge cases gracefully
|
|
const len = Math.min(a.length, b.length);
|
|
a = a.subarray(0, len);
|
|
b = b.subarray(0, len);
|
|
}
|
|
|
|
let dot = 0, normA = 0, normB = 0;
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i];
|
|
normA += a[i] * a[i];
|
|
normB += b[i] * b[i];
|
|
}
|
|
|
|
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
return denom === 0 ? 0 : dot / denom;
|
|
}
|
|
|
|
// generates company embeddings for any tracked company that doesnt have one yet
|
|
async function ensureCompanyEmbeddings(intelligenceDb, openRouterConfig) {
|
|
const companies = intelligenceDb.prepare("SELECT * FROM tracked_companies").all();
|
|
|
|
const getEmbed = intelligenceDb.prepare(
|
|
"SELECT embedding FROM company_embeddings WHERE company_id = ?"
|
|
);
|
|
const upsertEmbed = intelligenceDb.prepare(`
|
|
INSERT INTO company_embeddings (company_id, embedding, model, generated_at)
|
|
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
|
|
ON CONFLICT(company_id) DO UPDATE SET
|
|
embedding = excluded.embedding,
|
|
model = excluded.model,
|
|
generated_at = excluded.generated_at
|
|
`);
|
|
|
|
for (const company of companies) {
|
|
const existing = getEmbed.get(company.id);
|
|
if (existing) continue;
|
|
|
|
const text = `${company.name} is a company with ticker ${company.ticker}`;
|
|
try {
|
|
const embedding = await generateEmbedding(text, openRouterConfig);
|
|
const buf = Buffer.from(new Float32Array(embedding).buffer);
|
|
upsertEmbed.run(company.id, buf, openRouterConfig.embeddingModel);
|
|
console.log(`[embeddings] generated embedding for ${company.name}`);
|
|
} catch (err) {
|
|
console.error(`[embeddings] failed for ${company.name}:`, err.message);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// returns matched company objects from tracked_companies
|
|
// checks cosine similarity between each company embedding and
|
|
// the raw embeddings of all articles in the event
|
|
function findMatchedCompaniesByEmbedding(eventArticleIds, archiveDb, intelligenceDb, config) {
|
|
const threshold = config.intelligence?.similarityThreshold ?? 0.35;
|
|
const model = config.openRouter?.embeddingModel;
|
|
|
|
const companies = intelligenceDb.prepare(
|
|
"SELECT id, name, ticker FROM company_embeddings ce JOIN tracked_companies tc ON tc.id = ce.company_id"
|
|
).all();
|
|
|
|
if (companies.length === 0) return [];
|
|
|
|
// load article embeddings from archive — only articles that have one
|
|
const articleEmbeddings = [];
|
|
for (const articleId of eventArticleIds) {
|
|
const row = archiveDb.prepare(
|
|
"SELECT embedding FROM article_embedding_store WHERE article_id = ? AND model = ?"
|
|
).get(articleId, model);
|
|
if (row) articleEmbeddings.push(blobToFloat32(row.embedding));
|
|
}
|
|
|
|
if (articleEmbeddings.length === 0) return [];
|
|
|
|
const matched = [];
|
|
for (const company of companies) {
|
|
const companyRow = intelligenceDb.prepare(
|
|
"SELECT embedding FROM company_embeddings WHERE company_id = ?"
|
|
).get(company.id);
|
|
if (!companyRow) continue;
|
|
|
|
const companyVec = blobToFloat32(companyRow.embedding);
|
|
const hit = articleEmbeddings.some(articleVec => {
|
|
const sim = cosineSimilarity(companyVec, articleVec);
|
|
return sim >= threshold;
|
|
});
|
|
|
|
if (hit) matched.push(company);
|
|
}
|
|
|
|
return matched;
|
|
}
|
|
|
|
module.exports = { generateEmbedding, ensureCompanyEmbeddings, findMatchedCompaniesByEmbedding };
|