332 lines
9.2 KiB
JavaScript
332 lines
9.2 KiB
JavaScript
const path = require('path');
|
|
const Database = require('better-sqlite3');
|
|
const sqliteVec = require('sqlite-vec');
|
|
const config = require('./config');
|
|
|
|
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
|
|
const db = new Database(dbPath);
|
|
sqliteVec.load(db);
|
|
|
|
db.pragma('journal_mode = WAL');
|
|
|
|
// the image column is retained as a no-op for backwards compat with old rows.
|
|
// new code never writes to it; drop in a future migration if you really want
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
content TEXT,
|
|
image TEXT,
|
|
content_status TEXT,
|
|
content_error TEXT,
|
|
content_attempted_at TEXT,
|
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
|
url TEXT NOT NULL UNIQUE,
|
|
normalized_title TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
pub_date TEXT,
|
|
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
function rebuildArticlesTableIfNeeded() {
|
|
const indexes = db.prepare(`PRAGMA index_list('articles')`).all();
|
|
const hasUniqueNormalizedTitleIndex = indexes.some((index) => {
|
|
if (index.origin !== 'u' || !index.name) {
|
|
return false;
|
|
}
|
|
|
|
const columns = db.prepare(`PRAGMA index_info('${index.name.replace(/'/g, "''")}')`).all();
|
|
return columns.length === 1 && columns[0].name === 'normalized_title';
|
|
});
|
|
|
|
if (!hasUniqueNormalizedTitleIndex) {
|
|
return;
|
|
}
|
|
|
|
db.exec(`
|
|
BEGIN;
|
|
|
|
CREATE TABLE articles_rebuild (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
content TEXT,
|
|
image TEXT,
|
|
content_status TEXT,
|
|
content_error TEXT,
|
|
content_attempted_at TEXT,
|
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
|
url TEXT NOT NULL UNIQUE,
|
|
normalized_title TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
pub_date TEXT,
|
|
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
|
|
INSERT INTO articles_rebuild (
|
|
id,
|
|
title,
|
|
description,
|
|
content,
|
|
image,
|
|
content_status,
|
|
content_error,
|
|
content_attempted_at,
|
|
is_index_page,
|
|
url,
|
|
normalized_title,
|
|
source,
|
|
pub_date,
|
|
ingested_at
|
|
)
|
|
SELECT
|
|
id,
|
|
title,
|
|
description,
|
|
content,
|
|
image,
|
|
content_status,
|
|
content_error,
|
|
content_attempted_at,
|
|
0,
|
|
url,
|
|
normalized_title,
|
|
source,
|
|
pub_date,
|
|
ingested_at
|
|
FROM articles;
|
|
|
|
DROP TABLE articles;
|
|
ALTER TABLE articles_rebuild RENAME TO articles;
|
|
|
|
COMMIT;
|
|
`);
|
|
}
|
|
|
|
rebuildArticlesTableIfNeeded();
|
|
|
|
db.exec(`
|
|
CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS article_embedding_store (
|
|
article_id INTEGER NOT NULL,
|
|
model TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
embedded_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (article_id, model)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS article_embedding_meta (
|
|
article_id INTEGER PRIMARY KEY,
|
|
model TEXT NOT NULL,
|
|
embedded_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
// vec0 table — fixed at 8192 dims to cover any model on openrouter, shorter embeddings get zero-padded
|
|
{
|
|
const existing = db.prepare(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'article_embeddings'`).get();
|
|
const currentDim = existing && existing.sql && existing.sql.match(/FLOAT\[(\d+)\]/);
|
|
const needsMigration = existing && (!currentDim || parseInt(currentDim[1], 10) !== 8192);
|
|
|
|
if (needsMigration) {
|
|
// save everything in vec0 to the store before dropping it, keyed by whatever model is in meta
|
|
try {
|
|
const BATCH = 500;
|
|
let offset = 0;
|
|
|
|
const fetchBatch = db.prepare(`
|
|
SELECT e.article_id, m.model, e.embedding
|
|
FROM article_embeddings e
|
|
JOIN article_embedding_meta m ON m.article_id = e.article_id
|
|
LIMIT ? OFFSET ?
|
|
`);
|
|
|
|
const insert = db.prepare(`
|
|
INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding)
|
|
VALUES (?, ?, ?)
|
|
`);
|
|
|
|
const insertMany = db.transaction((rows) => {
|
|
for (const row of rows) insert.run(row.article_id, row.model, row.embedding);
|
|
});
|
|
|
|
while (true) {
|
|
const rows = fetchBatch.all(BATCH, offset);
|
|
if (rows.length === 0) break;
|
|
insertMany(rows);
|
|
offset += rows.length;
|
|
if (rows.length < BATCH) break;
|
|
}
|
|
} catch (err) {
|
|
console.error('failed to rescue embeddings from vec0 before migration:', err);
|
|
}
|
|
|
|
db.exec(`DROP TABLE article_embeddings`);
|
|
db.exec(`DELETE FROM article_embedding_meta`);
|
|
}
|
|
|
|
if (!existing || needsMigration) {
|
|
db.exec(`
|
|
CREATE VIRTUAL TABLE article_embeddings USING vec0(
|
|
article_id INTEGER PRIMARY KEY,
|
|
embedding FLOAT[8192]
|
|
);
|
|
`);
|
|
}
|
|
}
|
|
|
|
// migrate query_embeddings to include model in primary key
|
|
{
|
|
const cols = db.prepare(`PRAGMA table_info(query_embeddings)`).all();
|
|
const hasModel = cols.some(c => c.name === 'model');
|
|
|
|
if (!hasModel) {
|
|
db.exec(`
|
|
BEGIN;
|
|
|
|
CREATE TABLE query_embeddings_new (
|
|
query TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (query, model)
|
|
);
|
|
|
|
DROP TABLE IF EXISTS query_embeddings;
|
|
ALTER TABLE query_embeddings_new RENAME TO query_embeddings;
|
|
|
|
COMMIT;
|
|
`);
|
|
} else {
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS query_embeddings (
|
|
query TEXT NOT NULL,
|
|
model TEXT NOT NULL,
|
|
embedding BLOB NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (query, model)
|
|
);
|
|
`);
|
|
}
|
|
}
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS events (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
for (const statement of [
|
|
'ALTER TABLE articles ADD COLUMN event_id INTEGER REFERENCES events(id)',
|
|
]) {
|
|
try {
|
|
db.exec(statement);
|
|
} catch (error) {
|
|
if (!String(error.message).includes('duplicate column name')) {
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
db.exec(`
|
|
CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
|
source_id TEXT NOT NULL,
|
|
window_start TEXT NOT NULL,
|
|
window_end TEXT NOT NULL,
|
|
completed_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (source_id, window_start, window_end)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
|
|
url TEXT PRIMARY KEY,
|
|
site_name TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
pattern TEXT,
|
|
classified_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
|
|
site_name TEXT NOT NULL,
|
|
pattern TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (site_name, pattern)
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS crawler_site_rules (
|
|
site_name TEXT NOT NULL,
|
|
rule_type TEXT NOT NULL,
|
|
rule_value TEXT NOT NULL,
|
|
classification TEXT NOT NULL,
|
|
hit_count INTEGER NOT NULL DEFAULT 1,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
PRIMARY KEY (site_name, rule_type, rule_value)
|
|
);
|
|
`);
|
|
|
|
// per-domain fetch policy — caches whether plain http or browser is needed
|
|
// so we dont waste a round trip on every article from a known js-only site.
|
|
// expires_at lets us re-probe domains that may have recovered
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
|
|
domain TEXT PRIMARY KEY,
|
|
policy TEXT NOT NULL DEFAULT 'auto',
|
|
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
|
|
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
|
|
plain_success_count INTEGER NOT NULL DEFAULT 0,
|
|
browser_success_count INTEGER NOT NULL DEFAULT 0,
|
|
expires_at TEXT,
|
|
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
for (const statement of [
|
|
'ALTER TABLE articles ADD COLUMN image TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0',
|
|
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
|
|
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0',
|
|
'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0',
|
|
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT',
|
|
'ALTER TABLE articles ADD COLUMN language TEXT'
|
|
]) {
|
|
try {
|
|
db.exec(statement);
|
|
} catch (error) {
|
|
if (!String(error.message).includes('duplicate column name')) {
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
db.exec(`
|
|
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
|
|
`);
|
|
|
|
module.exports = db;
|