const path = require('path'); const Database = require('better-sqlite3'); const sqliteVec = require('sqlite-vec'); const config = require('./config'); const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite'); const db = new Database(dbPath); sqliteVec.load(db); db.pragma('journal_mode = WAL'); // the image column is retained as a no-op for backwards compat with old rows. // new code never writes to it; drop in a future migration if you really want db.exec(` CREATE TABLE IF NOT EXISTS articles ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, description TEXT, content TEXT, image TEXT, content_status TEXT, content_error TEXT, content_attempted_at TEXT, is_index_page INTEGER NOT NULL DEFAULT 0, url TEXT NOT NULL UNIQUE, normalized_title TEXT NOT NULL, source TEXT NOT NULL, pub_date TEXT, ingested_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); function rebuildArticlesTableIfNeeded() { const indexes = db.prepare(`PRAGMA index_list('articles')`).all(); const hasUniqueNormalizedTitleIndex = indexes.some((index) => { if (index.origin !== 'u' || !index.name) { return false; } const columns = db.prepare(`PRAGMA index_info('${index.name.replace(/'/g, "''")}')`).all(); return columns.length === 1 && columns[0].name === 'normalized_title'; }); if (!hasUniqueNormalizedTitleIndex) { return; } db.exec(` BEGIN; CREATE TABLE articles_rebuild ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, description TEXT, content TEXT, image TEXT, content_status TEXT, content_error TEXT, content_attempted_at TEXT, is_index_page INTEGER NOT NULL DEFAULT 0, url TEXT NOT NULL UNIQUE, normalized_title TEXT NOT NULL, source TEXT NOT NULL, pub_date TEXT, ingested_at TEXT NOT NULL DEFAULT (datetime('now')) ); INSERT INTO articles_rebuild ( id, title, description, content, image, content_status, content_error, content_attempted_at, is_index_page, url, normalized_title, source, pub_date, ingested_at ) SELECT id, title, description, content, image, content_status, content_error, content_attempted_at, 0, url, normalized_title, source, pub_date, ingested_at FROM articles; DROP TABLE articles; ALTER TABLE articles_rebuild RENAME TO articles; COMMIT; `); } rebuildArticlesTableIfNeeded(); db.exec(` CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source); CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date); CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at); CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title); `); db.exec(` CREATE TABLE IF NOT EXISTS article_embedding_store ( article_id INTEGER NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, embedded_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (article_id, model) ); `); db.exec(` CREATE TABLE IF NOT EXISTS article_embedding_meta ( article_id INTEGER PRIMARY KEY, model TEXT NOT NULL, embedded_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); // vec0 table — fixed at 8192 dims to cover any model on openrouter, shorter embeddings get zero-padded { const existing = db.prepare(`SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'article_embeddings'`).get(); const currentDim = existing && existing.sql && existing.sql.match(/FLOAT\[(\d+)\]/); const needsMigration = existing && (!currentDim || parseInt(currentDim[1], 10) !== 8192); if (needsMigration) { // save everything in vec0 to the store before dropping it, keyed by whatever model is in meta try { const BATCH = 500; let offset = 0; const fetchBatch = db.prepare(` SELECT e.article_id, m.model, e.embedding FROM article_embeddings e JOIN article_embedding_meta m ON m.article_id = e.article_id LIMIT ? OFFSET ? `); const insert = db.prepare(` INSERT OR IGNORE INTO article_embedding_store (article_id, model, embedding) VALUES (?, ?, ?) `); const insertMany = db.transaction((rows) => { for (const row of rows) insert.run(row.article_id, row.model, row.embedding); }); while (true) { const rows = fetchBatch.all(BATCH, offset); if (rows.length === 0) break; insertMany(rows); offset += rows.length; if (rows.length < BATCH) break; } } catch (err) { console.error('failed to rescue embeddings from vec0 before migration:', err); } db.exec(`DROP TABLE article_embeddings`); db.exec(`DELETE FROM article_embedding_meta`); } if (!existing || needsMigration) { db.exec(` CREATE VIRTUAL TABLE article_embeddings USING vec0( article_id INTEGER PRIMARY KEY, embedding FLOAT[8192] ); `); } } // migrate query_embeddings to include model in primary key { const cols = db.prepare(`PRAGMA table_info(query_embeddings)`).all(); const hasModel = cols.some(c => c.name === 'model'); if (!hasModel) { db.exec(` BEGIN; CREATE TABLE query_embeddings_new ( query TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (query, model) ); DROP TABLE IF EXISTS query_embeddings; ALTER TABLE query_embeddings_new RENAME TO query_embeddings; COMMIT; `); } else { db.exec(` CREATE TABLE IF NOT EXISTS query_embeddings ( query TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (query, model) ); `); } } db.exec(` CREATE TABLE IF NOT EXISTS events ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); for (const statement of [ 'ALTER TABLE articles ADD COLUMN event_id INTEGER REFERENCES events(id)', ]) { try { db.exec(statement); } catch (error) { if (!String(error.message).includes('duplicate column name')) { throw error; } } } db.exec(` CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id); `); db.exec(` CREATE TABLE IF NOT EXISTS gdelt_backfill_windows ( source_id TEXT NOT NULL, window_start TEXT NOT NULL, window_end TEXT NOT NULL, completed_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (source_id, window_start, window_end) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_page_classifications ( url TEXT PRIMARY KEY, site_name TEXT NOT NULL, classification TEXT NOT NULL, pattern TEXT, classified_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_url_patterns ( site_name TEXT NOT NULL, pattern TEXT NOT NULL, classification TEXT NOT NULL, hit_count INTEGER NOT NULL DEFAULT 1, updated_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (site_name, pattern) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_site_rules ( site_name TEXT NOT NULL, rule_type TEXT NOT NULL, rule_value TEXT NOT NULL, classification TEXT NOT NULL, hit_count INTEGER NOT NULL DEFAULT 1, updated_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (site_name, rule_type, rule_value) ); `); // per-domain fetch policy — caches whether plain http or browser is needed // so we dont waste a round trip on every article from a known js-only site. // expires_at lets us re-probe domains that may have recovered db.exec(` CREATE TABLE IF NOT EXISTS domain_fetch_policy ( domain TEXT PRIMARY KEY, policy TEXT NOT NULL DEFAULT 'auto', consecutive_plain_failures INTEGER NOT NULL DEFAULT 0, consecutive_browser_failures INTEGER NOT NULL DEFAULT 0, plain_success_count INTEGER NOT NULL DEFAULT 0, browser_success_count INTEGER NOT NULL DEFAULT 0, expires_at TEXT, updated_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); for (const statement of [ 'ALTER TABLE articles ADD COLUMN image TEXT', 'ALTER TABLE articles ADD COLUMN content_status TEXT', 'ALTER TABLE articles ADD COLUMN content_error TEXT', 'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT', 'ALTER TABLE articles ADD COLUMN content_attempt_count INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN content_retry_after TEXT', 'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0', 'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT', 'ALTER TABLE articles ADD COLUMN language TEXT' ]) { try { db.exec(statement); } catch (error) { if (!String(error.message).includes('duplicate column name')) { throw error; } } } db.exec(` CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding); CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC); `); module.exports = db;