const path = require('path'); const Database = require('better-sqlite3'); const sqliteVec = require('sqlite-vec'); const config = require('./config'); const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite'); const db = new Database(dbPath); sqliteVec.load(db); db.pragma('journal_mode = WAL'); db.exec(` CREATE TABLE IF NOT EXISTS articles ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, description TEXT, content TEXT, image TEXT, content_status TEXT, content_error TEXT, content_attempted_at TEXT, content_attempt_count INTEGER NOT NULL DEFAULT 0, content_retry_after TEXT, is_index_page INTEGER NOT NULL DEFAULT 0, has_embedding INTEGER NOT NULL DEFAULT 0, url TEXT NOT NULL UNIQUE, normalized_title TEXT NOT NULL, source TEXT NOT NULL, pub_date TEXT, pub_date_effective TEXT, language TEXT, event_id INTEGER REFERENCES events(id), ingested_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); db.exec(` CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source); CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date); CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at); CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title); CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id); CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding); CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC); `); db.exec(` CREATE TABLE IF NOT EXISTS article_embedding_store ( article_id INTEGER NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, embedded_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (article_id, model) ); `); db.exec(` CREATE TABLE IF NOT EXISTS article_embedding_meta ( article_id INTEGER PRIMARY KEY, model TEXT NOT NULL, embedded_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); db.exec(` CREATE VIRTUAL TABLE IF NOT EXISTS article_embeddings USING vec0( article_id INTEGER PRIMARY KEY, embedding FLOAT[8192] ); `); db.exec(` CREATE TABLE IF NOT EXISTS query_embeddings ( query TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (query, model) ); `); db.exec(` CREATE TABLE IF NOT EXISTS events ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, created_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); db.exec(` CREATE TABLE IF NOT EXISTS gdelt_backfill_windows ( source_id TEXT NOT NULL, window_start TEXT NOT NULL, window_end TEXT NOT NULL, completed_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (source_id, window_start, window_end) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_page_classifications ( url TEXT PRIMARY KEY, site_name TEXT NOT NULL, classification TEXT NOT NULL, pattern TEXT, classified_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_url_patterns ( site_name TEXT NOT NULL, pattern TEXT NOT NULL, classification TEXT NOT NULL, hit_count INTEGER NOT NULL DEFAULT 1, updated_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (site_name, pattern) ); `); db.exec(` CREATE TABLE IF NOT EXISTS crawler_site_rules ( site_name TEXT NOT NULL, rule_type TEXT NOT NULL, rule_value TEXT NOT NULL, classification TEXT NOT NULL, hit_count INTEGER NOT NULL DEFAULT 1, updated_at TEXT NOT NULL DEFAULT (datetime('now')), PRIMARY KEY (site_name, rule_type, rule_value) ); `); db.exec(` CREATE TABLE IF NOT EXISTS domain_fetch_policy ( domain TEXT PRIMARY KEY, policy TEXT NOT NULL DEFAULT 'auto', consecutive_plain_failures INTEGER NOT NULL DEFAULT 0, consecutive_browser_failures INTEGER NOT NULL DEFAULT 0, plain_success_count INTEGER NOT NULL DEFAULT 0, browser_success_count INTEGER NOT NULL DEFAULT 0, expires_at TEXT, updated_at TEXT NOT NULL DEFAULT (datetime('now')) ); `); module.exports = db;