migrate database from SQLite to PostgreSQL; add PostgreSQL adapter and migration script

This commit is contained in:
ImBenji
2026-04-27 14:33:46 +01:00
parent 653a58b3d8
commit b4df02da0d
13 changed files with 1978 additions and 6 deletions
+226
View File
@@ -0,0 +1,226 @@
// postgres adapter that mimics the better-sqlite3 synchronous interface.
// uses deasync to block the event loop until queries resolve so all
// existing consumers (which call .get()/.all()/.run() synchronously) keep working.
const { Pool } = require('pg');
const deasync = require('deasync');
const config = require('./config');
const pool = new Pool(config.database.postgres);
function querySync(sql, params) {
let done = false;
let result, err;
pool.query(sql, params).then(r => { result = r; done = true; }).catch(e => { err = e; done = true; });
deasync.loopWhile(() => !done);
if (err) throw err;
return result;
}
// translate ? placeholders to $1, $2, ... for postgres
function toPositional(sql) {
let i = 0;
return sql.replace(/\?/g, () => `$${++i}`);
}
// sqlite-vec uses: WHERE embedding MATCH ? AND k = ?
// pgvector uses: ORDER BY embedding <-> $1::vector LIMIT $2
const NEAREST_NEIGHBORS_RE = /SELECT\s+article_id\s*,\s*distance\s+FROM\s+article_embeddings\s+WHERE\s+embedding\s+MATCH\s+\?\s+AND\s+k\s*=\s*\?\s+ORDER\s+BY\s+distance/is;
function rewriteNearestNeighbors(sql) {
if (!NEAREST_NEIGHBORS_RE.test(sql)) return null;
return `
SELECT article_id, (embedding <-> $1::vector) AS distance
FROM article_embeddings
ORDER BY embedding <-> $1::vector
LIMIT $2
`;
}
function Statement(sql) {
const pgSql = rewriteNearestNeighbors(sql) || toPositional(sql);
return {
get(...params) {
const r = querySync(pgSql, params.flat());
return r.rows[0] ?? undefined;
},
all(...params) {
const r = querySync(pgSql, params.flat());
return r.rows;
},
run(...params) {
const r = querySync(pgSql, params.flat());
return { changes: r.rowCount, lastInsertRowid: null };
},
};
}
function exec(sql) {
// split on ; and run each statement individually (DDL blocks)
const stmts = sql.split(';').map(s => s.trim()).filter(Boolean);
for (const stmt of stmts) {
querySync(stmt, []);
}
}
function transaction(fn) {
return (...args) => {
querySync('BEGIN', []);
try {
const result = fn(...args);
querySync('COMMIT', []);
return result;
} catch (e) {
querySync('ROLLBACK', []);
throw e;
}
};
}
function prepare(sql) {
return Statement(sql);
}
const pgDb = { prepare, exec, transaction, pool };
// create the schema in postgres, mirroring src/db.js
exec(`
CREATE TABLE IF NOT EXISTS events (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
`);
exec(`
CREATE TABLE IF NOT EXISTS articles (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
description TEXT,
content TEXT,
image TEXT,
content_status TEXT,
content_error TEXT,
content_attempted_at TEXT,
content_attempt_count INTEGER NOT NULL DEFAULT 0,
content_retry_after TEXT,
is_index_page INTEGER NOT NULL DEFAULT 0,
has_embedding INTEGER NOT NULL DEFAULT 0,
url TEXT NOT NULL UNIQUE,
normalized_title TEXT NOT NULL,
source TEXT NOT NULL,
pub_date TEXT,
pub_date_effective TEXT,
language TEXT,
event_id INTEGER REFERENCES events(id),
ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding)`);
exec(`CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC)`);
exec(`
CREATE TABLE IF NOT EXISTS article_embedding_store (
article_id INTEGER NOT NULL,
model TEXT NOT NULL,
embedding BYTEA NOT NULL,
embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (article_id, model)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS article_embedding_meta (
article_id INTEGER PRIMARY KEY,
model TEXT NOT NULL,
embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
`);
exec(`
CREATE TABLE IF NOT EXISTS article_embeddings (
article_id INTEGER PRIMARY KEY,
embedding vector(8192)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS query_embeddings (
query TEXT NOT NULL,
model TEXT NOT NULL,
embedding BYTEA NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (query, model)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
source_id TEXT NOT NULL,
window_start TEXT NOT NULL,
window_end TEXT NOT NULL,
completed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (source_id, window_start, window_end)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
url TEXT PRIMARY KEY,
site_name TEXT NOT NULL,
classification TEXT NOT NULL,
pattern TEXT,
classified_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
`);
exec(`
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
site_name TEXT NOT NULL,
pattern TEXT NOT NULL,
classification TEXT NOT NULL,
hit_count INTEGER NOT NULL DEFAULT 1,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (site_name, pattern)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS crawler_site_rules (
site_name TEXT NOT NULL,
rule_type TEXT NOT NULL,
rule_value TEXT NOT NULL,
classification TEXT NOT NULL,
hit_count INTEGER NOT NULL DEFAULT 1,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
PRIMARY KEY (site_name, rule_type, rule_value)
)
`);
exec(`
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
domain TEXT PRIMARY KEY,
policy TEXT NOT NULL DEFAULT 'auto',
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
plain_success_count INTEGER NOT NULL DEFAULT 0,
browser_success_count INTEGER NOT NULL DEFAULT 0,
expires_at TEXT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)
`);
module.exports = pgDb;
+7 -1
View File
@@ -1,7 +1,13 @@
const config = require('./config');
if (config.database && config.database.type === 'postgres') {
module.exports = require('./db-pg');
return;
}
const path = require('path');
const Database = require('better-sqlite3');
const sqliteVec = require('sqlite-vec');
const config = require('./config');
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
const db = new Database(dbPath);