migrate database from SQLite to PostgreSQL; add PostgreSQL adapter and migration script
This commit is contained in:
+226
@@ -0,0 +1,226 @@
|
||||
// postgres adapter that mimics the better-sqlite3 synchronous interface.
|
||||
// uses deasync to block the event loop until queries resolve so all
|
||||
// existing consumers (which call .get()/.all()/.run() synchronously) keep working.
|
||||
|
||||
const { Pool } = require('pg');
|
||||
const deasync = require('deasync');
|
||||
const config = require('./config');
|
||||
|
||||
const pool = new Pool(config.database.postgres);
|
||||
|
||||
function querySync(sql, params) {
|
||||
let done = false;
|
||||
let result, err;
|
||||
|
||||
pool.query(sql, params).then(r => { result = r; done = true; }).catch(e => { err = e; done = true; });
|
||||
deasync.loopWhile(() => !done);
|
||||
|
||||
if (err) throw err;
|
||||
return result;
|
||||
}
|
||||
|
||||
// translate ? placeholders to $1, $2, ... for postgres
|
||||
function toPositional(sql) {
|
||||
let i = 0;
|
||||
return sql.replace(/\?/g, () => `$${++i}`);
|
||||
}
|
||||
|
||||
|
||||
// sqlite-vec uses: WHERE embedding MATCH ? AND k = ?
|
||||
// pgvector uses: ORDER BY embedding <-> $1::vector LIMIT $2
|
||||
const NEAREST_NEIGHBORS_RE = /SELECT\s+article_id\s*,\s*distance\s+FROM\s+article_embeddings\s+WHERE\s+embedding\s+MATCH\s+\?\s+AND\s+k\s*=\s*\?\s+ORDER\s+BY\s+distance/is;
|
||||
|
||||
function rewriteNearestNeighbors(sql) {
|
||||
if (!NEAREST_NEIGHBORS_RE.test(sql)) return null;
|
||||
return `
|
||||
SELECT article_id, (embedding <-> $1::vector) AS distance
|
||||
FROM article_embeddings
|
||||
ORDER BY embedding <-> $1::vector
|
||||
LIMIT $2
|
||||
`;
|
||||
}
|
||||
|
||||
function Statement(sql) {
|
||||
const pgSql = rewriteNearestNeighbors(sql) || toPositional(sql);
|
||||
|
||||
return {
|
||||
get(...params) {
|
||||
const r = querySync(pgSql, params.flat());
|
||||
return r.rows[0] ?? undefined;
|
||||
},
|
||||
|
||||
all(...params) {
|
||||
const r = querySync(pgSql, params.flat());
|
||||
return r.rows;
|
||||
},
|
||||
|
||||
run(...params) {
|
||||
const r = querySync(pgSql, params.flat());
|
||||
return { changes: r.rowCount, lastInsertRowid: null };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function exec(sql) {
|
||||
// split on ; and run each statement individually (DDL blocks)
|
||||
const stmts = sql.split(';').map(s => s.trim()).filter(Boolean);
|
||||
for (const stmt of stmts) {
|
||||
querySync(stmt, []);
|
||||
}
|
||||
}
|
||||
|
||||
function transaction(fn) {
|
||||
return (...args) => {
|
||||
querySync('BEGIN', []);
|
||||
try {
|
||||
const result = fn(...args);
|
||||
querySync('COMMIT', []);
|
||||
return result;
|
||||
} catch (e) {
|
||||
querySync('ROLLBACK', []);
|
||||
throw e;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function prepare(sql) {
|
||||
return Statement(sql);
|
||||
}
|
||||
|
||||
const pgDb = { prepare, exec, transaction, pool };
|
||||
|
||||
// create the schema in postgres, mirroring src/db.js
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS events (
|
||||
id SERIAL PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id SERIAL PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
image TEXT,
|
||||
content_status TEXT,
|
||||
content_error TEXT,
|
||||
content_attempted_at TEXT,
|
||||
content_attempt_count INTEGER NOT NULL DEFAULT 0,
|
||||
content_retry_after TEXT,
|
||||
is_index_page INTEGER NOT NULL DEFAULT 0,
|
||||
has_embedding INTEGER NOT NULL DEFAULT 0,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
normalized_title TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
pub_date TEXT,
|
||||
pub_date_effective TEXT,
|
||||
language TEXT,
|
||||
event_id INTEGER REFERENCES events(id),
|
||||
ingested_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_event_id ON articles(event_id)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding)`);
|
||||
exec(`CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC)`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS article_embedding_store (
|
||||
article_id INTEGER NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
embedding BYTEA NOT NULL,
|
||||
embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (article_id, model)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS article_embedding_meta (
|
||||
article_id INTEGER PRIMARY KEY,
|
||||
model TEXT NOT NULL,
|
||||
embedded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS article_embeddings (
|
||||
article_id INTEGER PRIMARY KEY,
|
||||
embedding vector(8192)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS query_embeddings (
|
||||
query TEXT NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
embedding BYTEA NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (query, model)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS gdelt_backfill_windows (
|
||||
source_id TEXT NOT NULL,
|
||||
window_start TEXT NOT NULL,
|
||||
window_end TEXT NOT NULL,
|
||||
completed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (source_id, window_start, window_end)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS crawler_page_classifications (
|
||||
url TEXT PRIMARY KEY,
|
||||
site_name TEXT NOT NULL,
|
||||
classification TEXT NOT NULL,
|
||||
pattern TEXT,
|
||||
classified_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS crawler_url_patterns (
|
||||
site_name TEXT NOT NULL,
|
||||
pattern TEXT NOT NULL,
|
||||
classification TEXT NOT NULL,
|
||||
hit_count INTEGER NOT NULL DEFAULT 1,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (site_name, pattern)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS crawler_site_rules (
|
||||
site_name TEXT NOT NULL,
|
||||
rule_type TEXT NOT NULL,
|
||||
rule_value TEXT NOT NULL,
|
||||
classification TEXT NOT NULL,
|
||||
hit_count INTEGER NOT NULL DEFAULT 1,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
PRIMARY KEY (site_name, rule_type, rule_value)
|
||||
)
|
||||
`);
|
||||
|
||||
exec(`
|
||||
CREATE TABLE IF NOT EXISTS domain_fetch_policy (
|
||||
domain TEXT PRIMARY KEY,
|
||||
policy TEXT NOT NULL DEFAULT 'auto',
|
||||
consecutive_plain_failures INTEGER NOT NULL DEFAULT 0,
|
||||
consecutive_browser_failures INTEGER NOT NULL DEFAULT 0,
|
||||
plain_success_count INTEGER NOT NULL DEFAULT 0,
|
||||
browser_success_count INTEGER NOT NULL DEFAULT 0,
|
||||
expires_at TEXT,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
)
|
||||
`);
|
||||
|
||||
module.exports = pgDb;
|
||||
@@ -1,7 +1,13 @@
|
||||
const config = require('./config');
|
||||
|
||||
if (config.database && config.database.type === 'postgres') {
|
||||
module.exports = require('./db-pg');
|
||||
return;
|
||||
}
|
||||
|
||||
const path = require('path');
|
||||
const Database = require('better-sqlite3');
|
||||
const sqliteVec = require('sqlite-vec');
|
||||
const config = require('./config');
|
||||
|
||||
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
|
||||
const db = new Database(dbPath);
|
||||
|
||||
Reference in New Issue
Block a user