170 lines
4.3 KiB
JavaScript
170 lines
4.3 KiB
JavaScript
const path = require('path');
|
|
const Database = require('better-sqlite3');
|
|
const sqliteVec = require('sqlite-vec');
|
|
const config = require('./config');
|
|
|
|
const dbPath = path.resolve(__dirname, '..', config.database.path || './archive.sqlite');
|
|
const db = new Database(dbPath);
|
|
sqliteVec.load(db);
|
|
|
|
db.pragma('journal_mode = WAL');
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
content TEXT,
|
|
image TEXT,
|
|
content_status TEXT,
|
|
content_error TEXT,
|
|
content_attempted_at TEXT,
|
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
|
url TEXT NOT NULL UNIQUE,
|
|
normalized_title TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
pub_date TEXT,
|
|
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
function rebuildArticlesTableIfNeeded() {
|
|
const indexes = db.prepare(`PRAGMA index_list('articles')`).all();
|
|
const hasUniqueNormalizedTitleIndex = indexes.some((index) => {
|
|
if (index.origin !== 'u' || !index.name) {
|
|
return false;
|
|
}
|
|
|
|
const columns = db.prepare(`PRAGMA index_info('${index.name.replace(/'/g, "''")}')`).all();
|
|
return columns.length === 1 && columns[0].name === 'normalized_title';
|
|
});
|
|
|
|
if (!hasUniqueNormalizedTitleIndex) {
|
|
return;
|
|
}
|
|
|
|
db.exec(`
|
|
BEGIN;
|
|
|
|
CREATE TABLE articles_rebuild (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
content TEXT,
|
|
image TEXT,
|
|
content_status TEXT,
|
|
content_error TEXT,
|
|
content_attempted_at TEXT,
|
|
is_index_page INTEGER NOT NULL DEFAULT 0,
|
|
url TEXT NOT NULL UNIQUE,
|
|
normalized_title TEXT NOT NULL,
|
|
source TEXT NOT NULL,
|
|
pub_date TEXT,
|
|
ingested_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
|
|
INSERT INTO articles_rebuild (
|
|
id,
|
|
title,
|
|
description,
|
|
content,
|
|
image,
|
|
content_status,
|
|
content_error,
|
|
content_attempted_at,
|
|
is_index_page,
|
|
url,
|
|
normalized_title,
|
|
source,
|
|
pub_date,
|
|
ingested_at
|
|
)
|
|
SELECT
|
|
id,
|
|
title,
|
|
description,
|
|
content,
|
|
image,
|
|
content_status,
|
|
content_error,
|
|
content_attempted_at,
|
|
0,
|
|
url,
|
|
normalized_title,
|
|
source,
|
|
pub_date,
|
|
ingested_at
|
|
FROM articles;
|
|
|
|
DROP TABLE articles;
|
|
ALTER TABLE articles_rebuild RENAME TO articles;
|
|
|
|
COMMIT;
|
|
`);
|
|
}
|
|
|
|
rebuildArticlesTableIfNeeded();
|
|
|
|
db.exec(`
|
|
CREATE INDEX IF NOT EXISTS idx_articles_source ON articles(source);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_pub_date ON articles(pub_date);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_ingested_at ON articles(ingested_at);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_normalized_title ON articles(normalized_title);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS article_embeddings USING vec0(
|
|
article_id INTEGER PRIMARY KEY,
|
|
embedding FLOAT[1024]
|
|
);
|
|
`);
|
|
|
|
db.exec(`
|
|
CREATE TABLE IF NOT EXISTS query_embeddings (
|
|
query TEXT PRIMARY KEY,
|
|
embedding BLOB NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
);
|
|
`);
|
|
|
|
for (const statement of [
|
|
'ALTER TABLE articles ADD COLUMN image TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_status TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_error TEXT',
|
|
'ALTER TABLE articles ADD COLUMN content_attempted_at TEXT',
|
|
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0'
|
|
]) {
|
|
try {
|
|
db.exec(statement);
|
|
} catch (error) {
|
|
if (!String(error.message).includes('duplicate column name')) {
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
db.exec(`
|
|
UPDATE articles
|
|
SET is_index_page = 1
|
|
WHERE is_index_page = 0
|
|
AND (
|
|
LOWER(url) LIKE '%/category/%'
|
|
OR LOWER(url) LIKE '%/categories/%'
|
|
OR LOWER(url) LIKE '%/tag/%'
|
|
OR LOWER(url) LIKE '%/tags/%'
|
|
OR LOWER(url) LIKE '%/topic/%'
|
|
OR LOWER(url) LIKE '%/topics/%'
|
|
OR LOWER(url) LIKE '%/section/%'
|
|
OR LOWER(url) LIKE '%/sections/%'
|
|
OR LOWER(url) LIKE '%/archive%'
|
|
OR LOWER(url) LIKE '%/archives/%'
|
|
OR LOWER(url) LIKE '%/authors/%'
|
|
OR LOWER(url) LIKE '%/search%'
|
|
OR LOWER(title) LIKE '%category%'
|
|
OR LOWER(title) LIKE '%archives%'
|
|
OR LOWER(title) LIKE '%archive%'
|
|
OR LOWER(title) LIKE '%latest news%'
|
|
)
|
|
`);
|
|
|
|
module.exports = db;
|