enhance article processing by adding language support and adjusting embedding parameters

This commit is contained in:
ImBenji
2026-04-20 03:41:10 +01:00
parent 37d9dfb083
commit 8805d3a3fc
7 changed files with 129 additions and 9 deletions
+3 -3
View File
@@ -65,8 +65,8 @@ const markContentPending = db.prepare(`
const selectPartitionedArticlesMissingContent = db.prepare(`
SELECT id, url, title, description
FROM (
SELECT id, url, title, description, source,
ROW_NUMBER() OVER (PARTITION BY source ORDER BY ingested_at DESC, id DESC) AS rn
SELECT id, url, title, description, source, pub_date_effective,
ROW_NUMBER() OVER (PARTITION BY source ORDER BY pub_date_effective DESC, id DESC) AS rn
FROM articles
WHERE (content IS NULL OR TRIM(content) = '')
AND (content_status IS NULL OR content_status = 'pending')
@@ -74,7 +74,7 @@ const selectPartitionedArticlesMissingContent = db.prepare(`
AND (id % ?) = ?
)
WHERE rn <= ?
ORDER BY rn, source
ORDER BY pub_date_effective DESC, rn, source
`);
const selectAttemptCount = db.prepare(`
+16 -1
View File
@@ -288,7 +288,8 @@ for (const statement of [
'ALTER TABLE articles ADD COLUMN content_retry_after TEXT',
'ALTER TABLE articles ADD COLUMN is_index_page INTEGER NOT NULL DEFAULT 0',
'ALTER TABLE articles ADD COLUMN has_embedding INTEGER NOT NULL DEFAULT 0',
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT'
'ALTER TABLE articles ADD COLUMN pub_date_effective TEXT',
'ALTER TABLE articles ADD COLUMN language TEXT'
]) {
try {
db.exec(statement);
@@ -312,6 +313,20 @@ db.exec(`
WHERE pub_date_effective IS NULL
`);
// backfill language from sources.json for existing rows
{
const sources = require('../sources.json');
const updateLang = db.prepare(`UPDATE articles SET language = ? WHERE source = ? AND language IS NULL`);
const backfillLang = db.transaction(() => {
for (const src of sources) {
if (src.language) {
updateLang.run(src.language, src.id);
}
}
});
backfillLang();
}
db.exec(`
CREATE INDEX IF NOT EXISTS idx_articles_has_embedding ON articles(has_embedding);
CREATE INDEX IF NOT EXISTS idx_articles_pub_date_effective ON articles(pub_date_effective DESC);
+1 -1
View File
@@ -73,7 +73,7 @@ const selectArticlesMissingEmbeddings = db.prepare(`
SELECT 1 FROM article_embedding_store s
WHERE s.article_id = a.id AND s.model = ?
)
ORDER BY a.ingested_at ASC, a.id ASC
ORDER BY a.pub_date_effective DESC, a.id DESC
LIMIT ?
`);
+10 -3
View File
@@ -2,6 +2,10 @@ const db = require('./db');
const { normalizeTitle } = require('./dedup');
const { markSourceRun } = require('./state');
const sourcesById = Object.fromEntries(
require('../sources.json').map((s) => [s.id, s])
);
const insertArticle = db.prepare(`
INSERT INTO articles (
title,
@@ -13,8 +17,9 @@ const insertArticle = db.prepare(`
source,
pub_date,
ingested_at,
pub_date_effective
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?)
pub_date_effective,
language
) VALUES (?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?)
`);
const findByUrl = db.prepare('SELECT id FROM articles WHERE url = ?');
const INDEX_PAGE_URL_HINT = /\/(category|categories|tag|tags|topic|topics|section|sections|archive|archives|authors|search)(?:\/|$)/i;
@@ -81,6 +86,7 @@ function ingestArticle(article) {
const isIndexPage = inferIsIndexPage(article, title, url);
const pubDate = normalizePubDate(article.pubDate);
const ingestedAt = new Date().toISOString();
const language = (sourcesById[source] && sourcesById[source].language) || null;
try {
const result = insertArticle.run(
@@ -92,7 +98,8 @@ function ingestArticle(article) {
source,
pubDate,
ingestedAt,
pubDate || ingestedAt
pubDate || ingestedAt,
language
);
// dont kick off the content fetch here — it used to be fire-and-forget which