Revisione/server/utils/generateCourse.ts

import { db } from "../db/index";
import { courses, uploads, topics } from "../db/schema";
import { eq } from "drizzle-orm";
import { randomUUID } from "crypto";
import { askAI } from "./openrouter";
import { auditCourse } from "./auditCourse";

const inFlightCourses = new Set<string>();

type Stage = "parsing_pdfs" | "analysing_sources" | "building_curriculum" | "finalising" | "ready" | "error";

function log(courseId: string, msg: string) {
  const short = courseId.slice(0, 8);
  console.log(`[revisione:${short}] ${msg}`);
}

async function setStage(courseId: string, stage: Stage) {
  await db.update(courses).set({ stage }).where(eq(courses.id, courseId));
  log(courseId, `stage → ${stage}`);
}

function parseJSON<T>(raw: string): T {
  let text = raw
    .replace(/<think>[\s\S]*?<\/think>/gi, "")
    .replace(/^\s*thought\s*\n/i, "")
    .trim();
  try {
    return JSON.parse(text);
  } catch {
    const cleaned = text.replace(/^```[a-z]*\n?/i, "").replace(/\n?```$/i, "").trim();
    return JSON.parse(cleaned);
  }
}

export async function generateCourseInBackground(courseId: string) {
  if (inFlightCourses.has(courseId)) {
    log(courseId, "already in flight, skipping duplicate call");
    return;
  }
  inFlightCourses.add(courseId);

  try {
    const course = await db.query.courses.findFirst({ where: eq(courses.id, courseId) });
    if (!course) throw new Error(`Course ${courseId} not found`);

    let costs = { ai: 0 };

    log(courseId, `starting generation for "${course.title}"`);

    // ── STEP 1 — load uploads ───────────────────────────────────────────────
    await setStage(courseId, "parsing_pdfs");

    const uploadRows = await db.query.uploads.findMany({
      where: eq(uploads.courseId, courseId),
    });

    if (uploadRows.length === 0) throw new Error("No uploads found for this course");

    log(courseId, `found ${uploadRows.length} upload(s)`);

    const primaryParts: string[] = [];
    const secondaryParts: string[] = [];

    for (const upload of uploadRows) {
      if (!upload.extractedText) {
        log(courseId, `  skipping "${upload.filename}" — no extracted text`);
        continue;
      }
      const snippet = `--- ${upload.filename} ---\n${upload.extractedText}`;
      log(courseId, `  loaded "${upload.filename}" (${upload.type}, ${upload.extractedText.length} chars)`);

      if (upload.type === "past_paper" || upload.type === "lab_worksheet") {
        primaryParts.push(snippet);
      } else {
        secondaryParts.push(snippet);
      }
    }

    log(courseId, `source split — primary: ${primaryParts.length}, secondary: ${secondaryParts.length}`);

    // ── STEP 1b — infer title, subject, organisation ───────────────────────
    await setStage(courseId, "analysing_sources");

    const allExtracted = [primaryParts.join("\n\n"), secondaryParts.join("\n\n")].join("\n\n");

    log(courseId, "inferring course title and subject from documents…");

    const inferenceResult = await askAI([{
      role: "user",
      content: `You are analysing a set of university course documents including lecture slides, past exam papers, and lab worksheets.

Based on the content, return a JSON object with:
- "title": a concise course name (e.g. "Computer Vision", "Thermodynamics", "Microeconomics")
- "subject": the broader academic discipline (e.g. "Computer Science", "Physics", "Economics")
- "organisation": the university or institution these materials are from (e.g. "University of Essex", "Imperial College London"). Infer this from headers, exam paper footers, logos described in text, or module codes. Return null if you genuinely cannot determine it.

Return only valid JSON, no markdown.

DOCUMENTS:
${allExtracted}`,
    }]);
    costs.ai += inferenceResult.cost;

    let inferredMeta: { title: string; subject: string; organisation?: string | null } = {
      title: course.title,
      subject: course.subject,
    };
    let inferenceWarning = false;

    try {
      inferredMeta = parseJSON(inferenceResult.text);
    } catch {
      log(courseId, `inference parse failed on first attempt, raw text: ${inferenceResult.text}`);

      // retry once
      try {
        const retryResult = await askAI([{
          role: "user",
          content: `You are analysing a set of university course documents including lecture slides, past exam papers, and lab worksheets.

Based on the content, return a JSON object with:
- "title": a concise course name (e.g. "Computer Vision", "Thermodynamics", "Microeconomics")
- "subject": the broader academic discipline (e.g. "Computer Science", "Physics", "Economics")
- "organisation": the university or institution these materials are from (e.g. "University of Essex", "Imperial College London"). Infer this from headers, exam paper footers, logos described in text, or module codes. Return null if you genuinely cannot determine it.

Return only valid JSON, no markdown.

DOCUMENTS:
${allExtracted}`,
        }]);
        costs.ai += retryResult.cost;
        inferredMeta = parseJSON(retryResult.text);
        log(courseId, "inference retry succeeded");
      } catch (retryErr: any) {
        log(courseId, `inference retry also failed: ${retryErr?.message ?? retryErr} — using defaults`);
        inferenceWarning = true;
      }
    }

    log(courseId, `inferred → title: "${inferredMeta.title}", subject: "${inferredMeta.subject}"`);

    await db.update(courses)
      .set({
        title: inferredMeta.title,
        subject: inferredMeta.subject,
        ...(inferredMeta.organisation != null ? { organisation: inferredMeta.organisation } : {}),
        ...(inferenceWarning ? { inferenceWarning: true } : {}),
      })
      .where(eq(courses.id, courseId));

    // ── STEP 2 — generate topic list (skip if topics already saved) ─────────

    let savedTopics = await db.query.topics.findMany({
      where: eq(topics.courseId, courseId),
      orderBy: (t, { asc }) => asc(t.order),
    });

    // only skip curriculum if we have topics AND we've moved past building_curriculum
    if (savedTopics.length > 0 && course.stage !== "building_curriculum") {
      log(courseId, `resuming — found ${savedTopics.length} existing topic(s), skipping curriculum generation`);
    } else {
      const primaryText = primaryParts.join("\n\n");
      const secondaryText = secondaryParts.join("\n\n");

      const knownFilenames = new Set(uploadRows.map((u) => u.filename));

      const availableFilesBlock = uploadRows
        .map((u) => `- ${u.filename} (${u.type})`)
        .join("\n");

      const curriculumPrompt = `You are designing a complete revision course from scratch.

Your only measure of success is this: a student who completes every lesson in this course must be able to:
- Answer every question in every past paper provided, including calculation questions, pseudocode questions, diagram questions, and scenario questions
- Perform every procedure, method, and algorithm named in the source material — not just describe them, but actually do them
- Fully understand every concept present in the source material, with no gaps

This is a non-negotiable standard. Do not summarise. Do not compress topics together if doing so would leave any gap in the student's ability to answer a past paper question. If meeting this standard requires 50 topics, generate 50 topics. If it requires 8, generate 8. There is no limit in either direction.

BEFORE generating topics, do the following analysis mentally:
1. Read every past paper question carefully. For each question, ask: what does a student need to know and be able to DO to answer this? List every distinct skill, concept, calculation method, algorithm, and procedure required.
2. Read every lab worksheet. For each task, ask: what does a student need to know and be able to DO to complete this? Add any new skills, concepts, or procedures to the list.
3. Read the lecture slides. Add any concept or topic that appears in the slides but is not yet in the list.
4. Now organise the list into topics, ordered from simplest to most complex, such that each topic assumes only the knowledge of topics before it.

TOPIC REQUIREMENTS:
- Every distinct algorithm named in the source material must have at least one dedicated topic that teaches it to implementation level — the student must be able to apply it step by step, not just name it
- Every calculation that appears in a past paper must be covered in a topic that teaches the student to perform that exact type of calculation by hand, with worked examples matching the exam style
- Every procedure that appears in a lab worksheet must be covered in a topic that teaches the student to carry out that procedure
- If a past paper asks for pseudocode, the corresponding topic must teach the student to write that pseudocode
- Conceptual understanding alone is never sufficient. Every topic must result in a student who can DO something, not just know something

AVAILABLE SOURCE FILES (you must reference these exact filenames in relevantFiles):
${availableFilesBlock}

PRIMARY SOURCES (past papers + lab worksheets — these define what the student must be able to do):
${primaryText || "(none provided)"}

SECONDARY SOURCES (lecture slides — use for additional concepts and explanations):
${secondaryText || "(none provided)"}

Return only valid JSON — an array of topics with no markdown:
[{ "title": "...", "description": "...", "difficulty": 1-5, "order": 1, "relevantFiles": ["filename.pdf"] }]

relevantFiles must list only filenames from the AVAILABLE SOURCE FILES list that directly contain content for this topic. Include at minimum the files that have past paper questions or lab tasks this topic must prepare the student for.

The description must be specific about what the student will be able to DO after completing this topic, not just what it covers.`;

      await setStage(courseId, "building_curriculum");
      const curriculumModel = (useRuntimeConfig() as any).openrouterCurriculumModel || undefined;
      log(courseId, `calling OpenRouter for curriculum${curriculumModel ? ` (model: ${curriculumModel})` : ""}…`);

      let curriculumResult = await askAI([{ role: "user", content: curriculumPrompt }], { model: curriculumModel });
      costs.ai += curriculumResult.cost;
      let curriculum = parseJSON<{ title: string; description: string; difficulty: number; relevantFiles?: string[] }[]>(curriculumResult.text);

      if (!Array.isArray(curriculum) || curriculum.length === 0) {
        throw new Error("AI returned an empty curriculum");
      }

      // check for blank required fields, retry once if any found
      const hasBlankFields = curriculum.some((t) => !t.title?.trim() || !t.description?.trim());
      if (hasBlankFields) {
        log(courseId, "some topics had blank title/description, retrying curriculum generation…");
        const retryResult = await askAI([{ role: "user", content: curriculumPrompt }]);
        costs.ai += retryResult.cost;
        const retryCurriculum = parseJSON<typeof curriculum>(retryResult.text);

        if (Array.isArray(retryCurriculum) && retryCurriculum.length > 0) {
          curriculum = retryCurriculum;
        } else {
          log(courseId, "retry also had issues, proceeding with original curriculum");
        }
      }

      log(courseId, `curriculum received — ${curriculum.length} topics:`);
      curriculum.forEach((t, i) => log(courseId, `  ${i + 1}. ${t.title} (difficulty ${t.difficulty})`));

      await setStage(courseId, "finalising");

      // better-sqlite3 transactions are synchronous — build rows first, then insert
      const topicRows: any[] = [];
      for (let i = 0; i < curriculum.length; i++) {
        const t = curriculum[i];

        if (!t.title?.trim() || !t.description?.trim()) {
          log(courseId, `  skipping topic ${i + 1} — missing title or description`);
          continue;
        }

        const rawFiles = t.relevantFiles ?? [];
        const validFiles = rawFiles.filter((f: string) => {
          const ok = knownFilenames.has(f);
          if (!ok) log(courseId, `  topic "${t.title}" — hallucinated filename filtered out: "${f}"`);
          return ok;
        });

        topicRows.push({
          id: randomUUID(),
          courseId,
          title: t.title,
          description: t.description,
          order: i,
          difficulty: Math.min(5, Math.max(1, t.difficulty ?? 1)),
          prerequisiteTopicIds: "[]",
          relevantFiles: JSON.stringify(validFiles),
          status: "pending",
        });
      }

      for (const row of topicRows) {
        await db.insert(topics).values(row);
      }

      savedTopics = await db.query.topics.findMany({
        where: eq(topics.courseId, courseId),
        orderBy: (t, { asc }) => asc(t.order),
      });

      log(courseId, `saved ${savedTopics.length} topics to DB`);
    }

    // ── STEP 3 — mark course ready immediately after topics are saved ────────
    await db.update(courses)
      .set({ status: "ready", stage: "ready", costAI: costs.ai })
      .where(eq(courses.id, courseId));

    log(courseId, `✓ curriculum ready — ${savedTopics.length} topics | AI cost: $${costs.ai.toFixed(4)}`);

    // ── STEP 4 — post-generation audit (non-blocking) ───────────────────────
    await auditCourse(courseId);
  } catch (err: any) {
    console.error(`[revisione:${courseId.slice(0, 8)}] ✗ generation failed: ${err?.message ?? err}`);
    await db.update(courses).set({ status: "error", stage: "error" }).where(eq(courses.id, courseId));
  } finally {
    inFlightCourses.delete(courseId);
  }
}