Revisione/server/utils/generateCourse.ts

import { db } from "../db/index";
import { courses, uploads, topics } from "../db/schema";
import { eq } from "drizzle-orm";
import { randomUUID } from "crypto";
import { askAI } from "./openrouter";
import { auditCourse } from "./auditCourse";

type Stage = "parsing_pdfs" | "analysing_sources" | "building_curriculum" | "finalising" | "ready" | "error";

function log(courseId: string, msg: string) {
  const short = courseId.slice(0, 8);
  console.log(`[revisione:${short}] ${msg}`);
}

async function setStage(courseId: string, stage: Stage) {
  await db.update(courses).set({ stage }).where(eq(courses.id, courseId));
  log(courseId, `stage → ${stage}`);
}

function parseJSON<T>(raw: string): T {
  try {
    return JSON.parse(raw);
  } catch {
    const cleaned = raw.replace(/^```[a-z]*\n?/i, "").replace(/\n?```$/i, "").trim();
    return JSON.parse(cleaned);
  }
}

export async function generateCourseInBackground(courseId: string) {
  try {
    const course = await db.query.courses.findFirst({ where: eq(courses.id, courseId) });
    if (!course) throw new Error(`Course ${courseId} not found`);

    let costs = { ai: 0 };

    log(courseId, `starting generation for "${course.title}"`);

    // ── STEP 1 — load uploads ───────────────────────────────────────────────
    await setStage(courseId, "parsing_pdfs");

    const uploadRows = await db.query.uploads.findMany({
      where: eq(uploads.courseId, courseId),
    });

    if (uploadRows.length === 0) throw new Error("No uploads found for this course");

    log(courseId, `found ${uploadRows.length} upload(s)`);

    const primaryParts: string[] = [];
    const secondaryParts: string[] = [];

    for (const upload of uploadRows) {
      if (!upload.extractedText) {
        log(courseId, `  skipping "${upload.filename}" — no extracted text`);
        continue;
      }
      const snippet = `--- ${upload.filename} ---\n${upload.extractedText}`;
      log(courseId, `  loaded "${upload.filename}" (${upload.type}, ${upload.extractedText.length} chars)`);

      if (upload.type === "past_paper" || upload.type === "lab_worksheet") {
        primaryParts.push(snippet);
      } else {
        secondaryParts.push(snippet);
      }
    }

    log(courseId, `source split — primary: ${primaryParts.length}, secondary: ${secondaryParts.length}`);

    // ── STEP 1b — infer title, subject, organisation ───────────────────────
    await setStage(courseId, "analysing_sources");

    const allExtracted = [
      ...primaryParts.join("\n\n"),
      ...secondaryParts.join("\n\n"),
    ].join("\n\n");

    log(courseId, "inferring course title and subject from documents…");

    const inferenceResult = await askAI([{
      role: "user",
      content: `You are analysing a set of university course documents including lecture slides, past exam papers, and lab worksheets.

Based on the content, return a JSON object with:
- "title": a concise course name (e.g. "Computer Vision", "Thermodynamics", "Microeconomics")
- "subject": the broader academic discipline (e.g. "Computer Science", "Physics", "Economics")
- "organisation": the university or institution these materials are from (e.g. "University of Essex", "Imperial College London"). Infer this from headers, exam paper footers, logos described in text, or module codes. Return null if you genuinely cannot determine it.

Return only valid JSON, no markdown.

DOCUMENTS:
${allExtracted}`,
    }]);
    costs.ai += inferenceResult.cost;

    let inferredMeta: { title: string; subject: string; organisation?: string | null } = {
      title: course.title,
      subject: course.subject,
    };
    try {
      inferredMeta = parseJSON(inferenceResult.text);
    } catch {
      log(courseId, "inference parse failed, using defaults");
    }

    log(courseId, `inferred → title: "${inferredMeta.title}", subject: "${inferredMeta.subject}"`);

    await db.update(courses)
      .set({
        title: inferredMeta.title,
        subject: inferredMeta.subject,
        ...(inferredMeta.organisation != null ? { organisation: inferredMeta.organisation } : {}),
      })
      .where(eq(courses.id, courseId));

    // ── STEP 2 — generate topic list (skip if topics already saved) ─────────

    let savedTopics = await db.query.topics.findMany({
      where: eq(topics.courseId, courseId),
      orderBy: (t, { asc }) => asc(t.order),
    });

    if (savedTopics.length > 0) {
      log(courseId, `resuming — found ${savedTopics.length} existing topic(s), skipping curriculum generation`);
    } else {
      const primaryText = primaryParts.join("\n\n");
      const secondaryText = secondaryParts.join("\n\n");

      const availableFilesBlock = uploadRows
        .map((u) => `- ${u.filename} (${u.type})`)
        .join("\n");

      const curriculumPrompt = `You are designing a complete revision course from scratch.

Your only measure of success is this: a student who completes every lesson in this course must be able to:
- Answer every question in every past paper provided, including calculation questions, pseudocode questions, diagram questions, and scenario questions
- Perform every procedure, method, and algorithm named in the source material — not just describe them, but actually do them
- Fully understand every concept present in the source material, with no gaps

This is a non-negotiable standard. Do not summarise. Do not compress topics together if doing so would leave any gap in the student's ability to answer a past paper question. If meeting this standard requires 50 topics, generate 50 topics. If it requires 8, generate 8. There is no limit in either direction.

BEFORE generating topics, do the following analysis mentally:
1. Read every past paper question carefully. For each question, ask: what does a student need to know and be able to DO to answer this? List every distinct skill, concept, calculation method, algorithm, and procedure required.
2. Read every lab worksheet. For each task, ask: what does a student need to know and be able to DO to complete this? Add any new skills, concepts, or procedures to the list.
3. Read the lecture slides. Add any concept or topic that appears in the slides but is not yet in the list.
4. Now organise the list into topics, ordered from simplest to most complex, such that each topic assumes only the knowledge of topics before it.

TOPIC REQUIREMENTS:
- Every distinct algorithm named in the source material must have at least one dedicated topic that teaches it to implementation level — the student must be able to apply it step by step, not just name it
- Every calculation that appears in a past paper must be covered in a topic that teaches the student to perform that exact type of calculation by hand, with worked examples matching the exam style
- Every procedure that appears in a lab worksheet must be covered in a topic that teaches the student to carry out that procedure
- If a past paper asks for pseudocode, the corresponding topic must teach the student to write that pseudocode
- Conceptual understanding alone is never sufficient. Every topic must result in a student who can DO something, not just know something

AVAILABLE SOURCE FILES (you must reference these exact filenames in relevantFiles):
${availableFilesBlock}

PRIMARY SOURCES (past papers + lab worksheets — these define what the student must be able to do):
${primaryText || "(none provided)"}

SECONDARY SOURCES (lecture slides — use for additional concepts and explanations):
${secondaryText || "(none provided)"}

Return only valid JSON — an array of topics with no markdown:
[{ "title": "...", "description": "...", "difficulty": 1-5, "order": 1, "relevantFiles": ["filename.pdf"] }]

relevantFiles must list only filenames from the AVAILABLE SOURCE FILES list that directly contain content for this topic. Include at minimum the files that have past paper questions or lab tasks this topic must prepare the student for.

The description must be specific about what the student will be able to DO after completing this topic, not just what it covers.`;

      await setStage(courseId, "building_curriculum");
      log(courseId, "calling OpenRouter for curriculum…");
      const curriculumResult = await askAI([{ role: "user", content: curriculumPrompt }]);
      costs.ai += curriculumResult.cost;
      const curriculum = parseJSON<{ title: string; description: string; difficulty: number; relevantFiles?: string[] }[]>(curriculumResult.text);

      if (!Array.isArray(curriculum) || curriculum.length === 0) {
        throw new Error("AI returned an empty curriculum");
      }

      log(courseId, `curriculum received — ${curriculum.length} topics:`);
      curriculum.forEach((t, i) => log(courseId, `  ${i + 1}. ${t.title} (difficulty ${t.difficulty})`));

      await setStage(courseId, "finalising");

      for (let i = 0; i < curriculum.length; i++) {
        const t = curriculum[i];
        await db.insert(topics).values({
          id: randomUUID(),
          courseId,
          title: t.title,
          description: t.description,
          order: i,
          difficulty: Math.min(5, Math.max(1, t.difficulty ?? 1)),
          prerequisiteTopicIds: "[]",
          relevantFiles: JSON.stringify(t.relevantFiles ?? []),
          status: "pending",
        });
      }

      savedTopics = await db.query.topics.findMany({
        where: eq(topics.courseId, courseId),
        orderBy: (t, { asc }) => asc(t.order),
      });

      log(courseId, `saved ${savedTopics.length} topics to DB`);
    }

    // ── STEP 3 — mark course ready immediately after topics are saved ────────
    await db.update(courses)
      .set({ status: "ready", stage: "ready", costAI: costs.ai })
      .where(eq(courses.id, courseId));

    log(courseId, `✓ curriculum ready — ${savedTopics.length} topics | AI cost: $${costs.ai.toFixed(4)}`);

    // ── STEP 4 — post-generation audit (non-blocking) ───────────────────────
    await auditCourse(courseId);
  } catch (err: any) {
    console.error(`[revisione:${courseId.slice(0, 8)}] ✗ generation failed: ${err?.message ?? err}`);
    await db.update(courses).set({ status: "error", stage: "error" }).where(eq(courses.id, courseId));
  }
}