Revisione/server/utils/generateTTS.ts

import { mkdir, writeFile, access } from "fs/promises";
import { resolve } from "path";

export interface AudioChunk {
  text: string;
  start: number;
  end: number;
}

export interface TTSResult {
  audioPath: string;
  audioChunks: AudioChunk[];
  cost: number;
}


async function callElevenLabs(
  text: string,
  apiKey: string,
  voiceId: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
  const res = await fetch(
    `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
    {
      method: "POST",
      headers: {
        "xi-api-key": apiKey,
        "Content-Type": "application/json",
      },
      body: JSON.stringify({
        text,
        model_id: "eleven_turbo_v2_5",
        output_format: "mp3_44100_128",
      }),
    }
  );

  if (!res.ok) {
    const errText = await res.text().catch(() => "");
    console.error(`[tts] ElevenLabs error ${res.status}: ${errText}`);
    return null;
  }

  const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
  const cost = charCost * 0.0003;

  const data = await res.json() as {
    audio_base64: string;
    alignment: {
      characters: string[];
      character_start_times_seconds: number[];
      character_end_times_seconds: number[];
    };
  };

  const audio = Buffer.from(data.audio_base64, "base64");
  const { characters, character_start_times_seconds: starts, character_end_times_seconds: ends } = data.alignment;

  const words: { word: string; charStart: number; charEnd: number }[] = [];
  let i = 0;
  while (i < text.length) {
    while (i < text.length && /\s/.test(text[i])) i++;
    if (i >= text.length) break;
    const wordStart = i;
    while (i < text.length && !/\s/.test(text[i])) i++;
    words.push({ word: text.slice(wordStart, i), charStart: wordStart, charEnd: i - 1 });
  }

  function timeForChar(charIdx: number, side: "start" | "end"): number {
    let accumulated = 0;
    for (let ci = 0; ci < characters.length; ci++) {
      const c = characters[ci];
      if (accumulated + c.length > charIdx) {
        return side === "start" ? starts[ci] : ends[ci];
      }
      accumulated += c.length;
    }
    const last = side === "start" ? starts[starts.length - 1] : ends[ends.length - 1];
    return last ?? 0;
  }

  const chunks: AudioChunk[] = [];
  for (let wi = 0; wi < words.length; wi += 3) {
    const slice = words.slice(wi, wi + 3);
    chunks.push({
      text: slice.map(w => w.word).join(" "),
      start: timeForChar(slice[0].charStart, "start"),
      end: timeForChar(slice[slice.length - 1].charEnd, "end"),
    });
  }

  return { audio, chunks, cost };
}


// Fish Audio doesnt return alignment data, so we return empty chunks
async function callFishAudio(
  text: string,
  apiKey: string,
  voiceId: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
  const res = await fetch("https://api.fish.audio/v1/tts", {
    method: "POST",
    headers: {
      "Authorization": `Bearer ${apiKey}`,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      text,
      reference_id: voiceId,
      format: "mp3",
      mp3_bitrate: 128,
      streaming: false,
    }),
  });

  if (!res.ok) {
    const errText = await res.text().catch(() => "");
    console.error(`[tts] Fish Audio error ${res.status}: ${errText}`);
    return null;
  }

  const audio = Buffer.from(await res.arrayBuffer());
  return { audio, chunks: [], cost: 0 };
}


function getProvider() {
  const config = useRuntimeConfig();
  return (config.ttsProvider as string | undefined)?.toLowerCase() ?? "elevenlabs";
}

async function callTTS(
  text: string
): Promise<{ audio: Buffer; chunks: AudioChunk[]; cost: number } | null> {
  const config = useRuntimeConfig();
  const provider = getProvider();

  if (provider === "fishaudio") {
    const apiKey = config.fishAudioApiKey as string;
    const voiceId = config.fishAudioVoiceId as string;
    if (!apiKey) return null;
    return callFishAudio(text, apiKey, voiceId);
  }

  const apiKey = config.elevenlabsApiKey as string;
  const voiceId = config.elevenlabsVoiceId as string;
  if (!apiKey) return null;
  return callElevenLabs(text, apiKey, voiceId);
}


export async function generateStepTTS(
  text: string,
  lessonId: string,
  stepIndex: number
): Promise<TTSResult | null> {
  try {
    const result = await callTTS(text);
    if (!result) return null;

    const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
    await mkdir(dir, { recursive: true });
    const filename = `step_${stepIndex}.mp3`;
    await writeFile(`${dir}/${filename}`, result.audio);

    const audioPath = `/audio/lessons/${lessonId}/${filename}`;
    console.log(`[tts] step ${stepIndex} for lesson ${lessonId} — ${result.chunks.length} chunks | $${result.cost.toFixed(4)}`);
    return { audioPath, audioChunks: result.chunks, cost: result.cost };
  } catch (err: any) {
    console.error(`[tts] step ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
    return null;
  }
}

export async function generateQuestionTTS(
  text: string,
  lessonId: string,
  stepIndex: number
): Promise<TTSResult | null> {
  try {
    const result = await callTTS(text);
    if (!result) return null;

    const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
    await mkdir(dir, { recursive: true });
    const filename = `step_${stepIndex}_question.mp3`;
    await writeFile(`${dir}/${filename}`, result.audio);

    const audioPath = `/audio/lessons/${lessonId}/${filename}`;
    return { audioPath, audioChunks: result.chunks, cost: result.cost };
  } catch (err: any) {
    console.error(`[tts] question ${stepIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
    return null;
  }
}

export async function generateOptionTTS(
  text: string,
  lessonId: string,
  stepIndex: number,
  optionIndex: number
): Promise<{ audioPath: string; cost: number } | null> {
  try {
    const result = await callTTS(text);
    if (!result) return null;

    const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
    await mkdir(dir, { recursive: true });
    const filename = `step_${stepIndex}_option_${optionIndex}.mp3`;
    await writeFile(`${dir}/${filename}`, result.audio);

    const audioPath = `/audio/lessons/${lessonId}/${filename}`;
    return { audioPath, cost: result.cost };
  } catch (err: any) {
    console.error(`[tts] option ${stepIndex}/${optionIndex} for lesson ${lessonId} failed: ${err?.message ?? err}`);
    return null;
  }
}

// generate a plain audio clip with no timestamp data (used for label clips)
// opts are ElevenLabs-specific and are ignored when using Fish Audio
export async function generateClip(
  text: string,
  outPath: string,
  apiKey: string,
  voiceId: string,
  opts?: { model?: string; voice_settings?: Record<string, number> }
): Promise<{ cost: number } | null> {
  const provider = getProvider();

  try {
    let buffer: Buffer;
    let cost = 0;

    if (provider === "fishaudio") {
      const config = useRuntimeConfig();
      const fishKey = config.fishAudioApiKey as string;
      const fishVoice = (config.fishAudioVoiceId as string) || voiceId;

      const res = await fetch("https://api.fish.audio/v1/tts", {
        method: "POST",
        headers: {
          "Authorization": `Bearer ${fishKey}`,
          "Content-Type": "application/json",
        },
        body: JSON.stringify({
          text,
          reference_id: fishVoice,
          format: "mp3",
          mp3_bitrate: 128,
          streaming: false,
        }),
      });

      if (!res.ok) {
        const errText = await res.text().catch(() => "");
        console.error(`[tts] Fish Audio clip error ${res.status}: ${errText}`);
        return null;
      }

      buffer = Buffer.from(await res.arrayBuffer());
    } else {
      const res = await fetch(
        `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
        {
          method: "POST",
          headers: {
            "xi-api-key": apiKey,
            "Content-Type": "application/json",
          },
          body: JSON.stringify({
            text,
            model_id: opts?.model ?? "eleven_turbo_v2_5",
            output_format: "mp3_44100_128",
            ...(opts?.voice_settings ? { voice_settings: opts.voice_settings } : {}),
          }),
        }
      );

      if (!res.ok) {
        const errText = await res.text().catch(() => "");
        console.error(`[tts] clip error ${res.status}: ${errText}`);
        return null;
      }

      const charCost = parseInt(res.headers.get("character-cost") ?? "0", 10);
      cost = charCost * 0.0003;
      buffer = Buffer.from(await res.arrayBuffer());
    }

    await mkdir(resolve(process.cwd(), "public/audio/labels"), { recursive: true });
    await writeFile(outPath, buffer);
    return { cost };
  } catch (err: any) {
    console.error(`[tts] clip failed for ${outPath}: ${err?.message ?? err}`);
    return null;
  }
}

export async function generateTTSToPath(
  text: string,
  lessonId: string,
  filename: string
): Promise<TTSResult | null> {
  try {
    const result = await callTTS(text);
    if (!result) return null;

    const dir = resolve(process.cwd(), `public/audio/lessons/${lessonId}`);
    await mkdir(dir, { recursive: true });
    await writeFile(`${dir}/${filename}`, result.audio);

    const audioPath = `/audio/lessons/${lessonId}/${filename}`;
    return { audioPath, audioChunks: result.chunks, cost: result.cost };
  } catch (err: any) {
    console.error(`[tts] ${filename} for lesson ${lessonId} failed: ${err?.message ?? err}`);
    return null;
  }
}

export async function fileExists(path: string): Promise<boolean> {
  try {
    await access(path);
    return true;
  } catch {
    return false;
  }
}