Auger/lib/utils/signal_generator.dart

import "dart:convert";

import "package:capstone_project/models/event_signal.dart";
import "package:capstone_project/utils/agrigator.dart";
import "package:capstone_project/utils/event_clusterer.dart";
import "package:capstone_project/utils/openrouter.dart";


// per-article content cap in the prompt. higher = more context, more tokens.
// tuned so an 8-article cluster stays well under 20k input tokens.
const int _kContentCharCap = 1500;


class SignalGenerator {
  final String apiKey;

  SignalGenerator({required this.apiKey});

  Future<List<EventSignal>> generateSignals(
    List<EventCluster> clusters, {
    required String ticker,
    required String companyName,
  }) async {
    final filtered = clusters.where((c) => c.articles.isNotEmpty).toList();

    final results = await Future.wait(
      filtered.map(
        (c) => generateSignal(c, ticker: ticker, companyName: companyName),
      ),
    );


    // sort by impact descending — impact is "how much should i care about this"
    // probability is the credibility gate, not the headline number
    results.sort((a, b) => b.impact.compareTo(a.impact));
    return results;
  }

  Future<EventSignal> generateSignal(
    EventCluster cluster, {
    required String ticker,
    required String companyName,
  }) async {
    final openRouter = OpenRouter(apiKey: apiKey);
    final articles = cluster.articles;
    final eventId = _eventIdFor(articles);

    try {
      final response = await openRouter.chat.completions.create(
        model: "openai/gpt-4.1-mini",
        messages: [
          ChatMessage.system(_systemPrompt).toJson(),
          ChatMessage.user(_buildPrompt(cluster, ticker: ticker, companyName: companyName)).toJson(),
        ],
        temperature: 0.2,
      ) as ChatCompletion;

      final rawContent = response.choices.first.message.content.trim();
      final parsed = jsonDecode(_extractJson(rawContent)) as Map<String, dynamic>;
      final signal = EventSignal.fromJson(parsed, articles, eventIdOverride: eventId);

      // override createdAt with median pub date from the article cluster
      return EventSignal(
        eventId: signal.eventId,
        eventSummary: signal.eventSummary,
        direction: signal.direction,
        nature: signal.nature,
        probability: signal.probability,
        impact: signal.impact,
        rationale: signal.rationale,
        articles: signal.articles,
        createdAt: medianPubDate(articles),
      );
    } catch (e) {
      print("Error generating signal: $e");
      return EventSignal(
        eventId: eventId,
        eventSummary: _fallbackSummary(articles),
        direction: "neutral",
        probability: 0.0,
        impact: 0.0,
        rationale: "Signal generation failed.",
        articles: articles,
        createdAt: medianPubDate(articles),
      );
    } finally {
      openRouter.dispose();
    }
  }

  static const String _systemPrompt = """
You analyze clusters of business and finance news articles that plausibly concern a single asset (a stock, commodity, etc). Each cluster has been assembled via semantic similarity search — the articles are believed to be reporting on the same underlying event, not merely from the same time window. For each cluster you estimate how likely the event is real, how much the asset's price is likely to move in the short term, and the direction of that move.

Return valid JSON only, with exactly these keys:
  event_summary, direction, nature, probability, impact, rationale

Field definitions:

- probability (number, 0.0–1.0): likelihood that the underlying event is real / actually happening as reported. Grounded primarily in coverage — number of articles, number of distinct publishers, reputability of those publishers, and how tightly the cluster hangs together semantically (lower avg distance = stronger corroboration). This is NOT the probability that the price moves, and it is NOT conditional on direction. Anchors:
  * 0.1 — single article, unknown blog, speculative framing, no corroboration
  * 0.5 — several articles from mid-tier outlets, or mixed/conflicting accounts across publishers
  * 0.9 — wide coverage: many articles across multiple major wire services or papers of record (Reuters, Bloomberg, AP, WSJ, FT, NYT, etc.) converging on the same core facts

- impact (number, 0.0–1.0): expected magnitude of the asset's immediate price reaction over roughly the next few trading days, ASSUMING the event is real. Reasoned from what the event actually is, applied to this specific asset. Coverage volume is only a weak prior here — loud news is not the same as impactful news, and long-term significance is not the same as short-term reaction. Anchors (illustrative, oil-related asset):
  * 0.1 — an OPEC minister makes a vague forward-looking comment about prices
  * 0.5 — a refinery outage in a secondary producing region; a mid-sized earnings beat
  * 0.9 — Strait of Hormuz closure; a surprise OPEC+ production cut of material size; a major sanctions announcement hitting the asset's supply or demand

- direction (string, enum): "positive" | "negative" | "neutral" — expected directional bias of the immediate price reaction for this specific asset. Kept separate from impact so a large negative and a large positive are both high-impact.

- nature (string, enum): "forecasting" if the cluster is predicting or anticipating a future event, "reactive" if it is reporting on something that already happened.

- event_summary (string): one neutral sentence describing the event itself. No hedging, no direction words.

- rationale (string): a short paragraph (2–4 sentences) covering two things, in this order:
    1. What's actually happening in the cluster — more substantive than event_summary. Pull out the concrete facts: who did what, specific numbers, quoted figures, timelines, named actors. This is the reader learning what the news IS.
    2. The causal chain from event → price reaction for THIS asset. Why does this move the asset in the chosen direction, and why by the chosen magnitude? Reference the mechanism (supply, demand, competition, margins, guidance, sentiment, regulatory exposure, etc).
  DO NOT restate probability, the nature label, publisher count, source reputability, corroboration strength, or semantic tightness. All of that is shown in the UI alongside the rationale — repeating it wastes the only place the reader learns anything new. Focus on event substance and causal reasoning, not meta-commentary about the input data.

Important:
  * probability is about the event being real, not about price movement.
  * impact is about magnitude of short-term reaction, not long-term significance.
  * direction is separate from impact.
  * Return only the JSON object, no prose, no code fences.
""";

  String _buildPrompt(
    EventCluster cluster, {
    required String ticker,
    required String companyName,
  }) {
    final articles = cluster.articles;

    final publishers = articles
        .map((a) => (a.source ?? "").trim())
        .where((s) => s.isNotEmpty)
        .toSet()
        .toList();

    final buffer = StringBuffer();
    buffer.writeln("Asset: $companyName ($ticker)");
    buffer.writeln();

    buffer.writeln("Coverage stats (computed, do not recount):");
    buffer.writeln("  Articles: ${articles.length}");
    buffer.writeln("  Distinct publishers: ${publishers.length}");
    if (publishers.isNotEmpty) {
      buffer.writeln("  Publishers: ${publishers.join(", ")}");
    } else {
      buffer.writeln("  Publishers: (none identified)");
    }

    final stats = cluster.distanceStats();
    if (stats.min != null) {
      buffer.writeln(
        "  Semantic tightness (distance from seed, 0=identical): "
        "min ${stats.min!.toStringAsFixed(3)}, "
        "avg ${stats.avg!.toStringAsFixed(3)}, "
        "max ${stats.max!.toStringAsFixed(3)}",
      );
    } else {
      buffer.writeln("  Semantic tightness: singleton cluster (no neighbours)");
    }

    buffer.writeln();

    buffer.writeln("Articles:");
    for (int i = 0; i < articles.length; i++) {
      final article = articles[i];
      buffer.writeln("${i + 1}. Title: ${article.title}");
      if ((article.source ?? "").trim().isNotEmpty) {
        buffer.writeln("   Publisher: ${article.source}");
      }

      final desc = article.description.trim();
      if (desc.isNotEmpty) {
        buffer.writeln("   Description: $desc");
      }

      final body = _clipContent(article.content);
      if (body.isNotEmpty) {
        buffer.writeln("   Content: $body");
      }

      buffer.writeln("   Link: ${article.link}");
    }

    buffer.writeln();
    buffer.writeln("Return a single JSON object with keys: event_summary, direction, nature, probability, impact, rationale.");

    return buffer.toString();
  }


  // trim + truncate article body to the char cap. returns empty string if
  // theres nothing useful to include.
  String _clipContent(String content) {
    final trimmed = content.trim();
    if (trimmed.isEmpty) return "";

    if (trimmed.length <= _kContentCharCap) {
      return trimmed;
    }

    return "${trimmed.substring(0, _kContentCharCap)}...";
  }

  String _extractJson(String content) {
    final start = content.indexOf("{");
    final end = content.lastIndexOf("}");

    if (start == -1 || end == -1 || end < start) {
      throw const FormatException("No JSON object found in model response.");
    }

    return content.substring(start, end + 1);
  }

  String _fallbackSummary(List<FeedItem> articles) {
    if (articles.isEmpty) {
      return "Unknown event";
    }

    return articles.first.title;
  }

  // deterministic id from the sorted link set — same cluster re-run produces
  // the same id, which is handy for dedupe later.
  String _eventIdFor(List<FeedItem> articles) {
    final links = articles.map((a) => a.link).toList()..sort();
    final joined = links.join("|");
    final h = joined.hashCode & 0x7FFFFFFF;
    return "evt_${h.toRadixString(36)}";
  }
}