Roadbound-BRR/lib/parsers/arriva_schedule_parser.dart

import "dart:typed_data";
import "package:docx_to_text/docx_to_text.dart";
import "../models/trip.dart";
import "../exceptions/schedule_parse_exception.dart";
import "schedule_parser.dart";

class ArrivaScheduleParser implements ScheduleParser {
  // OUTBOUND FORMAT: HHMM_HHMM times... EC### duty running trip
  // Times appear on the LEFT, trip info on the RIGHT
  static final _outboundPattern = RegExp(
    r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(\d+)",
  );

  static final _outboundFinishingPattern = RegExp(
    r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+F",
  );

  // INBOUND FORMAT: trip duty running EC### ... HHMM_HHMM times...
  // Trip info appears on the LEFT, times on the RIGHT
  // Note: running number may have N/R/F prefix like "N503" or be separate like "N 503"
  static final _inboundPattern = RegExp(
    r"^(\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(EC\d+)\s+.*?(\d{4})_(\d{4})\s+([\d\s]+)",
  );

  static final _inboundFinishingPattern = RegExp(
    r"^(\d+)\s+(\d+)\s+F\s+(EC\d+)\s+.*?(\d{4})_(\d{4})(.*)",
  );

  @override
  bool canParse(String content) {
    // Check if content looks like Arriva format
    return content.contains("EC5") &&
        content.contains("FINISH AT POINT") &&
        RegExp(r"\d{4}_\d{4}").hasMatch(content);
  }

  @override
  Future<List<Trip>> parseBytes(Uint8List bytes) async {
    // Step 1: Extract text
    final text = _extractTextFromDocx(bytes);

    // DEBUG: Print extracted text
    print("=== EXTRACTED TEXT START ===");
    print(text);
    print("=== EXTRACTED TEXT END ===");

    // Step 2: Parse document structure (headers and trips)
    final lines = text.split("\n");
    final documentSections = _parseDocumentSections(lines);

    print("=== FOUND ${documentSections.length} SECTIONS ===");
    for (var section in documentSections) {
      print("Section: ${section.direction}, ${section.stations.length} stations, ${section.tripLines.length} trips");
    }

    if (documentSections.isEmpty) {
      throw ScheduleParseException("No trip data found in schedule");
    }

    // Step 3: Parse trips from all sections
    final trips = <Trip>[];
    for (var section in documentSections) {
      final sectionTrips = _parseSectionTrips(section);
      trips.addAll(sectionTrips);
      print("✓ Parsed ${sectionTrips.length} trips from ${section.direction} section");
    }

    // Step 4: Sort by scheduled time
    trips.sort((a, b) => a.scheduledTime.compareTo(b.scheduledTime));

    return trips;
  }

  String _extractTextFromDocx(Uint8List bytes) {
    try {
      return docxToText(bytes);
    } catch (e) {
      throw ScheduleParseException("Failed to read document: $e");
    }
  }


  String _formatTime(String rawTime) {
    if (rawTime.length != 4) {
      throw FormatException("Invalid time format: $rawTime");
    }
    return "${rawTime.substring(0, 2)}:${rawTime.substring(2, 4)}";
  }

  List<_DocumentSection> _parseDocumentSections(List<String> lines) {
    final sections = <_DocumentSection>[];
    _DocumentSection? currentSection;

    for (var i = 0; i < lines.length; i++) {
      final line = lines[i].trim();

      // Check if this is a station header line
      final stations = _extractStationHeader(line);
      if (stations != null && stations.isNotEmpty) {
        // Save previous section if it exists
        if (currentSection != null && currentSection.tripLines.isNotEmpty) {
          sections.add(currentSection);
        }

        // Direction is determined later from the first trip line
        currentSection = _DocumentSection(
          stations: stations,
          direction: "unknown",
          tripLines: [],
        );

        print("Found station header at line $i with ${stations.length} stations");
        print("  Stations: ${stations.take(3).join(", ")} ... ${stations.skip(stations.length - 2).join(", ")}");
        continue;
      }

      // Check if this is a trip line
      if (currentSection != null && _isTripLine(line)) {

        // Infer direction from first trip line if not yet set
        if (currentSection.direction == "unknown") {
          currentSection.direction = _inferDirectionFromTripLine(line);
          print("  Direction inferred: ${currentSection.direction}");
        }

        currentSection.tripLines.add(line);
      }
    }

    // Add final section
    if (currentSection != null && currentSection.tripLines.isNotEmpty) {
      sections.add(currentSection);
    }

    return sections;
  }

  List<String>? _extractStationHeader(String line) {
    // Station headers have multiple short uppercase codes, no digits, no underscores
    if (line.contains(RegExp(r"\d")) ||
        line.contains("EC") ||
        line.contains("_") ||
        line.length < 10) {
      return null;
    }

    // Split by whitespace and filter for potential station codes (3-8 uppercase letters)
    final parts = line.split(RegExp(r"\s+"));
    final potentialStations = parts
        .where((part) => part.length >= 3 &&
                        part.length <= 8 &&
                        RegExp(r"^[A-Z]+$").hasMatch(part))
        .toList();

    if (potentialStations.length < 8) return null;

    // Filter out common metadata words that appear in headers
    const nonStationWords = {
      "TRP", "DUTY", "BUS", "START", "END", "GAR", "DEP", "ARR",
      "DENOTES", "FINISHES", "RELIEF", "TRIP", "NEXT", "NO", "AT",
      "SPELL", "HOURS", "TOTAL", "LAYOVER", "MILES", "LIVE", "DEAD",
      "MILEAGE", "TIME", "SIGN", "FORM", "NXT", "THIS", "HAS", "OR",
      "FOR", "CHANGE", "SERVICE", "POINT", "LSN", "MAN", "RUI", "SN",
      "ROUTE", "RUNNING", "PREV", "FIN", "ENTOD", "SOALL", "USHRS",
      "ADDTL", "CASH", "TODAYS", "REL", "IEF",
    };

    final stations = potentialStations
        .where((s) => !nonStationWords.contains(s))
        .toList();

    // Need at least 5 actual station-like codes remaining - the structural
    // density of codes is what marks this as a station header, not a known list
    return stations.length >= 5 ? stations : null;
  }

  // Determine direction from the format of the first trip line.
  // Lines starting with digits are outbound (trip number comes first).
  // Lines starting with underscores or a bare time are inbound (times come first).
  String _inferDirectionFromTripLine(String line) {
    if (RegExp(r"^\d{4}_\d{4}").hasMatch(line)) return "inbound";
    if (RegExp(r"^_+\d{4}").hasMatch(line)) return "inbound";
    if (RegExp(r"^\d+\s+\d+").hasMatch(line)) return "outbound";
    return "outbound";
  }

  bool _isTripLine(String line) {
    return RegExp(r"\d{4}_\d{4}").hasMatch(line) && line.contains("EC");
  }

  List<Trip> _parseSectionTrips(_DocumentSection section) {
    final trips = <Trip>[];

    for (final line in section.tripLines) {
      Trip? trip;

      // Detect actual line format by looking at structure
      // Inbound: starts with numbers (trip duty running EC###) or (trip duty F EC###)
      // Note: running number might be "N503  EC" (with spaces) or "N 503 EC" or just "503 EC"
      // Outbound: starts with HHMM_HHMM
      final isOutboundFormat = RegExp(r"^\d{4}_\d{4}").hasMatch(line);
      final isInboundFormat = RegExp(r"^\d+\s+\d+\s+(?:[NRF]\d+\s+|[NRF]\s+\d+\s+|F\s+|\d+\s+)EC").hasMatch(line);

      if (isOutboundFormat) {
        trip = _parseOutboundTrip(line, section.stations);
      } else if (isInboundFormat) {
        trip = _parseInboundTrip(line, section.stations);
      }

      if (trip != null) {
        trips.add(trip);
      } else {
        final format = isOutboundFormat ? "outbound" : isInboundFormat ? "inbound" : "unknown";
        print("Failed to parse $format line: ${line.substring(0, line.length > 80 ? 80 : line.length)}...");
      }
    }

    return trips;
  }

  Trip? _parseInboundTrip(
    String line,
    List<String> stations,
  ) {
    // INBOUND: trip duty running EC### ... HHMM_HHMM times...
    var match = _inboundPattern.firstMatch(line);
    if (match != null) {
      final tripNumber = match.group(1)!;
      final dutyNumber = match.group(2)!;
      final tripType = match.group(3) ?? "";
      final runningNumber = match.group(4)!;
      final firstTime = match.group(6)!;
      final secondTime = match.group(7)!;
      final timesString = match.group(8) ?? "";

      // Build complete time array: first_time, second_time, then remaining times
      final times = [firstTime, secondTime];
      final additionalTimes = _extractTimesFromString(timesString);
      times.addAll(additionalTimes);

      final stationTimes = _mapStationsToTimes(stations, times);
      final scheduledTime = _formatTime(firstTime);

      return Trip(
        tripNumber: tripNumber,
        dutyNumber: dutyNumber,
        runningNumber: runningNumber,
        scheduledTime: scheduledTime,
        tripType: tripType,
        isFinishing: false,
        stationTimes: stationTimes,
        stationOrder: stations,
        direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound",
      );
    }

    // Try finishing pattern
    match = _inboundFinishingPattern.firstMatch(line);
    if (match != null) {
      final tripNumber = match.group(1)!;
      final dutyNumber = match.group(2)!;
      final firstTime = match.group(4)!;
      final secondTime = match.group(5)!;
      final timesString = match.group(6) ?? "";

      final times = [firstTime, secondTime];
      final additionalTimes = _extractTimesFromString(timesString);
      times.addAll(additionalTimes);

      final stationTimes = _mapStationsToTimes(stations, times);
      final scheduledTime = _formatTime(firstTime);

      return Trip(
        tripNumber: tripNumber,
        dutyNumber: dutyNumber,
        runningNumber: dutyNumber,
        scheduledTime: scheduledTime,
        tripType: "F",
        isFinishing: true,
        stationTimes: stationTimes,
        stationOrder: stations,
        direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound",
      );
    }

    return null;
  }

  Trip? _parseOutboundTrip(
    String line,
    List<String> stations,
  ) {
    // OUTBOUND: HHMM_HHMM times... EC### duty running trip
    var match = _outboundPattern.firstMatch(line);
    if (match != null) {
      final firstTime = match.group(1)!;
      final secondTime = match.group(2)!;
      final timesString = match.group(3) ?? "";
      final dutyNumber = match.group(5)!;
      final tripType = match.group(6) ?? "";
      final runningNumber = match.group(7)!;
      final tripNumber = match.group(8)!;

      // Build complete time array: first_time, second_time, then remaining times
      final times = [firstTime, secondTime];
      times.addAll(_extractTimesFromString(timesString));

      final stationTimes = _mapStationsToTimes(stations, times);
      final scheduledTime = _formatTime(firstTime);

      return Trip(
        tripNumber: tripNumber,
        dutyNumber: dutyNumber,
        runningNumber: runningNumber,
        scheduledTime: scheduledTime,
        tripType: tripType,
        isFinishing: false,
        stationTimes: stationTimes,
        stationOrder: stations,
        direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound",
      );
    }

    // Try finishing pattern
    match = _outboundFinishingPattern.firstMatch(line);
    if (match != null) {
      final firstTime = match.group(1)!;
      final secondTime = match.group(2)!;
      final timesString = match.group(3) ?? "";
      final dutyNumber = match.group(5)!;

      final times = [firstTime, secondTime];
      times.addAll(_extractTimesFromString(timesString));

      final stationTimes = _mapStationsToTimes(stations, times);
      final scheduledTime = _formatTime(firstTime);

      return Trip(
        tripNumber: dutyNumber, // Finishing trips may not have separate trip number
        dutyNumber: dutyNumber,
        runningNumber: dutyNumber,
        scheduledTime: scheduledTime,
        tripType: "F",
        isFinishing: true,
        stationTimes: stationTimes,
        stationOrder: stations,
        direction: (int.tryParse(dutyNumber) ?? 0).isOdd ? "inbound" : "outbound",
      );
    }

    return null;
  }

  List<String> _extractTimesFromString(String timesString) {
    // Extract all 4-digit times from the string
    final pattern = RegExp(r"\b(\d{4})\b");
    return pattern
        .allMatches(timesString)
        .map((m) => m.group(1)!)
        .toList();
  }

  List<String> _extractAllTimes(String line) {
    // Extract all 4-digit times, including those in HHMM_HHMM format
    final timePattern = RegExp(r"\b(\d{4})(?:_(\d{4}))?\b");
    final matches = timePattern.allMatches(line);
    final times = <String>[];

    for (final match in matches) {
      // Add first time
      times.add(match.group(1)!);
      // Add second time if it exists (from HHMM_HHMM)
      if (match.group(2) != null) {
        times.add(match.group(2)!);
      }
    }

    return times;
  }

  Map<String, String> _mapStationsToTimes(
    List<String> stations,
    List<String> times,
  ) {
    final stationTimes = <String, String>{};

    for (var i = 0; i < stations.length && i < times.length; i++) {
      final time = times[i];
      // Only add non-empty times (not "____" or similar)
      if (RegExp(r"^\d{4}$").hasMatch(time)) {
        stationTimes[stations[i]] = _formatTime(time);
      }
    }

    return stationTimes;
  }
}

class _DocumentSection {
  final List<String> stations;
  String direction;
  final List<String> tripLines;

  _DocumentSection({
    required this.stations,
    required this.direction,
    required this.tripLines,
  });
}