import "dart:typed_data"; import "package:docx_to_text/docx_to_text.dart"; import "../models/operations/trip.dart"; import "../exceptions/schedule_parse_exception.dart"; import "schedule_parser.dart"; class ArrivaScheduleParser implements ScheduleParser { // OUTBOUND FORMAT: HHMM_HHMM times... EC### duty running trip // Times appear on the LEFT, trip info on the RIGHT static final _outboundPattern = RegExp( r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(\d+)", ); static final _outboundFinishingPattern = RegExp( r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+F", ); // INBOUND FORMAT: trip duty running EC### ... HHMM_HHMM times... // Trip info appears on the LEFT, times on the RIGHT // Note: running number may have N/R/F prefix like "N503" or be separate like "N 503" static final _inboundPattern = RegExp( r"^(\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(EC\d+)\s+.*?(\d{4})_(\d{4})\s+([\d\s]+)", ); static final _inboundFinishingPattern = RegExp( r"^(\d+)\s+(\d+)\s+F\s+(EC\d+)\s+.*?(\d{4})_(\d{4})(.*)", ); @override bool canParse(String content) { // Check if content looks like Arriva format return content.contains("EC5") && content.contains("FINISH AT POINT") && RegExp(r"\d{4}_\d{4}").hasMatch(content); } @override Future> parseBytes(Uint8List bytes) async { // Step 1: Extract text final text = _extractTextFromDocx(bytes); // DEBUG: Print extracted text print("=== EXTRACTED TEXT START ==="); print(text); print("=== EXTRACTED TEXT END ==="); // Step 2: Parse document structure (headers and trips) final lines = text.split("\n"); final documentSections = _parseDocumentSections(lines); print("=== FOUND ${documentSections.length} SECTIONS ==="); for (var section in documentSections) { print( "Section: ${section.direction}, ${section.stations.length} stations, ${section.tripLines.length} trips", ); } if (documentSections.isEmpty) { throw ScheduleParseException("No trip data found in schedule"); } // Step 3: Parse trips from all sections final trips = []; for (var section in documentSections) { final sectionTrips = _parseSectionTrips(section); trips.addAll(sectionTrips); print( "✓ Parsed ${sectionTrips.length} trips from ${section.direction} section", ); } // Step 4: Sort by scheduled time trips.sort((a, b) => a.scheduledTime.compareTo(b.scheduledTime)); return trips; } String _extractTextFromDocx(Uint8List bytes) { try { return docxToText(bytes); } catch (e, stackTrace) { print("Arriva parser document read failed: $e"); print(stackTrace); throw ScheduleParseException("Failed to read document: $e"); } } String _formatTime(String rawTime) { if (rawTime.length != 4) { throw FormatException("Invalid time format: $rawTime"); } return "${rawTime.substring(0, 2)}:${rawTime.substring(2, 4)}"; } List<_DocumentSection> _parseDocumentSections(List lines) { final sections = <_DocumentSection>[]; _DocumentSection? currentSection; for (var i = 0; i < lines.length; i++) { final line = lines[i].trim(); // Check if this is a station header line final stations = _extractStationHeader(line); if (stations != null && stations.isNotEmpty) { // Save previous section if it exists if (currentSection != null && currentSection.tripLines.isNotEmpty) { sections.add(currentSection); } // Direction is determined later from the first trip line currentSection = _DocumentSection( stations: stations, direction: "unknown", tripLines: [], ); print( "Found station header at line $i with ${stations.length} stations", ); print( " Stations: ${stations.take(3).join(", ")} ... ${stations.skip(stations.length - 2).join(", ")}", ); continue; } // Check if this is a trip line if (currentSection != null && _isTripLine(line)) { // Infer direction from first trip line if not yet set if (currentSection.direction == "unknown") { currentSection.direction = _inferDirectionFromTripLine(line); print(" Direction inferred: ${currentSection.direction}"); } currentSection.tripLines.add(line); } } // Add final section if (currentSection != null && currentSection.tripLines.isNotEmpty) { sections.add(currentSection); } return sections; } List? _extractStationHeader(String line) { // Station headers have multiple short uppercase codes, no digits, no underscores if (line.contains(RegExp(r"\d")) || line.contains("EC") || line.contains("_") || line.length < 10) { return null; } // Split by whitespace and filter for potential station codes (3-8 uppercase letters) final parts = line.split(RegExp(r"\s+")); final potentialStations = parts .where( (part) => part.length >= 3 && part.length <= 8 && RegExp(r"^[A-Z]+$").hasMatch(part), ) .toList(); if (potentialStations.length < 8) return null; // Filter out common metadata words that appear in headers const nonStationWords = { "TRP", "DUTY", "BUS", "START", "END", "GAR", "DEP", "ARR", "DENOTES", "FINISHES", "RELIEF", "TRIP", "NEXT", "NO", "AT", "SPELL", "HOURS", "TOTAL", "LAYOVER", "MILES", "LIVE", "DEAD", "MILEAGE", "TIME", "SIGN", "FORM", "NXT", "THIS", "HAS", "OR", "FOR", "CHANGE", "SERVICE", "POINT", "LSN", "MAN", "RUI", "SN", "ROUTE", "RUNNING", "PREV", "FIN", "ENTOD", "SOALL", "USHRS", "ADDTL", "CASH", "TODAYS", "REL", "IEF", }; final stations = potentialStations .where((s) => !nonStationWords.contains(s)) .toList(); // Need at least 5 actual station-like codes remaining - the structural // density of codes is what marks this as a station header, not a known list return stations.length >= 5 ? stations : null; } // Determine direction from the format of the first trip line. // Lines starting with digits are outbound (trip number comes first). // Lines starting with underscores or a bare time are inbound (times come first). String _inferDirectionFromTripLine(String line) { if (RegExp(r"^\d{4}_\d{4}").hasMatch(line)) return "inbound"; if (RegExp(r"^_+\d{4}").hasMatch(line)) return "inbound"; if (RegExp(r"^\d+\s+\d+").hasMatch(line)) return "outbound"; return "outbound"; } bool _isTripLine(String line) { return RegExp(r"\d{4}_\d{4}").hasMatch(line) && line.contains("EC"); } List _parseSectionTrips(_DocumentSection section) { final trips = []; for (final line in section.tripLines) { Trip? trip; // Detect actual line format by looking at structure // Inbound: starts with numbers (trip duty running EC###) or (trip duty F EC###) // Note: running number might be "N503 EC" (with spaces) or "N 503 EC" or just "503 EC" // Outbound: starts with HHMM_HHMM final isOutboundFormat = RegExp(r"^\d{4}_\d{4}").hasMatch(line); final isInboundFormat = RegExp( r"^\d+\s+\d+\s+(?:[NRF]\d+\s+|[NRF]\s+\d+\s+|F\s+|\d+\s+)EC", ).hasMatch(line); if (isOutboundFormat) { trip = _parseOutboundTrip(line, section.stations); } else if (isInboundFormat) { trip = _parseInboundTrip(line, section.stations); } if (trip != null) { trips.add(trip); } else { final format = isOutboundFormat ? "outbound" : isInboundFormat ? "inbound" : "unknown"; print( "Failed to parse $format line: ${line.substring(0, line.length > 80 ? 80 : line.length)}...", ); } } return trips; } Trip? _parseInboundTrip(String line, List stations) { // INBOUND: trip duty running EC### ... HHMM_HHMM times... var match = _inboundPattern.firstMatch(line); if (match != null) { final tripNumber = match.group(1)!; final dutyNumber = match.group(2)!; final tripType = match.group(3) ?? ""; final busWorkNumber = match.group(4)!; final firstTime = match.group(6)!; final secondTime = match.group(7)!; final timesString = match.group(8) ?? ""; // Build complete time array: first_time, second_time, then remaining times final times = [firstTime, secondTime]; final additionalTimes = _extractTimesFromString(timesString); times.addAll(additionalTimes); final stationTimes = _mapStationsToTimes(stations, times); final scheduledTime = _formatTime(firstTime); return Trip( tripNumber: tripNumber, dutyNumber: dutyNumber, busWorkNumber: busWorkNumber, scheduledTime: scheduledTime, tripType: tripType, isFinishing: false, stationTimes: stationTimes, stationOrder: stations, direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound", ); } // Try finishing pattern match = _inboundFinishingPattern.firstMatch(line); if (match != null) { final tripNumber = match.group(1)!; final dutyNumber = match.group(2)!; final firstTime = match.group(4)!; final secondTime = match.group(5)!; final timesString = match.group(6) ?? ""; final times = [firstTime, secondTime]; final additionalTimes = _extractTimesFromString(timesString); times.addAll(additionalTimes); final stationTimes = _mapStationsToTimes(stations, times); final scheduledTime = _formatTime(firstTime); return Trip( tripNumber: tripNumber, dutyNumber: dutyNumber, busWorkNumber: dutyNumber, scheduledTime: scheduledTime, tripType: "F", isFinishing: true, stationTimes: stationTimes, stationOrder: stations, direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound", ); } return null; } Trip? _parseOutboundTrip(String line, List stations) { // OUTBOUND: HHMM_HHMM times... EC### duty running trip var match = _outboundPattern.firstMatch(line); if (match != null) { final firstTime = match.group(1)!; final secondTime = match.group(2)!; final timesString = match.group(3) ?? ""; final dutyNumber = match.group(5)!; final tripType = match.group(6) ?? ""; final busWorkNumber = match.group(7)!; final tripNumber = match.group(8)!; // Build complete time array: first_time, second_time, then remaining times final times = [firstTime, secondTime]; times.addAll(_extractTimesFromString(timesString)); final stationTimes = _mapStationsToTimes(stations, times); final scheduledTime = _formatTime(firstTime); return Trip( tripNumber: tripNumber, dutyNumber: dutyNumber, busWorkNumber: busWorkNumber, scheduledTime: scheduledTime, tripType: tripType, isFinishing: false, stationTimes: stationTimes, stationOrder: stations, direction: (int.tryParse(tripNumber) ?? 0).isOdd ? "inbound" : "outbound", ); } // Try finishing pattern match = _outboundFinishingPattern.firstMatch(line); if (match != null) { final firstTime = match.group(1)!; final secondTime = match.group(2)!; final timesString = match.group(3) ?? ""; final dutyNumber = match.group(5)!; final times = [firstTime, secondTime]; times.addAll(_extractTimesFromString(timesString)); final stationTimes = _mapStationsToTimes(stations, times); final scheduledTime = _formatTime(firstTime); return Trip( tripNumber: dutyNumber, // Finishing trips may not have separate trip number dutyNumber: dutyNumber, busWorkNumber: dutyNumber, scheduledTime: scheduledTime, tripType: "F", isFinishing: true, stationTimes: stationTimes, stationOrder: stations, direction: (int.tryParse(dutyNumber) ?? 0).isOdd ? "inbound" : "outbound", ); } return null; } List _extractTimesFromString(String timesString) { // Extract all 4-digit times from the string final pattern = RegExp(r"\b(\d{4})\b"); return pattern.allMatches(timesString).map((m) => m.group(1)!).toList(); } List _extractAllTimes(String line) { // Extract all 4-digit times, including those in HHMM_HHMM format final timePattern = RegExp(r"\b(\d{4})(?:_(\d{4}))?\b"); final matches = timePattern.allMatches(line); final times = []; for (final match in matches) { // Add first time times.add(match.group(1)!); // Add second time if it exists (from HHMM_HHMM) if (match.group(2) != null) { times.add(match.group(2)!); } } return times; } Map _mapStationsToTimes( List stations, List times, ) { final stationTimes = {}; for (var i = 0; i < stations.length && i < times.length; i++) { final time = times[i]; // Only add non-empty times (not "____" or similar) if (RegExp(r"^\d{4}$").hasMatch(time)) { stationTimes[stations[i]] = _formatTime(time); } } return stationTimes; } } class _DocumentSection { final List stations; String direction; final List tripLines; _DocumentSection({ required this.stations, required this.direction, required this.tripLines, }); }