471 lines
14 KiB
Dart
471 lines
14 KiB
Dart
import "dart:typed_data";
|
|
import "package:docx_to_text/docx_to_text.dart";
|
|
import "../models/operations/trip.dart";
|
|
import "../exceptions/schedule_parse_exception.dart";
|
|
import "schedule_parser.dart";
|
|
|
|
class ArrivaScheduleParser implements ScheduleParser {
|
|
// OUTBOUND FORMAT: HHMM_HHMM times... EC### duty running trip
|
|
// Times appear on the LEFT, trip info on the RIGHT
|
|
static final _outboundPattern = RegExp(
|
|
r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(\d+)",
|
|
);
|
|
|
|
static final _outboundFinishingPattern = RegExp(
|
|
r"^(\d{4})_(\d{4})\s+(.*?)\s+(EC\d+)\s+(\d+)\s+F",
|
|
);
|
|
|
|
// INBOUND FORMAT: trip duty running EC### ... HHMM_HHMM times...
|
|
// Trip info appears on the LEFT, times on the RIGHT
|
|
// Note: running number may have N/R/F prefix like "N503" or be separate like "N 503"
|
|
static final _inboundPattern = RegExp(
|
|
r"^(\d+)\s+(\d+)\s+([NRF]?)(\d+)\s+(EC\d+)\s+.*?(\d{4})_(\d{4})\s+([\d\s]+)",
|
|
);
|
|
|
|
static final _inboundFinishingPattern = RegExp(
|
|
r"^(\d+)\s+(\d+)\s+F\s+(EC\d+)\s+.*?(\d{4})_(\d{4})(.*)",
|
|
);
|
|
|
|
@override
|
|
bool canParse(String content) {
|
|
// Check if content looks like Arriva format
|
|
return content.contains("EC5") &&
|
|
content.contains("FINISH AT POINT") &&
|
|
RegExp(r"\d{4}_\d{4}").hasMatch(content);
|
|
}
|
|
|
|
@override
|
|
Future<List<Trip>> parseBytes(Uint8List bytes) async {
|
|
// Step 1: Extract text
|
|
final text = _extractTextFromDocx(bytes);
|
|
|
|
// DEBUG: Print extracted text
|
|
print("=== EXTRACTED TEXT START ===");
|
|
print(text);
|
|
print("=== EXTRACTED TEXT END ===");
|
|
|
|
// Step 2: Parse document structure (headers and trips)
|
|
final lines = text.split("\n");
|
|
final documentSections = _parseDocumentSections(lines);
|
|
|
|
print("=== FOUND ${documentSections.length} SECTIONS ===");
|
|
for (var section in documentSections) {
|
|
print(
|
|
"Section: ${section.direction}, ${section.stations.length} stations, ${section.tripLines.length} trips",
|
|
);
|
|
}
|
|
|
|
if (documentSections.isEmpty) {
|
|
throw ScheduleParseException("No trip data found in schedule");
|
|
}
|
|
|
|
// Step 3: Parse trips from all sections
|
|
final trips = <Trip>[];
|
|
for (var section in documentSections) {
|
|
final sectionTrips = _parseSectionTrips(section);
|
|
trips.addAll(sectionTrips);
|
|
print(
|
|
"✓ Parsed ${sectionTrips.length} trips from ${section.direction} section",
|
|
);
|
|
}
|
|
|
|
// Step 4: Sort by scheduled time
|
|
trips.sort((a, b) => a.scheduledTime.compareTo(b.scheduledTime));
|
|
|
|
return trips;
|
|
}
|
|
|
|
String _extractTextFromDocx(Uint8List bytes) {
|
|
try {
|
|
return docxToText(bytes);
|
|
} catch (e, stackTrace) {
|
|
print("Arriva parser document read failed: $e");
|
|
print(stackTrace);
|
|
throw ScheduleParseException("Failed to read document: $e");
|
|
}
|
|
}
|
|
|
|
String _formatTime(String rawTime) {
|
|
if (rawTime.length != 4) {
|
|
throw FormatException("Invalid time format: $rawTime");
|
|
}
|
|
return "${rawTime.substring(0, 2)}:${rawTime.substring(2, 4)}";
|
|
}
|
|
|
|
List<_DocumentSection> _parseDocumentSections(List<String> lines) {
|
|
final sections = <_DocumentSection>[];
|
|
_DocumentSection? currentSection;
|
|
|
|
for (var i = 0; i < lines.length; i++) {
|
|
final line = lines[i].trim();
|
|
|
|
// Check if this is a station header line
|
|
final stations = _extractStationHeader(line);
|
|
if (stations != null && stations.isNotEmpty) {
|
|
// Save previous section if it exists
|
|
if (currentSection != null && currentSection.tripLines.isNotEmpty) {
|
|
sections.add(currentSection);
|
|
}
|
|
|
|
// Direction is determined later from the first trip line
|
|
currentSection = _DocumentSection(
|
|
stations: stations,
|
|
direction: "unknown",
|
|
tripLines: [],
|
|
);
|
|
|
|
print(
|
|
"Found station header at line $i with ${stations.length} stations",
|
|
);
|
|
print(
|
|
" Stations: ${stations.take(3).join(", ")} ... ${stations.skip(stations.length - 2).join(", ")}",
|
|
);
|
|
continue;
|
|
}
|
|
|
|
// Check if this is a trip line
|
|
if (currentSection != null && _isTripLine(line)) {
|
|
// Infer direction from first trip line if not yet set
|
|
if (currentSection.direction == "unknown") {
|
|
currentSection.direction = _inferDirectionFromTripLine(line);
|
|
print(" Direction inferred: ${currentSection.direction}");
|
|
}
|
|
|
|
currentSection.tripLines.add(line);
|
|
}
|
|
}
|
|
|
|
// Add final section
|
|
if (currentSection != null && currentSection.tripLines.isNotEmpty) {
|
|
sections.add(currentSection);
|
|
}
|
|
|
|
return sections;
|
|
}
|
|
|
|
List<String>? _extractStationHeader(String line) {
|
|
// Station headers have multiple short uppercase codes, no digits, no underscores
|
|
if (line.contains(RegExp(r"\d")) ||
|
|
line.contains("EC") ||
|
|
line.contains("_") ||
|
|
line.length < 10) {
|
|
return null;
|
|
}
|
|
|
|
// Split by whitespace and filter for potential station codes (3-8 uppercase letters)
|
|
final parts = line.split(RegExp(r"\s+"));
|
|
final potentialStations = parts
|
|
.where(
|
|
(part) =>
|
|
part.length >= 3 &&
|
|
part.length <= 8 &&
|
|
RegExp(r"^[A-Z]+$").hasMatch(part),
|
|
)
|
|
.toList();
|
|
|
|
if (potentialStations.length < 8) return null;
|
|
|
|
// Filter out common metadata words that appear in headers
|
|
const nonStationWords = {
|
|
"TRP",
|
|
"DUTY",
|
|
"BUS",
|
|
"START",
|
|
"END",
|
|
"GAR",
|
|
"DEP",
|
|
"ARR",
|
|
"DENOTES",
|
|
"FINISHES",
|
|
"RELIEF",
|
|
"TRIP",
|
|
"NEXT",
|
|
"NO",
|
|
"AT",
|
|
"SPELL",
|
|
"HOURS",
|
|
"TOTAL",
|
|
"LAYOVER",
|
|
"MILES",
|
|
"LIVE",
|
|
"DEAD",
|
|
"MILEAGE",
|
|
"TIME",
|
|
"SIGN",
|
|
"FORM",
|
|
"NXT",
|
|
"THIS",
|
|
"HAS",
|
|
"OR",
|
|
"FOR",
|
|
"CHANGE",
|
|
"SERVICE",
|
|
"POINT",
|
|
"LSN",
|
|
"MAN",
|
|
"RUI",
|
|
"SN",
|
|
"ROUTE",
|
|
"RUNNING",
|
|
"PREV",
|
|
"FIN",
|
|
"ENTOD",
|
|
"SOALL",
|
|
"USHRS",
|
|
"ADDTL",
|
|
"CASH",
|
|
"TODAYS",
|
|
"REL",
|
|
"IEF",
|
|
};
|
|
|
|
final stations = potentialStations
|
|
.where((s) => !nonStationWords.contains(s))
|
|
.toList();
|
|
|
|
// Need at least 5 actual station-like codes remaining - the structural
|
|
// density of codes is what marks this as a station header, not a known list
|
|
return stations.length >= 5 ? stations : null;
|
|
}
|
|
|
|
// Determine direction from the format of the first trip line.
|
|
// Lines starting with digits are outbound (trip number comes first).
|
|
// Lines starting with underscores or a bare time are inbound (times come first).
|
|
String _inferDirectionFromTripLine(String line) {
|
|
if (RegExp(r"^\d{4}_\d{4}").hasMatch(line)) return "inbound";
|
|
if (RegExp(r"^_+\d{4}").hasMatch(line)) return "inbound";
|
|
if (RegExp(r"^\d+\s+\d+").hasMatch(line)) return "outbound";
|
|
return "outbound";
|
|
}
|
|
|
|
bool _isTripLine(String line) {
|
|
return RegExp(r"\d{4}_\d{4}").hasMatch(line) && line.contains("EC");
|
|
}
|
|
|
|
List<Trip> _parseSectionTrips(_DocumentSection section) {
|
|
final trips = <Trip>[];
|
|
|
|
for (final line in section.tripLines) {
|
|
Trip? trip;
|
|
|
|
// Detect actual line format by looking at structure
|
|
// Inbound: starts with numbers (trip duty running EC###) or (trip duty F EC###)
|
|
// Note: running number might be "N503 EC" (with spaces) or "N 503 EC" or just "503 EC"
|
|
// Outbound: starts with HHMM_HHMM
|
|
final isOutboundFormat = RegExp(r"^\d{4}_\d{4}").hasMatch(line);
|
|
final isInboundFormat = RegExp(
|
|
r"^\d+\s+\d+\s+(?:[NRF]\d+\s+|[NRF]\s+\d+\s+|F\s+|\d+\s+)EC",
|
|
).hasMatch(line);
|
|
|
|
if (isOutboundFormat) {
|
|
trip = _parseOutboundTrip(line, section.stations);
|
|
} else if (isInboundFormat) {
|
|
trip = _parseInboundTrip(line, section.stations);
|
|
}
|
|
|
|
if (trip != null) {
|
|
trips.add(trip);
|
|
} else {
|
|
final format = isOutboundFormat
|
|
? "outbound"
|
|
: isInboundFormat
|
|
? "inbound"
|
|
: "unknown";
|
|
print(
|
|
"Failed to parse $format line: ${line.substring(0, line.length > 80 ? 80 : line.length)}...",
|
|
);
|
|
}
|
|
}
|
|
|
|
return trips;
|
|
}
|
|
|
|
Trip? _parseInboundTrip(String line, List<String> stations) {
|
|
// INBOUND: trip duty running EC### ... HHMM_HHMM times...
|
|
var match = _inboundPattern.firstMatch(line);
|
|
if (match != null) {
|
|
final tripNumber = match.group(1)!;
|
|
final dutyNumber = match.group(2)!;
|
|
final tripType = match.group(3) ?? "";
|
|
final busWorkNumber = match.group(4)!;
|
|
final firstTime = match.group(6)!;
|
|
final secondTime = match.group(7)!;
|
|
final timesString = match.group(8) ?? "";
|
|
|
|
// Build complete time array: first_time, second_time, then remaining times
|
|
final times = [firstTime, secondTime];
|
|
final additionalTimes = _extractTimesFromString(timesString);
|
|
times.addAll(additionalTimes);
|
|
|
|
final stationTimes = _mapStationsToTimes(stations, times);
|
|
final scheduledTime = _formatTime(firstTime);
|
|
|
|
return Trip(
|
|
tripNumber: tripNumber,
|
|
dutyNumber: dutyNumber,
|
|
busWorkNumber: busWorkNumber,
|
|
scheduledTime: scheduledTime,
|
|
tripType: tripType,
|
|
isFinishing: false,
|
|
stationTimes: stationTimes,
|
|
stationOrder: stations,
|
|
direction: (int.tryParse(tripNumber) ?? 0).isOdd
|
|
? "inbound"
|
|
: "outbound",
|
|
);
|
|
}
|
|
|
|
// Try finishing pattern
|
|
match = _inboundFinishingPattern.firstMatch(line);
|
|
if (match != null) {
|
|
final tripNumber = match.group(1)!;
|
|
final dutyNumber = match.group(2)!;
|
|
final firstTime = match.group(4)!;
|
|
final secondTime = match.group(5)!;
|
|
final timesString = match.group(6) ?? "";
|
|
|
|
final times = [firstTime, secondTime];
|
|
final additionalTimes = _extractTimesFromString(timesString);
|
|
times.addAll(additionalTimes);
|
|
|
|
final stationTimes = _mapStationsToTimes(stations, times);
|
|
final scheduledTime = _formatTime(firstTime);
|
|
|
|
return Trip(
|
|
tripNumber: tripNumber,
|
|
dutyNumber: dutyNumber,
|
|
busWorkNumber: dutyNumber,
|
|
scheduledTime: scheduledTime,
|
|
tripType: "F",
|
|
isFinishing: true,
|
|
stationTimes: stationTimes,
|
|
stationOrder: stations,
|
|
direction: (int.tryParse(tripNumber) ?? 0).isOdd
|
|
? "inbound"
|
|
: "outbound",
|
|
);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
Trip? _parseOutboundTrip(String line, List<String> stations) {
|
|
// OUTBOUND: HHMM_HHMM times... EC### duty running trip
|
|
var match = _outboundPattern.firstMatch(line);
|
|
if (match != null) {
|
|
final firstTime = match.group(1)!;
|
|
final secondTime = match.group(2)!;
|
|
final timesString = match.group(3) ?? "";
|
|
final dutyNumber = match.group(5)!;
|
|
final tripType = match.group(6) ?? "";
|
|
final busWorkNumber = match.group(7)!;
|
|
final tripNumber = match.group(8)!;
|
|
|
|
// Build complete time array: first_time, second_time, then remaining times
|
|
final times = [firstTime, secondTime];
|
|
times.addAll(_extractTimesFromString(timesString));
|
|
|
|
final stationTimes = _mapStationsToTimes(stations, times);
|
|
final scheduledTime = _formatTime(firstTime);
|
|
|
|
return Trip(
|
|
tripNumber: tripNumber,
|
|
dutyNumber: dutyNumber,
|
|
busWorkNumber: busWorkNumber,
|
|
scheduledTime: scheduledTime,
|
|
tripType: tripType,
|
|
isFinishing: false,
|
|
stationTimes: stationTimes,
|
|
stationOrder: stations,
|
|
direction: (int.tryParse(tripNumber) ?? 0).isOdd
|
|
? "inbound"
|
|
: "outbound",
|
|
);
|
|
}
|
|
|
|
// Try finishing pattern
|
|
match = _outboundFinishingPattern.firstMatch(line);
|
|
if (match != null) {
|
|
final firstTime = match.group(1)!;
|
|
final secondTime = match.group(2)!;
|
|
final timesString = match.group(3) ?? "";
|
|
final dutyNumber = match.group(5)!;
|
|
|
|
final times = [firstTime, secondTime];
|
|
times.addAll(_extractTimesFromString(timesString));
|
|
|
|
final stationTimes = _mapStationsToTimes(stations, times);
|
|
final scheduledTime = _formatTime(firstTime);
|
|
|
|
return Trip(
|
|
tripNumber:
|
|
dutyNumber, // Finishing trips may not have separate trip number
|
|
dutyNumber: dutyNumber,
|
|
busWorkNumber: dutyNumber,
|
|
scheduledTime: scheduledTime,
|
|
tripType: "F",
|
|
isFinishing: true,
|
|
stationTimes: stationTimes,
|
|
stationOrder: stations,
|
|
direction: (int.tryParse(dutyNumber) ?? 0).isOdd
|
|
? "inbound"
|
|
: "outbound",
|
|
);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
List<String> _extractTimesFromString(String timesString) {
|
|
// Extract all 4-digit times from the string
|
|
final pattern = RegExp(r"\b(\d{4})\b");
|
|
return pattern.allMatches(timesString).map((m) => m.group(1)!).toList();
|
|
}
|
|
|
|
List<String> _extractAllTimes(String line) {
|
|
// Extract all 4-digit times, including those in HHMM_HHMM format
|
|
final timePattern = RegExp(r"\b(\d{4})(?:_(\d{4}))?\b");
|
|
final matches = timePattern.allMatches(line);
|
|
final times = <String>[];
|
|
|
|
for (final match in matches) {
|
|
// Add first time
|
|
times.add(match.group(1)!);
|
|
// Add second time if it exists (from HHMM_HHMM)
|
|
if (match.group(2) != null) {
|
|
times.add(match.group(2)!);
|
|
}
|
|
}
|
|
|
|
return times;
|
|
}
|
|
|
|
Map<String, String> _mapStationsToTimes(
|
|
List<String> stations,
|
|
List<String> times,
|
|
) {
|
|
final stationTimes = <String, String>{};
|
|
|
|
for (var i = 0; i < stations.length && i < times.length; i++) {
|
|
final time = times[i];
|
|
// Only add non-empty times (not "____" or similar)
|
|
if (RegExp(r"^\d{4}$").hasMatch(time)) {
|
|
stationTimes[stations[i]] = _formatTime(time);
|
|
}
|
|
}
|
|
|
|
return stationTimes;
|
|
}
|
|
}
|
|
|
|
class _DocumentSection {
|
|
final List<String> stations;
|
|
String direction;
|
|
final List<String> tripLines;
|
|
|
|
_DocumentSection({
|
|
required this.stations,
|
|
required this.direction,
|
|
required this.tripLines,
|
|
});
|
|
}
|