import "package:capstone_project/services/duriin_service.dart"; import "package:capstone_project/utils/agrigator.dart"; // one event cluster built from a seed article + its semantic neighbours. // distances are measured from the seed; the seed itself has distance 0. class EventCluster { final FeedItem seed; final List articles; // article.id -> distance from seed. missing id / missing distance means // we dont have a number for it (falls back to nulls in the stats). final Map distancesFromSeed; EventCluster({ required this.seed, required this.articles, required this.distancesFromSeed, }); // convenience: summary stats over the (known) distances in this cluster. // returns nulls if we have no distances to report (eg a singleton cluster). ({double? min, double? avg, double? max}) distanceStats() { final vals = distancesFromSeed.values.toList(); if (vals.isEmpty) return (min: null, avg: null, max: null); double lo = vals.first; double hi = vals.first; double sum = 0.0; for (final v in vals) { if (v < lo) lo = v; if (v > hi) hi = v; sum += v; } return (min: lo, avg: sum / vals.length, max: hi); } } class EventClusterer { final DuriinService _duriin; // neighbours whose distance from the seed is strictly greater than this // are dropped as off-topic. calibrated against observed api distances: // genuinely-same-event pairs land around 0.58–0.62, different-event-same- // topic pairs start around 0.70+. tighten if clusters start merging distinct // events, loosen if obvious same-event stories end up as singletons. final double distanceThreshold; // hard cap on articles per cluster — keeps prompt size predictable final int maxClusterSize; // how many neighbours to ask the api for per seed final int neighbourFetchLimit; EventClusterer({ DuriinService? duriin, this.distanceThreshold = 0.60, this.maxClusterSize = 10, this.neighbourFetchLimit = 25, }) : _duriin = duriin ?? DuriinService(); Future> cluster(List articles) async { if (articles.isEmpty) return []; // index by id for fast membership checks when neighbours come back final byId = {}; final withoutId = []; for (final a in articles) { if (a.id != null) { byId[a.id!] = a; } else { withoutId.add(a); } } // work through newest first so the first signal surfaced is the freshest final queue = byId.values.toList() ..sort((a, b) { final da = a.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0); final db = b.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0); return db.compareTo(da); }); final clustered = {}; final clusters = []; for (final seed in queue) { if (clustered.contains(seed.id)) continue; final neighbours = await _duriin.findSimilar( seed.id!, limit: neighbourFetchLimit, ); // keep only neighbours we actually fetched (same ticker / window) // and that are close enough to count as the same event. final members = [seed]; final distances = {}; // dedupe just in case the api returns the seed in its own neighbour list final memberIds = {seed.id!}; for (final hit in neighbours) { final nid = hit.item.id; if (nid == null) continue; if (nid == seed.id) continue; if (memberIds.contains(nid)) continue; final inWindow = byId[nid]; if (inWindow == null) continue; final d = hit.distance; if (d == null) continue; if (d > distanceThreshold) continue; members.add(inWindow); distances[nid] = d; memberIds.add(nid); if (members.length >= maxClusterSize) break; } for (final id in memberIds) { clustered.add(id); } clusters.add(EventCluster( seed: seed, articles: members, distancesFromSeed: distances, )); } // articles with no id (shouldnt happen post-api-update, but just in case) // each becomes its own singleton cluster so we never silently drop them. for (final orphan in withoutId) { clusters.add(EventCluster( seed: orphan, articles: [orphan], distancesFromSeed: const {}, )); } return clusters; } }