Auger/lib/utils/event_clusterer.dart

154 lines
4.4 KiB
Dart
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import "package:capstone_project/services/duriin_service.dart";
import "package:capstone_project/utils/agrigator.dart";
// one event cluster built from a seed article + its semantic neighbours.
// distances are measured from the seed; the seed itself has distance 0.
class EventCluster {
final FeedItem seed;
final List<FeedItem> articles;
// article.id -> distance from seed. missing id / missing distance means
// we dont have a number for it (falls back to nulls in the stats).
final Map<int, double> distancesFromSeed;
EventCluster({
required this.seed,
required this.articles,
required this.distancesFromSeed,
});
// convenience: summary stats over the (known) distances in this cluster.
// returns nulls if we have no distances to report (eg a singleton cluster).
({double? min, double? avg, double? max}) distanceStats() {
final vals = distancesFromSeed.values.toList();
if (vals.isEmpty) return (min: null, avg: null, max: null);
double lo = vals.first;
double hi = vals.first;
double sum = 0.0;
for (final v in vals) {
if (v < lo) lo = v;
if (v > hi) hi = v;
sum += v;
}
return (min: lo, avg: sum / vals.length, max: hi);
}
}
class EventClusterer {
final DuriinService _duriin;
// neighbours whose distance from the seed is strictly greater than this
// are dropped as off-topic. calibrated against observed api distances:
// genuinely-same-event pairs land around 0.580.62, different-event-same-
// topic pairs start around 0.70+. tighten if clusters start merging distinct
// events, loosen if obvious same-event stories end up as singletons.
final double distanceThreshold;
// hard cap on articles per cluster — keeps prompt size predictable
final int maxClusterSize;
// how many neighbours to ask the api for per seed
final int neighbourFetchLimit;
EventClusterer({
DuriinService? duriin,
this.distanceThreshold = 0.60,
this.maxClusterSize = 10,
this.neighbourFetchLimit = 25,
}) : _duriin = duriin ?? DuriinService();
Future<List<EventCluster>> cluster(List<FeedItem> articles) async {
if (articles.isEmpty) return [];
// index by id for fast membership checks when neighbours come back
final byId = <int, FeedItem>{};
final withoutId = <FeedItem>[];
for (final a in articles) {
if (a.id != null) {
byId[a.id!] = a;
} else {
withoutId.add(a);
}
}
// work through newest first so the first signal surfaced is the freshest
final queue = byId.values.toList()
..sort((a, b) {
final da = a.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0);
final db = b.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0);
return db.compareTo(da);
});
final clustered = <int>{};
final clusters = <EventCluster>[];
for (final seed in queue) {
if (clustered.contains(seed.id)) continue;
final neighbours = await _duriin.findSimilar(
seed.id!,
limit: neighbourFetchLimit,
);
// keep only neighbours we actually fetched (same ticker / window)
// and that are close enough to count as the same event.
final members = <FeedItem>[seed];
final distances = <int, double>{};
// dedupe just in case the api returns the seed in its own neighbour list
final memberIds = <int>{seed.id!};
for (final hit in neighbours) {
final nid = hit.item.id;
if (nid == null) continue;
if (nid == seed.id) continue;
if (memberIds.contains(nid)) continue;
final inWindow = byId[nid];
if (inWindow == null) continue;
final d = hit.distance;
if (d == null) continue;
if (d > distanceThreshold) continue;
members.add(inWindow);
distances[nid] = d;
memberIds.add(nid);
if (members.length >= maxClusterSize) break;
}
for (final id in memberIds) {
clustered.add(id);
}
clusters.add(EventCluster(
seed: seed,
articles: members,
distancesFromSeed: distances,
));
}
// articles with no id (shouldnt happen post-api-update, but just in case)
// each becomes its own singleton cluster so we never silently drop them.
for (final orphan in withoutId) {
clusters.add(EventCluster(
seed: orphan,
articles: [orphan],
distancesFromSeed: const {},
));
}
return clusters;
}
}