154 lines
4.4 KiB
Dart
154 lines
4.4 KiB
Dart
import "package:capstone_project/services/duriin_service.dart";
|
||
import "package:capstone_project/utils/agrigator.dart";
|
||
|
||
|
||
// one event cluster built from a seed article + its semantic neighbours.
|
||
// distances are measured from the seed; the seed itself has distance 0.
|
||
class EventCluster {
|
||
final FeedItem seed;
|
||
final List<FeedItem> articles;
|
||
|
||
// article.id -> distance from seed. missing id / missing distance means
|
||
// we dont have a number for it (falls back to nulls in the stats).
|
||
final Map<int, double> distancesFromSeed;
|
||
|
||
EventCluster({
|
||
required this.seed,
|
||
required this.articles,
|
||
required this.distancesFromSeed,
|
||
});
|
||
|
||
|
||
// convenience: summary stats over the (known) distances in this cluster.
|
||
// returns nulls if we have no distances to report (eg a singleton cluster).
|
||
({double? min, double? avg, double? max}) distanceStats() {
|
||
final vals = distancesFromSeed.values.toList();
|
||
if (vals.isEmpty) return (min: null, avg: null, max: null);
|
||
|
||
double lo = vals.first;
|
||
double hi = vals.first;
|
||
double sum = 0.0;
|
||
for (final v in vals) {
|
||
if (v < lo) lo = v;
|
||
if (v > hi) hi = v;
|
||
sum += v;
|
||
}
|
||
|
||
return (min: lo, avg: sum / vals.length, max: hi);
|
||
}
|
||
}
|
||
|
||
|
||
class EventClusterer {
|
||
final DuriinService _duriin;
|
||
|
||
|
||
// neighbours whose distance from the seed is strictly greater than this
|
||
// are dropped as off-topic. calibrated against observed api distances:
|
||
// genuinely-same-event pairs land around 0.58–0.62, different-event-same-
|
||
// topic pairs start around 0.70+. tighten if clusters start merging distinct
|
||
// events, loosen if obvious same-event stories end up as singletons.
|
||
final double distanceThreshold;
|
||
|
||
// hard cap on articles per cluster — keeps prompt size predictable
|
||
final int maxClusterSize;
|
||
|
||
// how many neighbours to ask the api for per seed
|
||
final int neighbourFetchLimit;
|
||
|
||
EventClusterer({
|
||
DuriinService? duriin,
|
||
this.distanceThreshold = 0.60,
|
||
this.maxClusterSize = 10,
|
||
this.neighbourFetchLimit = 25,
|
||
}) : _duriin = duriin ?? DuriinService();
|
||
|
||
|
||
Future<List<EventCluster>> cluster(List<FeedItem> articles) async {
|
||
if (articles.isEmpty) return [];
|
||
|
||
// index by id for fast membership checks when neighbours come back
|
||
final byId = <int, FeedItem>{};
|
||
final withoutId = <FeedItem>[];
|
||
|
||
for (final a in articles) {
|
||
if (a.id != null) {
|
||
byId[a.id!] = a;
|
||
} else {
|
||
withoutId.add(a);
|
||
}
|
||
}
|
||
|
||
// work through newest first so the first signal surfaced is the freshest
|
||
final queue = byId.values.toList()
|
||
..sort((a, b) {
|
||
final da = a.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0);
|
||
final db = b.pubDate ?? DateTime.fromMillisecondsSinceEpoch(0);
|
||
return db.compareTo(da);
|
||
});
|
||
|
||
final clustered = <int>{};
|
||
final clusters = <EventCluster>[];
|
||
|
||
for (final seed in queue) {
|
||
if (clustered.contains(seed.id)) continue;
|
||
|
||
final neighbours = await _duriin.findSimilar(
|
||
seed.id!,
|
||
limit: neighbourFetchLimit,
|
||
);
|
||
|
||
|
||
// keep only neighbours we actually fetched (same ticker / window)
|
||
// and that are close enough to count as the same event.
|
||
final members = <FeedItem>[seed];
|
||
final distances = <int, double>{};
|
||
|
||
// dedupe just in case the api returns the seed in its own neighbour list
|
||
final memberIds = <int>{seed.id!};
|
||
|
||
for (final hit in neighbours) {
|
||
final nid = hit.item.id;
|
||
if (nid == null) continue;
|
||
if (nid == seed.id) continue;
|
||
if (memberIds.contains(nid)) continue;
|
||
|
||
final inWindow = byId[nid];
|
||
if (inWindow == null) continue;
|
||
|
||
final d = hit.distance;
|
||
if (d == null) continue;
|
||
if (d > distanceThreshold) continue;
|
||
|
||
members.add(inWindow);
|
||
distances[nid] = d;
|
||
memberIds.add(nid);
|
||
|
||
if (members.length >= maxClusterSize) break;
|
||
}
|
||
|
||
for (final id in memberIds) {
|
||
clustered.add(id);
|
||
}
|
||
|
||
clusters.add(EventCluster(
|
||
seed: seed,
|
||
articles: members,
|
||
distancesFromSeed: distances,
|
||
));
|
||
}
|
||
|
||
|
||
// articles with no id (shouldnt happen post-api-update, but just in case)
|
||
// each becomes its own singleton cluster so we never silently drop them.
|
||
for (final orphan in withoutId) {
|
||
clusters.add(EventCluster(
|
||
seed: orphan,
|
||
articles: [orphan],
|
||
distancesFromSeed: const {},
|
||
));
|
||
}
|
||
|
||
return clusters;
|
||
}
|
||
}
|