Similar versions: surface-overlap metric + endpoint + UI panel
Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.
- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
endpoint + min filter) -> 28/28
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,30 @@ def _get_or_create_game(db: Session, name: str) -> models.Game:
|
||||
return game
|
||||
|
||||
|
||||
def similar_snapshots(
|
||||
db: Session, snapshot_id: int, minimum: int = 0
|
||||
) -> list[tuple[models.Snapshot, dict]]:
|
||||
"""Rank every other catalogued snapshot against #snapshot_id by surface similarity.
|
||||
|
||||
Returns (snapshot, score) pairs (score = ams.similarity report) sorted by `overall` desc,
|
||||
dropping anything below `minimum`. Returns None if the target doesn't exist."""
|
||||
from ..similarity import similarity
|
||||
from ..snapshot import Snapshot as Surface
|
||||
|
||||
target = db.get(models.Snapshot, snapshot_id)
|
||||
if target is None:
|
||||
return None
|
||||
t_surface = Surface(target.data)
|
||||
|
||||
hits: list[tuple[models.Snapshot, dict]] = []
|
||||
for other in db.scalars(select(models.Snapshot).where(models.Snapshot.id != snapshot_id)):
|
||||
score = similarity(t_surface, Surface(other.data))
|
||||
if score["overall"] >= minimum:
|
||||
hits.append((other, score))
|
||||
hits.sort(key=lambda pair: pair[1]["overall"], reverse=True)
|
||||
return hits
|
||||
|
||||
|
||||
def import_snapshot(db: Session, data: dict[str, Any], game_name: str | None = None) -> models.Snapshot:
|
||||
"""Upsert a snapshot, deduped by the binary's sha256 (falling back to a content hash)."""
|
||||
sha = data.get("binary", {}).get("sha256") or _content_sha(data)
|
||||
|
||||
Reference in New Issue
Block a user