Similar versions: surface-overlap metric + endpoint + UI panel

Ranks catalogued engine versions by how much of their CMC_* surface they share, which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%. - similarity.py: jaccard, surface_similarity (per-axis + pooled overall), fuzzy_similarity (ssdeep via ppdeep, secondary signal) - service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit) - UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff) - tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy, endpoint + min filter) -> 28/28 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 12:33:50 +02:00
parent 30b2b1011e
commit 38be932abc
8 changed files with 275 additions and 1 deletions
--- a/ams/api/routes/snapshots.py
+++ b/ams/api/routes/snapshots.py
@@ -39,3 +39,20 @@ def get_snapshot(snapshot_id: int, db: Session = Depends(get_db)) -> models.Snap
    if snap is None:
        raise HTTPException(404, "snapshot not found")
    return snap
+
+
+@router.get("/{snapshot_id}/similar", response_model=list[schemas.SimilarHit])
+def similar_snapshots(
+    snapshot_id: int,
+    min: int = Query(0, ge=0, le=100, description="drop hits below this overall score"),
+    db: Session = Depends(get_db),
+) -> list[schemas.SimilarHit]:
+    hits = service.similar_snapshots(db, snapshot_id, minimum=min)
+    if hits is None:
+        raise HTTPException(404, "snapshot not found")
+    return [
+        schemas.SimilarHit(
+            snapshot=schemas.SnapshotOut.model_validate(snap),
+            overall=score["overall"], fuzzy=score["fuzzy"], axes=score["axes"])
+        for snap, score in hits
+    ]
--- a/ams/api/schemas.py
+++ b/ams/api/schemas.py
@@ -43,6 +43,13 @@ class GameDetail(GameOut):
    snapshots: list[SnapshotOut] = []


+class SimilarHit(BaseModel):
+    snapshot: SnapshotOut
+    overall: int            # pooled surface-overlap score 0–100
+    fuzzy: int | None       # ssdeep similarity of the raw binary, when available
+    axes: dict              # per-axis {shared, only_a, only_b, score}
+
+
 class JobOut(BaseModel):
    model_config = ConfigDict(from_attributes=True)
    id: int
--- a/ams/api/service.py
+++ b/ams/api/service.py
@@ -42,6 +42,30 @@ def _get_or_create_game(db: Session, name: str) -> models.Game:
    return game


+def similar_snapshots(
+    db: Session, snapshot_id: int, minimum: int = 0
+) -> list[tuple[models.Snapshot, dict]]:
+    """Rank every other catalogued snapshot against #snapshot_id by surface similarity.
+
+    Returns (snapshot, score) pairs (score = ams.similarity report) sorted by `overall` desc,
+    dropping anything below `minimum`. Returns None if the target doesn't exist."""
+    from ..similarity import similarity
+    from ..snapshot import Snapshot as Surface
+
+    target = db.get(models.Snapshot, snapshot_id)
+    if target is None:
+        return None
+    t_surface = Surface(target.data)
+
+    hits: list[tuple[models.Snapshot, dict]] = []
+    for other in db.scalars(select(models.Snapshot).where(models.Snapshot.id != snapshot_id)):
+        score = similarity(t_surface, Surface(other.data))
+        if score["overall"] >= minimum:
+            hits.append((other, score))
+    hits.sort(key=lambda pair: pair[1]["overall"], reverse=True)
+    return hits
+
+
 def import_snapshot(db: Session, data: dict[str, Any], game_name: str | None = None) -> models.Snapshot:
    """Upsert a snapshot, deduped by the binary's sha256 (falling back to a content hash)."""
    sha = data.get("binary", {}).get("sha256") or _content_sha(data)
--- a/ams/api/static/app.js
+++ b/ams/api/static/app.js
@@ -273,6 +273,9 @@ async function browse(id) {
  ];
  out.innerHTML = "";
  out.append(el("div", { class: "diff-head" }, "Przegląd: ", el("b", {}, `${snap.binary_name} [${snap.engine}/${snap.compiler}]`)));
+  const simBox = el("div", { class: "similar" });
+  out.append(simBox);
+  loadSimilar(id, simBox);
  const filter = el("input", { class: "owner browse-filter", placeholder: "filtruj…", oninput: () => render() });
  const tabbar = el("div", {});
  const list = el("div", {});
@@ -292,6 +295,27 @@ async function browse(id) {
  render();
 }

+async function loadSimilar(targetId, box) {
+  let hits;
+  try { hits = await jget("/snapshots/" + targetId + "/similar"); }
+  catch { return; }  // endpoint absent / single-snapshot catalog — just show nothing
+  if (!hits.length) return;
+  box.append(el("div", { class: "similar-title" }, "Podobne wersje (overlap powierzchni)"));
+  for (const h of hits.slice(0, 6)) {
+    const s = h.snapshot;
+    const bar = el("span", { class: "simbar" },
+      el("span", { class: "simfill", style: "width:" + h.overall + "%" }));
+    box.append(el("div", { class: "simrow" },
+      el("span", { class: "simscore" }, h.overall + "%"),
+      bar,
+      el("span", { class: "simname", title: "przejrzyj", onclick: () => browse(s.id) },
+        `${s.binary_name} [${s.engine || "?"}]`),
+      h.fuzzy != null ? el("span", { class: "simfuzzy", title: "ssdeep binarki" }, "fuzzy " + h.fuzzy) : null,
+      el("span", { class: "simdiff", title: "porównaj tę wersję z aktualną",
+        onclick: () => { state.a = targetId; state.b = s.id; refreshSelection(); compare(); } }, "⇄ diff")));
+  }
+}
+
 // --- boot -------------------------------------------------------------------------------------
 $("compare").addEventListener("click", compare);
 $("owner").addEventListener("keydown", (e) => { if (e.key === "Enter") compare(); });
--- a/ams/api/static/style.css
+++ b/ams/api/static/style.css
@@ -100,6 +100,19 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); }
 .empty { color: var(--dim); font-style: italic; }
 .moved { color: var(--accent); }

+.similar { margin: 4px 0 16px; }
+.similar-title { color: var(--dim); text-transform: uppercase; font-size: 11px; letter-spacing: 1px; margin-bottom: 6px; }
+.simrow { display: flex; align-items: center; gap: 10px; padding: 4px 0; }
+.simscore { width: 38px; text-align: right; color: var(--accent); font-weight: 600; }
+.simbar { flex: 0 0 120px; height: 7px; background: #16202c; border: 1px solid var(--border);
+  border-radius: 4px; overflow: hidden; }
+.simfill { display: block; height: 100%; background: linear-gradient(90deg, var(--accent2), var(--add)); }
+.simname { flex: 1; min-width: 0; color: var(--fg); cursor: pointer; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+.simname:hover { color: var(--accent); text-decoration: underline; }
+.simfuzzy { color: var(--dim); font-size: 11px; }
+.simdiff { color: var(--accent); cursor: pointer; font-size: 11px; }
+.simdiff:hover { text-decoration: underline; }
+
 .browse-filter { margin-bottom: 10px; }
 .btab { display: inline-block; padding: 4px 10px; margin-right: 6px; border: 1px solid var(--border);
  border-radius: 6px; cursor: pointer; color: var(--dim); }
--- a/ams/similarity.py
+++ b/ams/similarity.py
@@ -0,0 +1,86 @@
+"""How similar are two engine versions?
+
+Two complementary signals:
+
+* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
+  keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
+  compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
+* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
+  files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
+  flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
+"""
+
+from __future__ import annotations
+
+from typing import Hashable
+
+from .diff import _owner_name_key, _type_key
+from .snapshot import Snapshot
+
+# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
+_AXES = {
+    "types": (lambda s: s.types, _type_key),
+    "methods": (lambda s: s.methods, _owner_name_key),
+    "events": (lambda s: s.events, _owner_name_key),
+    "fields": (lambda s: s.fields, _owner_name_key),
+}
+
+
+def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
+    getter, key = _AXES[axis]
+    return {key(it) for it in getter(snap)}
+
+
+def jaccard(a: set, b: set) -> float:
+    """|A∩B| / |A∪B|; two empty sets are defined as identical (1.0)."""
+    union = a | b
+    return (len(a & b) / len(union)) if union else 1.0
+
+
+def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
+    """Per-axis overlap plus a pooled overall score (0–100).
+
+    `overall` is the micro-average — one Jaccard over every identity key from all axes — so it
+    reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
+    per: dict[str, dict] = {}
+    inter_total = union_total = 0
+    for axis in _AXES:
+        ka, kb = _keys(a, axis), _keys(b, axis)
+        inter, union = len(ka & kb), len(ka | kb)
+        per[axis] = {
+            "shared": inter,
+            "only_a": len(ka - kb),
+            "only_b": len(kb - ka),
+            "score": round(100 * jaccard(ka, kb)),
+        }
+        inter_total += inter
+        union_total += union
+    overall = round(100 * inter_total / union_total) if union_total else 100
+    return {"overall": overall, "axes": per}
+
+
+def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
+    """ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing
+    or no ssdeep implementation is installed."""
+    ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
+    if not ha or not hb:
+        return None
+    mod = None
+    for name in ("ppdeep", "ssdeep"):
+        try:
+            mod = __import__(name)
+            break
+        except ImportError:
+            continue
+    if mod is None:
+        return None
+    try:
+        return int(mod.compare(ha, hb))
+    except Exception:
+        return None
+
+
+def similarity(a: Snapshot, b: Snapshot) -> dict:
+    """Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
+    surf = surface_similarity(a, b)
+    return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}