Similar versions: surface-overlap metric + endpoint + UI panel

Ranks catalogued engine versions by how much of their CMC_* surface they share, which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%. - similarity.py: jaccard, surface_similarity (per-axis + pooled overall), fuzzy_similarity (ssdeep via ppdeep, secondary signal) - service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit) - UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff) - tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy, endpoint + min filter) -> 28/28 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 12:33:50 +02:00
parent 30b2b1011e
commit 38be932abc
8 changed files with 275 additions and 1 deletions
--- a/ams/similarity.py
+++ b/ams/similarity.py
@@ -0,0 +1,86 @@
+"""How similar are two engine versions?
+
+Two complementary signals:
+
+* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
+  keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
+  compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
+* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
+  files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
+  flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
+"""
+
+from __future__ import annotations
+
+from typing import Hashable
+
+from .diff import _owner_name_key, _type_key
+from .snapshot import Snapshot
+
+# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
+_AXES = {
+    "types": (lambda s: s.types, _type_key),
+    "methods": (lambda s: s.methods, _owner_name_key),
+    "events": (lambda s: s.events, _owner_name_key),
+    "fields": (lambda s: s.fields, _owner_name_key),
+}
+
+
+def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
+    getter, key = _AXES[axis]
+    return {key(it) for it in getter(snap)}
+
+
+def jaccard(a: set, b: set) -> float:
+    """|A∩B| / |A∪B|; two empty sets are defined as identical (1.0)."""
+    union = a | b
+    return (len(a & b) / len(union)) if union else 1.0
+
+
+def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
+    """Per-axis overlap plus a pooled overall score (0–100).
+
+    `overall` is the micro-average — one Jaccard over every identity key from all axes — so it
+    reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
+    per: dict[str, dict] = {}
+    inter_total = union_total = 0
+    for axis in _AXES:
+        ka, kb = _keys(a, axis), _keys(b, axis)
+        inter, union = len(ka & kb), len(ka | kb)
+        per[axis] = {
+            "shared": inter,
+            "only_a": len(ka - kb),
+            "only_b": len(kb - ka),
+            "score": round(100 * jaccard(ka, kb)),
+        }
+        inter_total += inter
+        union_total += union
+    overall = round(100 * inter_total / union_total) if union_total else 100
+    return {"overall": overall, "axes": per}
+
+
+def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
+    """ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing
+    or no ssdeep implementation is installed."""
+    ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
+    if not ha or not hb:
+        return None
+    mod = None
+    for name in ("ppdeep", "ssdeep"):
+        try:
+            mod = __import__(name)
+            break
+        except ImportError:
+            continue
+    if mod is None:
+        return None
+    try:
+        return int(mod.compare(ha, hb))
+    except Exception:
+        return None
+
+
+def similarity(a: Snapshot, b: Snapshot) -> dict:
+    """Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
+    surf = surface_similarity(a, b)
+    return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}