From 38be932abcf400943ab411bf80d99ba9489e15a9 Mon Sep 17 00:00:00 2001 From: Patryk Gensch <43010113+patryk025@users.noreply.github.com> Date: Sun, 31 May 2026 12:33:50 +0200 Subject: [PATCH] Similar versions: surface-overlap metric + endpoint + UI panel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ranks catalogued engine versions by how much of their CMC_* surface they share, which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%. - similarity.py: jaccard, surface_similarity (per-axis + pooled overall), fuzzy_similarity (ssdeep via ppdeep, secondary signal) - service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit) - UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff) - tests: 6 new (jaccard, identical/disjoint, golden pair 0 28/28 Co-Authored-By: Claude Opus 4.8 --- README.md | 13 +++++- ams/api/routes/snapshots.py | 17 +++++++ ams/api/schemas.py | 7 +++ ams/api/service.py | 24 ++++++++++ ams/api/static/app.js | 24 ++++++++++ ams/api/static/style.css | 13 ++++++ ams/similarity.py | 86 ++++++++++++++++++++++++++++++++++ tests/test_similarity.py | 92 +++++++++++++++++++++++++++++++++++++ 8 files changed, 275 insertions(+), 1 deletion(-) create mode 100644 ams/similarity.py create mode 100644 tests/test_similarity.py diff --git a/README.md b/README.md index e3304e1..d913d15 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,16 @@ python -m ams.api.importer --game "Reksio i UFO" snapshots/PIKLIB8.dll.snapshot. uvicorn ams.api.app:create_app --factory --reload # serwer ``` Endpointy: `POST/GET /games`, `POST/GET /snapshots` (import deduplikowany po sha256), -`GET /diff?old=&new=[&owner=]`, `GET /health`. Testy: `pytest` (11, w tym integracyjne na golden pair). +`GET /diff?old=&new=[&owner=]`, `GET /snapshots/{id}/similar`, `POST/GET /jobs`, `GET /health`. +Testy: `pytest` (28, w tym integracyjne na golden pair). + +### Podobne wersje + +`GET /snapshots/{id}/similar[?min=N]` rankuje pozostałe wersje w katalogu po **overlapie +powierzchni** — Jaccard zbiorów tożsamości (te same klucze co diff) per oś, plus pula `overall`. +Miara jest *cross-compiler*: golden pair PIKLIB8 (MSVC6) ↔ bloomoodll (MSVC8) wychodzi **85%** +(types 95% / methods 87% / events 77% / fields 90%), tam gdzie fuzzy-hash binarki daje 0. +Fuzzy (ssdeep) leci jako sygnał poboczny „prawie ten sam plik", gdy snapshot ma `binary.fuzzy`. ## Front — Command Center @@ -127,6 +136,8 @@ Po starcie serwera otwórz **http://127.0.0.1:8000/** (`/` → `/ui/`). Statyczn wersji (A/B), wizualny diff po 4 osiach z filtrem klasy i przeglądarka pojedynczej powierzchni. Przycisk **+ wgraj** w panelu gier otwiera upload ISO/ZIP/DLL → `POST /jobs`; status zadania (queued→started→finished/failed) jest odpytywany na żywo, a po zakończeniu lista wersji odświeża się sama. +W przeglądarce pojedynczej wersji widać panel **Podobne wersje** (pasek overlapu + `⇄ diff` ustawiający +A/B i odpalający porównanie). ## Format snapshotu diff --git a/ams/api/routes/snapshots.py b/ams/api/routes/snapshots.py index 35f2770..cb60bbe 100644 --- a/ams/api/routes/snapshots.py +++ b/ams/api/routes/snapshots.py @@ -39,3 +39,20 @@ def get_snapshot(snapshot_id: int, db: Session = Depends(get_db)) -> models.Snap if snap is None: raise HTTPException(404, "snapshot not found") return snap + + +@router.get("/{snapshot_id}/similar", response_model=list[schemas.SimilarHit]) +def similar_snapshots( + snapshot_id: int, + min: int = Query(0, ge=0, le=100, description="drop hits below this overall score"), + db: Session = Depends(get_db), +) -> list[schemas.SimilarHit]: + hits = service.similar_snapshots(db, snapshot_id, minimum=min) + if hits is None: + raise HTTPException(404, "snapshot not found") + return [ + schemas.SimilarHit( + snapshot=schemas.SnapshotOut.model_validate(snap), + overall=score["overall"], fuzzy=score["fuzzy"], axes=score["axes"]) + for snap, score in hits + ] diff --git a/ams/api/schemas.py b/ams/api/schemas.py index c86c675..83e7943 100644 --- a/ams/api/schemas.py +++ b/ams/api/schemas.py @@ -43,6 +43,13 @@ class GameDetail(GameOut): snapshots: list[SnapshotOut] = [] +class SimilarHit(BaseModel): + snapshot: SnapshotOut + overall: int # pooled surface-overlap score 0–100 + fuzzy: int | None # ssdeep similarity of the raw binary, when available + axes: dict # per-axis {shared, only_a, only_b, score} + + class JobOut(BaseModel): model_config = ConfigDict(from_attributes=True) id: int diff --git a/ams/api/service.py b/ams/api/service.py index 5a2fe1f..952ca2f 100644 --- a/ams/api/service.py +++ b/ams/api/service.py @@ -42,6 +42,30 @@ def _get_or_create_game(db: Session, name: str) -> models.Game: return game +def similar_snapshots( + db: Session, snapshot_id: int, minimum: int = 0 +) -> list[tuple[models.Snapshot, dict]]: + """Rank every other catalogued snapshot against #snapshot_id by surface similarity. + + Returns (snapshot, score) pairs (score = ams.similarity report) sorted by `overall` desc, + dropping anything below `minimum`. Returns None if the target doesn't exist.""" + from ..similarity import similarity + from ..snapshot import Snapshot as Surface + + target = db.get(models.Snapshot, snapshot_id) + if target is None: + return None + t_surface = Surface(target.data) + + hits: list[tuple[models.Snapshot, dict]] = [] + for other in db.scalars(select(models.Snapshot).where(models.Snapshot.id != snapshot_id)): + score = similarity(t_surface, Surface(other.data)) + if score["overall"] >= minimum: + hits.append((other, score)) + hits.sort(key=lambda pair: pair[1]["overall"], reverse=True) + return hits + + def import_snapshot(db: Session, data: dict[str, Any], game_name: str | None = None) -> models.Snapshot: """Upsert a snapshot, deduped by the binary's sha256 (falling back to a content hash).""" sha = data.get("binary", {}).get("sha256") or _content_sha(data) diff --git a/ams/api/static/app.js b/ams/api/static/app.js index 40ef282..fb68919 100644 --- a/ams/api/static/app.js +++ b/ams/api/static/app.js @@ -273,6 +273,9 @@ async function browse(id) { ]; out.innerHTML = ""; out.append(el("div", { class: "diff-head" }, "Przegląd: ", el("b", {}, `${snap.binary_name} [${snap.engine}/${snap.compiler}]`))); + const simBox = el("div", { class: "similar" }); + out.append(simBox); + loadSimilar(id, simBox); const filter = el("input", { class: "owner browse-filter", placeholder: "filtruj…", oninput: () => render() }); const tabbar = el("div", {}); const list = el("div", {}); @@ -292,6 +295,27 @@ async function browse(id) { render(); } +async function loadSimilar(targetId, box) { + let hits; + try { hits = await jget("/snapshots/" + targetId + "/similar"); } + catch { return; } // endpoint absent / single-snapshot catalog — just show nothing + if (!hits.length) return; + box.append(el("div", { class: "similar-title" }, "Podobne wersje (overlap powierzchni)")); + for (const h of hits.slice(0, 6)) { + const s = h.snapshot; + const bar = el("span", { class: "simbar" }, + el("span", { class: "simfill", style: "width:" + h.overall + "%" })); + box.append(el("div", { class: "simrow" }, + el("span", { class: "simscore" }, h.overall + "%"), + bar, + el("span", { class: "simname", title: "przejrzyj", onclick: () => browse(s.id) }, + `${s.binary_name} [${s.engine || "?"}]`), + h.fuzzy != null ? el("span", { class: "simfuzzy", title: "ssdeep binarki" }, "fuzzy " + h.fuzzy) : null, + el("span", { class: "simdiff", title: "porównaj tę wersję z aktualną", + onclick: () => { state.a = targetId; state.b = s.id; refreshSelection(); compare(); } }, "⇄ diff"))); + } +} + // --- boot ------------------------------------------------------------------------------------- $("compare").addEventListener("click", compare); $("owner").addEventListener("keydown", (e) => { if (e.key === "Enter") compare(); }); diff --git a/ams/api/static/style.css b/ams/api/static/style.css index 66811ef..1dc428c 100644 --- a/ams/api/static/style.css +++ b/ams/api/static/style.css @@ -100,6 +100,19 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); } .empty { color: var(--dim); font-style: italic; } .moved { color: var(--accent); } +.similar { margin: 4px 0 16px; } +.similar-title { color: var(--dim); text-transform: uppercase; font-size: 11px; letter-spacing: 1px; margin-bottom: 6px; } +.simrow { display: flex; align-items: center; gap: 10px; padding: 4px 0; } +.simscore { width: 38px; text-align: right; color: var(--accent); font-weight: 600; } +.simbar { flex: 0 0 120px; height: 7px; background: #16202c; border: 1px solid var(--border); + border-radius: 4px; overflow: hidden; } +.simfill { display: block; height: 100%; background: linear-gradient(90deg, var(--accent2), var(--add)); } +.simname { flex: 1; min-width: 0; color: var(--fg); cursor: pointer; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } +.simname:hover { color: var(--accent); text-decoration: underline; } +.simfuzzy { color: var(--dim); font-size: 11px; } +.simdiff { color: var(--accent); cursor: pointer; font-size: 11px; } +.simdiff:hover { text-decoration: underline; } + .browse-filter { margin-bottom: 10px; } .btab { display: inline-block; padding: 4px 10px; margin-right: 6px; border: 1px solid var(--border); border-radius: 6px; cursor: pointer; color: var(--dim); } diff --git a/ams/similarity.py b/ams/similarity.py new file mode 100644 index 0000000..7d5e76a --- /dev/null +++ b/ams/similarity.py @@ -0,0 +1,86 @@ +"""How similar are two engine versions? + +Two complementary signals: + +* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity + keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is + compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high. +* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical + files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL" + flag, present only when ppdeep/ssdeep produced a hash at acquisition time. +""" + +from __future__ import annotations + +from typing import Hashable + +from .diff import _owner_name_key, _type_key +from .snapshot import Snapshot + +# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree. +_AXES = { + "types": (lambda s: s.types, _type_key), + "methods": (lambda s: s.methods, _owner_name_key), + "events": (lambda s: s.events, _owner_name_key), + "fields": (lambda s: s.fields, _owner_name_key), +} + + +def _keys(snap: Snapshot, axis: str) -> set[Hashable]: + getter, key = _AXES[axis] + return {key(it) for it in getter(snap)} + + +def jaccard(a: set, b: set) -> float: + """|A∩B| / |A∪B|; two empty sets are defined as identical (1.0).""" + union = a | b + return (len(a & b) / len(union)) if union else 1.0 + + +def surface_similarity(a: Snapshot, b: Snapshot) -> dict: + """Per-axis overlap plus a pooled overall score (0–100). + + `overall` is the micro-average — one Jaccard over every identity key from all axes — so it + reads as "this share of the whole engine surface is common", naturally weighting by axis size.""" + per: dict[str, dict] = {} + inter_total = union_total = 0 + for axis in _AXES: + ka, kb = _keys(a, axis), _keys(b, axis) + inter, union = len(ka & kb), len(ka | kb) + per[axis] = { + "shared": inter, + "only_a": len(ka - kb), + "only_b": len(kb - ka), + "score": round(100 * jaccard(ka, kb)), + } + inter_total += inter + union_total += union + overall = round(100 * inter_total / union_total) if union_total else 100 + return {"overall": overall, "axes": per} + + +def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None: + """ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing + or no ssdeep implementation is installed.""" + ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy") + if not ha or not hb: + return None + mod = None + for name in ("ppdeep", "ssdeep"): + try: + mod = __import__(name) + break + except ImportError: + continue + if mod is None: + return None + try: + return int(mod.compare(ha, hb)) + except Exception: + return None + + +def similarity(a: Snapshot, b: Snapshot) -> dict: + """Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag.""" + surf = surface_similarity(a, b) + return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)} diff --git a/tests/test_similarity.py b/tests/test_similarity.py new file mode 100644 index 0000000..3b53c2b --- /dev/null +++ b/tests/test_similarity.py @@ -0,0 +1,92 @@ +"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity +from ams.snapshot import Snapshot + +SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots" +PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json" +BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json" + +pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present") + + +def _snap(path: Path) -> Snapshot: + with open(path, encoding="utf-8") as fh: + return Snapshot(json.load(fh)) + + +def test_jaccard_basics(): + assert jaccard(set(), set()) == 1.0 + assert jaccard({1, 2}, {1, 2}) == 1.0 + assert jaccard({1, 2}, {3, 4}) == 0.0 + assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3) + + +def test_identical_is_100(): + s = _snap(PIKLIB) + surf = surface_similarity(s, s) + assert surf["overall"] == 100 + assert all(ax["score"] == 100 for ax in surf["axes"].values()) + + +def test_disjoint_is_0(): + a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []}) + b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []}) + assert surface_similarity(a, b)["overall"] == 0 + + +def test_golden_pair_is_similar_not_identical(): + # PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal. + rep = similarity(_snap(PIKLIB), _snap(BLOOMOO)) + assert 0 < rep["overall"] < 100 + assert rep["axes"]["methods"]["shared"] > 0 + # the golden snapshots carry no fuzzy hash (extractor doesn't compute one) + assert rep["fuzzy"] is None + + +def test_fuzzy_matches_identical_hash(): + pytest.importorskip("ppdeep") + import ppdeep + h = ppdeep.hash(b"the quick brown fox " * 50) + a = Snapshot({"binary": {"fuzzy": h}}) + b = Snapshot({"binary": {"fuzzy": h}}) + assert fuzzy_similarity(a, b) == 100 + assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None + + +# --- endpoint --------------------------------------------------------------------------------- + +@pytest.fixture() +def client(tmp_path): + pytest.importorskip("fastapi") + from fastapi.testclient import TestClient + + from ams.api.app import create_app + return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path))) + + +def _load(path: Path) -> dict: + with open(path, encoding="utf-8") as fh: + return json.load(fh) + + +def test_similar_endpoint(client): + a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"] + b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"] + + hits = client.get("/snapshots/{0}/similar".format(a)).json() + assert len(hits) == 1 + assert hits[0]["snapshot"]["id"] == b + assert hits[0]["overall"] > 0 + assert "methods" in hits[0]["axes"] + + # a 100-only filter drops the cross-compiler pair + assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == [] + assert client.get("/snapshots/999/similar").status_code == 404