Similar versions: surface-overlap metric + endpoint + UI panel
Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.
- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
endpoint + min filter) -> 28/28
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
13
README.md
13
README.md
@@ -118,7 +118,16 @@ python -m ams.api.importer --game "Reksio i UFO" snapshots/PIKLIB8.dll.snapshot.
|
|||||||
uvicorn ams.api.app:create_app --factory --reload # serwer
|
uvicorn ams.api.app:create_app --factory --reload # serwer
|
||||||
```
|
```
|
||||||
Endpointy: `POST/GET /games`, `POST/GET /snapshots` (import deduplikowany po sha256),
|
Endpointy: `POST/GET /games`, `POST/GET /snapshots` (import deduplikowany po sha256),
|
||||||
`GET /diff?old=&new=[&owner=]`, `GET /health`. Testy: `pytest` (11, w tym integracyjne na golden pair).
|
`GET /diff?old=&new=[&owner=]`, `GET /snapshots/{id}/similar`, `POST/GET /jobs`, `GET /health`.
|
||||||
|
Testy: `pytest` (28, w tym integracyjne na golden pair).
|
||||||
|
|
||||||
|
### Podobne wersje
|
||||||
|
|
||||||
|
`GET /snapshots/{id}/similar[?min=N]` rankuje pozostałe wersje w katalogu po **overlapie
|
||||||
|
powierzchni** — Jaccard zbiorów tożsamości (te same klucze co diff) per oś, plus pula `overall`.
|
||||||
|
Miara jest *cross-compiler*: golden pair PIKLIB8 (MSVC6) ↔ bloomoodll (MSVC8) wychodzi **85%**
|
||||||
|
(types 95% / methods 87% / events 77% / fields 90%), tam gdzie fuzzy-hash binarki daje 0.
|
||||||
|
Fuzzy (ssdeep) leci jako sygnał poboczny „prawie ten sam plik", gdy snapshot ma `binary.fuzzy`.
|
||||||
|
|
||||||
## Front — Command Center
|
## Front — Command Center
|
||||||
|
|
||||||
@@ -127,6 +136,8 @@ Po starcie serwera otwórz **http://127.0.0.1:8000/** (`/` → `/ui/`). Statyczn
|
|||||||
wersji (A/B), wizualny diff po 4 osiach z filtrem klasy i przeglądarka pojedynczej powierzchni.
|
wersji (A/B), wizualny diff po 4 osiach z filtrem klasy i przeglądarka pojedynczej powierzchni.
|
||||||
Przycisk **+ wgraj** w panelu gier otwiera upload ISO/ZIP/DLL → `POST /jobs`; status zadania
|
Przycisk **+ wgraj** w panelu gier otwiera upload ISO/ZIP/DLL → `POST /jobs`; status zadania
|
||||||
(queued→started→finished/failed) jest odpytywany na żywo, a po zakończeniu lista wersji odświeża się sama.
|
(queued→started→finished/failed) jest odpytywany na żywo, a po zakończeniu lista wersji odświeża się sama.
|
||||||
|
W przeglądarce pojedynczej wersji widać panel **Podobne wersje** (pasek overlapu + `⇄ diff` ustawiający
|
||||||
|
A/B i odpalający porównanie).
|
||||||
|
|
||||||
## Format snapshotu
|
## Format snapshotu
|
||||||
|
|
||||||
|
|||||||
@@ -39,3 +39,20 @@ def get_snapshot(snapshot_id: int, db: Session = Depends(get_db)) -> models.Snap
|
|||||||
if snap is None:
|
if snap is None:
|
||||||
raise HTTPException(404, "snapshot not found")
|
raise HTTPException(404, "snapshot not found")
|
||||||
return snap
|
return snap
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{snapshot_id}/similar", response_model=list[schemas.SimilarHit])
|
||||||
|
def similar_snapshots(
|
||||||
|
snapshot_id: int,
|
||||||
|
min: int = Query(0, ge=0, le=100, description="drop hits below this overall score"),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
) -> list[schemas.SimilarHit]:
|
||||||
|
hits = service.similar_snapshots(db, snapshot_id, minimum=min)
|
||||||
|
if hits is None:
|
||||||
|
raise HTTPException(404, "snapshot not found")
|
||||||
|
return [
|
||||||
|
schemas.SimilarHit(
|
||||||
|
snapshot=schemas.SnapshotOut.model_validate(snap),
|
||||||
|
overall=score["overall"], fuzzy=score["fuzzy"], axes=score["axes"])
|
||||||
|
for snap, score in hits
|
||||||
|
]
|
||||||
|
|||||||
@@ -43,6 +43,13 @@ class GameDetail(GameOut):
|
|||||||
snapshots: list[SnapshotOut] = []
|
snapshots: list[SnapshotOut] = []
|
||||||
|
|
||||||
|
|
||||||
|
class SimilarHit(BaseModel):
|
||||||
|
snapshot: SnapshotOut
|
||||||
|
overall: int # pooled surface-overlap score 0–100
|
||||||
|
fuzzy: int | None # ssdeep similarity of the raw binary, when available
|
||||||
|
axes: dict # per-axis {shared, only_a, only_b, score}
|
||||||
|
|
||||||
|
|
||||||
class JobOut(BaseModel):
|
class JobOut(BaseModel):
|
||||||
model_config = ConfigDict(from_attributes=True)
|
model_config = ConfigDict(from_attributes=True)
|
||||||
id: int
|
id: int
|
||||||
|
|||||||
@@ -42,6 +42,30 @@ def _get_or_create_game(db: Session, name: str) -> models.Game:
|
|||||||
return game
|
return game
|
||||||
|
|
||||||
|
|
||||||
|
def similar_snapshots(
|
||||||
|
db: Session, snapshot_id: int, minimum: int = 0
|
||||||
|
) -> list[tuple[models.Snapshot, dict]]:
|
||||||
|
"""Rank every other catalogued snapshot against #snapshot_id by surface similarity.
|
||||||
|
|
||||||
|
Returns (snapshot, score) pairs (score = ams.similarity report) sorted by `overall` desc,
|
||||||
|
dropping anything below `minimum`. Returns None if the target doesn't exist."""
|
||||||
|
from ..similarity import similarity
|
||||||
|
from ..snapshot import Snapshot as Surface
|
||||||
|
|
||||||
|
target = db.get(models.Snapshot, snapshot_id)
|
||||||
|
if target is None:
|
||||||
|
return None
|
||||||
|
t_surface = Surface(target.data)
|
||||||
|
|
||||||
|
hits: list[tuple[models.Snapshot, dict]] = []
|
||||||
|
for other in db.scalars(select(models.Snapshot).where(models.Snapshot.id != snapshot_id)):
|
||||||
|
score = similarity(t_surface, Surface(other.data))
|
||||||
|
if score["overall"] >= minimum:
|
||||||
|
hits.append((other, score))
|
||||||
|
hits.sort(key=lambda pair: pair[1]["overall"], reverse=True)
|
||||||
|
return hits
|
||||||
|
|
||||||
|
|
||||||
def import_snapshot(db: Session, data: dict[str, Any], game_name: str | None = None) -> models.Snapshot:
|
def import_snapshot(db: Session, data: dict[str, Any], game_name: str | None = None) -> models.Snapshot:
|
||||||
"""Upsert a snapshot, deduped by the binary's sha256 (falling back to a content hash)."""
|
"""Upsert a snapshot, deduped by the binary's sha256 (falling back to a content hash)."""
|
||||||
sha = data.get("binary", {}).get("sha256") or _content_sha(data)
|
sha = data.get("binary", {}).get("sha256") or _content_sha(data)
|
||||||
|
|||||||
@@ -273,6 +273,9 @@ async function browse(id) {
|
|||||||
];
|
];
|
||||||
out.innerHTML = "";
|
out.innerHTML = "";
|
||||||
out.append(el("div", { class: "diff-head" }, "Przegląd: ", el("b", {}, `${snap.binary_name} [${snap.engine}/${snap.compiler}]`)));
|
out.append(el("div", { class: "diff-head" }, "Przegląd: ", el("b", {}, `${snap.binary_name} [${snap.engine}/${snap.compiler}]`)));
|
||||||
|
const simBox = el("div", { class: "similar" });
|
||||||
|
out.append(simBox);
|
||||||
|
loadSimilar(id, simBox);
|
||||||
const filter = el("input", { class: "owner browse-filter", placeholder: "filtruj…", oninput: () => render() });
|
const filter = el("input", { class: "owner browse-filter", placeholder: "filtruj…", oninput: () => render() });
|
||||||
const tabbar = el("div", {});
|
const tabbar = el("div", {});
|
||||||
const list = el("div", {});
|
const list = el("div", {});
|
||||||
@@ -292,6 +295,27 @@ async function browse(id) {
|
|||||||
render();
|
render();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function loadSimilar(targetId, box) {
|
||||||
|
let hits;
|
||||||
|
try { hits = await jget("/snapshots/" + targetId + "/similar"); }
|
||||||
|
catch { return; } // endpoint absent / single-snapshot catalog — just show nothing
|
||||||
|
if (!hits.length) return;
|
||||||
|
box.append(el("div", { class: "similar-title" }, "Podobne wersje (overlap powierzchni)"));
|
||||||
|
for (const h of hits.slice(0, 6)) {
|
||||||
|
const s = h.snapshot;
|
||||||
|
const bar = el("span", { class: "simbar" },
|
||||||
|
el("span", { class: "simfill", style: "width:" + h.overall + "%" }));
|
||||||
|
box.append(el("div", { class: "simrow" },
|
||||||
|
el("span", { class: "simscore" }, h.overall + "%"),
|
||||||
|
bar,
|
||||||
|
el("span", { class: "simname", title: "przejrzyj", onclick: () => browse(s.id) },
|
||||||
|
`${s.binary_name} [${s.engine || "?"}]`),
|
||||||
|
h.fuzzy != null ? el("span", { class: "simfuzzy", title: "ssdeep binarki" }, "fuzzy " + h.fuzzy) : null,
|
||||||
|
el("span", { class: "simdiff", title: "porównaj tę wersję z aktualną",
|
||||||
|
onclick: () => { state.a = targetId; state.b = s.id; refreshSelection(); compare(); } }, "⇄ diff")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- boot -------------------------------------------------------------------------------------
|
// --- boot -------------------------------------------------------------------------------------
|
||||||
$("compare").addEventListener("click", compare);
|
$("compare").addEventListener("click", compare);
|
||||||
$("owner").addEventListener("keydown", (e) => { if (e.key === "Enter") compare(); });
|
$("owner").addEventListener("keydown", (e) => { if (e.key === "Enter") compare(); });
|
||||||
|
|||||||
@@ -100,6 +100,19 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); }
|
|||||||
.empty { color: var(--dim); font-style: italic; }
|
.empty { color: var(--dim); font-style: italic; }
|
||||||
.moved { color: var(--accent); }
|
.moved { color: var(--accent); }
|
||||||
|
|
||||||
|
.similar { margin: 4px 0 16px; }
|
||||||
|
.similar-title { color: var(--dim); text-transform: uppercase; font-size: 11px; letter-spacing: 1px; margin-bottom: 6px; }
|
||||||
|
.simrow { display: flex; align-items: center; gap: 10px; padding: 4px 0; }
|
||||||
|
.simscore { width: 38px; text-align: right; color: var(--accent); font-weight: 600; }
|
||||||
|
.simbar { flex: 0 0 120px; height: 7px; background: #16202c; border: 1px solid var(--border);
|
||||||
|
border-radius: 4px; overflow: hidden; }
|
||||||
|
.simfill { display: block; height: 100%; background: linear-gradient(90deg, var(--accent2), var(--add)); }
|
||||||
|
.simname { flex: 1; min-width: 0; color: var(--fg); cursor: pointer; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
|
||||||
|
.simname:hover { color: var(--accent); text-decoration: underline; }
|
||||||
|
.simfuzzy { color: var(--dim); font-size: 11px; }
|
||||||
|
.simdiff { color: var(--accent); cursor: pointer; font-size: 11px; }
|
||||||
|
.simdiff:hover { text-decoration: underline; }
|
||||||
|
|
||||||
.browse-filter { margin-bottom: 10px; }
|
.browse-filter { margin-bottom: 10px; }
|
||||||
.btab { display: inline-block; padding: 4px 10px; margin-right: 6px; border: 1px solid var(--border);
|
.btab { display: inline-block; padding: 4px 10px; margin-right: 6px; border: 1px solid var(--border);
|
||||||
border-radius: 6px; cursor: pointer; color: var(--dim); }
|
border-radius: 6px; cursor: pointer; color: var(--dim); }
|
||||||
|
|||||||
86
ams/similarity.py
Normal file
86
ams/similarity.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""How similar are two engine versions?
|
||||||
|
|
||||||
|
Two complementary signals:
|
||||||
|
|
||||||
|
* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
|
||||||
|
keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
|
||||||
|
compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
|
||||||
|
* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
|
||||||
|
files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
|
||||||
|
flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Hashable
|
||||||
|
|
||||||
|
from .diff import _owner_name_key, _type_key
|
||||||
|
from .snapshot import Snapshot
|
||||||
|
|
||||||
|
# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
|
||||||
|
_AXES = {
|
||||||
|
"types": (lambda s: s.types, _type_key),
|
||||||
|
"methods": (lambda s: s.methods, _owner_name_key),
|
||||||
|
"events": (lambda s: s.events, _owner_name_key),
|
||||||
|
"fields": (lambda s: s.fields, _owner_name_key),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
|
||||||
|
getter, key = _AXES[axis]
|
||||||
|
return {key(it) for it in getter(snap)}
|
||||||
|
|
||||||
|
|
||||||
|
def jaccard(a: set, b: set) -> float:
|
||||||
|
"""|A∩B| / |A∪B|; two empty sets are defined as identical (1.0)."""
|
||||||
|
union = a | b
|
||||||
|
return (len(a & b) / len(union)) if union else 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
|
||||||
|
"""Per-axis overlap plus a pooled overall score (0–100).
|
||||||
|
|
||||||
|
`overall` is the micro-average — one Jaccard over every identity key from all axes — so it
|
||||||
|
reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
|
||||||
|
per: dict[str, dict] = {}
|
||||||
|
inter_total = union_total = 0
|
||||||
|
for axis in _AXES:
|
||||||
|
ka, kb = _keys(a, axis), _keys(b, axis)
|
||||||
|
inter, union = len(ka & kb), len(ka | kb)
|
||||||
|
per[axis] = {
|
||||||
|
"shared": inter,
|
||||||
|
"only_a": len(ka - kb),
|
||||||
|
"only_b": len(kb - ka),
|
||||||
|
"score": round(100 * jaccard(ka, kb)),
|
||||||
|
}
|
||||||
|
inter_total += inter
|
||||||
|
union_total += union
|
||||||
|
overall = round(100 * inter_total / union_total) if union_total else 100
|
||||||
|
return {"overall": overall, "axes": per}
|
||||||
|
|
||||||
|
|
||||||
|
def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
|
||||||
|
"""ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing
|
||||||
|
or no ssdeep implementation is installed."""
|
||||||
|
ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
|
||||||
|
if not ha or not hb:
|
||||||
|
return None
|
||||||
|
mod = None
|
||||||
|
for name in ("ppdeep", "ssdeep"):
|
||||||
|
try:
|
||||||
|
mod = __import__(name)
|
||||||
|
break
|
||||||
|
except ImportError:
|
||||||
|
continue
|
||||||
|
if mod is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(mod.compare(ha, hb))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def similarity(a: Snapshot, b: Snapshot) -> dict:
|
||||||
|
"""Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
|
||||||
|
surf = surface_similarity(a, b)
|
||||||
|
return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}
|
||||||
92
tests/test_similarity.py
Normal file
92
tests/test_similarity.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity
|
||||||
|
from ams.snapshot import Snapshot
|
||||||
|
|
||||||
|
SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots"
|
||||||
|
PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json"
|
||||||
|
BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json"
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present")
|
||||||
|
|
||||||
|
|
||||||
|
def _snap(path: Path) -> Snapshot:
|
||||||
|
with open(path, encoding="utf-8") as fh:
|
||||||
|
return Snapshot(json.load(fh))
|
||||||
|
|
||||||
|
|
||||||
|
def test_jaccard_basics():
|
||||||
|
assert jaccard(set(), set()) == 1.0
|
||||||
|
assert jaccard({1, 2}, {1, 2}) == 1.0
|
||||||
|
assert jaccard({1, 2}, {3, 4}) == 0.0
|
||||||
|
assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_identical_is_100():
|
||||||
|
s = _snap(PIKLIB)
|
||||||
|
surf = surface_similarity(s, s)
|
||||||
|
assert surf["overall"] == 100
|
||||||
|
assert all(ax["score"] == 100 for ax in surf["axes"].values())
|
||||||
|
|
||||||
|
|
||||||
|
def test_disjoint_is_0():
|
||||||
|
a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []})
|
||||||
|
b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []})
|
||||||
|
assert surface_similarity(a, b)["overall"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_golden_pair_is_similar_not_identical():
|
||||||
|
# PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal.
|
||||||
|
rep = similarity(_snap(PIKLIB), _snap(BLOOMOO))
|
||||||
|
assert 0 < rep["overall"] < 100
|
||||||
|
assert rep["axes"]["methods"]["shared"] > 0
|
||||||
|
# the golden snapshots carry no fuzzy hash (extractor doesn't compute one)
|
||||||
|
assert rep["fuzzy"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_fuzzy_matches_identical_hash():
|
||||||
|
pytest.importorskip("ppdeep")
|
||||||
|
import ppdeep
|
||||||
|
h = ppdeep.hash(b"the quick brown fox " * 50)
|
||||||
|
a = Snapshot({"binary": {"fuzzy": h}})
|
||||||
|
b = Snapshot({"binary": {"fuzzy": h}})
|
||||||
|
assert fuzzy_similarity(a, b) == 100
|
||||||
|
assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None
|
||||||
|
|
||||||
|
|
||||||
|
# --- endpoint ---------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def client(tmp_path):
|
||||||
|
pytest.importorskip("fastapi")
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from ams.api.app import create_app
|
||||||
|
return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path)))
|
||||||
|
|
||||||
|
|
||||||
|
def _load(path: Path) -> dict:
|
||||||
|
with open(path, encoding="utf-8") as fh:
|
||||||
|
return json.load(fh)
|
||||||
|
|
||||||
|
|
||||||
|
def test_similar_endpoint(client):
|
||||||
|
a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"]
|
||||||
|
b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"]
|
||||||
|
|
||||||
|
hits = client.get("/snapshots/{0}/similar".format(a)).json()
|
||||||
|
assert len(hits) == 1
|
||||||
|
assert hits[0]["snapshot"]["id"] == b
|
||||||
|
assert hits[0]["overall"] > 0
|
||||||
|
assert "methods" in hits[0]["axes"]
|
||||||
|
|
||||||
|
# a 100-only filter drops the cross-compiler pair
|
||||||
|
assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == []
|
||||||
|
assert client.get("/snapshots/999/similar").status_code == 404
|
||||||
Reference in New Issue
Block a user