Files
Aidem-Media-DLL-Analysis/tests/test_similarity.py
Patryk Gensch 38be932abc Similar versions: surface-overlap metric + endpoint + UI panel
Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.

- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
  fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
  endpoint + min filter) -> 28/28

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 12:33:50 +02:00

93 lines
3.1 KiB
Python

"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint."""
from __future__ import annotations
import json
from pathlib import Path
import pytest
from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity
from ams.snapshot import Snapshot
SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots"
PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json"
BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json"
pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present")
def _snap(path: Path) -> Snapshot:
with open(path, encoding="utf-8") as fh:
return Snapshot(json.load(fh))
def test_jaccard_basics():
assert jaccard(set(), set()) == 1.0
assert jaccard({1, 2}, {1, 2}) == 1.0
assert jaccard({1, 2}, {3, 4}) == 0.0
assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3)
def test_identical_is_100():
s = _snap(PIKLIB)
surf = surface_similarity(s, s)
assert surf["overall"] == 100
assert all(ax["score"] == 100 for ax in surf["axes"].values())
def test_disjoint_is_0():
a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []})
b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []})
assert surface_similarity(a, b)["overall"] == 0
def test_golden_pair_is_similar_not_identical():
# PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal.
rep = similarity(_snap(PIKLIB), _snap(BLOOMOO))
assert 0 < rep["overall"] < 100
assert rep["axes"]["methods"]["shared"] > 0
# the golden snapshots carry no fuzzy hash (extractor doesn't compute one)
assert rep["fuzzy"] is None
def test_fuzzy_matches_identical_hash():
pytest.importorskip("ppdeep")
import ppdeep
h = ppdeep.hash(b"the quick brown fox " * 50)
a = Snapshot({"binary": {"fuzzy": h}})
b = Snapshot({"binary": {"fuzzy": h}})
assert fuzzy_similarity(a, b) == 100
assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None
# --- endpoint ---------------------------------------------------------------------------------
@pytest.fixture()
def client(tmp_path):
pytest.importorskip("fastapi")
from fastapi.testclient import TestClient
from ams.api.app import create_app
return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path)))
def _load(path: Path) -> dict:
with open(path, encoding="utf-8") as fh:
return json.load(fh)
def test_similar_endpoint(client):
a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"]
b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"]
hits = client.get("/snapshots/{0}/similar".format(a)).json()
assert len(hits) == 1
assert hits[0]["snapshot"]["id"] == b
assert hits[0]["overall"] > 0
assert "methods" in hits[0]["axes"]
# a 100-only filter drops the cross-compiler pair
assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == []
assert client.get("/snapshots/999/similar").status_code == 404