Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.
- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
endpoint + min filter) -> 28/28
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity
|
|
from ams.snapshot import Snapshot
|
|
|
|
SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots"
|
|
PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json"
|
|
BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json"
|
|
|
|
pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present")
|
|
|
|
|
|
def _snap(path: Path) -> Snapshot:
|
|
with open(path, encoding="utf-8") as fh:
|
|
return Snapshot(json.load(fh))
|
|
|
|
|
|
def test_jaccard_basics():
|
|
assert jaccard(set(), set()) == 1.0
|
|
assert jaccard({1, 2}, {1, 2}) == 1.0
|
|
assert jaccard({1, 2}, {3, 4}) == 0.0
|
|
assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3)
|
|
|
|
|
|
def test_identical_is_100():
|
|
s = _snap(PIKLIB)
|
|
surf = surface_similarity(s, s)
|
|
assert surf["overall"] == 100
|
|
assert all(ax["score"] == 100 for ax in surf["axes"].values())
|
|
|
|
|
|
def test_disjoint_is_0():
|
|
a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []})
|
|
b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []})
|
|
assert surface_similarity(a, b)["overall"] == 0
|
|
|
|
|
|
def test_golden_pair_is_similar_not_identical():
|
|
# PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal.
|
|
rep = similarity(_snap(PIKLIB), _snap(BLOOMOO))
|
|
assert 0 < rep["overall"] < 100
|
|
assert rep["axes"]["methods"]["shared"] > 0
|
|
# the golden snapshots carry no fuzzy hash (extractor doesn't compute one)
|
|
assert rep["fuzzy"] is None
|
|
|
|
|
|
def test_fuzzy_matches_identical_hash():
|
|
pytest.importorskip("ppdeep")
|
|
import ppdeep
|
|
h = ppdeep.hash(b"the quick brown fox " * 50)
|
|
a = Snapshot({"binary": {"fuzzy": h}})
|
|
b = Snapshot({"binary": {"fuzzy": h}})
|
|
assert fuzzy_similarity(a, b) == 100
|
|
assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None
|
|
|
|
|
|
# --- endpoint ---------------------------------------------------------------------------------
|
|
|
|
@pytest.fixture()
|
|
def client(tmp_path):
|
|
pytest.importorskip("fastapi")
|
|
from fastapi.testclient import TestClient
|
|
|
|
from ams.api.app import create_app
|
|
return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path)))
|
|
|
|
|
|
def _load(path: Path) -> dict:
|
|
with open(path, encoding="utf-8") as fh:
|
|
return json.load(fh)
|
|
|
|
|
|
def test_similar_endpoint(client):
|
|
a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"]
|
|
b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"]
|
|
|
|
hits = client.get("/snapshots/{0}/similar".format(a)).json()
|
|
assert len(hits) == 1
|
|
assert hits[0]["snapshot"]["id"] == b
|
|
assert hits[0]["overall"] > 0
|
|
assert "methods" in hits[0]["axes"]
|
|
|
|
# a 100-only filter drops the cross-compiler pair
|
|
assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == []
|
|
assert client.get("/snapshots/999/similar").status_code == 404
|