"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint.""" from __future__ import annotations import json from pathlib import Path import pytest from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity from ams.snapshot import Snapshot SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots" PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json" BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json" pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present") def _snap(path: Path) -> Snapshot: with open(path, encoding="utf-8") as fh: return Snapshot(json.load(fh)) def test_jaccard_basics(): assert jaccard(set(), set()) == 1.0 assert jaccard({1, 2}, {1, 2}) == 1.0 assert jaccard({1, 2}, {3, 4}) == 0.0 assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3) def test_identical_is_100(): s = _snap(PIKLIB) surf = surface_similarity(s, s) assert surf["overall"] == 100 assert all(ax["score"] == 100 for ax in surf["axes"].values()) def test_disjoint_is_0(): a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []}) b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []}) assert surface_similarity(a, b)["overall"] == 0 def test_golden_pair_is_similar_not_identical(): # PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal. rep = similarity(_snap(PIKLIB), _snap(BLOOMOO)) assert 0 < rep["overall"] < 100 assert rep["axes"]["methods"]["shared"] > 0 # the golden snapshots carry no fuzzy hash (extractor doesn't compute one) assert rep["fuzzy"] is None def test_fuzzy_matches_identical_hash(): pytest.importorskip("ppdeep") import ppdeep h = ppdeep.hash(b"the quick brown fox " * 50) a = Snapshot({"binary": {"fuzzy": h}}) b = Snapshot({"binary": {"fuzzy": h}}) assert fuzzy_similarity(a, b) == 100 assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None # --- endpoint --------------------------------------------------------------------------------- @pytest.fixture() def client(tmp_path): pytest.importorskip("fastapi") from fastapi.testclient import TestClient from ams.api.app import create_app return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path))) def _load(path: Path) -> dict: with open(path, encoding="utf-8") as fh: return json.load(fh) def test_similar_endpoint(client): a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"] b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"] hits = client.get("/snapshots/{0}/similar".format(a)).json() assert len(hits) == 1 assert hits[0]["snapshot"]["id"] == b assert hits[0]["overall"] > 0 assert "methods" in hits[0]["axes"] # a 100-only filter drops the cross-compiler pair assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == [] assert client.get("/snapshots/999/similar").status_code == 404