Aidem-Media-DLL-Analysis/tests/test_similarity.py

"""Similarity: pure surface/fuzzy scoring + the /snapshots/{id}/similar endpoint."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from ams.similarity import fuzzy_similarity, jaccard, similarity, surface_similarity
from ams.snapshot import Snapshot

SNAP_DIR = Path(__file__).resolve().parents[1] / "snapshots"
PIKLIB = SNAP_DIR / "PIKLIB8.dll.snapshot.json"
BLOOMOO = SNAP_DIR / "bloomoodll.dll.snapshot.json"

pytestmark = pytest.mark.skipif(not PIKLIB.exists(), reason="golden snapshots not present")


def _snap(path: Path) -> Snapshot:
    with open(path, encoding="utf-8") as fh:
        return Snapshot(json.load(fh))


def test_jaccard_basics():
    assert jaccard(set(), set()) == 1.0
    assert jaccard({1, 2}, {1, 2}) == 1.0
    assert jaccard({1, 2}, {3, 4}) == 0.0
    assert jaccard({1, 2}, {2, 3}) == pytest.approx(1 / 3)


def test_identical_is_100():
    s = _snap(PIKLIB)
    surf = surface_similarity(s, s)
    assert surf["overall"] == 100
    assert all(ax["score"] == 100 for ax in surf["axes"].values())


def test_disjoint_is_0():
    a = Snapshot({"types": [{"script_name": "AAA"}], "methods": [], "events": [], "fields": []})
    b = Snapshot({"types": [{"script_name": "ZZZ"}], "methods": [], "events": [], "fields": []})
    assert surface_similarity(a, b)["overall"] == 0


def test_golden_pair_is_similar_not_identical():
    # PIKLIB (MSVC6) vs BlooMoo (MSVC8): sibling engines — high surface overlap, but not equal.
    rep = similarity(_snap(PIKLIB), _snap(BLOOMOO))
    assert 0 < rep["overall"] < 100
    assert rep["axes"]["methods"]["shared"] > 0
    # the golden snapshots carry no fuzzy hash (extractor doesn't compute one)
    assert rep["fuzzy"] is None


def test_fuzzy_matches_identical_hash():
    pytest.importorskip("ppdeep")
    import ppdeep
    h = ppdeep.hash(b"the quick brown fox " * 50)
    a = Snapshot({"binary": {"fuzzy": h}})
    b = Snapshot({"binary": {"fuzzy": h}})
    assert fuzzy_similarity(a, b) == 100
    assert fuzzy_similarity(a, Snapshot({"binary": {}})) is None


# --- endpoint ---------------------------------------------------------------------------------

@pytest.fixture()
def client(tmp_path):
    pytest.importorskip("fastapi")
    from fastapi.testclient import TestClient

    from ams.api.app import create_app
    return TestClient(create_app(database_url="sqlite:///{0}/sim.db".format(tmp_path)))


def _load(path: Path) -> dict:
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)


def test_similar_endpoint(client):
    a = client.post("/snapshots", params={"game": "Reksio i UFO"}, json=_load(PIKLIB)).json()["id"]
    b = client.post("/snapshots", params={"game": "Reksio i Kapitan Nemo"}, json=_load(BLOOMOO)).json()["id"]

    hits = client.get("/snapshots/{0}/similar".format(a)).json()
    assert len(hits) == 1
    assert hits[0]["snapshot"]["id"] == b
    assert hits[0]["overall"] > 0
    assert "methods" in hits[0]["axes"]

    # a 100-only filter drops the cross-compiler pair
    assert client.get("/snapshots/{0}/similar".format(a), params={"min": 100}).json() == []
    assert client.get("/snapshots/999/similar").status_code == 404