Aidem-Media-DLL-Analysis/ams/similarity.py

"""How similar are two engine versions?

Two complementary signals:

* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
  keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
  compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
  files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
  flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
"""

from __future__ import annotations

from typing import Hashable

from .diff import _owner_name_key, _type_key
from .snapshot import Snapshot

# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
_AXES = {
    "types": (lambda s: s.types, _type_key),
    "methods": (lambda s: s.methods, _owner_name_key),
    "events": (lambda s: s.events, _owner_name_key),
    "fields": (lambda s: s.fields, _owner_name_key),
}


def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
    getter, key = _AXES[axis]
    return {key(it) for it in getter(snap)}


def jaccard(a: set, b: set) -> float:
    """|A∩B| / |A∪B|; two empty sets are defined as identical (1.0)."""
    union = a | b
    return (len(a & b) / len(union)) if union else 1.0


def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
    """Per-axis overlap plus a pooled overall score (0–100).

    `overall` is the micro-average — one Jaccard over every identity key from all axes — so it
    reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
    per: dict[str, dict] = {}
    inter_total = union_total = 0
    for axis in _AXES:
        ka, kb = _keys(a, axis), _keys(b, axis)
        inter, union = len(ka & kb), len(ka | kb)
        per[axis] = {
            "shared": inter,
            "only_a": len(ka - kb),
            "only_b": len(kb - ka),
            "score": round(100 * jaccard(ka, kb)),
        }
        inter_total += inter
        union_total += union
    overall = round(100 * inter_total / union_total) if union_total else 100
    return {"overall": overall, "axes": per}


def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
    """ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing
    or no ssdeep implementation is installed."""
    ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
    if not ha or not hb:
        return None
    mod = None
    for name in ("ppdeep", "ssdeep"):
        try:
            mod = __import__(name)
            break
        except ImportError:
            continue
    if mod is None:
        return None
    try:
        return int(mod.compare(ha, hb))
    except Exception:
        return None


def similarity(a: Snapshot, b: Snapshot) -> dict:
    """Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
    surf = surface_similarity(a, b)
    return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}