Files
Aidem-Media-DLL-Analysis/ams/similarity.py
Patryk Gensch 38be932abc Similar versions: surface-overlap metric + endpoint + UI panel
Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.

- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
  fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
  endpoint + min filter) -> 28/28

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 12:33:50 +02:00

87 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""How similar are two engine versions?
Two complementary signals:
* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
"""
from __future__ import annotations
from typing import Hashable
from .diff import _owner_name_key, _type_key
from .snapshot import Snapshot
# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
_AXES = {
"types": (lambda s: s.types, _type_key),
"methods": (lambda s: s.methods, _owner_name_key),
"events": (lambda s: s.events, _owner_name_key),
"fields": (lambda s: s.fields, _owner_name_key),
}
def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
getter, key = _AXES[axis]
return {key(it) for it in getter(snap)}
def jaccard(a: set, b: set) -> float:
"""|A∩B| / |AB|; two empty sets are defined as identical (1.0)."""
union = a | b
return (len(a & b) / len(union)) if union else 1.0
def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
"""Per-axis overlap plus a pooled overall score (0100).
`overall` is the micro-average — one Jaccard over every identity key from all axes — so it
reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
per: dict[str, dict] = {}
inter_total = union_total = 0
for axis in _AXES:
ka, kb = _keys(a, axis), _keys(b, axis)
inter, union = len(ka & kb), len(ka | kb)
per[axis] = {
"shared": inter,
"only_a": len(ka - kb),
"only_b": len(kb - ka),
"score": round(100 * jaccard(ka, kb)),
}
inter_total += inter
union_total += union
overall = round(100 * inter_total / union_total) if union_total else 100
return {"overall": overall, "axes": per}
def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
"""ssdeep comparison (0100) of the two binaries' fuzzy hashes, or None if either is missing
or no ssdeep implementation is installed."""
ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
if not ha or not hb:
return None
mod = None
for name in ("ppdeep", "ssdeep"):
try:
mod = __import__(name)
break
except ImportError:
continue
if mod is None:
return None
try:
return int(mod.compare(ha, hb))
except Exception:
return None
def similarity(a: Snapshot, b: Snapshot) -> dict:
"""Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
surf = surface_similarity(a, b)
return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}