Ranks catalogued engine versions by how much of their CMC_* surface they share,
which (unlike a binary fuzzy hash) stays meaningful across compilers — the golden
pair PIKLIB8/MSVC6 vs bloomoodll/MSVC8 scores 85%.
- similarity.py: jaccard, surface_similarity (per-axis + pooled overall),
fuzzy_similarity (ssdeep via ppdeep, secondary signal)
- service.similar_snapshots + GET /snapshots/{id}/similar?min=N (SimilarHit)
- UI: "Podobne wersje" panel in the snapshot browser (overlap bar + ⇄ diff)
- tests: 6 new (jaccard, identical/disjoint, golden pair 0<x<100, fuzzy,
endpoint + min filter) -> 28/28
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
87 lines
3.1 KiB
Python
87 lines
3.1 KiB
Python
"""How similar are two engine versions?
|
||
|
||
Two complementary signals:
|
||
|
||
* **Surface similarity** — Jaccard overlap of the *identity sets* per axis (the same identity
|
||
keys the diff engine uses). This is the meaningful one for "is this a sibling version": it is
|
||
compiler-agnostic, so a MSVC6 build and a MSVC8 build of the same engine still score high.
|
||
* **Fuzzy (ssdeep)** — context-triggered hash of the *raw binary*. Only catches near-identical
|
||
files (it collapses to 0 across recompiles), so it is a secondary "is this almost the same DLL"
|
||
flag, present only when ppdeep/ssdeep produced a hash at acquisition time.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from typing import Hashable
|
||
|
||
from .diff import _owner_name_key, _type_key
|
||
from .snapshot import Snapshot
|
||
|
||
# axis -> (list accessor, identity-key fn) — mirrors ams.diff's keying so similarity and diff agree.
|
||
_AXES = {
|
||
"types": (lambda s: s.types, _type_key),
|
||
"methods": (lambda s: s.methods, _owner_name_key),
|
||
"events": (lambda s: s.events, _owner_name_key),
|
||
"fields": (lambda s: s.fields, _owner_name_key),
|
||
}
|
||
|
||
|
||
def _keys(snap: Snapshot, axis: str) -> set[Hashable]:
|
||
getter, key = _AXES[axis]
|
||
return {key(it) for it in getter(snap)}
|
||
|
||
|
||
def jaccard(a: set, b: set) -> float:
|
||
"""|A∩B| / |A∪B|; two empty sets are defined as identical (1.0)."""
|
||
union = a | b
|
||
return (len(a & b) / len(union)) if union else 1.0
|
||
|
||
|
||
def surface_similarity(a: Snapshot, b: Snapshot) -> dict:
|
||
"""Per-axis overlap plus a pooled overall score (0–100).
|
||
|
||
`overall` is the micro-average — one Jaccard over every identity key from all axes — so it
|
||
reads as "this share of the whole engine surface is common", naturally weighting by axis size."""
|
||
per: dict[str, dict] = {}
|
||
inter_total = union_total = 0
|
||
for axis in _AXES:
|
||
ka, kb = _keys(a, axis), _keys(b, axis)
|
||
inter, union = len(ka & kb), len(ka | kb)
|
||
per[axis] = {
|
||
"shared": inter,
|
||
"only_a": len(ka - kb),
|
||
"only_b": len(kb - ka),
|
||
"score": round(100 * jaccard(ka, kb)),
|
||
}
|
||
inter_total += inter
|
||
union_total += union
|
||
overall = round(100 * inter_total / union_total) if union_total else 100
|
||
return {"overall": overall, "axes": per}
|
||
|
||
|
||
def fuzzy_similarity(a: Snapshot, b: Snapshot) -> int | None:
|
||
"""ssdeep comparison (0–100) of the two binaries' fuzzy hashes, or None if either is missing
|
||
or no ssdeep implementation is installed."""
|
||
ha, hb = a.binary.get("fuzzy"), b.binary.get("fuzzy")
|
||
if not ha or not hb:
|
||
return None
|
||
mod = None
|
||
for name in ("ppdeep", "ssdeep"):
|
||
try:
|
||
mod = __import__(name)
|
||
break
|
||
except ImportError:
|
||
continue
|
||
if mod is None:
|
||
return None
|
||
try:
|
||
return int(mod.compare(ha, hb))
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def similarity(a: Snapshot, b: Snapshot) -> dict:
|
||
"""Combined report: pooled `overall`, per-axis breakdown, and the secondary `fuzzy` flag."""
|
||
surf = surface_similarity(a, b)
|
||
return {"overall": surf["overall"], "axes": surf["axes"], "fuzzy": fuzzy_similarity(a, b)}
|