Aidem-Media-DLL-Analysis/ams/normalize.py

"""Body-level normalisation: *how much* did a method's implementation change?

The dispatch axis gives each method an ordered list of leaf CALL anchors (its body
fingerprint). Here we turn two such lists into a single similarity score plus a precise
leaf-level delta, so a body diff reads as "SHOW 82% — +vtbl+0x58, -CMC_Foo::bar" instead
of a binary changed/unchanged.

`canonical()` collapses consecutive duplicate anchors — the strlen+memcpy / load-twice idioms
emit the same leaf back-to-back, and that doubling is a codegen artefact, not a real call — so
the comparison doesn't punish it. Order is otherwise preserved; matching is via
difflib.SequenceMatcher (stdlib, deterministic), which is insertion/deletion aware.
"""

from __future__ import annotations

from difflib import SequenceMatcher


def canonical(calls: list[str]) -> list[str]:
    """Drop consecutive duplicate anchors (load-twice artefacts); keep order otherwise."""
    out: list[str] = []
    for c in calls or []:
        if not out or out[-1] != c:
            out.append(c)
    return out


def body_similarity(a: list[str], b: list[str]) -> int:
    """Order-aware similarity of two body fingerprints, 0–100. Two empty bodies are identical."""
    ca, cb = canonical(a), canonical(b)
    if not ca and not cb:
        return 100
    return int(round(100 * SequenceMatcher(None, ca, cb).ratio()))


def body_delta(a: list[str], b: list[str]) -> dict:
    """Leaf-level change between two bodies: {similarity, added, removed}.

    `added`/`removed` are the anchors present only on the new/old side (in order), derived from
    the alignment — i.e. exactly which calls appeared or vanished."""
    ca, cb = canonical(a), canonical(b)
    sm = SequenceMatcher(None, ca, cb)
    added: list[str] = []
    removed: list[str] = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag in ("replace", "delete"):
            removed.extend(ca[i1:i2])
        if tag in ("replace", "insert"):
            added.extend(cb[j1:j2])
    return {
        "similarity": 100 if (not ca and not cb) else int(round(100 * sm.ratio())),
        "added": added,
        "removed": removed,
    }