Body normalisation: per-method similarity score + leaf delta

Turns the dispatch axis from a binary changed/unchanged into a "how much" measure of code change — the original goal. ams.normalize compares two body fingerprints (the ordered leaf-call anchors) with difflib after collapsing consecutive-duplicate anchors (a load-twice codegen artefact), yielding a 0-100 similarity and the exact leaves that appeared/vanished. Every dispatch `changed` entry now carries body={similarity, added, removed}, and the block carries a summary={shared, identical, changed, mean_similarity}. Golden pair (cross-compiler): 470 shared bodies, 131 identical, mean 66% similar; Animo SHOW/HIDE/PAUSE/RESUME come out 100% despite MSVC6 vs MSVC8, LOAD 50% with the swapped leaves spelled out. - normalize.py: canonical / body_similarity / body_delta - diff: _dispatch_diff enriches changed with body + adds summary - render: METHOD BODIES shows %, leaf delta, summary line - UI: similarity % + leaf delta + axis summary - tests: 5 new -> 34/34 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 13:23:15 +02:00
parent 27399a52b1
commit b0d3d22445
7 changed files with 198 additions and 18 deletions
--- a/ams/api/static/app.js
+++ b/ams/api/static/app.js
@@ -240,18 +240,34 @@ function axisCard(ax, block) {
  const sortByName = (arr) => arr.slice().sort((x, y) => ax.name(x).localeCompare(ax.name(y)));
  for (const it of sortByName(block.added)) body.append(el("div", { class: "row r-add" }, ax.fmt(it)));
  for (const it of sortByName(block.removed)) body.append(el("div", { class: "row r-del" }, ax.fmt(it)));
+  const leaves = (arr) => "[" + arr.slice(0, 4).join(", ") + (arr.length > 4 ? "…+" + (arr.length - 4) : "") + "]";
  for (const ch of block.changed.slice().sort((x, y) => ax.name(x.item).localeCompare(ax.name(y.item)))) {
-    const deltas = Object.entries(ch.changes).map(([f, v]) =>
-      (Array.isArray(v[0]) || Array.isArray(v[1]))
-        ? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}`
-        : `${f}: ${v[0]} → ${v[1]}`).join(", ");
-    body.append(el("div", { class: "row r-chg" }, ax.name(ch.item), "  ", el("span", { class: "delta" }, deltas)));
+    let deltas, sim = null;
+    if (ch.body) {  // method-body diff: similarity score + leaf-level delta
+      sim = ch.body.similarity;
+      const parts = [];
+      if (ch.body.added && ch.body.added.length) parts.push("+" + leaves(ch.body.added));
+      if (ch.body.removed && ch.body.removed.length) parts.push("−" + leaves(ch.body.removed));
+      deltas = parts.join("  ") || (ch.changes.impl ? `impl ${ch.changes.impl[0]} → ${ch.changes.impl[1]}` : "");
+    } else {
+      deltas = Object.entries(ch.changes).map(([f, v]) =>
+        (Array.isArray(v[0]) || Array.isArray(v[1]))
+          ? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}`
+          : `${f}: ${v[0]} → ${v[1]}`).join(", ");
+    }
+    const row = el("div", { class: "row r-chg" }, ax.name(ch.item), "  ");
+    if (sim != null) row.append(el("span", { class: "simpct" }, sim + "%"), " ");
+    row.append(el("span", { class: "delta" }, deltas));
+    body.append(row);
  }
+  const sum = block.summary
+    ? el("span", { class: "axsum" }, `śr. ${block.summary.mean_similarity}% · ${block.summary.changed}/${block.summary.shared} zmienionych`)
+    : null;
  return el("details", { class: "axis", open: true },
    el("summary", {}, el("span", { class: "title" }, ax.title),
      badge("b-add", "+", block.added.length),
      badge("b-del", "−", block.removed.length),
-      badge("b-chg", "~", block.changed.length)),
+      badge("b-chg", "~", block.changed.length), sum),
    body);
 }

--- a/ams/api/static/style.css
+++ b/ams/api/static/style.css
@@ -97,6 +97,8 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); }
 .r-chg::before { content: "~"; color: var(--chg); }
 .r-del { color: var(--dim); }
 .delta { color: var(--chg); }
+.simpct { color: var(--accent); font-weight: 600; }
+.axsum { margin-left: auto; color: var(--dim); font-size: 11px; }
 .empty { color: var(--dim); font-style: italic; }
 .moved { color: var(--accent); }

--- a/ams/diff.py
+++ b/ams/diff.py
@@ -85,6 +85,33 @@ def _dispatch_with_names(snap: Snapshot) -> list[Item]:
    return out


+def _dispatch_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
+    """Dispatch axis with body-level normalisation: every `changed` entry carries a `body`
+    {similarity, added, removed} from ams.normalize, and the block gets a `summary` measuring
+    how much the shared bodies changed overall (mean similarity, identical/changed counts)."""
+    from .normalize import body_delta, body_similarity
+
+    do = _dispatch_with_names(old)
+    dn = _dispatch_with_names(new)
+    block = keyed_diff(do, dn, _dispatch_key, ["impl", "calls"])
+
+    old_calls = {_dispatch_key(r): r.get("calls", []) for r in do}
+    new_calls = {_dispatch_key(r): r.get("calls", []) for r in dn}
+    for ch in block["changed"]:
+        k = _dispatch_key(ch["item"])
+        ch["body"] = body_delta(old_calls.get(k, []), new_calls.get(k, []))
+
+    shared = set(old_calls) & set(new_calls)
+    sims = [body_similarity(old_calls[k], new_calls[k]) for k in shared]
+    block["summary"] = {
+        "shared": len(shared),
+        "identical": sum(1 for s in sims if s == 100),
+        "changed": sum(1 for s in sims if s < 100),
+        "mean_similarity": int(round(sum(sims) / len(sims))) if sims else 100,
+    }
+    return block
+
+
 def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
    return {
        "binary": {"from": old.binary, "to": new.binary},
@@ -94,8 +121,7 @@ def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
        "fields": keyed_diff(old.fields, new.fields, _owner_name_key, ["type"]),
        "struct_layout": keyed_diff(old.struct_layout, new.struct_layout, _layout_key,
                                    ["size", "is_vtable"]),
-        "method_dispatch": keyed_diff(_dispatch_with_names(old), _dispatch_with_names(new),
-                                      _dispatch_key, ["impl", "calls"]),
+        "method_dispatch": _dispatch_diff(old, new),
        "method_inheritance": keyed_diff(old.method_inheritance, new.method_inheritance,
                                         lambda x: x["runner"], ["base_runner"]),
        "field_inheritance": keyed_diff(old.field_inheritance, new.field_inheritance,
--- a/ams/normalize.py
+++ b/ams/normalize.py
@@ -0,0 +1,54 @@
+"""Body-level normalisation: *how much* did a method's implementation change?
+
+The dispatch axis gives each method an ordered list of leaf CALL anchors (its body
+fingerprint). Here we turn two such lists into a single similarity score plus a precise
+leaf-level delta, so a body diff reads as "SHOW 82% — +vtbl+0x58, -CMC_Foo::bar" instead
+of a binary changed/unchanged.
+
+`canonical()` collapses consecutive duplicate anchors — the strlen+memcpy / load-twice idioms
+emit the same leaf back-to-back, and that doubling is a codegen artefact, not a real call — so
+the comparison doesn't punish it. Order is otherwise preserved; matching is via
+difflib.SequenceMatcher (stdlib, deterministic), which is insertion/deletion aware.
+"""
+
+from __future__ import annotations
+
+from difflib import SequenceMatcher
+
+
+def canonical(calls: list[str]) -> list[str]:
+    """Drop consecutive duplicate anchors (load-twice artefacts); keep order otherwise."""
+    out: list[str] = []
+    for c in calls or []:
+        if not out or out[-1] != c:
+            out.append(c)
+    return out
+
+
+def body_similarity(a: list[str], b: list[str]) -> int:
+    """Order-aware similarity of two body fingerprints, 0–100. Two empty bodies are identical."""
+    ca, cb = canonical(a), canonical(b)
+    if not ca and not cb:
+        return 100
+    return int(round(100 * SequenceMatcher(None, ca, cb).ratio()))
+
+
+def body_delta(a: list[str], b: list[str]) -> dict:
+    """Leaf-level change between two bodies: {similarity, added, removed}.
+
+    `added`/`removed` are the anchors present only on the new/old side (in order), derived from
+    the alignment — i.e. exactly which calls appeared or vanished."""
+    ca, cb = canonical(a), canonical(b)
+    sm = SequenceMatcher(None, ca, cb)
+    added: list[str] = []
+    removed: list[str] = []
+    for tag, i1, i2, j1, j2 in sm.get_opcodes():
+        if tag in ("replace", "delete"):
+            removed.extend(ca[i1:i2])
+        if tag in ("replace", "insert"):
+            added.extend(cb[j1:j2])
+    return {
+        "similarity": 100 if (not ca and not cb) else int(round(100 * sm.ratio())),
+        "added": added,
+        "removed": removed,
+    }
--- a/ams/render.py
+++ b/ams/render.py
@@ -82,11 +82,23 @@ def _dispatch_name(r: dict) -> str:
    return r.get("name") or "id {0}".format(r.get("id"))


+def _leaves(items: list, cap: int = 4) -> str:
+    shown = items[:cap]
+    extra = "…+{0}".format(len(items) - cap) if len(items) > cap else ""
+    return ", ".join(shown) + extra
+
+
 def _section_dispatch(out: list[str], block: dict) -> None:
-    """Method-body fingerprints (per owner+id). `calls` deltas are summarised by length so the
-    line stays readable; the full anchor lists live in the JSON."""
+    """Method bodies (per owner+id), normalised. Each changed entry shows a similarity score and
+    the leaf-level delta (which calls appeared/vanished); a summary line gives the overall drift."""
    out.append("")
-    out.append("{0:<16} {1}".format("METHOD BODIES", _counts(block)))
+    summ = block.get("summary")
+    head = "METHOD BODIES"
+    if summ:
+        head = "{0}  (shared {1}, ~{2} changed, mean {3}% similar)".format(
+            "METHOD BODIES", summ["shared"], summ["changed"], summ["mean_similarity"])
+    out.append("{0}".format(head))
+    out.append("{0:<16} {1}".format("", _counts(block)))
    owner_of = lambda r: r["owner"]
    added = _group_by(block["added"], owner_of)
    removed = _group_by(block["removed"], owner_of)
@@ -99,14 +111,18 @@ def _section_dispatch(out: list[str], block: dict) -> None:
        for it in sorted(removed.get(owner, []), key=_dispatch_name):
            out.append("    - {0}".format(_dispatch_name(it)))
        for it in sorted(changed.get(owner, []), key=_dispatch_name):
-            ch = change_by_id[id(it)]["changes"]
+            ch = change_by_id[id(it)]
+            body = ch.get("body", {})
+            sim = body.get("similarity")
            bits = []
-            if "impl" in ch:
-                bits.append("impl {0} -> {1}".format(ch["impl"][0], ch["impl"][1]))
-            if "calls" in ch:
-                a, b = ch["calls"]
-                bits.append("calls {0} -> {1}".format(len(a or []), len(b or [])))
-            out.append("    ~ {0:<22} {1}".format(_dispatch_name(it), "; ".join(bits)))
+            if body.get("added"):
+                bits.append("+[{0}]".format(_leaves(body["added"])))
+            if body.get("removed"):
+                bits.append("-[{0}]".format(_leaves(body["removed"])))
+            if not bits and "impl" in ch["changes"]:
+                bits.append("impl {0} -> {1}".format(*ch["changes"]["impl"]))
+            label = "{0} {1}%".format(_dispatch_name(it), sim) if sim is not None else _dispatch_name(it)
+            out.append("    ~ {0:<26} {1}".format(label, "  ".join(bits)))


 _EMPTY = {"added": [], "removed": [], "changed": []}