Body normalisation: per-method similarity score + leaf delta
Turns the dispatch axis from a binary changed/unchanged into a "how much" measure
of code change — the original goal. ams.normalize compares two body fingerprints
(the ordered leaf-call anchors) with difflib after collapsing consecutive-duplicate
anchors (a load-twice codegen artefact), yielding a 0-100 similarity and the exact
leaves that appeared/vanished.
Every dispatch `changed` entry now carries body={similarity, added, removed}, and the
block carries a summary={shared, identical, changed, mean_similarity}.
Golden pair (cross-compiler): 470 shared bodies, 131 identical, mean 66% similar;
Animo SHOW/HIDE/PAUSE/RESUME come out 100% despite MSVC6 vs MSVC8, LOAD 50% with the
swapped leaves spelled out.
- normalize.py: canonical / body_similarity / body_delta
- diff: _dispatch_diff enriches changed with body + adds summary
- render: METHOD BODIES shows %, leaf delta, summary line
- UI: similarity % + leaf delta + axis summary
- tests: 5 new -> 34/34
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -110,6 +110,13 @@ każdej metody — wykrywa **zmiany ciała** między wersjami; jak `struct_layou
|
|||||||
między wersjami tego samego kompilatora (cross-compiler proste metody i tak się zgadzają,
|
między wersjami tego samego kompilatora (cross-compiler proste metody i tak się zgadzają,
|
||||||
np. Animo `SHOW`→`vtbl+0xa0` na MSVC6 i MSVC8).
|
np. Animo `SHOW`→`vtbl+0xa0` na MSVC6 i MSVC8).
|
||||||
|
|
||||||
|
**Normalizacja ciał** (`ams.normalize`): każda zmiana w osi `dispatch` niesie wynik
|
||||||
|
`body = {similarity, added, removed}` — podobieństwo 0–100% sekwencji liści (`difflib`,
|
||||||
|
po zwinięciu sąsiednich duplikatów = artefaktów codegenu) oraz *które* wywołania doszły/zniknęły.
|
||||||
|
Blok dostaje też `summary` (wspólne / identyczne / zmienione / średnie podobieństwo). Na golden
|
||||||
|
pair (cross-compiler): 470 wspólnych ciał, 131 identycznych, średnio 66% — a `SHOW/HIDE/PAUSE/
|
||||||
|
RESUME` Animo wychodzą 100% mimo MSVC6↔MSVC8. To jest miara „na ile się zmieniło" na poziomie kodu.
|
||||||
|
|
||||||
## Backend (FastAPI + katalog)
|
## Backend (FastAPI + katalog)
|
||||||
|
|
||||||
Modularny monolit nad SQLAlchemy — domyślnie SQLite (zero setupu), gotowy pod Postgres
|
Modularny monolit nad SQLAlchemy — domyślnie SQLite (zero setupu), gotowy pod Postgres
|
||||||
|
|||||||
@@ -240,18 +240,34 @@ function axisCard(ax, block) {
|
|||||||
const sortByName = (arr) => arr.slice().sort((x, y) => ax.name(x).localeCompare(ax.name(y)));
|
const sortByName = (arr) => arr.slice().sort((x, y) => ax.name(x).localeCompare(ax.name(y)));
|
||||||
for (const it of sortByName(block.added)) body.append(el("div", { class: "row r-add" }, ax.fmt(it)));
|
for (const it of sortByName(block.added)) body.append(el("div", { class: "row r-add" }, ax.fmt(it)));
|
||||||
for (const it of sortByName(block.removed)) body.append(el("div", { class: "row r-del" }, ax.fmt(it)));
|
for (const it of sortByName(block.removed)) body.append(el("div", { class: "row r-del" }, ax.fmt(it)));
|
||||||
|
const leaves = (arr) => "[" + arr.slice(0, 4).join(", ") + (arr.length > 4 ? "…+" + (arr.length - 4) : "") + "]";
|
||||||
for (const ch of block.changed.slice().sort((x, y) => ax.name(x.item).localeCompare(ax.name(y.item)))) {
|
for (const ch of block.changed.slice().sort((x, y) => ax.name(x.item).localeCompare(ax.name(y.item)))) {
|
||||||
const deltas = Object.entries(ch.changes).map(([f, v]) =>
|
let deltas, sim = null;
|
||||||
(Array.isArray(v[0]) || Array.isArray(v[1]))
|
if (ch.body) { // method-body diff: similarity score + leaf-level delta
|
||||||
? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}`
|
sim = ch.body.similarity;
|
||||||
: `${f}: ${v[0]} → ${v[1]}`).join(", ");
|
const parts = [];
|
||||||
body.append(el("div", { class: "row r-chg" }, ax.name(ch.item), " ", el("span", { class: "delta" }, deltas)));
|
if (ch.body.added && ch.body.added.length) parts.push("+" + leaves(ch.body.added));
|
||||||
|
if (ch.body.removed && ch.body.removed.length) parts.push("−" + leaves(ch.body.removed));
|
||||||
|
deltas = parts.join(" ") || (ch.changes.impl ? `impl ${ch.changes.impl[0]} → ${ch.changes.impl[1]}` : "");
|
||||||
|
} else {
|
||||||
|
deltas = Object.entries(ch.changes).map(([f, v]) =>
|
||||||
|
(Array.isArray(v[0]) || Array.isArray(v[1]))
|
||||||
|
? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}`
|
||||||
|
: `${f}: ${v[0]} → ${v[1]}`).join(", ");
|
||||||
|
}
|
||||||
|
const row = el("div", { class: "row r-chg" }, ax.name(ch.item), " ");
|
||||||
|
if (sim != null) row.append(el("span", { class: "simpct" }, sim + "%"), " ");
|
||||||
|
row.append(el("span", { class: "delta" }, deltas));
|
||||||
|
body.append(row);
|
||||||
}
|
}
|
||||||
|
const sum = block.summary
|
||||||
|
? el("span", { class: "axsum" }, `śr. ${block.summary.mean_similarity}% · ${block.summary.changed}/${block.summary.shared} zmienionych`)
|
||||||
|
: null;
|
||||||
return el("details", { class: "axis", open: true },
|
return el("details", { class: "axis", open: true },
|
||||||
el("summary", {}, el("span", { class: "title" }, ax.title),
|
el("summary", {}, el("span", { class: "title" }, ax.title),
|
||||||
badge("b-add", "+", block.added.length),
|
badge("b-add", "+", block.added.length),
|
||||||
badge("b-del", "−", block.removed.length),
|
badge("b-del", "−", block.removed.length),
|
||||||
badge("b-chg", "~", block.changed.length)),
|
badge("b-chg", "~", block.changed.length), sum),
|
||||||
body);
|
body);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -97,6 +97,8 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); }
|
|||||||
.r-chg::before { content: "~"; color: var(--chg); }
|
.r-chg::before { content: "~"; color: var(--chg); }
|
||||||
.r-del { color: var(--dim); }
|
.r-del { color: var(--dim); }
|
||||||
.delta { color: var(--chg); }
|
.delta { color: var(--chg); }
|
||||||
|
.simpct { color: var(--accent); font-weight: 600; }
|
||||||
|
.axsum { margin-left: auto; color: var(--dim); font-size: 11px; }
|
||||||
.empty { color: var(--dim); font-style: italic; }
|
.empty { color: var(--dim); font-style: italic; }
|
||||||
.moved { color: var(--accent); }
|
.moved { color: var(--accent); }
|
||||||
|
|
||||||
|
|||||||
30
ams/diff.py
30
ams/diff.py
@@ -85,6 +85,33 @@ def _dispatch_with_names(snap: Snapshot) -> list[Item]:
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _dispatch_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
|
||||||
|
"""Dispatch axis with body-level normalisation: every `changed` entry carries a `body`
|
||||||
|
{similarity, added, removed} from ams.normalize, and the block gets a `summary` measuring
|
||||||
|
how much the shared bodies changed overall (mean similarity, identical/changed counts)."""
|
||||||
|
from .normalize import body_delta, body_similarity
|
||||||
|
|
||||||
|
do = _dispatch_with_names(old)
|
||||||
|
dn = _dispatch_with_names(new)
|
||||||
|
block = keyed_diff(do, dn, _dispatch_key, ["impl", "calls"])
|
||||||
|
|
||||||
|
old_calls = {_dispatch_key(r): r.get("calls", []) for r in do}
|
||||||
|
new_calls = {_dispatch_key(r): r.get("calls", []) for r in dn}
|
||||||
|
for ch in block["changed"]:
|
||||||
|
k = _dispatch_key(ch["item"])
|
||||||
|
ch["body"] = body_delta(old_calls.get(k, []), new_calls.get(k, []))
|
||||||
|
|
||||||
|
shared = set(old_calls) & set(new_calls)
|
||||||
|
sims = [body_similarity(old_calls[k], new_calls[k]) for k in shared]
|
||||||
|
block["summary"] = {
|
||||||
|
"shared": len(shared),
|
||||||
|
"identical": sum(1 for s in sims if s == 100),
|
||||||
|
"changed": sum(1 for s in sims if s < 100),
|
||||||
|
"mean_similarity": int(round(sum(sims) / len(sims))) if sims else 100,
|
||||||
|
}
|
||||||
|
return block
|
||||||
|
|
||||||
|
|
||||||
def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
|
def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"binary": {"from": old.binary, "to": new.binary},
|
"binary": {"from": old.binary, "to": new.binary},
|
||||||
@@ -94,8 +121,7 @@ def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]:
|
|||||||
"fields": keyed_diff(old.fields, new.fields, _owner_name_key, ["type"]),
|
"fields": keyed_diff(old.fields, new.fields, _owner_name_key, ["type"]),
|
||||||
"struct_layout": keyed_diff(old.struct_layout, new.struct_layout, _layout_key,
|
"struct_layout": keyed_diff(old.struct_layout, new.struct_layout, _layout_key,
|
||||||
["size", "is_vtable"]),
|
["size", "is_vtable"]),
|
||||||
"method_dispatch": keyed_diff(_dispatch_with_names(old), _dispatch_with_names(new),
|
"method_dispatch": _dispatch_diff(old, new),
|
||||||
_dispatch_key, ["impl", "calls"]),
|
|
||||||
"method_inheritance": keyed_diff(old.method_inheritance, new.method_inheritance,
|
"method_inheritance": keyed_diff(old.method_inheritance, new.method_inheritance,
|
||||||
lambda x: x["runner"], ["base_runner"]),
|
lambda x: x["runner"], ["base_runner"]),
|
||||||
"field_inheritance": keyed_diff(old.field_inheritance, new.field_inheritance,
|
"field_inheritance": keyed_diff(old.field_inheritance, new.field_inheritance,
|
||||||
|
|||||||
54
ams/normalize.py
Normal file
54
ams/normalize.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
"""Body-level normalisation: *how much* did a method's implementation change?
|
||||||
|
|
||||||
|
The dispatch axis gives each method an ordered list of leaf CALL anchors (its body
|
||||||
|
fingerprint). Here we turn two such lists into a single similarity score plus a precise
|
||||||
|
leaf-level delta, so a body diff reads as "SHOW 82% — +vtbl+0x58, -CMC_Foo::bar" instead
|
||||||
|
of a binary changed/unchanged.
|
||||||
|
|
||||||
|
`canonical()` collapses consecutive duplicate anchors — the strlen+memcpy / load-twice idioms
|
||||||
|
emit the same leaf back-to-back, and that doubling is a codegen artefact, not a real call — so
|
||||||
|
the comparison doesn't punish it. Order is otherwise preserved; matching is via
|
||||||
|
difflib.SequenceMatcher (stdlib, deterministic), which is insertion/deletion aware.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
|
|
||||||
|
def canonical(calls: list[str]) -> list[str]:
|
||||||
|
"""Drop consecutive duplicate anchors (load-twice artefacts); keep order otherwise."""
|
||||||
|
out: list[str] = []
|
||||||
|
for c in calls or []:
|
||||||
|
if not out or out[-1] != c:
|
||||||
|
out.append(c)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def body_similarity(a: list[str], b: list[str]) -> int:
|
||||||
|
"""Order-aware similarity of two body fingerprints, 0–100. Two empty bodies are identical."""
|
||||||
|
ca, cb = canonical(a), canonical(b)
|
||||||
|
if not ca and not cb:
|
||||||
|
return 100
|
||||||
|
return int(round(100 * SequenceMatcher(None, ca, cb).ratio()))
|
||||||
|
|
||||||
|
|
||||||
|
def body_delta(a: list[str], b: list[str]) -> dict:
|
||||||
|
"""Leaf-level change between two bodies: {similarity, added, removed}.
|
||||||
|
|
||||||
|
`added`/`removed` are the anchors present only on the new/old side (in order), derived from
|
||||||
|
the alignment — i.e. exactly which calls appeared or vanished."""
|
||||||
|
ca, cb = canonical(a), canonical(b)
|
||||||
|
sm = SequenceMatcher(None, ca, cb)
|
||||||
|
added: list[str] = []
|
||||||
|
removed: list[str] = []
|
||||||
|
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
||||||
|
if tag in ("replace", "delete"):
|
||||||
|
removed.extend(ca[i1:i2])
|
||||||
|
if tag in ("replace", "insert"):
|
||||||
|
added.extend(cb[j1:j2])
|
||||||
|
return {
|
||||||
|
"similarity": 100 if (not ca and not cb) else int(round(100 * sm.ratio())),
|
||||||
|
"added": added,
|
||||||
|
"removed": removed,
|
||||||
|
}
|
||||||
@@ -82,11 +82,23 @@ def _dispatch_name(r: dict) -> str:
|
|||||||
return r.get("name") or "id {0}".format(r.get("id"))
|
return r.get("name") or "id {0}".format(r.get("id"))
|
||||||
|
|
||||||
|
|
||||||
|
def _leaves(items: list, cap: int = 4) -> str:
|
||||||
|
shown = items[:cap]
|
||||||
|
extra = "…+{0}".format(len(items) - cap) if len(items) > cap else ""
|
||||||
|
return ", ".join(shown) + extra
|
||||||
|
|
||||||
|
|
||||||
def _section_dispatch(out: list[str], block: dict) -> None:
|
def _section_dispatch(out: list[str], block: dict) -> None:
|
||||||
"""Method-body fingerprints (per owner+id). `calls` deltas are summarised by length so the
|
"""Method bodies (per owner+id), normalised. Each changed entry shows a similarity score and
|
||||||
line stays readable; the full anchor lists live in the JSON."""
|
the leaf-level delta (which calls appeared/vanished); a summary line gives the overall drift."""
|
||||||
out.append("")
|
out.append("")
|
||||||
out.append("{0:<16} {1}".format("METHOD BODIES", _counts(block)))
|
summ = block.get("summary")
|
||||||
|
head = "METHOD BODIES"
|
||||||
|
if summ:
|
||||||
|
head = "{0} (shared {1}, ~{2} changed, mean {3}% similar)".format(
|
||||||
|
"METHOD BODIES", summ["shared"], summ["changed"], summ["mean_similarity"])
|
||||||
|
out.append("{0}".format(head))
|
||||||
|
out.append("{0:<16} {1}".format("", _counts(block)))
|
||||||
owner_of = lambda r: r["owner"]
|
owner_of = lambda r: r["owner"]
|
||||||
added = _group_by(block["added"], owner_of)
|
added = _group_by(block["added"], owner_of)
|
||||||
removed = _group_by(block["removed"], owner_of)
|
removed = _group_by(block["removed"], owner_of)
|
||||||
@@ -99,14 +111,18 @@ def _section_dispatch(out: list[str], block: dict) -> None:
|
|||||||
for it in sorted(removed.get(owner, []), key=_dispatch_name):
|
for it in sorted(removed.get(owner, []), key=_dispatch_name):
|
||||||
out.append(" - {0}".format(_dispatch_name(it)))
|
out.append(" - {0}".format(_dispatch_name(it)))
|
||||||
for it in sorted(changed.get(owner, []), key=_dispatch_name):
|
for it in sorted(changed.get(owner, []), key=_dispatch_name):
|
||||||
ch = change_by_id[id(it)]["changes"]
|
ch = change_by_id[id(it)]
|
||||||
|
body = ch.get("body", {})
|
||||||
|
sim = body.get("similarity")
|
||||||
bits = []
|
bits = []
|
||||||
if "impl" in ch:
|
if body.get("added"):
|
||||||
bits.append("impl {0} -> {1}".format(ch["impl"][0], ch["impl"][1]))
|
bits.append("+[{0}]".format(_leaves(body["added"])))
|
||||||
if "calls" in ch:
|
if body.get("removed"):
|
||||||
a, b = ch["calls"]
|
bits.append("-[{0}]".format(_leaves(body["removed"])))
|
||||||
bits.append("calls {0} -> {1}".format(len(a or []), len(b or [])))
|
if not bits and "impl" in ch["changes"]:
|
||||||
out.append(" ~ {0:<22} {1}".format(_dispatch_name(it), "; ".join(bits)))
|
bits.append("impl {0} -> {1}".format(*ch["changes"]["impl"]))
|
||||||
|
label = "{0} {1}%".format(_dispatch_name(it), sim) if sim is not None else _dispatch_name(it)
|
||||||
|
out.append(" ~ {0:<26} {1}".format(label, " ".join(bits)))
|
||||||
|
|
||||||
|
|
||||||
_EMPTY = {"added": [], "removed": [], "changed": []}
|
_EMPTY = {"added": [], "removed": [], "changed": []}
|
||||||
|
|||||||
59
tests/test_normalize.py
Normal file
59
tests/test_normalize.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
"""Body-level normalisation: similarity score + leaf delta, and its wiring into the diff."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from ams.normalize import body_delta, body_similarity, canonical
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonical_collapses_consecutive_dups():
|
||||||
|
assert canonical(["a", "a", "b", "b", "a"]) == ["a", "b", "a"]
|
||||||
|
assert canonical([]) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_body_similarity_bounds():
|
||||||
|
assert body_similarity([], []) == 100
|
||||||
|
assert body_similarity(["a", "b"], ["a", "b"]) == 100
|
||||||
|
assert body_similarity(["a", "b", "c"], ["x", "y", "z"]) == 0
|
||||||
|
# one inserted leaf out of a shared core -> high but < 100
|
||||||
|
s = body_similarity(["a", "b", "c"], ["a", "b", "X", "c"])
|
||||||
|
assert 70 <= s < 100
|
||||||
|
|
||||||
|
|
||||||
|
def test_body_similarity_ignores_load_twice_artifact():
|
||||||
|
# the doubled anchor is a codegen artefact; canonical() makes the two bodies identical
|
||||||
|
assert body_similarity(["getAnimo", "vtbl+0x3c", "vtbl+0x3c"], ["getAnimo", "vtbl+0x3c"]) == 100
|
||||||
|
|
||||||
|
|
||||||
|
def test_body_delta_added_removed():
|
||||||
|
d = body_delta(["a", "b", "c"], ["a", "X", "c"])
|
||||||
|
assert d["removed"] == ["b"] and d["added"] == ["X"]
|
||||||
|
assert d["similarity"] < 100
|
||||||
|
|
||||||
|
|
||||||
|
def test_dispatch_diff_carries_body_and_summary():
|
||||||
|
from ams.diff import compute_diff
|
||||||
|
from ams.snapshot import Snapshot
|
||||||
|
|
||||||
|
def snap(calls1, calls7):
|
||||||
|
return Snapshot({
|
||||||
|
"binary": {}, "types": [], "events": [], "fields": [],
|
||||||
|
"struct_layout": [], "method_inheritance": [], "field_inheritance": [],
|
||||||
|
"methods": [{"owner": "CMC_Animo", "name": "SHOW", "id": 1},
|
||||||
|
{"owner": "CMC_Animo", "name": "LOAD", "id": 7}],
|
||||||
|
"method_dispatch": [
|
||||||
|
{"owner": "CMC_Animo", "id": 1, "impl": None, "calls": calls1},
|
||||||
|
{"owner": "CMC_Animo", "id": 7, "impl": None, "calls": calls7},
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
old = snap(["getAnimo", "vtbl+0xa0"], ["a", "b", "c"])
|
||||||
|
new = snap(["getAnimo", "vtbl+0xa0"], ["a", "X", "c"]) # SHOW identical, LOAD changed
|
||||||
|
block = compute_diff(old, new)["method_dispatch"]
|
||||||
|
|
||||||
|
s = block["summary"]
|
||||||
|
assert s["shared"] == 2 and s["identical"] == 1 and s["changed"] == 1
|
||||||
|
assert 75 <= s["mean_similarity"] <= 92 # mean of 100 (SHOW) and ~67 (LOAD)
|
||||||
|
|
||||||
|
ch = block["changed"]
|
||||||
|
assert len(ch) == 1 and ch[0]["item"]["name"] == "LOAD"
|
||||||
|
assert ch[0]["body"]["added"] == ["X"] and ch[0]["body"]["removed"] == ["b"]
|
||||||
Reference in New Issue
Block a user