diff --git a/README.md b/README.md index c1b72d6..f67fb04 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,13 @@ każdej metody — wykrywa **zmiany ciała** między wersjami; jak `struct_layou między wersjami tego samego kompilatora (cross-compiler proste metody i tak się zgadzają, np. Animo `SHOW`→`vtbl+0xa0` na MSVC6 i MSVC8). +**Normalizacja ciał** (`ams.normalize`): każda zmiana w osi `dispatch` niesie wynik +`body = {similarity, added, removed}` — podobieństwo 0–100% sekwencji liści (`difflib`, +po zwinięciu sąsiednich duplikatów = artefaktów codegenu) oraz *które* wywołania doszły/zniknęły. +Blok dostaje też `summary` (wspólne / identyczne / zmienione / średnie podobieństwo). Na golden +pair (cross-compiler): 470 wspólnych ciał, 131 identycznych, średnio 66% — a `SHOW/HIDE/PAUSE/ +RESUME` Animo wychodzą 100% mimo MSVC6↔MSVC8. To jest miara „na ile się zmieniło" na poziomie kodu. + ## Backend (FastAPI + katalog) Modularny monolit nad SQLAlchemy — domyślnie SQLite (zero setupu), gotowy pod Postgres diff --git a/ams/api/static/app.js b/ams/api/static/app.js index c737660..33ae86c 100644 --- a/ams/api/static/app.js +++ b/ams/api/static/app.js @@ -240,18 +240,34 @@ function axisCard(ax, block) { const sortByName = (arr) => arr.slice().sort((x, y) => ax.name(x).localeCompare(ax.name(y))); for (const it of sortByName(block.added)) body.append(el("div", { class: "row r-add" }, ax.fmt(it))); for (const it of sortByName(block.removed)) body.append(el("div", { class: "row r-del" }, ax.fmt(it))); + const leaves = (arr) => "[" + arr.slice(0, 4).join(", ") + (arr.length > 4 ? "…+" + (arr.length - 4) : "") + "]"; for (const ch of block.changed.slice().sort((x, y) => ax.name(x.item).localeCompare(ax.name(y.item)))) { - const deltas = Object.entries(ch.changes).map(([f, v]) => - (Array.isArray(v[0]) || Array.isArray(v[1])) - ? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}` - : `${f}: ${v[0]} → ${v[1]}`).join(", "); - body.append(el("div", { class: "row r-chg" }, ax.name(ch.item), " ", el("span", { class: "delta" }, deltas))); + let deltas, sim = null; + if (ch.body) { // method-body diff: similarity score + leaf-level delta + sim = ch.body.similarity; + const parts = []; + if (ch.body.added && ch.body.added.length) parts.push("+" + leaves(ch.body.added)); + if (ch.body.removed && ch.body.removed.length) parts.push("−" + leaves(ch.body.removed)); + deltas = parts.join(" ") || (ch.changes.impl ? `impl ${ch.changes.impl[0]} → ${ch.changes.impl[1]}` : ""); + } else { + deltas = Object.entries(ch.changes).map(([f, v]) => + (Array.isArray(v[0]) || Array.isArray(v[1])) + ? `${f}: ${(v[0] || []).length} → ${(v[1] || []).length}` + : `${f}: ${v[0]} → ${v[1]}`).join(", "); + } + const row = el("div", { class: "row r-chg" }, ax.name(ch.item), " "); + if (sim != null) row.append(el("span", { class: "simpct" }, sim + "%"), " "); + row.append(el("span", { class: "delta" }, deltas)); + body.append(row); } + const sum = block.summary + ? el("span", { class: "axsum" }, `śr. ${block.summary.mean_similarity}% · ${block.summary.changed}/${block.summary.shared} zmienionych`) + : null; return el("details", { class: "axis", open: true }, el("summary", {}, el("span", { class: "title" }, ax.title), badge("b-add", "+", block.added.length), badge("b-del", "−", block.removed.length), - badge("b-chg", "~", block.changed.length)), + badge("b-chg", "~", block.changed.length), sum), body); } diff --git a/ams/api/static/style.css b/ams/api/static/style.css index 1dc428c..3819ba4 100644 --- a/ams/api/static/style.css +++ b/ams/api/static/style.css @@ -97,6 +97,8 @@ body { background: var(--bg); color: var(--fg); font: 13px/1.45 var(--mono); } .r-chg::before { content: "~"; color: var(--chg); } .r-del { color: var(--dim); } .delta { color: var(--chg); } +.simpct { color: var(--accent); font-weight: 600; } +.axsum { margin-left: auto; color: var(--dim); font-size: 11px; } .empty { color: var(--dim); font-style: italic; } .moved { color: var(--accent); } diff --git a/ams/diff.py b/ams/diff.py index d604722..b644ada 100644 --- a/ams/diff.py +++ b/ams/diff.py @@ -85,6 +85,33 @@ def _dispatch_with_names(snap: Snapshot) -> list[Item]: return out +def _dispatch_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]: + """Dispatch axis with body-level normalisation: every `changed` entry carries a `body` + {similarity, added, removed} from ams.normalize, and the block gets a `summary` measuring + how much the shared bodies changed overall (mean similarity, identical/changed counts).""" + from .normalize import body_delta, body_similarity + + do = _dispatch_with_names(old) + dn = _dispatch_with_names(new) + block = keyed_diff(do, dn, _dispatch_key, ["impl", "calls"]) + + old_calls = {_dispatch_key(r): r.get("calls", []) for r in do} + new_calls = {_dispatch_key(r): r.get("calls", []) for r in dn} + for ch in block["changed"]: + k = _dispatch_key(ch["item"]) + ch["body"] = body_delta(old_calls.get(k, []), new_calls.get(k, [])) + + shared = set(old_calls) & set(new_calls) + sims = [body_similarity(old_calls[k], new_calls[k]) for k in shared] + block["summary"] = { + "shared": len(shared), + "identical": sum(1 for s in sims if s == 100), + "changed": sum(1 for s in sims if s < 100), + "mean_similarity": int(round(sum(sims) / len(sims))) if sims else 100, + } + return block + + def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]: return { "binary": {"from": old.binary, "to": new.binary}, @@ -94,8 +121,7 @@ def compute_diff(old: Snapshot, new: Snapshot) -> dict[str, Any]: "fields": keyed_diff(old.fields, new.fields, _owner_name_key, ["type"]), "struct_layout": keyed_diff(old.struct_layout, new.struct_layout, _layout_key, ["size", "is_vtable"]), - "method_dispatch": keyed_diff(_dispatch_with_names(old), _dispatch_with_names(new), - _dispatch_key, ["impl", "calls"]), + "method_dispatch": _dispatch_diff(old, new), "method_inheritance": keyed_diff(old.method_inheritance, new.method_inheritance, lambda x: x["runner"], ["base_runner"]), "field_inheritance": keyed_diff(old.field_inheritance, new.field_inheritance, diff --git a/ams/normalize.py b/ams/normalize.py new file mode 100644 index 0000000..43ad6e9 --- /dev/null +++ b/ams/normalize.py @@ -0,0 +1,54 @@ +"""Body-level normalisation: *how much* did a method's implementation change? + +The dispatch axis gives each method an ordered list of leaf CALL anchors (its body +fingerprint). Here we turn two such lists into a single similarity score plus a precise +leaf-level delta, so a body diff reads as "SHOW 82% — +vtbl+0x58, -CMC_Foo::bar" instead +of a binary changed/unchanged. + +`canonical()` collapses consecutive duplicate anchors — the strlen+memcpy / load-twice idioms +emit the same leaf back-to-back, and that doubling is a codegen artefact, not a real call — so +the comparison doesn't punish it. Order is otherwise preserved; matching is via +difflib.SequenceMatcher (stdlib, deterministic), which is insertion/deletion aware. +""" + +from __future__ import annotations + +from difflib import SequenceMatcher + + +def canonical(calls: list[str]) -> list[str]: + """Drop consecutive duplicate anchors (load-twice artefacts); keep order otherwise.""" + out: list[str] = [] + for c in calls or []: + if not out or out[-1] != c: + out.append(c) + return out + + +def body_similarity(a: list[str], b: list[str]) -> int: + """Order-aware similarity of two body fingerprints, 0–100. Two empty bodies are identical.""" + ca, cb = canonical(a), canonical(b) + if not ca and not cb: + return 100 + return int(round(100 * SequenceMatcher(None, ca, cb).ratio())) + + +def body_delta(a: list[str], b: list[str]) -> dict: + """Leaf-level change between two bodies: {similarity, added, removed}. + + `added`/`removed` are the anchors present only on the new/old side (in order), derived from + the alignment — i.e. exactly which calls appeared or vanished.""" + ca, cb = canonical(a), canonical(b) + sm = SequenceMatcher(None, ca, cb) + added: list[str] = [] + removed: list[str] = [] + for tag, i1, i2, j1, j2 in sm.get_opcodes(): + if tag in ("replace", "delete"): + removed.extend(ca[i1:i2]) + if tag in ("replace", "insert"): + added.extend(cb[j1:j2]) + return { + "similarity": 100 if (not ca and not cb) else int(round(100 * sm.ratio())), + "added": added, + "removed": removed, + } diff --git a/ams/render.py b/ams/render.py index c3c44d4..ab2d204 100644 --- a/ams/render.py +++ b/ams/render.py @@ -82,11 +82,23 @@ def _dispatch_name(r: dict) -> str: return r.get("name") or "id {0}".format(r.get("id")) +def _leaves(items: list, cap: int = 4) -> str: + shown = items[:cap] + extra = "…+{0}".format(len(items) - cap) if len(items) > cap else "" + return ", ".join(shown) + extra + + def _section_dispatch(out: list[str], block: dict) -> None: - """Method-body fingerprints (per owner+id). `calls` deltas are summarised by length so the - line stays readable; the full anchor lists live in the JSON.""" + """Method bodies (per owner+id), normalised. Each changed entry shows a similarity score and + the leaf-level delta (which calls appeared/vanished); a summary line gives the overall drift.""" out.append("") - out.append("{0:<16} {1}".format("METHOD BODIES", _counts(block))) + summ = block.get("summary") + head = "METHOD BODIES" + if summ: + head = "{0} (shared {1}, ~{2} changed, mean {3}% similar)".format( + "METHOD BODIES", summ["shared"], summ["changed"], summ["mean_similarity"]) + out.append("{0}".format(head)) + out.append("{0:<16} {1}".format("", _counts(block))) owner_of = lambda r: r["owner"] added = _group_by(block["added"], owner_of) removed = _group_by(block["removed"], owner_of) @@ -99,14 +111,18 @@ def _section_dispatch(out: list[str], block: dict) -> None: for it in sorted(removed.get(owner, []), key=_dispatch_name): out.append(" - {0}".format(_dispatch_name(it))) for it in sorted(changed.get(owner, []), key=_dispatch_name): - ch = change_by_id[id(it)]["changes"] + ch = change_by_id[id(it)] + body = ch.get("body", {}) + sim = body.get("similarity") bits = [] - if "impl" in ch: - bits.append("impl {0} -> {1}".format(ch["impl"][0], ch["impl"][1])) - if "calls" in ch: - a, b = ch["calls"] - bits.append("calls {0} -> {1}".format(len(a or []), len(b or []))) - out.append(" ~ {0:<22} {1}".format(_dispatch_name(it), "; ".join(bits))) + if body.get("added"): + bits.append("+[{0}]".format(_leaves(body["added"]))) + if body.get("removed"): + bits.append("-[{0}]".format(_leaves(body["removed"]))) + if not bits and "impl" in ch["changes"]: + bits.append("impl {0} -> {1}".format(*ch["changes"]["impl"])) + label = "{0} {1}%".format(_dispatch_name(it), sim) if sim is not None else _dispatch_name(it) + out.append(" ~ {0:<26} {1}".format(label, " ".join(bits))) _EMPTY = {"added": [], "removed": [], "changed": []} diff --git a/tests/test_normalize.py b/tests/test_normalize.py new file mode 100644 index 0000000..a4b3601 --- /dev/null +++ b/tests/test_normalize.py @@ -0,0 +1,59 @@ +"""Body-level normalisation: similarity score + leaf delta, and its wiring into the diff.""" + +from __future__ import annotations + +from ams.normalize import body_delta, body_similarity, canonical + + +def test_canonical_collapses_consecutive_dups(): + assert canonical(["a", "a", "b", "b", "a"]) == ["a", "b", "a"] + assert canonical([]) == [] + + +def test_body_similarity_bounds(): + assert body_similarity([], []) == 100 + assert body_similarity(["a", "b"], ["a", "b"]) == 100 + assert body_similarity(["a", "b", "c"], ["x", "y", "z"]) == 0 + # one inserted leaf out of a shared core -> high but < 100 + s = body_similarity(["a", "b", "c"], ["a", "b", "X", "c"]) + assert 70 <= s < 100 + + +def test_body_similarity_ignores_load_twice_artifact(): + # the doubled anchor is a codegen artefact; canonical() makes the two bodies identical + assert body_similarity(["getAnimo", "vtbl+0x3c", "vtbl+0x3c"], ["getAnimo", "vtbl+0x3c"]) == 100 + + +def test_body_delta_added_removed(): + d = body_delta(["a", "b", "c"], ["a", "X", "c"]) + assert d["removed"] == ["b"] and d["added"] == ["X"] + assert d["similarity"] < 100 + + +def test_dispatch_diff_carries_body_and_summary(): + from ams.diff import compute_diff + from ams.snapshot import Snapshot + + def snap(calls1, calls7): + return Snapshot({ + "binary": {}, "types": [], "events": [], "fields": [], + "struct_layout": [], "method_inheritance": [], "field_inheritance": [], + "methods": [{"owner": "CMC_Animo", "name": "SHOW", "id": 1}, + {"owner": "CMC_Animo", "name": "LOAD", "id": 7}], + "method_dispatch": [ + {"owner": "CMC_Animo", "id": 1, "impl": None, "calls": calls1}, + {"owner": "CMC_Animo", "id": 7, "impl": None, "calls": calls7}, + ], + }) + + old = snap(["getAnimo", "vtbl+0xa0"], ["a", "b", "c"]) + new = snap(["getAnimo", "vtbl+0xa0"], ["a", "X", "c"]) # SHOW identical, LOAD changed + block = compute_diff(old, new)["method_dispatch"] + + s = block["summary"] + assert s["shared"] == 2 and s["identical"] == 1 and s["changed"] == 1 + assert 75 <= s["mean_similarity"] <= 92 # mean of 100 (SHOW) and ~67 (LOAD) + + ch = block["changed"] + assert len(ch) == 1 and ch[0]["item"]["name"] == "LOAD" + assert ch[0]["body"]["added"] == ["X"] and ch[0]["body"]["removed"] == ["b"]