Method dispatch axis: map id -> body via Runner::run switch

Recovers how a script method id maps to its implementation, the foundation for
body-level normalisation. Each CMC_*_Runner::run is a switch(id) (vtable slot 17);
every case is the method body — inline (MSVC6) or a tail-call to a separate
show()/load() (MSVC8). The extractor parses the jump table at the disassembly
level (Ghidra's decompiler jump-table recovery silently dropped the big runners),
fingerprints each case by its ordered CALL anchors (Class::method / vtbl+0xNN),
and expands thin wrappers one level so MSVC8 lines up with MSVC6.

Validated on the golden pair: Animo SHOW..RESUME (id 1-4) yield identical leaves
(getAnimo + vtbl+0xa0/0xa4/0x4c/0x50) across both compilers. Coverage 30/32
runners; Piklib 475 / BlooMoo 619 dispatch rows.

- extract_engine_surface.py: extract_method_dispatch (schema_version -> 4)
- snapshots regenerated with the method_dispatch axis
- ams: Snapshot.method_dispatch; diff axis keyed (owner,id) on [impl,calls] with
  method-name join; render METHOD BODIES section; cli --only dispatch; owner filter
- UI: "Ciała metod" diff axis + browse tab
- tests: body-change unit + cross-compiler vtbl assertion -> 29/29

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Patryk Gensch
2026-05-31 13:15:58 +02:00
parent 38be932abc
commit 27399a52b1
10 changed files with 15843 additions and 13 deletions

View File

@@ -314,6 +314,222 @@ def extract_methods(program):
return methods, inheritance
_VTBL_OFF = re.compile(r"\[\w+ \+ (0x[0-9a-fA-F]+)\]")
_MEM_OFF = re.compile(r"\[\w+ \+ (0x[0-9a-fA-F]+)\]")
def _is_generic_name(name):
"""A compiler-assigned placeholder, not a real symbol."""
return (not name) or name.startswith("FUN_") or name.startswith("thunk_") or name.startswith("LAB_")
def _qualified(f):
if f is None:
return None
ns = f.getParentNamespace()
nm = f.getName()
return (ns.getName() + "::" + nm) if (ns is not None and ns.getName() != "Global") else nm
def _call_anchor(program, instr):
"""A normalisable, compiler-tolerant fingerprint of one CALL inside a switch case.
Direct call -> "Namespace::name". On MSVC8 the symbol sits on the ILT *stub* while the body
is an unnamed FUN_, so we keep the stub's name and only fall back to the thunk-resolved body
when the direct name is itself a placeholder. Indirect virtual call -> "vtbl+0xNN" from the
displacement, which abstracts away the register holding `this`."""
cf = call_target(program, instr)
if cf is not None:
if not _is_generic_name(cf.getName()):
return _qualified(cf)
resolved = resolve_thunk(cf)
if resolved is not None and not _is_generic_name(resolved.getName()):
return _qualified(resolved)
return _qualified(cf)
m = _VTBL_OFF.search(instr.toString())
if m is not None:
return "vtbl+" + m.group(1)
return None
def _walk_calls(program, start_addr, stops, limit=80):
"""Walk a straight-line block from `start_addr`, returning (anchors, funcs):
* `anchors` - ordered CALL fingerprints (see `_call_anchor`), additionally recovering the
`MOV reg,[base+0xNN]` / `CALL reg` virtual-call idiom (MSVC8) as `vtbl+0xNN`, so it matches
the `CALL [reg+0xNN]` form (MSVC6).
* `funcs` - the (anchor, entry) of each *direct* call to a real function, used to detect a
thin wrapper case that just forwards to a named/unnamed submethod.
Stops at a RET, an unconditional jump, a `stops` address, or after `limit` instructions."""
listing = program.getListing()
instr = listing.getInstructionAt(start_addr)
anchors = []
funcs = []
regoff = {} # register -> vtable offset most recently loaded into it
n = 0
while instr is not None and n < limit:
if n > 0 and instr.getAddress() in stops:
break
n += 1
mn = instr.getMnemonicString()
if mn == "MOV" and instr.getNumOperands() >= 2:
dst = instr.getDefaultOperandRepresentation(0)
m = _MEM_OFF.search(instr.toString())
if m is not None:
regoff[dst] = m.group(1)
else:
regoff.pop(dst, None)
elif mn == "CALL":
a = _call_anchor(program, instr)
if a is None: # CALL reg -> use the offset last loaded into that register
op0 = instr.getDefaultOperandRepresentation(0)
if op0 in regoff:
a = "vtbl+" + regoff[op0]
if a is not None:
anchors.append(a)
cf = call_target(program, instr)
if cf is not None:
body = resolve_thunk(cf)
if body is not None:
funcs.append((a, body.getEntryPoint()))
ft = instr.getFlowType()
if ft.isTerminal() or (ft.isJump() and not ft.isConditional()):
break
instr = instr.getNext()
return anchors, funcs
_SWITCH_JMP = re.compile(r"\[(\w+)\*0x4 \+ (0x[0-9a-fA-F]+)\]")
_LEA_DISP = re.compile(r"\[\w+ \+ (-?0x[0-9a-fA-F]+)\]")
def _lea_disp(instr):
"""Signed displacement of a `LEA reg,[base + disp]`, parsed from text when getScalar misses."""
m = _LEA_DISP.search(instr.toString())
return int(m.group(1), 16) if m is not None else None
def _parse_switch(program, func):
"""Recover the dense jump-table switch of a `run` function at the disassembly level
(decompiler-independent, so it survives the big inline-heavy runners). Both MSVC6 and
MSVC8 emit the same shape:
LEA idx,[reg - base] ; CMP idx, range ; JA default ; JMP [idx*4 + TABLE]
Returns {table, base, count} or None. `id = table_index + base`; `count = range + 1`."""
listing = program.getListing()
instrs = []
it = listing.getInstructions(func.getBody(), True)
while it.hasNext():
instrs.append(it.next())
idx_reg = table = jmp_addr = None
for instr in instrs:
if instr.getMnemonicString() == "JMP" and instr.getFlowType().isComputed():
m = _SWITCH_JMP.search(instr.toString())
if m is not None:
idx_reg = m.group(1)
space = program.getAddressFactory().getDefaultAddressSpace()
table = space.getAddress(int(m.group(2), 16))
jmp_addr = instr.getAddress()
break
if table is None:
return None
base = 0
count = None
for instr in instrs:
if instr.getAddress().equals(jmp_addr):
break
if instr.getNumOperands() == 0 or instr.getDefaultOperandRepresentation(0) != idx_reg:
continue
mn = instr.getMnemonicString()
s = instr.getScalar(1)
if mn == "CMP" and s is not None:
count = int(s.getValue()) + 1
elif mn == "LEA": # LEA idx,[reg - k] -> id = index + k
disp = int(s.getValue()) if s is not None else _lea_disp(instr)
if disp is not None:
base = -disp
elif mn == "SUB" and s is not None:
base = int(s.getValue())
elif mn == "ADD" and s is not None:
base = -int(s.getValue())
elif mn == "DEC":
base = 1
return {"table": table, "base": base, "count": count}
def extract_method_dispatch(program):
"""For each CMC_*_Runner::run, recover how method ids map to their implementation.
`run(int id, ...)` is a `switch(id)` (vtable slot 17, overridden per runner) whose every
`case id:` is the method body - either a tail-call to a named submethod (BlooMoo/MSVC8
keeps show()/load()/... as separate functions) or inline code whose leaves are virtual
calls on the wrapped object (Piklib/MSVC6). We fingerprint each case by its ordered CALL
anchors, so a later pass can diff method *bodies* by (owner, id). Join names via `methods`."""
fm = program.getFunctionManager()
out = []
it = fm.getFunctions(True)
while it.hasNext():
f = it.next()
if f.getName() != "run":
continue
ns = f.getParentNamespace()
runner = ns.getName() if ns is not None else "?"
if not runner.endswith("_Runner"):
continue
try:
out.extend(_dispatch_from_run(program, f, _owner_from_runner(runner), runner))
except Exception as e: # one malformed runner shouldn't sink the whole axis
print("[!] method_dispatch %s: %s" % (runner, e))
return out
def _dispatch_from_run(program, run_func, owner, runner):
run_func = resolve_thunk(run_func)
sw = _parse_switch(program, run_func)
if sw is None:
return []
count = sw["count"]
if count is None or count < 1 or count > 4096:
return []
mem = program.getMemory()
space = program.getAddressFactory().getDefaultAddressSpace()
targets = []
for i in range(count):
try:
val = mem.getInt(sw["table"].add(i * 4)) & 0xffffffff
except Exception:
break
targets.append(space.getAddress(val))
stops = set(targets)
rows = []
for i in range(len(targets)):
anchors, funcs = _walk_calls(program, targets[i], stops)
# A thin wrapper case forwards to one submethod: the real body (and its leaf anchors)
# live in that function. Expanding one level makes MSVC8 (separate show()/load()) line up
# with MSVC6 (inline), so `calls` is a compiler-tolerant body fingerprint.
if len(anchors) == 1 and len(funcs) == 1 and funcs[0][0] == anchors[0]:
impl = funcs[0][0]
impl_entry = funcs[0][1]
impl_addr = "0x%x" % impl_entry.getOffset()
calls, _ = _walk_calls(program, impl_entry, set())
else:
impl = None
impl_addr = "0x%x" % targets[i].getOffset() # body is inline in the case block
calls = anchors
rows.append({
"owner": owner, "runner": runner, "id": i + sw["base"],
"case_addr": "0x%x" % targets[i].getOffset(),
"impl": impl, "impl_addr": impl_addr, "calls": calls,
})
return rows
def extract_events(program):
"""Per CMC_*::getBehavioursList, collect the ordered event-name literals (ONINIT, ONDONE, ...).
@@ -583,9 +799,10 @@ def run():
events = extract_events(program)
fields = extract_script_fields(program)
struct_layout, field_inheritance = extract_struct_layout(program)
method_dispatch = extract_method_dispatch(program)
snapshot = {
"schema_version": 3,
"schema_version": 4,
"binary": {
"name": program.getName(),
"sha256": sha256_of(program),
@@ -600,6 +817,7 @@ def run():
"fields": fields,
"field_inheritance": field_inheritance,
"struct_layout": struct_layout,
"method_dispatch": method_dispatch,
}
args = getScriptArgs()
@@ -610,9 +828,9 @@ def run():
finally:
fh.close()
print("[+] %s [%s/%s]: %d types, %d methods, %d events, %d fields (%d layout) -> %s" % (
print("[+] %s [%s/%s]: %d types, %d methods, %d events, %d fields (%d layout, %d dispatch) -> %s" % (
program.getName(), engine, compiler, len(types), len(methods),
len(events), len(fields), len(struct_layout), out_path))
len(events), len(fields), len(struct_layout), len(method_dispatch), out_path))
run()