Harden dispatch switch parsing (id-gap drobiazgi)

Resolves the method-id gaps surfaced by the dispatch axis, all real switch-shape
edge cases rather than numbering bugs:

- default holes: ids the runner doesn't implement route to the `JA default` block
  (tail-call to base CMC_Runner::run); capture that target and drop those cases
  (was emitting false Sound 5/6, Scene 10-15, Array 26-31)
- sign-extension: high-base switches (CMC_NetPeer id 257+) encode the base as
  `LEA/ADD idx, 0xFFFFFEFF` (-257); _s32 sign-extends on both the scalar and the
  text path (Ghidra prints big displacements unsigned, small ones signed)
- two-level (byte-indexed) switches: sparse runners (Image) use
  `MOVZX r,byte[i+byteTable]` (MSVC8) / `MOV rl,byte[i+byteTable]` (MSVC6) then
  `JMP [r*4+ptrTable]`; decode target = ptrTable[byteTable[i]], taking base/count
  from the byte-table's index register (differs from the JMP index reg on MSVC6)
- _executable() guard + id clamp: never emit a non-code "case"

Result: Piklib 500 rows / BlooMoo 561, garbage 0, dispatch<->methods consistent.
The lone genuinely-nameless method is CMC_Animo id 14 (a bool getter prepareMthHashSet
doesn't register) - a real engine property, correctly absent from the methods axis.

FUN_ ctor names are not recoverable (no symbols/mangled strings/RTTI in the binary
for FILTER/MOVIE/VECTOR/PATH/FIFO/LIFO/STATICFILTER); cpp_class=None stays.

Snapshots regenerated; 34/34 tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Patryk Gensch
2026-05-31 16:56:24 +02:00
parent b0d3d22445
commit be733cf8b7
3 changed files with 773 additions and 728 deletions

View File

@@ -323,6 +323,13 @@ def _is_generic_name(name):
return (not name) or name.startswith("FUN_") or name.startswith("thunk_") or name.startswith("LAB_")
def _executable(program, addr):
"""True if `addr` lives in an executable memory block - a sanity gate so a switch shape we
don't model (or a table over-read) can never emit a non-code 'case' as a method."""
blk = program.getMemory().getBlock(addr)
return blk is not None and blk.isExecute()
def _qualified(f):
if f is None:
return None
@@ -402,6 +409,9 @@ def _walk_calls(program, start_addr, stops, limit=80):
_SWITCH_JMP = re.compile(r"\[(\w+)\*0x4 \+ (0x[0-9a-fA-F]+)\]")
_LEA_DISP = re.compile(r"\[\w+ \+ (-?0x[0-9a-fA-F]+)\]")
# A two-level switch's byte-index-table load: `byte ptr [<boundsreg> + <tableaddr>]`. The 5+ hex
# digits distinguish a table address (0x100xxxxx) from a small struct-field offset.
_BYTE_TABLE = re.compile(r"byte ptr \[(\w+) \+ (0x[0-9a-fA-F]{5,})\]")
def _lea_disp(instr):
@@ -410,6 +420,14 @@ def _lea_disp(instr):
return int(m.group(1), 16) if m is not None else None
def _s32(v):
"""Interpret a scalar as 32-bit signed. Ghidra's getScalar() hands back the raw unsigned
immediate, so e.g. `ADD idx, 0xFFFFFEFF` (really -257, a switch starting at id 257) must be
sign-extended or the recovered ids overflow to nonsense."""
v = int(v) & 0xffffffff
return v - 0x100000000 if v >= 0x80000000 else v
def _parse_switch(program, func):
"""Recover the dense jump-table switch of a `run` function at the disassembly level
(decompiler-independent, so it survives the big inline-heavy runners). Both MSVC6 and
@@ -417,48 +435,72 @@ def _parse_switch(program, func):
LEA idx,[reg - base] ; CMP idx, range ; JA default ; JMP [idx*4 + TABLE]
Returns {table, base, count} or None. `id = table_index + base`; `count = range + 1`."""
Returns {table, base, count, default} or None. `id = table_index + base`; `count = range + 1`.
`default` is the out-of-range target of the `JA default` bounds check - the jump table also
routes its *holes* (ids the runner doesn't implement) there, so cases pointing at it are not
real methods and must be dropped."""
listing = program.getListing()
space = program.getAddressFactory().getDefaultAddressSpace()
instrs = []
it = listing.getInstructions(func.getBody(), True)
while it.hasNext():
instrs.append(it.next())
idx_reg = table = jmp_addr = None
idx_reg = table = jmp_addr = default = None
for instr in instrs:
if instr.getMnemonicString() == "JMP" and instr.getFlowType().isComputed():
ft = instr.getFlowType()
if ft.isJump() and ft.isConditional(): # the `JA default` bounds check (last one wins)
flows = instr.getFlows()
if len(flows) > 0:
default = flows[0]
if instr.getMnemonicString() == "JMP" and ft.isComputed():
m = _SWITCH_JMP.search(instr.toString())
if m is not None:
idx_reg = m.group(1)
space = program.getAddressFactory().getDefaultAddressSpace()
table = space.getAddress(int(m.group(2), 16))
jmp_addr = instr.getAddress()
break
if table is None:
return None
# Two-level switch: the index is itself looked up in a byte table - `MOVZX r, byte[i+bt]`
# (MSVC8) or `XOR r,r; MOV rl, byte[i+bt]` (MSVC6). Targets are ptrTable[byteTable[i]]. The
# *bounds* register (the `i` indexing the byte table) is what LEA/CMP constrain - which on
# MSVC6 differs from the JMP's index register - so recover it from the byte-table load.
byte_table = None
bounds_reg = idx_reg
for instr in instrs:
if instr.getAddress().equals(jmp_addr):
break
mbt = _BYTE_TABLE.search(instr.toString())
if mbt is not None:
bounds_reg = mbt.group(1)
byte_table = space.getAddress(int(mbt.group(2), 16))
break
base = 0
count = None
for instr in instrs:
if instr.getAddress().equals(jmp_addr):
break
if instr.getNumOperands() == 0 or instr.getDefaultOperandRepresentation(0) != idx_reg:
if instr.getNumOperands() == 0 or instr.getDefaultOperandRepresentation(0) != bounds_reg:
continue
mn = instr.getMnemonicString()
s = instr.getScalar(1)
if mn == "CMP" and s is not None:
count = int(s.getValue()) + 1
count = _s32(s.getValue()) + 1
elif mn == "LEA": # LEA idx,[reg - k] -> id = index + k
disp = int(s.getValue()) if s is not None else _lea_disp(instr)
if disp is not None:
base = -disp
raw = s.getValue() if s is not None else _lea_disp(instr)
if raw is not None: # _s32 also fixes the text path: Ghidra prints a
base = -_s32(raw) # big displacement unsigned ("0xfffffeff" = -257)
elif mn == "SUB" and s is not None:
base = int(s.getValue())
base = _s32(s.getValue())
elif mn == "ADD" and s is not None:
base = -int(s.getValue())
base = -_s32(s.getValue())
elif mn == "DEC":
base = 1
return {"table": table, "base": base, "count": count}
return {"table": table, "base": base, "count": count, "default": default,
"byte_table": byte_table}
def extract_method_dispatch(program):
@@ -498,17 +540,29 @@ def _dispatch_from_run(program, run_func, owner, runner):
mem = program.getMemory()
space = program.getAddressFactory().getDefaultAddressSpace()
ptr = sw["table"]
byte_table = sw.get("byte_table")
targets = []
for i in range(count):
try:
val = mem.getInt(sw["table"].add(i * 4)) & 0xffffffff
# Two-level switch: index the pointer table through the byte index table.
slot = (mem.getByte(byte_table.add(i)) & 0xff) if byte_table is not None else i
val = mem.getInt(ptr.add(slot * 4)) & 0xffffffff
except Exception:
break
targets.append(space.getAddress(val))
stops = set(targets)
stops = set(t for t in targets if _executable(program, t))
default = sw.get("default")
rows = []
for i in range(len(targets)):
mid = i + sw["base"]
if mid < 0 or mid > 0xffff:
continue # nonsensical id -> the switch base wasn't recovered cleanly; don't emit garbage
if not _executable(program, targets[i]):
continue # target isn't code (unsupported switch shape / over-read) -> skip, never emit garbage
if default is not None and targets[i].equals(default):
continue # a switch hole (unimplemented id) routed to the base-runner default
anchors, funcs = _walk_calls(program, targets[i], stops)
# A thin wrapper case forwards to one submethod: the real body (and its leaf anchors)
# live in that function. Expanding one level makes MSVC8 (separate show()/load()) line up
@@ -523,7 +577,7 @@ def _dispatch_from_run(program, run_func, owner, runner):
impl_addr = "0x%x" % targets[i].getOffset() # body is inline in the case block
calls = anchors
rows.append({
"owner": owner, "runner": runner, "id": i + sw["base"],
"owner": owner, "runner": runner, "id": mid,
"case_addr": "0x%x" % targets[i].getOffset(),
"impl": impl, "impl_addr": impl_addr, "calls": calls,
})