Harden dispatch switch parsing (id-gap drobiazgi)
Resolves the method-id gaps surfaced by the dispatch axis, all real switch-shape edge cases rather than numbering bugs: - default holes: ids the runner doesn't implement route to the `JA default` block (tail-call to base CMC_Runner::run); capture that target and drop those cases (was emitting false Sound 5/6, Scene 10-15, Array 26-31) - sign-extension: high-base switches (CMC_NetPeer id 257+) encode the base as `LEA/ADD idx, 0xFFFFFEFF` (-257); _s32 sign-extends on both the scalar and the text path (Ghidra prints big displacements unsigned, small ones signed) - two-level (byte-indexed) switches: sparse runners (Image) use `MOVZX r,byte[i+byteTable]` (MSVC8) / `MOV rl,byte[i+byteTable]` (MSVC6) then `JMP [r*4+ptrTable]`; decode target = ptrTable[byteTable[i]], taking base/count from the byte-table's index register (differs from the JMP index reg on MSVC6) - _executable() guard + id clamp: never emit a non-code "case" Result: Piklib 500 rows / BlooMoo 561, garbage 0, dispatch<->methods consistent. The lone genuinely-nameless method is CMC_Animo id 14 (a bool getter prepareMthHashSet doesn't register) - a real engine property, correctly absent from the methods axis. FUN_ ctor names are not recoverable (no symbols/mangled strings/RTTI in the binary for FILTER/MOVIE/VECTOR/PATH/FIFO/LIFO/STATICFILTER); cpp_class=None stays. Snapshots regenerated; 34/34 tests pass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -323,6 +323,13 @@ def _is_generic_name(name):
|
||||
return (not name) or name.startswith("FUN_") or name.startswith("thunk_") or name.startswith("LAB_")
|
||||
|
||||
|
||||
def _executable(program, addr):
|
||||
"""True if `addr` lives in an executable memory block - a sanity gate so a switch shape we
|
||||
don't model (or a table over-read) can never emit a non-code 'case' as a method."""
|
||||
blk = program.getMemory().getBlock(addr)
|
||||
return blk is not None and blk.isExecute()
|
||||
|
||||
|
||||
def _qualified(f):
|
||||
if f is None:
|
||||
return None
|
||||
@@ -402,6 +409,9 @@ def _walk_calls(program, start_addr, stops, limit=80):
|
||||
|
||||
_SWITCH_JMP = re.compile(r"\[(\w+)\*0x4 \+ (0x[0-9a-fA-F]+)\]")
|
||||
_LEA_DISP = re.compile(r"\[\w+ \+ (-?0x[0-9a-fA-F]+)\]")
|
||||
# A two-level switch's byte-index-table load: `byte ptr [<boundsreg> + <tableaddr>]`. The 5+ hex
|
||||
# digits distinguish a table address (0x100xxxxx) from a small struct-field offset.
|
||||
_BYTE_TABLE = re.compile(r"byte ptr \[(\w+) \+ (0x[0-9a-fA-F]{5,})\]")
|
||||
|
||||
|
||||
def _lea_disp(instr):
|
||||
@@ -410,6 +420,14 @@ def _lea_disp(instr):
|
||||
return int(m.group(1), 16) if m is not None else None
|
||||
|
||||
|
||||
def _s32(v):
|
||||
"""Interpret a scalar as 32-bit signed. Ghidra's getScalar() hands back the raw unsigned
|
||||
immediate, so e.g. `ADD idx, 0xFFFFFEFF` (really -257, a switch starting at id 257) must be
|
||||
sign-extended or the recovered ids overflow to nonsense."""
|
||||
v = int(v) & 0xffffffff
|
||||
return v - 0x100000000 if v >= 0x80000000 else v
|
||||
|
||||
|
||||
def _parse_switch(program, func):
|
||||
"""Recover the dense jump-table switch of a `run` function at the disassembly level
|
||||
(decompiler-independent, so it survives the big inline-heavy runners). Both MSVC6 and
|
||||
@@ -417,48 +435,72 @@ def _parse_switch(program, func):
|
||||
|
||||
LEA idx,[reg - base] ; CMP idx, range ; JA default ; JMP [idx*4 + TABLE]
|
||||
|
||||
Returns {table, base, count} or None. `id = table_index + base`; `count = range + 1`."""
|
||||
Returns {table, base, count, default} or None. `id = table_index + base`; `count = range + 1`.
|
||||
`default` is the out-of-range target of the `JA default` bounds check - the jump table also
|
||||
routes its *holes* (ids the runner doesn't implement) there, so cases pointing at it are not
|
||||
real methods and must be dropped."""
|
||||
listing = program.getListing()
|
||||
space = program.getAddressFactory().getDefaultAddressSpace()
|
||||
instrs = []
|
||||
it = listing.getInstructions(func.getBody(), True)
|
||||
while it.hasNext():
|
||||
instrs.append(it.next())
|
||||
|
||||
idx_reg = table = jmp_addr = None
|
||||
idx_reg = table = jmp_addr = default = None
|
||||
for instr in instrs:
|
||||
if instr.getMnemonicString() == "JMP" and instr.getFlowType().isComputed():
|
||||
ft = instr.getFlowType()
|
||||
if ft.isJump() and ft.isConditional(): # the `JA default` bounds check (last one wins)
|
||||
flows = instr.getFlows()
|
||||
if len(flows) > 0:
|
||||
default = flows[0]
|
||||
if instr.getMnemonicString() == "JMP" and ft.isComputed():
|
||||
m = _SWITCH_JMP.search(instr.toString())
|
||||
if m is not None:
|
||||
idx_reg = m.group(1)
|
||||
space = program.getAddressFactory().getDefaultAddressSpace()
|
||||
table = space.getAddress(int(m.group(2), 16))
|
||||
jmp_addr = instr.getAddress()
|
||||
break
|
||||
if table is None:
|
||||
return None
|
||||
|
||||
# Two-level switch: the index is itself looked up in a byte table - `MOVZX r, byte[i+bt]`
|
||||
# (MSVC8) or `XOR r,r; MOV rl, byte[i+bt]` (MSVC6). Targets are ptrTable[byteTable[i]]. The
|
||||
# *bounds* register (the `i` indexing the byte table) is what LEA/CMP constrain - which on
|
||||
# MSVC6 differs from the JMP's index register - so recover it from the byte-table load.
|
||||
byte_table = None
|
||||
bounds_reg = idx_reg
|
||||
for instr in instrs:
|
||||
if instr.getAddress().equals(jmp_addr):
|
||||
break
|
||||
mbt = _BYTE_TABLE.search(instr.toString())
|
||||
if mbt is not None:
|
||||
bounds_reg = mbt.group(1)
|
||||
byte_table = space.getAddress(int(mbt.group(2), 16))
|
||||
break
|
||||
|
||||
base = 0
|
||||
count = None
|
||||
for instr in instrs:
|
||||
if instr.getAddress().equals(jmp_addr):
|
||||
break
|
||||
if instr.getNumOperands() == 0 or instr.getDefaultOperandRepresentation(0) != idx_reg:
|
||||
if instr.getNumOperands() == 0 or instr.getDefaultOperandRepresentation(0) != bounds_reg:
|
||||
continue
|
||||
mn = instr.getMnemonicString()
|
||||
s = instr.getScalar(1)
|
||||
if mn == "CMP" and s is not None:
|
||||
count = int(s.getValue()) + 1
|
||||
count = _s32(s.getValue()) + 1
|
||||
elif mn == "LEA": # LEA idx,[reg - k] -> id = index + k
|
||||
disp = int(s.getValue()) if s is not None else _lea_disp(instr)
|
||||
if disp is not None:
|
||||
base = -disp
|
||||
raw = s.getValue() if s is not None else _lea_disp(instr)
|
||||
if raw is not None: # _s32 also fixes the text path: Ghidra prints a
|
||||
base = -_s32(raw) # big displacement unsigned ("0xfffffeff" = -257)
|
||||
elif mn == "SUB" and s is not None:
|
||||
base = int(s.getValue())
|
||||
base = _s32(s.getValue())
|
||||
elif mn == "ADD" and s is not None:
|
||||
base = -int(s.getValue())
|
||||
base = -_s32(s.getValue())
|
||||
elif mn == "DEC":
|
||||
base = 1
|
||||
return {"table": table, "base": base, "count": count}
|
||||
return {"table": table, "base": base, "count": count, "default": default,
|
||||
"byte_table": byte_table}
|
||||
|
||||
|
||||
def extract_method_dispatch(program):
|
||||
@@ -498,17 +540,29 @@ def _dispatch_from_run(program, run_func, owner, runner):
|
||||
|
||||
mem = program.getMemory()
|
||||
space = program.getAddressFactory().getDefaultAddressSpace()
|
||||
ptr = sw["table"]
|
||||
byte_table = sw.get("byte_table")
|
||||
targets = []
|
||||
for i in range(count):
|
||||
try:
|
||||
val = mem.getInt(sw["table"].add(i * 4)) & 0xffffffff
|
||||
# Two-level switch: index the pointer table through the byte index table.
|
||||
slot = (mem.getByte(byte_table.add(i)) & 0xff) if byte_table is not None else i
|
||||
val = mem.getInt(ptr.add(slot * 4)) & 0xffffffff
|
||||
except Exception:
|
||||
break
|
||||
targets.append(space.getAddress(val))
|
||||
|
||||
stops = set(targets)
|
||||
stops = set(t for t in targets if _executable(program, t))
|
||||
default = sw.get("default")
|
||||
rows = []
|
||||
for i in range(len(targets)):
|
||||
mid = i + sw["base"]
|
||||
if mid < 0 or mid > 0xffff:
|
||||
continue # nonsensical id -> the switch base wasn't recovered cleanly; don't emit garbage
|
||||
if not _executable(program, targets[i]):
|
||||
continue # target isn't code (unsupported switch shape / over-read) -> skip, never emit garbage
|
||||
if default is not None and targets[i].equals(default):
|
||||
continue # a switch hole (unimplemented id) routed to the base-runner default
|
||||
anchors, funcs = _walk_calls(program, targets[i], stops)
|
||||
# A thin wrapper case forwards to one submethod: the real body (and its leaf anchors)
|
||||
# live in that function. Expanding one level makes MSVC8 (separate show()/load()) line up
|
||||
@@ -523,7 +577,7 @@ def _dispatch_from_run(program, run_func, owner, runner):
|
||||
impl_addr = "0x%x" % targets[i].getOffset() # body is inline in the case block
|
||||
calls = anchors
|
||||
rows.append({
|
||||
"owner": owner, "runner": runner, "id": i + sw["base"],
|
||||
"owner": owner, "runner": runner, "id": mid,
|
||||
"case_addr": "0x%x" % targets[i].getOffset(),
|
||||
"impl": impl, "impl_addr": impl_addr, "calls": calls,
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user