worker: optional PyGhidra back-end for Ghidra 11.4+/12.x (no Jython)

The .py extractor runs fine under PyGhidra in the GUI; only `analyzeHeadless`
doesn't init PyGhidra. Add an env-gated CPython path so modern Ghidra works headless:

- ghidra.run_extractor_pyghidra(): runs the same GhidraScript via pyghidra.run_script
  (boots Ghidra in-process, imports+analyses, getScriptArgs()=[out_path]); run_extractor
  dispatches to it when AMS_USE_PYGHIDRA is set. No script changes needed.
- worker image installs pyghidra + sets GHIDRA_INSTALL_DIR; compose exposes
  AMS_USE_PYGHIDRA (default off). Jython path stays the default and untouched.
- README documents both variants (Jython <=11.3.x vs PyGhidra 11.4+/12.x).
- test: AMS_USE_PYGHIDRA routes to the PyGhidra back-end (clear error if pkg missing).

35/35 tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Patryk Gensch
2026-05-31 18:03:04 +02:00
parent aa65beb7c1
commit ba9db82a4c
5 changed files with 85 additions and 14 deletions

View File

@@ -1,13 +1,20 @@
"""Drive Ghidra's `analyzeHeadless` to run the engine-surface extractor on a DLL.
"""Drive Ghidra to run the engine-surface extractor on a DLL.
This is the heavy worker step: it imports the binary into a throwaway Ghidra
project, auto-analyses it, then runs `ghidra_scripts/extract_engine_surface.py`
as a post-script that writes the snapshot JSON to a path we pick.
to write the snapshot JSON to a path we pick.
Ghidra isn't a Python package, so it must be located on disk. Resolution order:
1. $GHIDRA_HEADLESS — full path to the analyzeHeadless launcher
2. $GHIDRA_HOME/support/analyzeHeadless
3. `analyzeHeadless` on PATH
Two back-ends, picked by the `AMS_USE_PYGHIDRA` env var:
* default — `analyzeHeadless` runs the script as a post-script via Ghidra's bundled
**Jython**. Works on Ghidra <= 11.3.x; on 11.4+/12.x Jython is gone and the script
silently doesn't run ("Ghidra was not started with PyGhidra").
* `AMS_USE_PYGHIDRA=1` — run the same script through **PyGhidra** (CPython) via
`pyghidra.run_script`, so modern Ghidra (11.4+/12.x) works. Needs `pip install pyghidra`
and Ghidra's dir in `$GHIDRA_INSTALL_DIR` (falls back to `$GHIDRA_HOME`).
analyzeHeadless resolution order: $GHIDRA_HEADLESS, $GHIDRA_HOME/support/analyzeHeadless,
then `analyzeHeadless` on PATH.
"""
from __future__ import annotations
@@ -58,6 +65,9 @@ def run_extractor(
Raises GhidraNotFound if no launcher is configured, GhidraRunError on failure or if
the script produced no output."""
if os.environ.get("AMS_USE_PYGHIDRA"):
return run_extractor_pyghidra(dll_path, out_path, script_dir=script_dir)
headless = headless or find_headless()
if not headless:
raise GhidraNotFound(
@@ -91,3 +101,38 @@ def run_extractor(
raise GhidraRunError(
"extractor produced no snapshot at {0}\n--- headless tail ---\n{1}".format(out_path, tail))
return out_path
def run_extractor_pyghidra(dll_path: str, out_path: str, *, script_dir: str | None = None) -> str:
"""Run the extractor through PyGhidra (CPython) instead of analyzeHeadless/Jython.
`pyghidra.run_script` boots Ghidra in-process, imports + auto-analyses the binary, and
executes our GhidraScript with `getScriptArgs() == [out_path]` - the same script, just under
CPython, so it works on Ghidra 11.4+/12.x where Jython is gone."""
os.environ.setdefault("GHIDRA_INSTALL_DIR", os.environ.get("GHIDRA_HOME", ""))
try:
import pyghidra
except ImportError:
raise GhidraNotFound(
"AMS_USE_PYGHIDRA is set but the 'pyghidra' package isn't installed (pip install pyghidra)")
script_dir = script_dir or os.environ.get("AMS_GHIDRA_SCRIPTS") or str(_SCRIPT_DIR)
script_path = os.path.join(script_dir, _SCRIPT_NAME)
out_path = os.path.abspath(out_path)
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
proj_dir = tempfile.mkdtemp(prefix="ams_pyghidra_")
try:
pyghidra.run_script(
dll_path, script_path,
project_location=proj_dir, project_name="ams_" + uuid.uuid4().hex[:8],
script_args=[out_path], analyze=True, verbose=False,
)
except Exception as e: # jpype/Ghidra errors aren't a tidy hierarchy
raise GhidraRunError("pyghidra.run_script failed: {0}".format(e))
finally:
shutil.rmtree(proj_dir, ignore_errors=True)
if not os.path.isfile(out_path):
raise GhidraRunError("extractor produced no snapshot at {0} (PyGhidra path)".format(out_path))
return out_path