Aidem-Media-DLL-Analysis/ams/acquire/ghidra.py

"""Drive Ghidra to run the engine-surface extractor on a DLL.

This is the heavy worker step: it imports the binary into a throwaway Ghidra
project, auto-analyses it, then runs `ghidra_scripts/extract_engine_surface.py`
to write the snapshot JSON to a path we pick.

Two back-ends, picked by the `AMS_USE_PYGHIDRA` env var:

* default — `analyzeHeadless` runs the script as a post-script via Ghidra's bundled
  **Jython**. Works on Ghidra <= 11.3.x; on 11.4+/12.x Jython is gone and the script
  silently doesn't run ("Ghidra was not started with PyGhidra").
* `AMS_USE_PYGHIDRA=1` — run the same script through **PyGhidra** (CPython) via
  `pyghidra.run_script`, so modern Ghidra (11.4+/12.x) works. Needs `pip install pyghidra`
  and Ghidra's dir in `$GHIDRA_INSTALL_DIR` (falls back to `$GHIDRA_HOME`).

analyzeHeadless resolution order: $GHIDRA_HEADLESS, $GHIDRA_HOME/support/analyzeHeadless,
then `analyzeHeadless` on PATH.
"""

from __future__ import annotations

import os
import shutil
import subprocess
import tempfile
import uuid
from pathlib import Path

_SCRIPT_NAME = "extract_engine_surface.py"
# ams/acquire/ghidra.py -> repo root is two parents up
_SCRIPT_DIR = Path(__file__).resolve().parents[2] / "ghidra_scripts"


class GhidraNotFound(RuntimeError):
    pass


class GhidraRunError(RuntimeError):
    pass


def find_headless() -> str | None:
    """Locate the analyzeHeadless launcher, or None if Ghidra isn't configured."""
    env = os.environ.get("GHIDRA_HEADLESS")
    if env and os.path.isfile(env):
        return env
    home = os.environ.get("GHIDRA_HOME")
    if home:
        for name in ("analyzeHeadless", "analyzeHeadless.bat"):
            cand = os.path.join(home, "support", name)
            if os.path.isfile(cand):
                return cand
    return shutil.which("analyzeHeadless")


def run_extractor(
    dll_path: str,
    out_path: str,
    *,
    headless: str | None = None,
    script_dir: str | None = None,
    timeout: int = 1800,
) -> str:
    """Headless-analyse `dll_path` and write the snapshot to `out_path`; returns `out_path`.

    Raises GhidraNotFound if no launcher is configured, GhidraRunError on failure or if
    the script produced no output."""
    if os.environ.get("AMS_USE_PYGHIDRA"):
        return run_extractor_pyghidra(dll_path, out_path, script_dir=script_dir)

    headless = headless or find_headless()
    if not headless:
        raise GhidraNotFound(
            "analyzeHeadless not found — set $GHIDRA_HEADLESS or $GHIDRA_HOME (Ghidra's install dir)")

    script_dir = script_dir or os.environ.get("AMS_GHIDRA_SCRIPTS") or str(_SCRIPT_DIR)
    out_path = os.path.abspath(out_path)
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

    proj_dir = tempfile.mkdtemp(prefix="ams_ghidra_")
    proj_name = "ams_" + uuid.uuid4().hex[:8]
    cmd = [
        headless, proj_dir, proj_name,
        "-import", dll_path,
        "-scriptPath", script_dir,
        "-postScript", _SCRIPT_NAME, out_path,
        "-deleteProject",
    ]
    try:
        proc = subprocess.run(
            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, timeout=timeout)
    except subprocess.TimeoutExpired:
        raise GhidraRunError("analyzeHeadless timed out after {0}s".format(timeout))
    except OSError as e:
        raise GhidraRunError("failed to launch analyzeHeadless: {0}".format(e))
    finally:
        shutil.rmtree(proj_dir, ignore_errors=True)

    if not os.path.isfile(out_path):
        tail = proc.stdout.decode("utf-8", "replace")[-2000:] if proc.stdout else ""
        raise GhidraRunError(
            "extractor produced no snapshot at {0}\n--- headless tail ---\n{1}".format(out_path, tail))
    return out_path


def run_extractor_pyghidra(dll_path: str, out_path: str, *, script_dir: str | None = None) -> str:
    """Run the extractor through PyGhidra (CPython) instead of analyzeHeadless/Jython.

    `pyghidra.run_script` boots Ghidra in-process, imports + auto-analyses the binary, and
    executes our GhidraScript with `getScriptArgs() == [out_path]` - the same script, just under
    CPython, so it works on Ghidra 11.4+/12.x where Jython is gone."""
    os.environ.setdefault("GHIDRA_INSTALL_DIR", os.environ.get("GHIDRA_HOME", ""))
    try:
        import pyghidra
    except ImportError:
        raise GhidraNotFound(
            "AMS_USE_PYGHIDRA is set but the 'pyghidra' package isn't installed (pip install pyghidra)")

    script_dir = script_dir or os.environ.get("AMS_GHIDRA_SCRIPTS") or str(_SCRIPT_DIR)
    script_path = os.path.join(script_dir, _SCRIPT_NAME)
    out_path = os.path.abspath(out_path)
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)

    proj_dir = tempfile.mkdtemp(prefix="ams_pyghidra_")
    try:
        pyghidra.run_script(
            dll_path, script_path,
            project_location=proj_dir, project_name="ams_" + uuid.uuid4().hex[:8],
            script_args=[out_path], analyze=True, verbose=False,
        )
    except Exception as e:  # jpype/Ghidra errors aren't a tidy hierarchy
        raise GhidraRunError("pyghidra.run_script failed: {0}".format(e))
    finally:
        shutil.rmtree(proj_dir, ignore_errors=True)

    if not os.path.isfile(out_path):
        raise GhidraRunError("extractor produced no snapshot at {0} (PyGhidra path)".format(out_path))
    return out_path