diff --git a/README.md b/README.md index 19746ff..7197b79 100644 --- a/README.md +++ b/README.md @@ -65,20 +65,28 @@ Worker (`docker/worker.Dockerfile`, `eclipse-temurin:21-jdk`) pobiera Ghidrę i nadpisz URL realnym wydaniem z [releases NSA](https://github.com/NationalSecurityAgency/ghidra/releases) (nazwa pliku: `ghidra__PUBLIC_.zip`). -> **Musi to być Ghidra ≤ 11.3.x.** Ekstraktor to skrypt **Pythona (`.py`)**, który Ghidra w trybie -> headless uruchamia przez wbudowanego **Jythona**. Ghidra **11.4+ / 12.x usunęły Jythona** — tam -> `.py` headless wymaga **PyGhidry** (CPython), której ten obraz nie inicjalizuje, i dostaniesz -> `Ghidra was not started with PyGhidra. Python is not available` (analiza przejdzie, ale post-skrypt -> nie wyemituje snapshotu). Domyślny `GHIDRA_URL` celuje w 11.2.1 (z Jythonem). Chcesz zostać na 12.x? -> Trzeba doinstalować `pyghidra` i odpalać headless przez PyGhidrę — sam skrypt jest CPython-kompatybilny, -> więc zadziała, gdy interpreter wstanie (patrz dokumentacja PyGhidra w danej wersji Ghidry). +> **Domyślnie wymaga Ghidry ≤ 11.3.x.** Ekstraktor to skrypt **Pythona (`.py`)**, który Ghidra +> w headless uruchamia przez wbudowanego **Jythona**. Ghidra **11.4+ / 12.x usunęły Jythona** — tam +> `.py` headless przez `analyzeHeadless` nie ruszy (`Ghidra was not started with PyGhidra...`): +> analiza przejdzie, ale post-skrypt nie wyemituje snapshotu. Domyślny `GHIDRA_URL` celuje w 11.2.1. +**Wariant Jython (domyślny, ≤ 11.3.x):** ```bash docker compose build worker \ --build-arg GHIDRA_URL=https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.2.1_build/ghidra_11.2.1_PUBLIC_20241105.zip docker compose up ``` +**Wariant PyGhidra (Ghidra 11.4+ / 12.x):** obraz workera ma już `pyghidra`; ten sam skrypt leci +przez CPython (`pyghidra.run_script`, bez zmian w kodzie). Zbuduj z nowszą Ghidrą i włącz przełącznik: +```bash +docker compose build worker \ + --build-arg GHIDRA_URL=https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_11.4.2_build/ghidra_11.4.2_PUBLIC_20250826.zip +AMS_USE_PYGHIDRA=1 docker compose up +``` +Pod spodem worker woła `ams.acquire.ghidra.run_extractor_pyghidra` (uruchamia Ghidrę in-process, +importuje + analizuje binarkę, odpala nasz GhidraScript z `getScriptArgs()=[out_path]`). + ### 4. Ekstrakcja ręcznie w GUI Ghidry (alternatywa, bez Dockera) *Script Manager → Manage Script Directories* → wskaż `ghidra_scripts/`, otwórz program (DLL), diff --git a/ams/acquire/ghidra.py b/ams/acquire/ghidra.py index ece2eb6..e62ecca 100644 --- a/ams/acquire/ghidra.py +++ b/ams/acquire/ghidra.py @@ -1,13 +1,20 @@ -"""Drive Ghidra's `analyzeHeadless` to run the engine-surface extractor on a DLL. +"""Drive Ghidra to run the engine-surface extractor on a DLL. This is the heavy worker step: it imports the binary into a throwaway Ghidra project, auto-analyses it, then runs `ghidra_scripts/extract_engine_surface.py` -as a post-script that writes the snapshot JSON to a path we pick. +to write the snapshot JSON to a path we pick. -Ghidra isn't a Python package, so it must be located on disk. Resolution order: - 1. $GHIDRA_HEADLESS — full path to the analyzeHeadless launcher - 2. $GHIDRA_HOME/support/analyzeHeadless - 3. `analyzeHeadless` on PATH +Two back-ends, picked by the `AMS_USE_PYGHIDRA` env var: + +* default — `analyzeHeadless` runs the script as a post-script via Ghidra's bundled + **Jython**. Works on Ghidra <= 11.3.x; on 11.4+/12.x Jython is gone and the script + silently doesn't run ("Ghidra was not started with PyGhidra"). +* `AMS_USE_PYGHIDRA=1` — run the same script through **PyGhidra** (CPython) via + `pyghidra.run_script`, so modern Ghidra (11.4+/12.x) works. Needs `pip install pyghidra` + and Ghidra's dir in `$GHIDRA_INSTALL_DIR` (falls back to `$GHIDRA_HOME`). + +analyzeHeadless resolution order: $GHIDRA_HEADLESS, $GHIDRA_HOME/support/analyzeHeadless, +then `analyzeHeadless` on PATH. """ from __future__ import annotations @@ -58,6 +65,9 @@ def run_extractor( Raises GhidraNotFound if no launcher is configured, GhidraRunError on failure or if the script produced no output.""" + if os.environ.get("AMS_USE_PYGHIDRA"): + return run_extractor_pyghidra(dll_path, out_path, script_dir=script_dir) + headless = headless or find_headless() if not headless: raise GhidraNotFound( @@ -91,3 +101,38 @@ def run_extractor( raise GhidraRunError( "extractor produced no snapshot at {0}\n--- headless tail ---\n{1}".format(out_path, tail)) return out_path + + +def run_extractor_pyghidra(dll_path: str, out_path: str, *, script_dir: str | None = None) -> str: + """Run the extractor through PyGhidra (CPython) instead of analyzeHeadless/Jython. + + `pyghidra.run_script` boots Ghidra in-process, imports + auto-analyses the binary, and + executes our GhidraScript with `getScriptArgs() == [out_path]` - the same script, just under + CPython, so it works on Ghidra 11.4+/12.x where Jython is gone.""" + os.environ.setdefault("GHIDRA_INSTALL_DIR", os.environ.get("GHIDRA_HOME", "")) + try: + import pyghidra + except ImportError: + raise GhidraNotFound( + "AMS_USE_PYGHIDRA is set but the 'pyghidra' package isn't installed (pip install pyghidra)") + + script_dir = script_dir or os.environ.get("AMS_GHIDRA_SCRIPTS") or str(_SCRIPT_DIR) + script_path = os.path.join(script_dir, _SCRIPT_NAME) + out_path = os.path.abspath(out_path) + os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True) + + proj_dir = tempfile.mkdtemp(prefix="ams_pyghidra_") + try: + pyghidra.run_script( + dll_path, script_path, + project_location=proj_dir, project_name="ams_" + uuid.uuid4().hex[:8], + script_args=[out_path], analyze=True, verbose=False, + ) + except Exception as e: # jpype/Ghidra errors aren't a tidy hierarchy + raise GhidraRunError("pyghidra.run_script failed: {0}".format(e)) + finally: + shutil.rmtree(proj_dir, ignore_errors=True) + + if not os.path.isfile(out_path): + raise GhidraRunError("extractor produced no snapshot at {0} (PyGhidra path)".format(out_path)) + return out_path diff --git a/docker-compose.yml b/docker-compose.yml index 2780fe4..37272ba 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,9 @@ services: DATABASE_URL: postgresql+psycopg://ams:ams@db:5432/ams REDIS_URL: redis://redis:6379/0 AMS_UPLOAD_DIR: /data/uploads + # Set to 1 (e.g. `AMS_USE_PYGHIDRA=1 docker compose up`) to run the extractor through + # PyGhidra instead of Jython - needed for Ghidra 11.4+/12.x (build worker with that GHIDRA_URL). + AMS_USE_PYGHIDRA: ${AMS_USE_PYGHIDRA:-} volumes: - uploads:/data/uploads depends_on: diff --git a/docker/worker.Dockerfile b/docker/worker.Dockerfile index 422928e..865e272 100644 --- a/docker/worker.Dockerfile +++ b/docker/worker.Dockerfile @@ -29,6 +29,7 @@ RUN wget -q "$GHIDRA_URL" -O /tmp/ghidra.zip \ RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel ENV GHIDRA_HOME=/opt/ghidra +ENV GHIDRA_INSTALL_DIR=/opt/ghidra ENV AMS_GHIDRA_SCRIPTS=/app/ghidra_scripts ENV AMS_UPLOAD_DIR=/data/uploads @@ -38,7 +39,9 @@ COPY ams ./ams COPY ghidra_scripts ./ghidra_scripts COPY snapshots ./snapshots -RUN pip3 install --no-cache-dir ".[api,acquire,worker]" +# pyghidra enables the CPython back-end (set AMS_USE_PYGHIDRA=1) for Ghidra 11.4+/12.x, which +# dropped Jython. Harmless when unused; the default Jython path doesn't import it. +RUN pip3 install --no-cache-dir ".[api,acquire,worker]" pyghidra # Drain the 'acquire' queue. Shell form so $REDIS_URL expands at runtime. CMD rq worker --url "${REDIS_URL:-redis://redis:6379/0}" acquire diff --git a/tests/test_acquire.py b/tests/test_acquire.py index 60f7a45..6dacad6 100644 --- a/tests/test_acquire.py +++ b/tests/test_acquire.py @@ -136,6 +136,18 @@ def test_acquire_zip_no_sink(tmp_path, golden_snapshot): assert r.imported_id is None and r.sink == "none" +def test_pyghidra_dispatch_without_dep(tmp_path, monkeypatch): + """AMS_USE_PYGHIDRA routes to the PyGhidra back-end; without the package it fails clearly.""" + import importlib.util + + from ams.acquire import ghidra + if importlib.util.find_spec("pyghidra") is not None: + pytest.skip("pyghidra is installed; this exercises the missing-dependency path") + monkeypatch.setenv("AMS_USE_PYGHIDRA", "1") + with pytest.raises(ghidra.GhidraNotFound, match="pyghidra"): + ghidra.run_extractor(str(tmp_path / "x.dll"), str(tmp_path / "out.json")) + + def test_acquire_loose_dll_into_db(tmp_path, golden_snapshot): from ams.api.db import configure