Containerise: Postgres + Redis/RQ + API + Ghidra worker

Brings up the documented target architecture as a docker-compose stack — a modular monolith with the Ghidra step split into its own async worker. - worker/: RQ queue (lazy redis import) + run_acquisition task (Job status queued→started→finished/failed, drives ams.acquire with sink=db) - Job model + JobOut schema; Snapshot.data is JSONB on Postgres - POST/GET /jobs: stream an upload to a shared volume, enqueue, poll status - docker/api.Dockerfile (slim) + docker/worker.Dockerfile (JDK21 + Ghidra fetched at build, overridable via GHIDRA_URL) + docker-compose.yml - ghidra.py: AMS_GHIDRA_SCRIPTS override for in-container script path - pyproject: [worker] extra (rq/redis/psycopg), python-multipart in [api] - tests: 4 new (task success/failure + endpoint enqueue/503) -> 22/22 Verified: API image builds, container serves /health + /ui + /jobs; compose config validates. Worker image (downloads ~1 GB Ghidra) not built here. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 12:24:47 +02:00
parent 6797ad5ddb
commit f4aa7caaa9
15 changed files with 511 additions and 3 deletions
--- a/ams/worker/init.py
+++ b/ams/worker/init.py
@@ -0,0 +1 @@
+"""Async acquisition worker: an RQ queue feeding a Ghidra-headless container."""
--- a/ams/worker/queue.py
+++ b/ams/worker/queue.py
@@ -0,0 +1,44 @@
+"""RQ queue handle. Redis/RQ are imported lazily so the API (and the test suite)
+can import this module without those packages installed — they're only needed when
+something is actually enqueued or a worker runs."""
+
+from __future__ import annotations
+
+import os
+
+QUEUE_NAME = "acquire"
+DEFAULT_REDIS_URL = "redis://localhost:6379/0"
+
+TASK_PATH = "ams.worker.tasks.run_acquisition"  # importable by the RQ worker process
+
+
+def redis_url() -> str:
+    return os.environ.get("REDIS_URL", DEFAULT_REDIS_URL)
+
+
+def get_queue():
+    """Build an RQ Queue bound to Redis from $REDIS_URL (raises if rq/redis missing)."""
+    from redis import Redis  # lazy
+    from rq import Queue
+
+    return Queue(QUEUE_NAME, connection=Redis.from_url(redis_url()))
+
+
+def enqueue_acquisition(source_path: str, game_name: str | None, job_id: int) -> str:
+    """Enqueue one acquisition and return the RQ job id.
+
+    The DB Job row (`job_id`) is the durable record; the worker re-opens it to report
+    progress. We pass the API's DATABASE_URL through so the worker writes to the same DB."""
+    q = get_queue()
+    rq_job = q.enqueue(
+        TASK_PATH,
+        kwargs={
+            "job_id": job_id,
+            "source_path": source_path,
+            "game_name": game_name,
+            "database_url": os.environ.get("DATABASE_URL"),
+        },
+        job_timeout=int(os.environ.get("AMS_JOB_TIMEOUT", "3600")),
+        result_ttl=86400,
+    )
+    return rq_job.id
--- a/ams/worker/tasks.py
+++ b/ams/worker/tasks.py
@@ -0,0 +1,55 @@
+"""The RQ task body. Runs inside the Ghidra-equipped worker container.
+
+It re-points SQLAlchemy at the shared DATABASE_URL (the worker is a separate process
+from the API), drives the acquisition pipeline, and walks the Job row through its
+status transitions so the API/UI can poll progress."""
+
+from __future__ import annotations
+
+import os
+import traceback
+
+from ..acquire import acquire
+from ..api.db import configure, get_session, init_db
+from ..api.models import Job
+
+
+def _set(job_id: int, **fields):
+    """Patch a Job row in its own short-lived session (worker may run far from the API)."""
+    db = get_session()
+    try:
+        job = db.get(Job, job_id)
+        if job is None:
+            return None
+        for k, v in fields.items():
+            setattr(job, k, v)
+        db.commit()
+        return job
+    finally:
+        db.close()
+
+
+def run_acquisition(
+    job_id: int,
+    source_path: str,
+    game_name: str | None = None,
+    database_url: str | None = None,
+) -> dict:
+    """Acquire `source_path` into the catalog, updating Job #`job_id` as it goes.
+
+    Returns a small result dict (also stored by RQ). Errors are recorded on the Job row
+    and re-raised so RQ marks the job failed."""
+    configure(database_url)
+    init_db()
+
+    _set(job_id, status="started", error=None)
+    try:
+        result = acquire(source_path, game_name, sink="db")
+    except Exception as exc:  # record then re-raise so RQ sees the failure too
+        _set(job_id, status="failed", error="{0}: {1}".format(type(exc).__name__, exc))
+        traceback.print_exc()
+        raise
+
+    _set(job_id, status="finished", snapshot_id=result.imported_id,
+         dll_name=os.path.basename(result.dll), error=None)
+    return {"snapshot_id": result.imported_id, "engine": result.engine, "dll": result.dll}
				`@@ -0,0 +1 @@`
				`"""Async acquisition worker: an RQ queue feeding a Ghidra-headless container."""`