Add ISO/ZIP acquisition pipeline (ams.acquire worker)

Closes the chain from a game file to a catalog entry: unpack an ISO/ZIP,
content-identify the engine DLL (CMC_ObjectsContainer marker in RTTI, so a
renamed file is still found), hash it (sha256 + md5 + optional ssdeep via
ppdeep), run Ghidra headless with the extractor, enrich and import the snapshot.

- unpack.py: bsdtar (ISO9660 + ZIP) with a pure-Python zipfile fallback
- identify.py: content-based engine-DLL picker + hashing
- ghidra.py: analyzeHeadless launcher discovery + post-script run
- pipeline.py: orchestration with injectable extract_fn; sink db|http|none
- cli.py: python -m ams.acquire (incl. --identify-only dry run)
- tests: 7 new (forged PE markers + stubbed extractor) -> 18/18

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Patryk Gensch
2026-05-31 12:11:56 +02:00
parent 4542763936
commit 6797ad5ddb
10 changed files with 774 additions and 0 deletions

139
ams/acquire/identify.py Normal file
View File

@@ -0,0 +1,139 @@
"""Find the engine DLL inside an unpacked game tree and hash it.
Identification is *content-based* first — we scan the file for marker strings that
only a Piklib/BlooMoo engine carries (the factory class name shows up inside the
MSVC RTTI/mangled symbols) — and fall back to filename hints. So a renamed DLL is
still picked correctly, before the expensive Ghidra pass ever runs.
"""
from __future__ import annotations
import hashlib
import os
from dataclasses import dataclass
# ASCII substrings that survive inside the PE: the factory class name is the
# decisive "this is an engine DLL" signal (it appears within mangled RTTI names),
# the namespace/product tags only disambiguate which engine.
_MARK_FACTORY = b"CMC_ObjectsContainer"
_MARK_BLOOMOO = b"BlooMoo"
_MARK_PIKLIB = b"Piklib"
_FILENAME_HINTS = ("piklib", "bloomoo")
_PE_EXT = (".dll", ".exe")
_SCAN_LIMIT = 64 * 1024 * 1024 # plenty for these engine DLLs, bounds pathological files
@dataclass
class Candidate:
"""A scored engine-DLL candidate. Higher score = more certain."""
path: str
score: int
engine: str | None # "BlooMoo" | "Piklib" | None (factory present, product unclear)
reason: str
@dataclass
class FileHashes:
sha256: str
md5: str
size: int
fuzzy: str | None # ssdeep-style, when ppdeep/ssdeep is installed; else None
def is_pe(path: str) -> bool:
"""True if the file is a Windows PE image (MZ stub + PE\\0\\0 header)."""
try:
with open(path, "rb") as fh:
if fh.read(2) != b"MZ":
return False
fh.seek(0x3C)
off = fh.read(4)
if len(off) < 4:
return False
fh.seek(int.from_bytes(off, "little"))
return fh.read(4) == b"PE\x00\x00"
except OSError:
return False
def scan_markers(path: str, limit: int = _SCAN_LIMIT) -> tuple[bool, str | None]:
"""Scan up to `limit` bytes for engine markers → (has_factory, engine_guess)."""
try:
with open(path, "rb") as fh:
blob = fh.read(limit)
except OSError:
return (False, None)
engine = "BlooMoo" if _MARK_BLOOMOO in blob else ("Piklib" if _MARK_PIKLIB in blob else None)
return (_MARK_FACTORY in blob, engine)
def _score(path: str) -> Candidate | None:
"""Score a single file as an engine DLL, or None if it isn't a PE image."""
if not is_pe(path):
return None
has_factory, engine = scan_markers(path)
score, reasons = 0, []
if has_factory:
score += 100
reasons.append("factory-marker")
name = os.path.basename(path).lower()
if any(h in name for h in _FILENAME_HINTS):
score += 10
reasons.append("filename-hint")
if engine:
score += 5
reasons.append("engine=" + engine)
if score == 0:
return None # a PE with no engine signal at all — not a candidate
return Candidate(path=path, score=score, engine=engine, reason=",".join(reasons))
def find_engine_dlls(root: str) -> list[Candidate]:
"""Walk `root` and return engine-DLL candidates, strongest first.
A single file path is accepted directly. Only PE files with a `.dll`/`.exe`
extension are considered, but the actual decision is content-based."""
targets: list[str] = []
if os.path.isfile(root):
targets = [root]
else:
for dirpath, _dirs, files in os.walk(root):
for fn in files:
if fn.lower().endswith(_PE_EXT):
targets.append(os.path.join(dirpath, fn))
out = [c for c in (_score(p) for p in targets) if c is not None]
out.sort(key=lambda c: c.score, reverse=True)
return out
def fuzzy_hash(path: str) -> str | None:
"""Context-triggered piecewise hash (ssdeep format) for near-duplicate detection.
Uses ppdeep (pure-Python) or ssdeep if importable; returns None otherwise, so the
pipeline never hard-depends on it."""
for modname in ("ppdeep", "ssdeep"):
try:
mod = __import__(modname)
except ImportError:
continue
try:
return mod.hash_from_file(path)
except Exception:
return None
return None
def hash_file(path: str) -> FileHashes:
"""sha256 + md5 + size (+ fuzzy when available), streamed so big DLLs don't load whole."""
sha, md5 = hashlib.sha256(), hashlib.md5()
size = 0
with open(path, "rb") as fh:
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
sha.update(chunk)
md5.update(chunk)
size += len(chunk)
return FileHashes(sha256=sha.hexdigest(), md5=md5.hexdigest(), size=size, fuzzy=fuzzy_hash(path))