Add ISO/ZIP acquisition pipeline (ams.acquire worker)
Closes the chain from a game file to a catalog entry: unpack an ISO/ZIP, content-identify the engine DLL (CMC_ObjectsContainer marker in RTTI, so a renamed file is still found), hash it (sha256 + md5 + optional ssdeep via ppdeep), run Ghidra headless with the extractor, enrich and import the snapshot. - unpack.py: bsdtar (ISO9660 + ZIP) with a pure-Python zipfile fallback - identify.py: content-based engine-DLL picker + hashing - ghidra.py: analyzeHeadless launcher discovery + post-script run - pipeline.py: orchestration with injectable extract_fn; sink db|http|none - cli.py: python -m ams.acquire (incl. --identify-only dry run) - tests: 7 new (forged PE markers + stubbed extractor) -> 18/18 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
139
ams/acquire/identify.py
Normal file
139
ams/acquire/identify.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""Find the engine DLL inside an unpacked game tree and hash it.
|
||||
|
||||
Identification is *content-based* first — we scan the file for marker strings that
|
||||
only a Piklib/BlooMoo engine carries (the factory class name shows up inside the
|
||||
MSVC RTTI/mangled symbols) — and fall back to filename hints. So a renamed DLL is
|
||||
still picked correctly, before the expensive Ghidra pass ever runs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
# ASCII substrings that survive inside the PE: the factory class name is the
|
||||
# decisive "this is an engine DLL" signal (it appears within mangled RTTI names),
|
||||
# the namespace/product tags only disambiguate which engine.
|
||||
_MARK_FACTORY = b"CMC_ObjectsContainer"
|
||||
_MARK_BLOOMOO = b"BlooMoo"
|
||||
_MARK_PIKLIB = b"Piklib"
|
||||
|
||||
_FILENAME_HINTS = ("piklib", "bloomoo")
|
||||
_PE_EXT = (".dll", ".exe")
|
||||
|
||||
_SCAN_LIMIT = 64 * 1024 * 1024 # plenty for these engine DLLs, bounds pathological files
|
||||
|
||||
|
||||
@dataclass
|
||||
class Candidate:
|
||||
"""A scored engine-DLL candidate. Higher score = more certain."""
|
||||
|
||||
path: str
|
||||
score: int
|
||||
engine: str | None # "BlooMoo" | "Piklib" | None (factory present, product unclear)
|
||||
reason: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileHashes:
|
||||
sha256: str
|
||||
md5: str
|
||||
size: int
|
||||
fuzzy: str | None # ssdeep-style, when ppdeep/ssdeep is installed; else None
|
||||
|
||||
|
||||
def is_pe(path: str) -> bool:
|
||||
"""True if the file is a Windows PE image (MZ stub + PE\\0\\0 header)."""
|
||||
try:
|
||||
with open(path, "rb") as fh:
|
||||
if fh.read(2) != b"MZ":
|
||||
return False
|
||||
fh.seek(0x3C)
|
||||
off = fh.read(4)
|
||||
if len(off) < 4:
|
||||
return False
|
||||
fh.seek(int.from_bytes(off, "little"))
|
||||
return fh.read(4) == b"PE\x00\x00"
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def scan_markers(path: str, limit: int = _SCAN_LIMIT) -> tuple[bool, str | None]:
|
||||
"""Scan up to `limit` bytes for engine markers → (has_factory, engine_guess)."""
|
||||
try:
|
||||
with open(path, "rb") as fh:
|
||||
blob = fh.read(limit)
|
||||
except OSError:
|
||||
return (False, None)
|
||||
engine = "BlooMoo" if _MARK_BLOOMOO in blob else ("Piklib" if _MARK_PIKLIB in blob else None)
|
||||
return (_MARK_FACTORY in blob, engine)
|
||||
|
||||
|
||||
def _score(path: str) -> Candidate | None:
|
||||
"""Score a single file as an engine DLL, or None if it isn't a PE image."""
|
||||
if not is_pe(path):
|
||||
return None
|
||||
has_factory, engine = scan_markers(path)
|
||||
score, reasons = 0, []
|
||||
if has_factory:
|
||||
score += 100
|
||||
reasons.append("factory-marker")
|
||||
name = os.path.basename(path).lower()
|
||||
if any(h in name for h in _FILENAME_HINTS):
|
||||
score += 10
|
||||
reasons.append("filename-hint")
|
||||
if engine:
|
||||
score += 5
|
||||
reasons.append("engine=" + engine)
|
||||
if score == 0:
|
||||
return None # a PE with no engine signal at all — not a candidate
|
||||
return Candidate(path=path, score=score, engine=engine, reason=",".join(reasons))
|
||||
|
||||
|
||||
def find_engine_dlls(root: str) -> list[Candidate]:
|
||||
"""Walk `root` and return engine-DLL candidates, strongest first.
|
||||
|
||||
A single file path is accepted directly. Only PE files with a `.dll`/`.exe`
|
||||
extension are considered, but the actual decision is content-based."""
|
||||
targets: list[str] = []
|
||||
if os.path.isfile(root):
|
||||
targets = [root]
|
||||
else:
|
||||
for dirpath, _dirs, files in os.walk(root):
|
||||
for fn in files:
|
||||
if fn.lower().endswith(_PE_EXT):
|
||||
targets.append(os.path.join(dirpath, fn))
|
||||
|
||||
out = [c for c in (_score(p) for p in targets) if c is not None]
|
||||
out.sort(key=lambda c: c.score, reverse=True)
|
||||
return out
|
||||
|
||||
|
||||
def fuzzy_hash(path: str) -> str | None:
|
||||
"""Context-triggered piecewise hash (ssdeep format) for near-duplicate detection.
|
||||
|
||||
Uses ppdeep (pure-Python) or ssdeep if importable; returns None otherwise, so the
|
||||
pipeline never hard-depends on it."""
|
||||
for modname in ("ppdeep", "ssdeep"):
|
||||
try:
|
||||
mod = __import__(modname)
|
||||
except ImportError:
|
||||
continue
|
||||
try:
|
||||
return mod.hash_from_file(path)
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def hash_file(path: str) -> FileHashes:
|
||||
"""sha256 + md5 + size (+ fuzzy when available), streamed so big DLLs don't load whole."""
|
||||
sha, md5 = hashlib.sha256(), hashlib.md5()
|
||||
size = 0
|
||||
with open(path, "rb") as fh:
|
||||
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
|
||||
sha.update(chunk)
|
||||
md5.update(chunk)
|
||||
size += len(chunk)
|
||||
return FileHashes(sha256=sha.hexdigest(), md5=md5.hexdigest(), size=size, fuzzy=fuzzy_hash(path))
|
||||
Reference in New Issue
Block a user