"""Find the engine DLL inside an unpacked game tree and hash it. Identification is *content-based* first — we scan the file for marker strings that only a Piklib/BlooMoo engine carries (the factory class name shows up inside the MSVC RTTI/mangled symbols) — and fall back to filename hints. So a renamed DLL is still picked correctly, before the expensive Ghidra pass ever runs. """ from __future__ import annotations import hashlib import os from dataclasses import dataclass # ASCII substrings that survive inside the PE: the factory class name is the # decisive "this is an engine DLL" signal (it appears within mangled RTTI names), # the namespace/product tags only disambiguate which engine. _MARK_FACTORY = b"CMC_ObjectsContainer" _MARK_BLOOMOO = b"BlooMoo" _MARK_PIKLIB = b"Piklib" _FILENAME_HINTS = ("piklib", "bloomoo") _PE_EXT = (".dll", ".exe") _SCAN_LIMIT = 64 * 1024 * 1024 # plenty for these engine DLLs, bounds pathological files @dataclass class Candidate: """A scored engine-DLL candidate. Higher score = more certain.""" path: str score: int engine: str | None # "BlooMoo" | "Piklib" | None (factory present, product unclear) reason: str @dataclass class FileHashes: sha256: str md5: str size: int fuzzy: str | None # ssdeep-style, when ppdeep/ssdeep is installed; else None def is_pe(path: str) -> bool: """True if the file is a Windows PE image (MZ stub + PE\\0\\0 header).""" try: with open(path, "rb") as fh: if fh.read(2) != b"MZ": return False fh.seek(0x3C) off = fh.read(4) if len(off) < 4: return False fh.seek(int.from_bytes(off, "little")) return fh.read(4) == b"PE\x00\x00" except OSError: return False def scan_markers(path: str, limit: int = _SCAN_LIMIT) -> tuple[bool, str | None]: """Scan up to `limit` bytes for engine markers → (has_factory, engine_guess).""" try: with open(path, "rb") as fh: blob = fh.read(limit) except OSError: return (False, None) engine = "BlooMoo" if _MARK_BLOOMOO in blob else ("Piklib" if _MARK_PIKLIB in blob else None) return (_MARK_FACTORY in blob, engine) def _score(path: str) -> Candidate | None: """Score a single file as an engine DLL, or None if it isn't a PE image.""" if not is_pe(path): return None has_factory, engine = scan_markers(path) score, reasons = 0, [] if has_factory: score += 100 reasons.append("factory-marker") name = os.path.basename(path).lower() if any(h in name for h in _FILENAME_HINTS): score += 10 reasons.append("filename-hint") if engine: score += 5 reasons.append("engine=" + engine) if score == 0: return None # a PE with no engine signal at all — not a candidate return Candidate(path=path, score=score, engine=engine, reason=",".join(reasons)) def find_engine_dlls(root: str) -> list[Candidate]: """Walk `root` and return engine-DLL candidates, strongest first. A single file path is accepted directly. Only PE files with a `.dll`/`.exe` extension are considered, but the actual decision is content-based.""" targets: list[str] = [] if os.path.isfile(root): targets = [root] else: for dirpath, _dirs, files in os.walk(root): for fn in files: if fn.lower().endswith(_PE_EXT): targets.append(os.path.join(dirpath, fn)) out = [c for c in (_score(p) for p in targets) if c is not None] out.sort(key=lambda c: c.score, reverse=True) return out def fuzzy_hash(path: str) -> str | None: """Context-triggered piecewise hash (ssdeep format) for near-duplicate detection. Uses ppdeep (pure-Python) or ssdeep if importable; returns None otherwise, so the pipeline never hard-depends on it.""" for modname in ("ppdeep", "ssdeep"): try: mod = __import__(modname) except ImportError: continue try: return mod.hash_from_file(path) except Exception: return None return None def hash_file(path: str) -> FileHashes: """sha256 + md5 + size (+ fuzzy when available), streamed so big DLLs don't load whole.""" sha, md5 = hashlib.sha256(), hashlib.md5() size = 0 with open(path, "rb") as fh: for chunk in iter(lambda: fh.read(1024 * 1024), b""): sha.update(chunk) md5.update(chunk) size += len(chunk) return FileHashes(sha256=sha.hexdigest(), md5=md5.hexdigest(), size=size, fuzzy=fuzzy_hash(path))