Initial commit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-09 17:36:26 +08:00
commit 097d2ce472
25 changed files with 5944 additions and 0 deletions
--- a/tool_registry.py
+++ b/tool_registry.py
@@ -0,0 +1,615 @@
+"""Central tool registry — catalogs all available forensic tools.
+
+Tools are registered once at startup with bound image_path and offset.
+The AgentFactory uses this catalog to compose agents dynamically.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Any
+
+from tools import parsers
+from tools import registry as reg
+from tools import sleuthkit as tsk
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Tool result cache — keyed by (tool_name, args_hash).
+# Disk image tools are deterministic (image is read-only), so identical
+# calls always produce the same output.
+# ---------------------------------------------------------------------------
+
+_tool_result_cache: dict[str, str] = {}
+
+# Tools safe to cache: deterministic reads with no side effects.
+CACHEABLE_TOOLS: set[str] = {
+    "partition_info", "filesystem_info", "list_directory", "find_file",
+    "search_strings", "count_deleted_files", "build_filesystem_timeline",
+    "parse_registry_key", "search_registry", "get_user_activity",
+    "read_text_file", "read_binary_preview", "search_text_file",
+    "read_text_file_section", "list_extracted_dir", "parse_pcap_strings",
+}
+
+
+def _cache_key(tool_name: str, kwargs: dict) -> str:
+    """Build a deterministic cache key from tool name + arguments."""
+    args_str = json.dumps(kwargs, sort_keys=True, ensure_ascii=False)
+    args_hash = hashlib.md5(args_str.encode()).hexdigest()
+    return f"{tool_name}:{args_hash}"
+
+
+def _make_cached(tool_name: str, executor: Any) -> Any:
+    """Wrap an executor with an in-memory result cache."""
+
+    async def wrapper(**kwargs) -> str:
+        key = _cache_key(tool_name, kwargs)
+        cached = _tool_result_cache.get(key)
+        if cached is not None:
+            logger.debug("Cache hit: %s(%s)", tool_name, kwargs)
+            return cached
+        result = await executor(**kwargs)
+        # Only cache successful results (not errors)
+        if not result.startswith("Error") and not result.startswith("[Command failed"):
+            _tool_result_cache[key] = result
+        return result
+
+    return wrapper
+
+
+def get_cache_stats() -> dict[str, int]:
+    """Return cache statistics for diagnostics."""
+    return {"entries": len(_tool_result_cache)}
+
+# Category auto-detection patterns (filename → category)
+_REGISTRY_HIVE_NAMES = {"system", "software", "sam", "ntuser.dat", "security", "default"}
+
+ASSET_CATEGORIES = [
+    "registry_hive", "chat_log", "prefetch", "network_capture",
+    "config_file", "address_book", "recycle_bin", "executable",
+    "text_log", "other",
+]
+
+
+def _auto_categorize(filename: str) -> str:
+    """Infer asset category from filename."""
+    name_lower = filename.lower()
+    ext = os.path.splitext(name_lower)[1]
+
+    # Check full name (with extension) and base name against known hive names
+    if name_lower in _REGISTRY_HIVE_NAMES:
+        return "registry_hive"
+    if ext == ".pf":
+        return "prefetch"
+    if ext in (".pcap", ".cap") or name_lower == "interception":
+        return "network_capture"
+    if ext == ".wab":
+        return "address_book"
+    if name_lower == "info2" or re.match(r"dc\d+\.exe", name_lower):
+        return "recycle_bin"
+    # Extension-based checks before keyword-based (e.g. mirc.ini → config, not chat)
+    if ext in (".ini", ".csv", ".dat", ".cfg"):
+        return "config_file"
+    if ext in (".log", ".lst"):
+        if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")):
+            return "chat_log"
+        return "text_log"
+    if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")):
+        return "chat_log"
+    if ext in (".exe", ".dll", ".com"):
+        return "executable"
+    return "other"
+
+
+@dataclass
+class ToolDefinition:
+    """A registered tool available for agent composition."""
+
+    name: str
+    description: str
+    input_schema: dict
+    executor: Any               # async callable (or sync for some parsers)
+    module: str                 # "sleuthkit", "registry", "parsers"
+    tags: list[str] = field(default_factory=list)
+
+
+# Global tool catalog, populated by register_all_tools().
+TOOL_CATALOG: dict[str, ToolDefinition] = {}
+
+
+def _make_auto_record(tool_name: str, category: str, executor: Any, graph: Any) -> Any:
+    """Wrap a forensic tool to auto-record its result as a phenomenon."""
+
+    async def wrapper(**kwargs) -> str:
+        result = await executor(**kwargs)
+        if graph is None or not result or result.startswith("Error") or result.startswith("["):
+            return result
+        # Auto-record: the tool produced a forensic fact
+        agent = getattr(graph, "_current_agent", "") or "unknown"
+        title = f"{tool_name}: {result.split(chr(10))[0][:80]}"
+        await graph.add_phenomenon(
+            source_agent=agent,
+            category=category,
+            title=title,
+            description=result[:2000],
+            source_tool=tool_name,
+        )
+        return result
+
+    return wrapper
+
+
+def register_all_tools(
+    image_path: str,
+    partition_offset: int,
+    graph: Any = None,
+    extracted_dir: str = "extracted",
+) -> None:
+    """Populate TOOL_CATALOG with all available tools, pre-bound to image/offset."""
+    TOOL_CATALOG.clear()
+
+    # ---- Sleuth Kit tools ----
+
+    TOOL_CATALOG["partition_info"] = ToolDefinition(
+        name="partition_info",
+        description="Get the partition table layout of the disk image. Run this first to understand disk structure.",
+        input_schema={"type": "object", "properties": {}},
+        executor=lambda: tsk.partition_info(image_path),
+        module="sleuthkit",
+        tags=["filesystem", "disk", "partition"],
+    )
+
+    TOOL_CATALOG["filesystem_info"] = ToolDefinition(
+        name="filesystem_info",
+        description="Get detailed filesystem information (type, block size, volume name, etc.) for the selected partition.",
+        input_schema={"type": "object", "properties": {}},
+        executor=lambda: tsk.filesystem_info(image_path, partition_offset),
+        module="sleuthkit",
+        tags=["filesystem", "disk"],
+    )
+
+    TOOL_CATALOG["list_directory"] = ToolDefinition(
+        name="list_directory",
+        description="List files and directories. Without inode, lists root. Use recursive=true for all files.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "inode": {"type": "string", "description": "Inode of directory. Omit for root."},
+                "recursive": {"type": "boolean", "description": "List all files recursively."},
+            },
+        },
+        executor=lambda inode=None, recursive=False: tsk.list_directory(
+            image_path, partition_offset, inode, recursive
+        ),
+        module="sleuthkit",
+        tags=["filesystem", "directory", "listing"],
+    )
+
+    async def _extract_with_tracking(inode: str) -> str:
+        """Extract a file by inode. Name and category are derived from the real disk path."""
+        # Dedup
+        if graph is not None:
+            existing = graph.lookup_asset_by_inode(inode)
+            if existing is not None:
+                return (
+                    f"Already extracted: {existing.local_path} "
+                    f"({existing.size_bytes} bytes, {existing.category}). "
+                    f"Disk path: {existing.original_path}"
+                )
+
+        # Resolve real disk path first
+        orig_path = (await tsk.find_file(image_path, inode, partition_offset)).strip()
+        if not orig_path or "not found" in orig_path.lower():
+            return f"Error: inode {inode} not found on the disk image."
+
+        # Derive local filename from real disk path
+        filename = os.path.basename(orig_path)
+        local_path = os.path.join(extracted_dir, filename)
+
+        # Handle name collisions by appending inode
+        if os.path.exists(local_path):
+            base, ext = os.path.splitext(filename)
+            local_path = os.path.join(extracted_dir, f"{base}_{inode.replace('-', '_')}{ext}")
+            filename = os.path.basename(local_path)
+
+        # Extract
+        result = await tsk.extract_file(image_path, inode, local_path, partition_offset)
+        if result.startswith("[icat failed"):
+            return result
+
+        size = os.path.getsize(local_path) if os.path.exists(local_path) else 0
+        category = _auto_categorize(os.path.basename(orig_path))
+
+        # Register
+        if graph is not None:
+            agent_name = getattr(graph, "_current_agent", "") or "unknown"
+            await graph.register_asset(
+                inode=inode,
+                original_path=orig_path,
+                local_path=local_path,
+                category=category,
+                filename=filename,
+                size_bytes=size,
+                extracted_by=agent_name,
+            )
+            logger.info("Asset registered: %s (%s, %d bytes)", local_path, category, size)
+
+        return (
+            f"Extracted to {local_path} ({size} bytes, {category})\n"
+            f"Disk path: {orig_path}"
+        )
+
+    TOOL_CATALOG["extract_file"] = ToolDefinition(
+        name="extract_file",
+        description=(
+            "Extract a file from the disk image by inode number. "
+            "The filename is automatically determined from the disk path. "
+            "Checks if already extracted (returns existing path if so). "
+            "Returns the local path and the original disk path."
+        ),
+        input_schema={
+            "type": "object",
+            "properties": {
+                "inode": {"type": "string", "description": "Inode number of the file (e.g. '334-128-4' or '334')."},
+            },
+            "required": ["inode"],
+        },
+        executor=_extract_with_tracking,
+        module="sleuthkit",
+        tags=["filesystem", "extraction"],
+    )
+
+    TOOL_CATALOG["find_file"] = ToolDefinition(
+        name="find_file",
+        description="Find the file path for a given inode number.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "inode": {"type": "string", "description": "Inode number to look up."},
+            },
+            "required": ["inode"],
+        },
+        executor=lambda inode: tsk.find_file(image_path, inode, partition_offset),
+        module="sleuthkit",
+        tags=["filesystem"],
+    )
+
+    TOOL_CATALOG["search_strings"] = ToolDefinition(
+        name="search_strings",
+        description="Search for a string pattern across the entire disk image (slow on first call, fast after). Prefer search_text_file on already-extracted files when possible.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "pattern": {"type": "string", "description": "String pattern (case-insensitive grep)."},
+            },
+            "required": ["pattern"],
+        },
+        executor=lambda pattern: tsk.search_strings(image_path, pattern),
+        module="sleuthkit",
+        tags=["filesystem", "search", "strings"],
+    )
+
+    TOOL_CATALOG["count_deleted_files"] = ToolDefinition(
+        name="count_deleted_files",
+        description="List and count all deleted files. Shows total count, executables, and extension breakdown.",
+        input_schema={"type": "object", "properties": {}},
+        executor=lambda: tsk.count_deleted_files(image_path, partition_offset),
+        module="sleuthkit",
+        tags=["filesystem", "deleted", "recovery"],
+    )
+
+    TOOL_CATALOG["build_filesystem_timeline"] = ToolDefinition(
+        name="build_filesystem_timeline",
+        description="Build a MAC timeline from the filesystem (Modified/Accessed/Changed times for all files).",
+        input_schema={"type": "object", "properties": {}},
+        executor=lambda: tsk.build_timeline(image_path, partition_offset),
+        module="sleuthkit",
+        tags=["filesystem", "timeline"],
+    )
+
+    # ---- Registry tools ----
+
+    TOOL_CATALOG["parse_registry_key"] = ToolDefinition(
+        name="parse_registry_key",
+        description="Parse a registry hive file and list subkeys/values at a given path.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to extracted hive file."},
+                "key_path": {"type": "string", "description": "Registry key path to inspect."},
+            },
+            "required": ["hive_path", "key_path"],
+        },
+        executor=lambda hive_path, key_path: reg.parse_registry_key(hive_path, key_path),
+        module="registry",
+        tags=["registry", "hive"],
+    )
+
+    TOOL_CATALOG["list_installed_software"] = ToolDefinition(
+        name="list_installed_software",
+        description="List installed software from a SOFTWARE registry hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SOFTWARE hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("list_installed_software", "registry",
+                                   lambda hive_path: reg.list_installed_software(hive_path), graph),
+        module="registry",
+        tags=["registry", "software", "installed"],
+    )
+
+    TOOL_CATALOG["get_user_activity"] = ToolDefinition(
+        name="get_user_activity",
+        description="Extract user activity from NTUSER.DAT (recent docs, typed URLs, run dialog history).",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to NTUSER.DAT."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=lambda hive_path: reg.get_user_activity(hive_path),
+        module="registry",
+        tags=["registry", "user", "activity"],
+    )
+
+    TOOL_CATALOG["search_registry"] = ToolDefinition(
+        name="search_registry",
+        description="Search for a pattern in registry key names and values.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to hive file."},
+                "pattern": {"type": "string", "description": "Search pattern."},
+            },
+            "required": ["hive_path", "pattern"],
+        },
+        executor=lambda hive_path, pattern: reg.search_registry(hive_path, pattern),
+        module="registry",
+        tags=["registry", "search"],
+    )
+
+    # ---- Registry tools (auto-record: results are forensic facts) ----
+
+    TOOL_CATALOG["get_system_info"] = ToolDefinition(
+        name="get_system_info",
+        description="Extract OS version, install date, and registered owner from a SOFTWARE hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SOFTWARE hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_system_info", "registry",
+                                   lambda hive_path: reg.get_system_info(hive_path), graph),
+        module="registry",
+        tags=["registry", "system"],
+    )
+
+    TOOL_CATALOG["get_timezone_info"] = ToolDefinition(
+        name="get_timezone_info",
+        description="Extract timezone settings from a SYSTEM hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_timezone_info", "registry",
+                                   lambda hive_path: reg.get_timezone_info(hive_path), graph),
+        module="registry",
+        tags=["registry", "timezone", "system"],
+    )
+
+    TOOL_CATALOG["get_computer_name"] = ToolDefinition(
+        name="get_computer_name",
+        description="Extract computer/host name from a SYSTEM hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_computer_name", "registry",
+                                   lambda hive_path: reg.get_computer_name(hive_path), graph),
+        module="registry",
+        tags=["registry", "system", "hostname"],
+    )
+
+    TOOL_CATALOG["get_shutdown_time"] = ToolDefinition(
+        name="get_shutdown_time",
+        description="Extract last shutdown time from a SYSTEM hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_shutdown_time", "registry",
+                                   lambda hive_path: reg.get_shutdown_time(hive_path), graph),
+        module="registry",
+        tags=["registry", "system", "shutdown"],
+    )
+
+    TOOL_CATALOG["enumerate_users"] = ToolDefinition(
+        name="enumerate_users",
+        description="List all user accounts and RIDs from a SAM hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SAM hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("enumerate_users", "registry",
+                                   lambda hive_path: reg.enumerate_users(hive_path), graph),
+        module="registry",
+        tags=["registry", "user", "accounts", "sam"],
+    )
+
+    TOOL_CATALOG["get_network_interfaces"] = ToolDefinition(
+        name="get_network_interfaces",
+        description="Extract network adapter and TCP/IP config from a SYSTEM hive.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_network_interfaces", "registry",
+                                   lambda hive_path: reg.get_network_interfaces(hive_path), graph),
+        module="registry",
+        tags=["registry", "network", "adapter", "ip"],
+    )
+
+    TOOL_CATALOG["get_email_config"] = ToolDefinition(
+        name="get_email_config",
+        description="Extract email account configuration (SMTP, POP3, NNTP) from NTUSER.DAT.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "hive_path": {"type": "string", "description": "Path to NTUSER.DAT."},
+            },
+            "required": ["hive_path"],
+        },
+        executor=_make_auto_record("get_email_config", "registry",
+                                   lambda hive_path: reg.get_email_config(hive_path), graph),
+        module="registry",
+        tags=["registry", "email", "account"],
+    )
+
+    # ---- Parser tools ----
+
+    TOOL_CATALOG["parse_prefetch"] = ToolDefinition(
+        name="parse_prefetch",
+        description="Parse a Windows Prefetch (.pf) file to extract executable name, last execution time, and run count.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Path to extracted .pf file."},
+            },
+            "required": ["file_path"],
+        },
+        executor=_make_auto_record("parse_prefetch", "filesystem",
+                                   lambda file_path: parsers.parse_prefetch(file_path), graph),
+        module="parsers",
+        tags=["filesystem", "prefetch", "execution"],
+    )
+
+    TOOL_CATALOG["read_text_file"] = ToolDefinition(
+        name="read_text_file",
+        description="Read an extracted text file (configs, logs, chat logs, etc.).",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Local path to the file."},
+            },
+            "required": ["file_path"],
+        },
+        executor=lambda file_path: parsers.read_text_file(file_path),
+        module="parsers",
+        tags=["text", "read"],
+    )
+
+    TOOL_CATALOG["read_binary_preview"] = ToolDefinition(
+        name="read_binary_preview",
+        description="Preview a binary file in hex+ASCII format.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Local path to the file."},
+            },
+            "required": ["file_path"],
+        },
+        executor=lambda file_path: parsers.read_binary_preview(file_path),
+        module="parsers",
+        tags=["binary", "hex", "preview"],
+    )
+
+    TOOL_CATALOG["search_text_file"] = ToolDefinition(
+        name="search_text_file",
+        description="Search for a regex pattern in an extracted text file. Returns matching lines with line numbers.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Path to extracted file."},
+                "pattern": {"type": "string", "description": "Regex pattern."},
+            },
+            "required": ["file_path", "pattern"],
+        },
+        executor=lambda file_path, pattern: parsers.search_text_file(file_path, pattern),
+        module="parsers",
+        tags=["text", "search", "regex"],
+    )
+
+    TOOL_CATALOG["read_text_file_section"] = ToolDefinition(
+        name="read_text_file_section",
+        description="Read a section of a large text file starting at a byte offset.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Path to file."},
+                "start": {"type": "integer", "description": "Byte offset to start reading."},
+                "max_bytes": {"type": "integer", "description": "Maximum bytes to read."},
+            },
+            "required": ["file_path"],
+        },
+        executor=lambda file_path, start=0, max_bytes=8000: parsers.read_text_file_section(
+            file_path, start, max_bytes
+        ),
+        module="parsers",
+        tags=["text", "read", "section"],
+    )
+
+    TOOL_CATALOG["list_extracted_dir"] = ToolDefinition(
+        name="list_extracted_dir",
+        description="List files in an extracted directory with sizes.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "dir_path": {"type": "string", "description": "Directory path."},
+            },
+            "required": ["dir_path"],
+        },
+        executor=lambda dir_path: parsers.list_extracted_dir(dir_path),
+        module="parsers",
+        tags=["filesystem", "listing", "extracted"],
+    )
+
+    TOOL_CATALOG["parse_pcap_strings"] = ToolDefinition(
+        name="parse_pcap_strings",
+        description="Extract HTTP headers, hosts, User-Agent, cookies, and URLs from a PCAP/capture file.",
+        input_schema={
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Path to PCAP file."},
+            },
+            "required": ["file_path"],
+        },
+        executor=lambda file_path: parsers.parse_pcap_strings(file_path),
+        module="parsers",
+        tags=["network", "pcap", "http", "capture"],
+    )
+
+    # ---- Apply result caching to deterministic read-only tools ----
+    # Must come AFTER all tools are registered. Auto-record wrapped tools
+    # (e.g. get_system_info) are NOT in CACHEABLE_TOOLS since they write
+    # to the evidence graph as a side effect.
+    _tool_result_cache.clear()
+    for tool_name, td in TOOL_CATALOG.items():
+        if tool_name in CACHEABLE_TOOLS:
+            td.executor = _make_cached(tool_name, td.executor)