Initial commit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
BattleTag
2026-05-09 17:36:26 +08:00
commit 097d2ce472
25 changed files with 5944 additions and 0 deletions

615
tool_registry.py Normal file
View File

@@ -0,0 +1,615 @@
"""Central tool registry — catalogs all available forensic tools.
Tools are registered once at startup with bound image_path and offset.
The AgentFactory uses this catalog to compose agents dynamically.
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Any
from tools import parsers
from tools import registry as reg
from tools import sleuthkit as tsk
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Tool result cache — keyed by (tool_name, args_hash).
# Disk image tools are deterministic (image is read-only), so identical
# calls always produce the same output.
# ---------------------------------------------------------------------------
_tool_result_cache: dict[str, str] = {}
# Tools safe to cache: deterministic reads with no side effects.
CACHEABLE_TOOLS: set[str] = {
"partition_info", "filesystem_info", "list_directory", "find_file",
"search_strings", "count_deleted_files", "build_filesystem_timeline",
"parse_registry_key", "search_registry", "get_user_activity",
"read_text_file", "read_binary_preview", "search_text_file",
"read_text_file_section", "list_extracted_dir", "parse_pcap_strings",
}
def _cache_key(tool_name: str, kwargs: dict) -> str:
"""Build a deterministic cache key from tool name + arguments."""
args_str = json.dumps(kwargs, sort_keys=True, ensure_ascii=False)
args_hash = hashlib.md5(args_str.encode()).hexdigest()
return f"{tool_name}:{args_hash}"
def _make_cached(tool_name: str, executor: Any) -> Any:
"""Wrap an executor with an in-memory result cache."""
async def wrapper(**kwargs) -> str:
key = _cache_key(tool_name, kwargs)
cached = _tool_result_cache.get(key)
if cached is not None:
logger.debug("Cache hit: %s(%s)", tool_name, kwargs)
return cached
result = await executor(**kwargs)
# Only cache successful results (not errors)
if not result.startswith("Error") and not result.startswith("[Command failed"):
_tool_result_cache[key] = result
return result
return wrapper
def get_cache_stats() -> dict[str, int]:
"""Return cache statistics for diagnostics."""
return {"entries": len(_tool_result_cache)}
# Category auto-detection patterns (filename → category)
_REGISTRY_HIVE_NAMES = {"system", "software", "sam", "ntuser.dat", "security", "default"}
ASSET_CATEGORIES = [
"registry_hive", "chat_log", "prefetch", "network_capture",
"config_file", "address_book", "recycle_bin", "executable",
"text_log", "other",
]
def _auto_categorize(filename: str) -> str:
"""Infer asset category from filename."""
name_lower = filename.lower()
ext = os.path.splitext(name_lower)[1]
# Check full name (with extension) and base name against known hive names
if name_lower in _REGISTRY_HIVE_NAMES:
return "registry_hive"
if ext == ".pf":
return "prefetch"
if ext in (".pcap", ".cap") or name_lower == "interception":
return "network_capture"
if ext == ".wab":
return "address_book"
if name_lower == "info2" or re.match(r"dc\d+\.exe", name_lower):
return "recycle_bin"
# Extension-based checks before keyword-based (e.g. mirc.ini → config, not chat)
if ext in (".ini", ".csv", ".dat", ".cfg"):
return "config_file"
if ext in (".log", ".lst"):
if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")):
return "chat_log"
return "text_log"
if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")):
return "chat_log"
if ext in (".exe", ".dll", ".com"):
return "executable"
return "other"
@dataclass
class ToolDefinition:
"""A registered tool available for agent composition."""
name: str
description: str
input_schema: dict
executor: Any # async callable (or sync for some parsers)
module: str # "sleuthkit", "registry", "parsers"
tags: list[str] = field(default_factory=list)
# Global tool catalog, populated by register_all_tools().
TOOL_CATALOG: dict[str, ToolDefinition] = {}
def _make_auto_record(tool_name: str, category: str, executor: Any, graph: Any) -> Any:
"""Wrap a forensic tool to auto-record its result as a phenomenon."""
async def wrapper(**kwargs) -> str:
result = await executor(**kwargs)
if graph is None or not result or result.startswith("Error") or result.startswith("["):
return result
# Auto-record: the tool produced a forensic fact
agent = getattr(graph, "_current_agent", "") or "unknown"
title = f"{tool_name}: {result.split(chr(10))[0][:80]}"
await graph.add_phenomenon(
source_agent=agent,
category=category,
title=title,
description=result[:2000],
source_tool=tool_name,
)
return result
return wrapper
def register_all_tools(
image_path: str,
partition_offset: int,
graph: Any = None,
extracted_dir: str = "extracted",
) -> None:
"""Populate TOOL_CATALOG with all available tools, pre-bound to image/offset."""
TOOL_CATALOG.clear()
# ---- Sleuth Kit tools ----
TOOL_CATALOG["partition_info"] = ToolDefinition(
name="partition_info",
description="Get the partition table layout of the disk image. Run this first to understand disk structure.",
input_schema={"type": "object", "properties": {}},
executor=lambda: tsk.partition_info(image_path),
module="sleuthkit",
tags=["filesystem", "disk", "partition"],
)
TOOL_CATALOG["filesystem_info"] = ToolDefinition(
name="filesystem_info",
description="Get detailed filesystem information (type, block size, volume name, etc.) for the selected partition.",
input_schema={"type": "object", "properties": {}},
executor=lambda: tsk.filesystem_info(image_path, partition_offset),
module="sleuthkit",
tags=["filesystem", "disk"],
)
TOOL_CATALOG["list_directory"] = ToolDefinition(
name="list_directory",
description="List files and directories. Without inode, lists root. Use recursive=true for all files.",
input_schema={
"type": "object",
"properties": {
"inode": {"type": "string", "description": "Inode of directory. Omit for root."},
"recursive": {"type": "boolean", "description": "List all files recursively."},
},
},
executor=lambda inode=None, recursive=False: tsk.list_directory(
image_path, partition_offset, inode, recursive
),
module="sleuthkit",
tags=["filesystem", "directory", "listing"],
)
async def _extract_with_tracking(inode: str) -> str:
"""Extract a file by inode. Name and category are derived from the real disk path."""
# Dedup
if graph is not None:
existing = graph.lookup_asset_by_inode(inode)
if existing is not None:
return (
f"Already extracted: {existing.local_path} "
f"({existing.size_bytes} bytes, {existing.category}). "
f"Disk path: {existing.original_path}"
)
# Resolve real disk path first
orig_path = (await tsk.find_file(image_path, inode, partition_offset)).strip()
if not orig_path or "not found" in orig_path.lower():
return f"Error: inode {inode} not found on the disk image."
# Derive local filename from real disk path
filename = os.path.basename(orig_path)
local_path = os.path.join(extracted_dir, filename)
# Handle name collisions by appending inode
if os.path.exists(local_path):
base, ext = os.path.splitext(filename)
local_path = os.path.join(extracted_dir, f"{base}_{inode.replace('-', '_')}{ext}")
filename = os.path.basename(local_path)
# Extract
result = await tsk.extract_file(image_path, inode, local_path, partition_offset)
if result.startswith("[icat failed"):
return result
size = os.path.getsize(local_path) if os.path.exists(local_path) else 0
category = _auto_categorize(os.path.basename(orig_path))
# Register
if graph is not None:
agent_name = getattr(graph, "_current_agent", "") or "unknown"
await graph.register_asset(
inode=inode,
original_path=orig_path,
local_path=local_path,
category=category,
filename=filename,
size_bytes=size,
extracted_by=agent_name,
)
logger.info("Asset registered: %s (%s, %d bytes)", local_path, category, size)
return (
f"Extracted to {local_path} ({size} bytes, {category})\n"
f"Disk path: {orig_path}"
)
TOOL_CATALOG["extract_file"] = ToolDefinition(
name="extract_file",
description=(
"Extract a file from the disk image by inode number. "
"The filename is automatically determined from the disk path. "
"Checks if already extracted (returns existing path if so). "
"Returns the local path and the original disk path."
),
input_schema={
"type": "object",
"properties": {
"inode": {"type": "string", "description": "Inode number of the file (e.g. '334-128-4' or '334')."},
},
"required": ["inode"],
},
executor=_extract_with_tracking,
module="sleuthkit",
tags=["filesystem", "extraction"],
)
TOOL_CATALOG["find_file"] = ToolDefinition(
name="find_file",
description="Find the file path for a given inode number.",
input_schema={
"type": "object",
"properties": {
"inode": {"type": "string", "description": "Inode number to look up."},
},
"required": ["inode"],
},
executor=lambda inode: tsk.find_file(image_path, inode, partition_offset),
module="sleuthkit",
tags=["filesystem"],
)
TOOL_CATALOG["search_strings"] = ToolDefinition(
name="search_strings",
description="Search for a string pattern across the entire disk image (slow on first call, fast after). Prefer search_text_file on already-extracted files when possible.",
input_schema={
"type": "object",
"properties": {
"pattern": {"type": "string", "description": "String pattern (case-insensitive grep)."},
},
"required": ["pattern"],
},
executor=lambda pattern: tsk.search_strings(image_path, pattern),
module="sleuthkit",
tags=["filesystem", "search", "strings"],
)
TOOL_CATALOG["count_deleted_files"] = ToolDefinition(
name="count_deleted_files",
description="List and count all deleted files. Shows total count, executables, and extension breakdown.",
input_schema={"type": "object", "properties": {}},
executor=lambda: tsk.count_deleted_files(image_path, partition_offset),
module="sleuthkit",
tags=["filesystem", "deleted", "recovery"],
)
TOOL_CATALOG["build_filesystem_timeline"] = ToolDefinition(
name="build_filesystem_timeline",
description="Build a MAC timeline from the filesystem (Modified/Accessed/Changed times for all files).",
input_schema={"type": "object", "properties": {}},
executor=lambda: tsk.build_timeline(image_path, partition_offset),
module="sleuthkit",
tags=["filesystem", "timeline"],
)
# ---- Registry tools ----
TOOL_CATALOG["parse_registry_key"] = ToolDefinition(
name="parse_registry_key",
description="Parse a registry hive file and list subkeys/values at a given path.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to extracted hive file."},
"key_path": {"type": "string", "description": "Registry key path to inspect."},
},
"required": ["hive_path", "key_path"],
},
executor=lambda hive_path, key_path: reg.parse_registry_key(hive_path, key_path),
module="registry",
tags=["registry", "hive"],
)
TOOL_CATALOG["list_installed_software"] = ToolDefinition(
name="list_installed_software",
description="List installed software from a SOFTWARE registry hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SOFTWARE hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("list_installed_software", "registry",
lambda hive_path: reg.list_installed_software(hive_path), graph),
module="registry",
tags=["registry", "software", "installed"],
)
TOOL_CATALOG["get_user_activity"] = ToolDefinition(
name="get_user_activity",
description="Extract user activity from NTUSER.DAT (recent docs, typed URLs, run dialog history).",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to NTUSER.DAT."},
},
"required": ["hive_path"],
},
executor=lambda hive_path: reg.get_user_activity(hive_path),
module="registry",
tags=["registry", "user", "activity"],
)
TOOL_CATALOG["search_registry"] = ToolDefinition(
name="search_registry",
description="Search for a pattern in registry key names and values.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to hive file."},
"pattern": {"type": "string", "description": "Search pattern."},
},
"required": ["hive_path", "pattern"],
},
executor=lambda hive_path, pattern: reg.search_registry(hive_path, pattern),
module="registry",
tags=["registry", "search"],
)
# ---- Registry tools (auto-record: results are forensic facts) ----
TOOL_CATALOG["get_system_info"] = ToolDefinition(
name="get_system_info",
description="Extract OS version, install date, and registered owner from a SOFTWARE hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SOFTWARE hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_system_info", "registry",
lambda hive_path: reg.get_system_info(hive_path), graph),
module="registry",
tags=["registry", "system"],
)
TOOL_CATALOG["get_timezone_info"] = ToolDefinition(
name="get_timezone_info",
description="Extract timezone settings from a SYSTEM hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_timezone_info", "registry",
lambda hive_path: reg.get_timezone_info(hive_path), graph),
module="registry",
tags=["registry", "timezone", "system"],
)
TOOL_CATALOG["get_computer_name"] = ToolDefinition(
name="get_computer_name",
description="Extract computer/host name from a SYSTEM hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_computer_name", "registry",
lambda hive_path: reg.get_computer_name(hive_path), graph),
module="registry",
tags=["registry", "system", "hostname"],
)
TOOL_CATALOG["get_shutdown_time"] = ToolDefinition(
name="get_shutdown_time",
description="Extract last shutdown time from a SYSTEM hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_shutdown_time", "registry",
lambda hive_path: reg.get_shutdown_time(hive_path), graph),
module="registry",
tags=["registry", "system", "shutdown"],
)
TOOL_CATALOG["enumerate_users"] = ToolDefinition(
name="enumerate_users",
description="List all user accounts and RIDs from a SAM hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SAM hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("enumerate_users", "registry",
lambda hive_path: reg.enumerate_users(hive_path), graph),
module="registry",
tags=["registry", "user", "accounts", "sam"],
)
TOOL_CATALOG["get_network_interfaces"] = ToolDefinition(
name="get_network_interfaces",
description="Extract network adapter and TCP/IP config from a SYSTEM hive.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to SYSTEM hive."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_network_interfaces", "registry",
lambda hive_path: reg.get_network_interfaces(hive_path), graph),
module="registry",
tags=["registry", "network", "adapter", "ip"],
)
TOOL_CATALOG["get_email_config"] = ToolDefinition(
name="get_email_config",
description="Extract email account configuration (SMTP, POP3, NNTP) from NTUSER.DAT.",
input_schema={
"type": "object",
"properties": {
"hive_path": {"type": "string", "description": "Path to NTUSER.DAT."},
},
"required": ["hive_path"],
},
executor=_make_auto_record("get_email_config", "registry",
lambda hive_path: reg.get_email_config(hive_path), graph),
module="registry",
tags=["registry", "email", "account"],
)
# ---- Parser tools ----
TOOL_CATALOG["parse_prefetch"] = ToolDefinition(
name="parse_prefetch",
description="Parse a Windows Prefetch (.pf) file to extract executable name, last execution time, and run count.",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to extracted .pf file."},
},
"required": ["file_path"],
},
executor=_make_auto_record("parse_prefetch", "filesystem",
lambda file_path: parsers.parse_prefetch(file_path), graph),
module="parsers",
tags=["filesystem", "prefetch", "execution"],
)
TOOL_CATALOG["read_text_file"] = ToolDefinition(
name="read_text_file",
description="Read an extracted text file (configs, logs, chat logs, etc.).",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Local path to the file."},
},
"required": ["file_path"],
},
executor=lambda file_path: parsers.read_text_file(file_path),
module="parsers",
tags=["text", "read"],
)
TOOL_CATALOG["read_binary_preview"] = ToolDefinition(
name="read_binary_preview",
description="Preview a binary file in hex+ASCII format.",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Local path to the file."},
},
"required": ["file_path"],
},
executor=lambda file_path: parsers.read_binary_preview(file_path),
module="parsers",
tags=["binary", "hex", "preview"],
)
TOOL_CATALOG["search_text_file"] = ToolDefinition(
name="search_text_file",
description="Search for a regex pattern in an extracted text file. Returns matching lines with line numbers.",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to extracted file."},
"pattern": {"type": "string", "description": "Regex pattern."},
},
"required": ["file_path", "pattern"],
},
executor=lambda file_path, pattern: parsers.search_text_file(file_path, pattern),
module="parsers",
tags=["text", "search", "regex"],
)
TOOL_CATALOG["read_text_file_section"] = ToolDefinition(
name="read_text_file_section",
description="Read a section of a large text file starting at a byte offset.",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to file."},
"start": {"type": "integer", "description": "Byte offset to start reading."},
"max_bytes": {"type": "integer", "description": "Maximum bytes to read."},
},
"required": ["file_path"],
},
executor=lambda file_path, start=0, max_bytes=8000: parsers.read_text_file_section(
file_path, start, max_bytes
),
module="parsers",
tags=["text", "read", "section"],
)
TOOL_CATALOG["list_extracted_dir"] = ToolDefinition(
name="list_extracted_dir",
description="List files in an extracted directory with sizes.",
input_schema={
"type": "object",
"properties": {
"dir_path": {"type": "string", "description": "Directory path."},
},
"required": ["dir_path"],
},
executor=lambda dir_path: parsers.list_extracted_dir(dir_path),
module="parsers",
tags=["filesystem", "listing", "extracted"],
)
TOOL_CATALOG["parse_pcap_strings"] = ToolDefinition(
name="parse_pcap_strings",
description="Extract HTTP headers, hosts, User-Agent, cookies, and URLs from a PCAP/capture file.",
input_schema={
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to PCAP file."},
},
"required": ["file_path"],
},
executor=lambda file_path: parsers.parse_pcap_strings(file_path),
module="parsers",
tags=["network", "pcap", "http", "capture"],
)
# ---- Apply result caching to deterministic read-only tools ----
# Must come AFTER all tools are registered. Auto-record wrapped tools
# (e.g. get_system_info) are NOT in CACHEABLE_TOOLS since they write
# to the evidence graph as a side effect.
_tool_result_cache.clear()
for tool_name, td in TOOL_CATALOG.items():
if tool_name in CACHEABLE_TOOLS:
td.executor = _make_cached(tool_name, td.executor)