From 097d2ce4726b6bb4a8a7908cad060cfef1031e05 Mon Sep 17 00:00:00 2001 From: BattleTag Date: Sat, 9 May 2026 17:36:26 +0800 Subject: [PATCH] Initial commit Co-Authored-By: Claude Opus 4.7 (1M context) --- .python-version | 1 + README.md | 168 +++++++++ agent_factory.py | 280 ++++++++++++++ agents/__init__.py | 0 agents/communication.py | 33 ++ agents/filesystem.py | 34 ++ agents/hypothesis.py | 130 +++++++ agents/network.py | 34 ++ agents/registry.py | 36 ++ agents/report.py | 191 ++++++++++ agents/timeline.py | 88 +++++ base_agent.py | 448 ++++++++++++++++++++++ evidence_graph.py | 799 ++++++++++++++++++++++++++++++++++++++++ llm_client.py | 619 +++++++++++++++++++++++++++++++ log_config.py | 243 ++++++++++++ main.py | 272 ++++++++++++++ orchestrator.py | 702 +++++++++++++++++++++++++++++++++++ pyproject.toml | 23 ++ regenerate_report.py | 63 ++++ tool_registry.py | 615 +++++++++++++++++++++++++++++++ tools/__init__.py | 0 tools/parsers.py | 234 ++++++++++++ tools/registry.py | 449 ++++++++++++++++++++++ tools/sleuthkit.py | 229 ++++++++++++ uv.lock | 253 +++++++++++++ 25 files changed, 5944 insertions(+) create mode 100644 .python-version create mode 100644 README.md create mode 100644 agent_factory.py create mode 100644 agents/__init__.py create mode 100644 agents/communication.py create mode 100644 agents/filesystem.py create mode 100644 agents/hypothesis.py create mode 100644 agents/network.py create mode 100644 agents/registry.py create mode 100644 agents/report.py create mode 100644 agents/timeline.py create mode 100644 base_agent.py create mode 100644 evidence_graph.py create mode 100644 llm_client.py create mode 100644 log_config.py create mode 100644 main.py create mode 100644 orchestrator.py create mode 100644 pyproject.toml create mode 100644 regenerate_report.py create mode 100644 tool_registry.py create mode 100644 tools/__init__.py create mode 100644 tools/parsers.py create mode 100644 tools/registry.py create mode 100644 tools/sleuthkit.py create mode 100644 uv.lock diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..6324d40 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.14 diff --git a/README.md b/README.md new file mode 100644 index 0000000..6768bfa --- /dev/null +++ b/README.md @@ -0,0 +1,168 @@ +# MASForensics + +Multi-Agent System for Digital Forensics — 基于大语言模型的多智能体电子取证系统。 + +系统通过 6 个专业化 Agent 协同工作,对磁盘镜像进行自动化取证分析,最终生成结构化的取证报告。 + +## 架构 + +``` +main.py 入口:配置加载、恢复检测、运行管理 + │ + ├── Orchestrator 四阶段流水线调度 + │ │ + │ ├── FileSystemAgent 磁盘结构、文件系统、删除文件、Prefetch + │ ├── RegistryAgent 注册表分析(系统/用户/网络/软件) + │ ├── CommunicationAgent 邮件、IRC 聊天记录 + │ ├── NetworkAgent 浏览器历史、PCAP 抓包 + │ ├── TimelineAgent 跨类别时间线关联 + │ └── ReportAgent 综合报告生成 + │ + ├── Blackboard 共享知识库(Evidence + Lead) + └── LLMClient Claude API 调用(ReAct 模式) +``` + +Agent 之间不直接通信,通过 **Blackboard(黑板)** 共享发现(Evidence)和线索(Lead)。 + +## 调查流程 + +| 阶段 | 说明 | +|------|------| +| **Phase 1** | FileSystemAgent 勘查磁盘镜像,识别分区、目录结构、关键文件,产出初始 Lead | +| **Phase 2** | 多轮线索追踪 — Lead 按 Agent 类型分组并行派发,最多 10 轮迭代 | +| **Phase 2.5** | 覆盖率缺口分析 — 对照 config.yaml 中的 10 个调查领域,自动补漏 | +| **Phase 3** | TimelineAgent 综合所有 evidence 建立事件时间线 | +| **Phase 4** | ReportAgent 生成 Markdown 格式取证报告 | + +## 取证工具链 + +### Sleuth Kit(磁盘取证) + +通过异步子进程调用 TSK 命令行工具: + +| 工具 | 用途 | +|------|------| +| `mmls` | 分区表分析 | +| `fsstat` | 文件系统元数据 | +| `fls` | 目录列举(含已删除文件) | +| `icat` | 按 inode 提取文件 | +| `srch_strings` | 磁盘字符串搜索 | +| `fls -m` | MAC 时间线生成 | + +### regipy(注册表解析) + +直接解析 Windows 注册表 hive 二进制文件(SYSTEM、SOFTWARE、SAM、NTUSER.DAT),提取系统信息、用户账户、网络配置、已安装软件、邮件账户、关机时间等。 + +### 文件解析器 + +- **Prefetch** — 二进制解析 Windows XP .pf 文件(运行次数、最后执行时间) +- **PCAP** — 从抓包文件提取 HTTP 请求、Host、Cookie、User-Agent +- **通用文本/二进制** — 按偏移读取、正则搜索、Hex dump + +## 断连恢复与数据归档 + +系统设计了三层防护,应对长时间运行中的网络中断: + +1. **Blackboard 自动持久化** — 每次 add_evidence / add_lead 自动写盘(原子写入) +2. **Agent 级容错** — 单个 Agent 失败标记 Lead 为 failed,不影响其他 Agent,自动重试一次 +3. **优雅退出** — 连续 3 次 Agent 失败后保存现有成果并干净退出 + +每次运行自动创建带时间戳的归档目录: + +``` +runs/ + 2026-04-02T14-30-00/ + config.yaml 配置快照 + blackboard_state.json 实时状态(用于恢复) + evidence.json 结构化证据导出 + leads.json 线索及最终状态 + report.md 取证报告 + run_metadata.json 运行元数据(时长、统计、错误) + masforensics.log 运行日志 +``` + +中断后再次运行 `python main.py`,系统自动检测未完成的运行并提示恢复。 + +## 快速开始 + +### 环境要求 + +- Python >= 3.14 +- The Sleuth Kit(系统安装,提供 `mmls`、`fls`、`icat` 等命令) +- 磁盘镜像文件置于 `image/` 目录 + +### 安装 + +```bash +uv sync +``` + +### 配置 + +编辑 `config.yaml`,填入 LLM API 地址和密钥: + +```yaml +agent: + base_url: "https://your-api-proxy.com" + api_key: "sk-your-key" + model: "claude-sonnet-4-6" + max_tokens: 4096 +``` + +`investigation_areas` 部分定义了必须覆盖的调查领域,可按需增减。 + +### 运行 + +```bash +python main.py +``` + +报告和所有结构化数据将保存在 `runs//` 目录下。 + +## 项目结构 + +``` +MASForensics/ +├── main.py 入口 +├── orchestrator.py 流水线调度 +├── blackboard.py 共享知识库 +├── llm_client.py LLM API 客户端 +├── base_agent.py Agent 基类 +├── config.yaml 配置文件 +├── agents/ +│ ├── filesystem.py 文件系统 Agent +│ ├── registry.py 注册表 Agent +│ ├── communication.py 通信 Agent +│ ├── network.py 网络 Agent +│ ├── timeline.py 时间线 Agent +│ └── report.py 报告 Agent +├── tools/ +│ ├── sleuthkit.py Sleuth Kit 封装 +│ ├── registry.py 注册表解析(regipy) +│ └── parsers.py 文件格式解析器 +├── image/ 磁盘镜像 +├── extracted/ 提取的文件(运行时生成) +└── runs/ 运行归档 +``` + +## 依赖 + +| 包 | 用途 | +|----|------| +| `httpx[socks]` | 异步 HTTP 客户端(支持 SOCKS 代理) | +| `pyyaml` | 配置文件解析 | +| `regipy` | Windows 注册表 hive 解析 | + +## 当前案例 + +默认配置分析 **CFReDS Hacking Case**(NIST 标准取证教学镜像): + +- 镜像:SCHARDT.001(~4.6GB,IBM 硬盘,8 个分段) +- 系统:Windows XP +- 场景:涉嫌黑客入侵的计算机取证分析 + +## 测试 + +```bash +python -m pytest tests/ -v +``` diff --git a/agent_factory.py b/agent_factory.py new file mode 100644 index 0000000..e55a680 --- /dev/null +++ b/agent_factory.py @@ -0,0 +1,280 @@ +"""Agent Factory — composes agents from tool registry and role templates. + +Provides both pre-defined agent templates (filesystem, registry, etc.) +and LLM-driven dynamic agent composition for capability gaps. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass, field + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG, ToolDefinition + +# Agent classes with custom tools — keyed by template name +_AGENT_CLASSES: dict[str, type] = {} + + +def _load_agent_classes() -> None: + """Lazy-import custom agent classes to avoid circular imports.""" + if _AGENT_CLASSES: + return + from agents.hypothesis import HypothesisAgent + from agents.report import ReportAgent + _AGENT_CLASSES["hypothesis"] = HypothesisAgent + _AGENT_CLASSES["report"] = ReportAgent + +logger = logging.getLogger(__name__) + + +@dataclass +class RoleTemplate: + """Pre-defined agent archetype.""" + + name: str + role: str + default_tools: list[str] # tool names from TOOL_CATALOG + tags: list[str] = field(default_factory=list) + + +# Pre-defined templates matching the original 6 agents + hypothesis agent. +ROLE_TEMPLATES: dict[str, RoleTemplate] = { + "filesystem": RoleTemplate( + name="filesystem", + role=( + "File system forensic analyst. You examine disk image partition layouts, " + "directory structures, file metadata, and recover deleted files. " + "You identify suspicious files, installed programs, and user data locations. " + "You also handle Recycle Bin forensics and Prefetch execution evidence." + ), + default_tools=[ + "partition_info", "filesystem_info", "list_directory", + "extract_file", "find_file", "search_strings", + "parse_prefetch", "count_deleted_files", + "read_text_file", "search_text_file", "read_binary_preview", + ], + tags=["filesystem", "disk", "files", "deleted", "prefetch"], + ), + "registry": RoleTemplate( + name="registry", + role=( + "Windows registry forensic analyst. You parse registry hive files " + "(SYSTEM, SOFTWARE, SAM, NTUSER.DAT) to extract system configuration, " + "user accounts, installed software, network settings, email accounts, " + "and other Windows artifacts." + ), + default_tools=[ + "extract_file", "list_directory", + "parse_registry_key", "list_installed_software", + "get_user_activity", "search_registry", + "get_system_info", "get_timezone_info", "get_computer_name", + "get_shutdown_time", "enumerate_users", + "get_network_interfaces", "get_email_config", + ], + tags=["registry", "windows", "system", "user", "software"], + ), + "communication": RoleTemplate( + name="communication", + role=( + "Communication forensic analyst. You analyze email files (.dbx, .pst), " + "IRC/mIRC chat logs, newsgroup data, and other messaging artifacts " + "to identify communication patterns and contacts." + ), + default_tools=[ + "list_directory", "extract_file", + "read_text_file", "read_binary_preview", + "list_extracted_dir", "search_strings", + "search_text_file", "read_text_file_section", + ], + tags=["email", "chat", "irc", "messaging", "communication"], + ), + "network": RoleTemplate( + name="network", + role=( + "Network forensic analyst. You analyze browser history, cookies, " + "network captures (PCAP), wireless artifacts, and other network-related " + "evidence to reconstruct online activities." + ), + default_tools=[ + "list_directory", "extract_file", + "read_text_file", "read_binary_preview", + "list_extracted_dir", "search_strings", + "search_text_file", "read_text_file_section", + "parse_pcap_strings", + ], + tags=["network", "browser", "pcap", "http", "internet"], + ), + "timeline": RoleTemplate( + name="timeline", + role=( + "Timeline correlation analyst. You build chronological timelines " + "by combining filesystem MAC times with evidence from other agents. " + "You identify temporal patterns and correlate events across categories." + ), + default_tools=[ + "build_filesystem_timeline", + ], + tags=["timeline", "correlation", "temporal"], + ), + "report": RoleTemplate( + name="report", + role=( + "Forensic report writer. You synthesize all evidence and hypotheses " + "into a comprehensive forensic analysis report with executive summary, " + "detailed findings organized by hypothesis, timeline of events, and conclusions." + ), + default_tools=[], # Report agent uses only graph query tools + tags=["report", "summary", "writing"], + ), + "hypothesis": RoleTemplate( + name="hypothesis", + role=( + "Hypothesis analyst. You review all phenomena discovered so far " + "and formulate investigative hypotheses about what happened on the system. " + "For each hypothesis, identify which existing phenomena support or contradict it." + ), + default_tools=[], # Uses only graph query + hypothesis tools + tags=["hypothesis", "analysis", "reasoning"], + ), +} + + +class AgentFactory: + """Creates agents from templates or dynamically via LLM composition.""" + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + self.llm = llm + self.graph = graph + self._cache: dict[str, BaseAgent] = {} + + def get_or_create_agent(self, agent_type: str) -> BaseAgent | None: + """Get a cached agent or create one from a template.""" + if agent_type in self._cache: + return self._cache[agent_type] + + template = ROLE_TEMPLATES.get(agent_type) + if template is None: + logger.warning("No template for agent type: %s", agent_type) + return None + + # Use custom agent class if one exists, otherwise BaseAgent + _load_agent_classes() + agent_cls = _AGENT_CLASSES.get(agent_type) + if agent_cls is not None: + agent = agent_cls(self.llm, self.graph) + else: + agent = self._instantiate_from_template(template) + self._cache[agent_type] = agent + return agent + + def _instantiate_from_template(self, template: RoleTemplate) -> BaseAgent: + """Create a BaseAgent from a role template, registering tools from the catalog.""" + agent = BaseAgent(self.llm, self.graph) + agent.name = template.name + agent.role = template.role + + for tool_name in template.default_tools: + td = TOOL_CATALOG.get(tool_name) + if td is None: + logger.warning("Tool '%s' not in catalog (template: %s)", tool_name, template.name) + continue + agent.register_tool(td.name, td.description, td.input_schema, td.executor) + + return agent + + async def create_specialized_agent( + self, + hypothesis_title: str, + hypothesis_desc: str, + capability_gap: str, + ) -> BaseAgent: + """Use LLM to compose a new agent for a specific investigative need. + + 1. LLM sees available tools + the capability gap + 2. LLM selects tools and writes role + strategy + 3. Factory instantiates BaseAgent with chosen tools + """ + available = "\n".join( + f"- {td.name}: {td.description} [tags: {', '.join(td.tags)}]" + for td in TOOL_CATALOG.values() + ) + + prompt = ( + f"You are designing a specialized forensic investigation agent.\n\n" + f"Hypothesis to investigate: {hypothesis_title}\n" + f"Details: {hypothesis_desc}\n" + f"Capability gap: {capability_gap}\n\n" + f"Available tools:\n{available}\n\n" + f"Select 3-8 tools from the list above that would be most useful.\n" + f"Write a role description (2-3 sentences) for this agent.\n" + f"Write an investigation strategy (3-5 numbered steps).\n\n" + f"Respond ONLY with JSON (no markdown):\n" + f'{{"agent_name": "short_name", "role": "...", "tools": ["tool1", "tool2"], "strategy": "1. ...\\n2. ..."}}' + ) + + response = await self.llm.chat( + messages=[{"role": "user", "content": prompt}], + ) + + # Parse response — try to extract JSON + try: + config = json.loads(response) + except json.JSONDecodeError: + # Try to find JSON in the response + import re + match = re.search(r'\{.*\}', response, re.DOTALL) + if match: + config = json.loads(match.group()) + else: + logger.error("Failed to parse agent composition response: %s", response[:300]) + # Fallback: create a generic agent with all tools + return self._create_fallback_agent(capability_gap) + + agent_name = config.get("agent_name", "specialized") + role_text = config.get("role", f"Specialized agent for: {capability_gap}") + strategy = config.get("strategy", "") + tool_names = config.get("tools", []) + + # Validate tool names against catalog + valid_tools = [t for t in tool_names if t in TOOL_CATALOG] + if not valid_tools: + logger.warning("No valid tools selected by LLM, using fallback") + return self._create_fallback_agent(capability_gap) + + # Build agent + agent = BaseAgent(self.llm, self.graph) + agent.name = agent_name + agent.role = f"{role_text}\n\nInvestigation Strategy:\n{strategy}" + + for tool_name in valid_tools: + td = TOOL_CATALOG[tool_name] + agent.register_tool(td.name, td.description, td.input_schema, td.executor) + + self._cache[agent_name] = agent + logger.info( + "Dynamic agent created: '%s' with %d tools: %s", + agent_name, len(valid_tools), valid_tools, + ) + return agent + + def _create_fallback_agent(self, gap_description: str) -> BaseAgent: + """Create a generic agent with common tools as fallback.""" + agent = BaseAgent(self.llm, self.graph) + agent.name = "generic_investigator" + agent.role = f"General forensic investigator. Focus: {gap_description}" + + fallback_tools = [ + "list_directory", "extract_file", "read_text_file", + "read_binary_preview", "search_strings", "search_text_file", + ] + for tool_name in fallback_tools: + td = TOOL_CATALOG.get(tool_name) + if td: + agent.register_tool(td.name, td.description, td.input_schema, td.executor) + + self._cache["generic_investigator"] = agent + return agent diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agents/communication.py b/agents/communication.py new file mode 100644 index 0000000..7399e9f --- /dev/null +++ b/agents/communication.py @@ -0,0 +1,33 @@ +"""Communication Agent — analyzes email, chat logs, and messaging artifacts.""" + +from __future__ import annotations + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + + +class CommunicationAgent(BaseAgent): + name = "communication" + role = ( + "Communication forensic analyst. You analyze email files (.dbx, .pst), " + "IRC/mIRC chat logs, newsgroup data, and other messaging artifacts " + "to identify communication patterns, contacts, and content." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _register_tools(self) -> None: + tool_names = [ + "list_directory", "extract_file", + "read_text_file", "read_binary_preview", + "list_extracted_dir", "search_strings", + "search_text_file", "read_text_file_section", + ] + for name in tool_names: + td = TOOL_CATALOG.get(name) + if td: + self.register_tool(td.name, td.description, td.input_schema, td.executor) diff --git a/agents/filesystem.py b/agents/filesystem.py new file mode 100644 index 0000000..5678f22 --- /dev/null +++ b/agents/filesystem.py @@ -0,0 +1,34 @@ +"""FileSystem Agent — analyzes disk structure, files, and deleted data.""" + +from __future__ import annotations + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + + +class FileSystemAgent(BaseAgent): + name = "filesystem" + role = ( + "File system forensic analyst. You examine disk image partition layouts, " + "directory structures, file metadata, and recover deleted files. " + "You identify suspicious files, installed programs, and user data locations. " + "You also handle malware analysis, Recycle Bin forensics, and Prefetch execution evidence." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _register_tools(self) -> None: + tool_names = [ + "partition_info", "filesystem_info", "list_directory", + "extract_file", "find_file", "search_strings", + "parse_prefetch", "count_deleted_files", + "read_text_file", "search_text_file", "read_binary_preview", + ] + for name in tool_names: + td = TOOL_CATALOG.get(name) + if td: + self.register_tool(td.name, td.description, td.input_schema, td.executor) diff --git a/agents/hypothesis.py b/agents/hypothesis.py new file mode 100644 index 0000000..4e25f89 --- /dev/null +++ b/agents/hypothesis.py @@ -0,0 +1,130 @@ +"""Hypothesis Agent — analyzes phenomena and generates investigative hypotheses.""" + +from __future__ import annotations + +import json +import logging + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph, HYPOTHESIS_EDGE_WEIGHTS +from llm_client import LLMClient + +logger = logging.getLogger(__name__) + + +class HypothesisAgent(BaseAgent): + name = "hypothesis" + role = ( + "Hypothesis analyst. You review all phenomena discovered so far " + "and formulate investigative hypotheses about what happened on this system. " + "Your ultimate goal: build the most complete picture of events that occurred. " + "For each hypothesis, identify which existing phenomena support or contradict it." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_hypothesis_tools() + + def _register_hypothesis_tools(self) -> None: + """Register hypothesis-specific tools.""" + + valid_edge_types = list(HYPOTHESIS_EDGE_WEIGHTS.keys()) + + self.register_tool( + name="add_hypothesis", + description=( + "Create a new investigative hypothesis about what happened on the system. " + "Each hypothesis should be a specific, testable claim." + ), + input_schema={ + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Short title for the hypothesis.", + }, + "description": { + "type": "string", + "description": "Detailed description of what this hypothesis claims.", + }, + }, + "required": ["title", "description"], + }, + executor=self._add_hypothesis, + ) + + self.register_tool( + name="link_phenomenon_to_hypothesis", + description=( + "Link an existing phenomenon to a hypothesis with a relationship type. " + f"Valid relationship types: {', '.join(valid_edge_types)}. " + "direct_evidence = the phenomenon IS the hypothesis. " + "supports = consistent with the hypothesis. " + "prerequisite_met = a necessary condition is satisfied. " + "consequence_observed = an expected result of the hypothesis is found. " + "contradicts = directly contradicts the hypothesis. " + "weakens = makes the hypothesis less likely." + ), + input_schema={ + "type": "object", + "properties": { + "phenomenon_id": { + "type": "string", + "description": "ID of the phenomenon (e.g. 'ph-a1b2c3d4').", + }, + "hypothesis_id": { + "type": "string", + "description": "ID of the hypothesis (e.g. 'hyp-e5f6g7h8').", + }, + "edge_type": { + "type": "string", + "enum": valid_edge_types, + "description": "The edge_type of the relationship.", + }, + "reason": { + "type": "string", + "description": "The reason this relationship holds (1-2 sentences).", + }, + }, + "required": ["phenomenon_id", "hypothesis_id", "edge_type", "reason"], + }, + executor=self._link_phenomenon_to_hypothesis, + ) + + async def _add_hypothesis(self, title: str, description: str) -> str: + hid = await self.graph.add_hypothesis( + title=title, + description=description, + created_by=self.name, + ) + return f"Hypothesis created: {hid} — {title} (confidence: 0.50)" + + async def _link_phenomenon_to_hypothesis( + self, + phenomenon_id: str, + hypothesis_id: str, + edge_type: str = "", + reason: str = "", + # Common LLM misnaming — accept as fallbacks + relationship: str = "", + note: str = "", + ) -> str: + edge_type = edge_type or relationship + reason = reason or note + if not edge_type: + return "Error: edge_type is required." + try: + new_conf = await self.graph.update_hypothesis_confidence( + hyp_id=hypothesis_id, + phenomenon_id=phenomenon_id, + edge_type=edge_type, + reason=reason, + ) + weight = HYPOTHESIS_EDGE_WEIGHTS[edge_type] + direction = "+" if weight > 0 else "" + return ( + f"Linked: {phenomenon_id} —[{edge_type}]→ {hypothesis_id} " + f"(weight: {direction}{weight}, new confidence: {new_conf:.3f})" + ) + except ValueError as e: + return f"Error linking: {e}" diff --git a/agents/network.py b/agents/network.py new file mode 100644 index 0000000..d1d4fb2 --- /dev/null +++ b/agents/network.py @@ -0,0 +1,34 @@ +"""Network Agent — analyzes browser history, network tool artifacts, and wireless evidence.""" + +from __future__ import annotations + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + + +class NetworkAgent(BaseAgent): + name = "network" + role = ( + "Network forensic analyst. You analyze browser history, cookies, " + "network captures (PCAP), wireless artifacts, and other network-related " + "evidence to reconstruct online activities and network attacks." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _register_tools(self) -> None: + tool_names = [ + "list_directory", "extract_file", + "read_text_file", "read_binary_preview", + "list_extracted_dir", "search_strings", + "search_text_file", "read_text_file_section", + "parse_pcap_strings", + ] + for name in tool_names: + td = TOOL_CATALOG.get(name) + if td: + self.register_tool(td.name, td.description, td.input_schema, td.executor) diff --git a/agents/registry.py b/agents/registry.py new file mode 100644 index 0000000..9ee62d0 --- /dev/null +++ b/agents/registry.py @@ -0,0 +1,36 @@ +"""Registry Agent — analyzes Windows registry hives.""" + +from __future__ import annotations + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + + +class RegistryAgent(BaseAgent): + name = "registry" + role = ( + "Windows registry forensic analyst. You parse registry hive files " + "(SYSTEM, SOFTWARE, SAM, NTUSER.DAT) to extract system configuration, " + "user accounts, installed software, network settings, email accounts, " + "and other Windows artifacts." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _register_tools(self) -> None: + tool_names = [ + "extract_file", "list_directory", + "parse_registry_key", "list_installed_software", + "get_user_activity", "search_registry", + "get_system_info", "get_timezone_info", "get_computer_name", + "get_shutdown_time", "enumerate_users", + "get_network_interfaces", "get_email_config", + ] + for name in tool_names: + td = TOOL_CATALOG.get(name) + if td: + self.register_tool(td.name, td.description, td.input_schema, td.executor) diff --git a/agents/report.py b/agents/report.py new file mode 100644 index 0000000..552d070 --- /dev/null +++ b/agents/report.py @@ -0,0 +1,191 @@ +"""Report Agent — generates forensic analysis reports.""" + +from __future__ import annotations + +import json +import os + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient + + +class ReportAgent(BaseAgent): + name = "report" + role = ( + "Forensic report writer. You synthesize all findings from the investigation " + "into a structured, professional forensic analysis report organized by hypotheses.\n\n" + "IMPORTANT: Only include findings that have a source_tool attribution (marked VERIFIED). " + "If evidence lacks source attribution, mark it as UNVERIFIED. " + "Do NOT invent or fabricate any data, timestamps, or findings not present in the evidence.\n\n" + "CRITICAL: You MUST call save_report to write the final report." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _build_system_prompt(self, task: str) -> str: + """Report agent gets a clean prompt — no Phase A/B/C/D workflow.""" + return ( + f"You are a forensic report writer.\n" + f"Role: {self.role}\n\n" + f"Investigation state:\n{self.graph.stats_summary()}\n\n" + f"Your task: {task}\n\n" + f"WORKFLOW:\n" + f"1. Call get_hypotheses_with_evidence to get all hypotheses and their linked evidence\n" + f"2. Call get_all_phenomena to get detailed findings by category\n" + f"3. Call get_entities to get people, programs, and hosts\n" + f"4. Call get_case_info for case metadata\n" + f"5. Write the complete report directly in your block\n\n" + f"RULES:\n" + f"- Write the report DIRECTLY in — do NOT use save_report tool\n" + f"- Only include findings present in the evidence graph\n" + f"- Do NOT invent timestamps, file paths, or data not in the phenomena\n" + f"- The report must be complete — do not cut off mid-section\n" + ) + + def _register_tools(self) -> None: + self.register_tool( + name="get_all_phenomena", + description="Get all phenomena across all categories with full details.", + input_schema={"type": "object", "properties": {}}, + executor=self._get_all_phenomena, + ) + + self.register_tool( + name="get_hypotheses_with_evidence", + description="Get all hypotheses with their linked phenomena (supporting and contradicting).", + input_schema={"type": "object", "properties": {}}, + executor=self._get_hypotheses_with_evidence, + ) + + self.register_tool( + name="get_case_info", + description="Get case metadata (image info, drive details, etc.).", + input_schema={"type": "object", "properties": {}}, + executor=self._get_case_info, + ) + + self.register_tool( + name="get_entities", + description="Get all entities (people, programs, hosts) and their connections.", + input_schema={"type": "object", "properties": {}}, + executor=self._get_entities, + ) + + self.register_tool( + name="save_report", + description="Save the final report to a file.", + input_schema={ + "type": "object", + "properties": { + "content": {"type": "string", "description": "Report content in Markdown."}, + "output_path": {"type": "string", "description": "File path to save the report."}, + }, + "required": ["content", "output_path"], + }, + executor=self._save_report, + ) + + self.register_tool( + name="verify_phenomena", + description="Check phenomena provenance — VERIFIED (has source_tool) vs UNVERIFIED.", + input_schema={"type": "object", "properties": {}}, + executor=self._verify_phenomena, + ) + + async def _get_all_phenomena(self) -> str: + phenomena = self.graph.phenomena + if not phenomena: + return "No phenomena in the evidence graph." + + categories = sorted(set(ph.category for ph in phenomena.values())) + lines = [f"=== All Phenomena ({len(phenomena)} entries) ==="] + for cat in categories: + items = [ph for ph in phenomena.values() if ph.category == cat] + lines.append(f"\n--- {cat.upper()} ({len(items)} entries) ---") + for ph in items: + verified = "VERIFIED" if ph.source_tool else "UNVERIFIED" + lines.append(f"\n[{verified}] {ph.title} ({ph.id})") + lines.append(f" Source: {ph.source_agent} | Tool: {ph.source_tool or 'N/A'}") + if ph.timestamp: + lines.append(f" Timestamp: {ph.timestamp}") + lines.append(f" {ph.description[:500]}") + return "\n".join(lines) + + async def _get_hypotheses_with_evidence(self) -> str: + if not self.graph.hypotheses: + return "No hypotheses defined." + + lines = [f"=== Hypotheses ({len(self.graph.hypotheses)}) ==="] + for hyp in self.graph.hypotheses.values(): + lines.append(f"\n### {hyp.title}") + lines.append(f"Confidence: {hyp.confidence:.2f} | Status: {hyp.status}") + lines.append(f"Description: {hyp.description}") + + related = self.graph.get_related(hyp.id, direction="in") + supporting = [r for r in related if r["edge_type"] in ("direct_evidence", "supports", "prerequisite_met", "consequence_observed")] + contradicting = [r for r in related if r["edge_type"] in ("contradicts", "weakens")] + + if supporting: + lines.append(f"\nSupporting evidence ({len(supporting)}):") + for r in supporting: + lines.append(f" [{r['edge_type']}] {r['node']}") + if contradicting: + lines.append(f"\nContradicting evidence ({len(contradicting)}):") + for r in contradicting: + lines.append(f" [{r['edge_type']}] {r['node']}") + if not supporting and not contradicting: + lines.append(" (no evidence linked)") + return "\n".join(lines) + + async def _get_case_info(self) -> str: + info = self.graph.case_info + lines = ["=== Case Information ==="] + for k, v in info.items(): + lines.append(f" {k}: {v}") + lines.append(f" Image path: {self.graph.image_path}") + lines.append(f" Partition offset: {self.graph.partition_offset}") + return "\n".join(lines) + + async def _get_entities(self) -> str: + if not self.graph.entities: + return "No entities recorded." + + lines = [f"=== Entities ({len(self.graph.entities)}) ==="] + for ent in self.graph.entities.values(): + lines.append(f"\n{ent.name} ({ent.entity_type})") + if ent.description: + lines.append(f" {ent.description}") + related = self.graph.get_related(ent.id, direction="in") + if related: + for r in related: + lines.append(f" ← [{r['edge_type']}] {r['node']}") + return "\n".join(lines) + + async def _verify_phenomena(self) -> str: + verified = [] + unverified = [] + for ph in self.graph.phenomena.values(): + entry = f" [{ph.category}] {ph.title} (agent: {ph.source_agent}, tool: {ph.source_tool or 'N/A'})" + if ph.source_tool: + verified.append(entry) + else: + unverified.append(entry) + + lines = ["=== Phenomena Verification Report ==="] + lines.append(f"\nVERIFIED ({len(verified)} — have source_tool):") + lines.extend(verified) + lines.append(f"\nUNVERIFIED ({len(unverified)} — no source_tool):") + lines.extend(unverified) + return "\n".join(lines) + + async def _save_report(self, content: str, output_path: str) -> str: + try: + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "w") as f: + f.write(content) + return f"Report saved to {output_path} ({len(content)} chars)" + except Exception as e: + return f"Error saving report: {e}" diff --git a/agents/timeline.py b/agents/timeline.py new file mode 100644 index 0000000..0365510 --- /dev/null +++ b/agents/timeline.py @@ -0,0 +1,88 @@ +"""Timeline Agent — correlates evidence across time.""" + +from __future__ import annotations + +import json + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + + +class TimelineAgent(BaseAgent): + name = "timeline" + role = ( + "Timeline forensic analyst. You build chronological timelines from filesystem " + "MAC timestamps and correlate events across all phenomena categories in the " + "evidence graph to reconstruct the sequence of activities on the system." + ) + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + super().__init__(llm, graph) + self._register_tools() + + def _register_tools(self) -> None: + # Filesystem timeline tool from catalog + td = TOOL_CATALOG.get("build_filesystem_timeline") + if td: + self.register_tool(td.name, td.description, td.input_schema, td.executor) + + # Custom tool to get all phenomena with timestamps for correlation + self.register_tool( + name="get_timestamped_phenomena", + description="Get all phenomena that have timestamps, sorted chronologically. Use for timeline correlation.", + input_schema={"type": "object", "properties": {}}, + executor=self._get_timestamped_phenomena, + ) + + # Tool to add temporal edges between phenomena + self.register_tool( + name="add_temporal_edge", + description="Add a temporal relationship between two phenomena (before, after, or concurrent).", + input_schema={ + "type": "object", + "properties": { + "source_id": {"type": "string", "description": "ID of the earlier/source phenomenon."}, + "target_id": {"type": "string", "description": "ID of the later/target phenomenon."}, + "relation": { + "type": "string", + "enum": ["before", "after", "concurrent"], + "description": "Temporal relationship.", + }, + }, + "required": ["source_id", "target_id", "relation"], + }, + executor=self._add_temporal_edge, + ) + + async def _get_timestamped_phenomena(self) -> str: + items = [ + ph for ph in self.graph.phenomena.values() + if ph.timestamp + ] + items.sort(key=lambda ph: ph.timestamp or "") + + if not items: + return "No phenomena with timestamps found." + + lines = [] + for ph in items: + lines.append(f"{ph.timestamp} | [{ph.category}] {ph.title} ({ph.id})") + lines.append(f" {ph.description[:150]}") + return "\n".join(lines) + + async def _add_temporal_edge( + self, source_id: str, target_id: str, relation: str, + ) -> str: + try: + await self.graph.add_edge( + source_id=source_id, + target_id=target_id, + edge_type="temporal", + metadata={"relation": relation}, + created_by=self.name, + ) + return f"Temporal edge added: {source_id} —[{relation}]→ {target_id}" + except ValueError as e: + return f"Error: {e}" diff --git a/base_agent.py b/base_agent.py new file mode 100644 index 0000000..a385f47 --- /dev/null +++ b/base_agent.py @@ -0,0 +1,448 @@ +"""Base class for forensic analysis agents.""" + +from __future__ import annotations + +import json +import logging +import time +from typing import Any + +from evidence_graph import EvidenceGraph +from llm_client import LLMClient + +logger = logging.getLogger(__name__) + + +def _log(msg: str, **extra) -> None: + """Emit a structured log message with extra fields.""" + logger.info(msg, extra=extra) + + +class BaseAgent: + """Base class for all forensic agents. + + Each agent has: + - A name and role description + - A set of tools it can use (registered as methods) + - Access to the shared EvidenceGraph + - An LLM client for reasoning and tool-calling + """ + + name: str = "base" + role: str = "A forensic analysis agent." + + def __init__(self, llm: LLMClient, graph: EvidenceGraph) -> None: + self.llm = llm + self.graph = graph + self._tools: dict[str, dict] = {} # name -> schema + self._executors: dict[str, Any] = {} # name -> async callable + self._work_log: list[str] = [] + + def register_tool( + self, + name: str, + description: str, + input_schema: dict, + executor: Any, + ) -> None: + """Register a tool that this agent can use.""" + self._tools[name] = { + "name": name, + "description": description, + "input_schema": input_schema, + } + self._executors[name] = executor + + def get_tool_definitions(self) -> list[dict]: + """Get tool definitions in Claude API format.""" + return list(self._tools.values()) + + def _build_system_prompt(self, task: str) -> str: + """Build the system prompt — lightweight stats, no full evidence dump.""" + work_log_section = "" + if self._work_log: + entries = self._work_log[-5:] + log_lines = "\n".join(f" {i+1}. {entry}" for i, entry in enumerate(entries)) + work_log_section = ( + f"\nYour prior work on this investigation:\n{log_lines}\n" + f"Avoid repeating tools/approaches that already succeeded or failed. Build on prior findings.\n" + ) + + return ( + f"You are {self.name}, a specialized digital forensics agent.\n" + f"Role: {self.role}\n\n" + f"You are analyzing a disk image as part of a multi-agent forensic investigation.\n" + f"Image: {self.graph.image_path}\n\n" + f"Current investigation state:\n{self.graph.stats_summary()}\n" + f"{work_log_section}\n" + f"Your current task: {task}\n\n" + f"CRITICAL WORKFLOW — you MUST follow these steps IN ORDER, one phase at a time:\n\n" + f"Phase A — INVESTIGATE:\n" + f" Use list_phenomena/search_graph to review existing findings.\n" + f" Call list_assets to see what files are already extracted.\n" + f" Call investigation tools (list_directory, parse_registry_key, etc.) to gather data.\n" + f" Only extract_file for forensically relevant files (user data, logs, configs, hives) — NOT system DLLs or OS files.\n" + f" Create add_lead for anything outside your expertise.\n\n" + f"Phase B — RECORD PHENOMENA:\n" + f" For EACH significant finding from Phase A, call add_phenomenon.\n" + f" Do NOT call link_to_entity yet — just record all phenomena first.\n\n" + f"Phase C — LINK ENTITIES:\n" + f" FIRST call list_phenomena to get the current IDs — do NOT rely on memory.\n" + f" Then call link_to_entity for each relevant phenomenon.\n" + f" NEVER guess or fabricate a phenomenon ID. If an ID is not in list_phenomena output, it does not exist.\n\n" + f"Phase D — ANSWER:\n" + f" Only give your AFTER completing Phases B and C.\n\n" + f"IMPORTANT:\n" + f"- You MUST call add_phenomenon at least once before finishing\n" + f"- Complete each phase before starting the next\n" + f"- Other agents can ONLY see what you write to the graph\n" + f"- If you don't record findings, they are LOST\n" + f"- Include relevant file paths, inode numbers, timestamps, and raw data\n\n" + f"ANTI-HALLUCINATION RULES — STRICTLY ENFORCED:\n" + f"- ONLY record findings that appear VERBATIM in tool results you received\n" + f"- NEVER invent or guess timestamps, file paths, inode numbers, or program names\n" + f"- If tool output was truncated, state '[truncated]' — do NOT fill in the missing data\n" + f"- If you are unsure whether something exists, call a tool to verify or create a lead — do NOT assume\n" + f"- Quote exact strings from tool output when recording evidence descriptions\n" + f"- Do NOT fabricate execution timestamps — only report timestamps returned by tools" + ) + + async def run(self, task: str) -> str: + """Run this agent with a specific task.""" + _log(task, event="agent_start", agent=self.name) + self.graph.agent_status[self.name] = "running" + self.graph._current_agent = self.name + + self._register_graph_tools() + + system = self._build_system_prompt(task) + messages = [{"role": "user", "content": task}] + + t0 = time.monotonic() + ph_before = len(self.graph.phenomena) + + try: + final_text, _ = await self.llm.tool_call_loop( + messages=messages, + tools=self.get_tool_definitions(), + tool_executor=self._executors, + system=system, + ) + self._work_log.append(f"[Task: {task[:80]}] -> {final_text[:150]}") + except Exception: + self.graph.agent_status[self.name] = "failed" + logger.error("[%s] Failed during task execution", self.name, exc_info=True) + raise + + self.graph.agent_status[self.name] = "completed" + elapsed = time.monotonic() - t0 + new_ph = len(self.graph.phenomena) - ph_before + _log(f"+{new_ph} phenomena, {len(final_text)} chars", event="agent_done", agent=self.name, elapsed=elapsed) + return final_text + + # ---- Graph interaction tools -------------------------------------------- + + def _register_graph_tools(self) -> None: + """Register tools for querying and writing to the evidence graph.""" + + # --- Read tools --- + + self.register_tool( + name="list_phenomena", + description=( + "List all phenomena (evidence artifacts) on the graph. " + "Returns one-line summaries with IDs. Use get_phenomenon(id) for full details." + ), + input_schema={ + "type": "object", + "properties": { + "category": { + "type": "string", + "description": "Filter by category (filesystem, registry, email, network, timeline). Omit for all.", + }, + }, + }, + executor=self._list_phenomena, + ) + + self.register_tool( + name="get_phenomenon", + description="Get full details of a specific phenomenon by ID, including raw_data.", + input_schema={ + "type": "object", + "properties": { + "id": {"type": "string", "description": "Phenomenon ID (e.g. 'ph-a1b2c3d4')."}, + }, + "required": ["id"], + }, + executor=self._get_phenomenon, + ) + + self.register_tool( + name="search_graph", + description="Search across phenomena, hypotheses, and entities by keyword. Returns matching summaries.", + input_schema={ + "type": "object", + "properties": { + "keyword": {"type": "string", "description": "Search keyword."}, + }, + "required": ["keyword"], + }, + executor=self._search_graph, + ) + + self.register_tool( + name="get_related", + description="Get all nodes connected to a given node via edges. Returns summaries and edge types.", + input_schema={ + "type": "object", + "properties": { + "node_id": {"type": "string", "description": "Any node ID (ph-*, hyp-*, ent-*)."}, + }, + "required": ["node_id"], + }, + executor=self._get_related, + ) + + self.register_tool( + name="get_hypothesis_status", + description="Get current status and confidence of all hypotheses being investigated.", + input_schema={"type": "object", "properties": {}}, + executor=self._get_hypothesis_status, + ) + + # --- Write tools --- + + self.register_tool( + name="add_phenomenon", + description=( + "Record a forensic finding (phenomenon) on the evidence graph. " + "You MUST specify source_tool: the name of the tool call that produced this finding." + ), + input_schema={ + "type": "object", + "properties": { + "category": {"type": "string", "description": "Category of the finding."}, + "title": {"type": "string", "description": "Short title."}, + "description": {"type": "string", "description": "Detailed description. Quote exact data from tool output."}, + "raw_data": {"type": "object", "description": "Structured raw data supporting this finding."}, + "timestamp": {"type": "string", "description": "Timestamp if any. ONLY use timestamps from tool output."}, + "source_tool": {"type": "string", "description": "Name of the tool that produced this (e.g. 'list_directory')."}, + }, + "required": ["category", "title", "description", "source_tool"], + }, + executor=self._add_phenomenon, + ) + + self.register_tool( + name="add_lead", + description="Create an investigative lead for another agent to follow up on.", + input_schema={ + "type": "object", + "properties": { + "target_agent": { + "type": "string", + "enum": ["filesystem", "registry", "communication", "network", "timeline"], + "description": "Which agent should handle this lead.", + }, + "description": {"type": "string", "description": "What should be investigated."}, + "priority": {"type": "integer", "description": "Priority 1 (highest) to 10 (lowest). Default 5."}, + }, + "required": ["target_agent", "description"], + }, + executor=self._add_lead, + ) + + self.register_tool( + name="link_to_entity", + description=( + "Link a phenomenon to a named entity (person, program, host, etc). " + "Creates the entity if it doesn't exist." + ), + input_schema={ + "type": "object", + "properties": { + "phenomenon_id": {"type": "string", "description": "Phenomenon ID to link from."}, + "entity_name": {"type": "string", "description": "Name of the entity (e.g. 'Mr. Evil', 'mIRC.exe')."}, + "entity_type": { + "type": "string", + "enum": ["person", "program", "file", "host", "ip_address"], + "description": "Type of entity.", + }, + "edge_type": { + "type": "string", + "enum": ["created_by", "executed_by", "owned_by", "targets", "associated_with", "found_on", "used_by"], + "description": "Relationship type.", + }, + }, + "required": ["phenomenon_id", "entity_name", "entity_type", "edge_type"], + }, + executor=self._link_to_entity, + ) + + # --- Asset library tools --- + + self.register_tool( + name="list_assets", + description=( + "List all files extracted from the disk image. " + "Shows filename, category, size, local path, and inode. " + "Check this before calling extract_file to avoid re-extraction." + ), + input_schema={ + "type": "object", + "properties": { + "category": { + "type": "string", + "enum": [ + "registry_hive", "chat_log", "prefetch", "network_capture", + "config_file", "address_book", "recycle_bin", "executable", + "text_log", "other", + ], + "description": "Filter by category. Omit to list all.", + }, + }, + }, + executor=self._list_assets, + ) + + self.register_tool( + name="find_extracted_file", + description=( + "Find an already-extracted file by inode or filename. " + "Returns the local path so you can use it directly with " + "parse_registry_key, read_text_file, etc. without re-extracting." + ), + input_schema={ + "type": "object", + "properties": { + "inode": {"type": "string", "description": "Inode to look up."}, + "filename": {"type": "string", "description": "Filename or partial name to search."}, + }, + }, + executor=self._find_extracted_file, + ) + + # ---- Tool executors ----------------------------------------------------- + + async def _list_phenomena(self, category: str | None = None) -> str: + results = self.graph.list_phenomena(category) + if not results: + return "No phenomena recorded yet." if not category else f"No phenomena in category '{category}'." + return "\n".join(results) + + async def _get_phenomenon(self, id: str) -> str: + data = self.graph.get_phenomenon(id) + if data is None: + return f"Phenomenon not found: {id}" + return json.dumps(data, ensure_ascii=False, indent=2) + + async def _search_graph(self, keyword: str) -> str: + results = self.graph.search_graph(keyword) + if not results: + return f"No matches for '{keyword}'." + return "\n".join(results) + + async def _get_related(self, node_id: str) -> str: + results = self.graph.get_related(node_id) + if not results: + return f"No connections found for {node_id}." + lines = [] + for r in results: + lines.append(f" {r['direction']} [{r['edge_type']}] → {r['node']}") + return "\n".join(lines) + + async def _get_hypothesis_status(self) -> str: + results = self.graph.get_hypothesis_status() + if not results: + return "No hypotheses defined yet." + return "\n".join(results) + + async def _add_phenomenon( + self, + category: str, + title: str, + description: str, + raw_data: dict | None = None, + timestamp: str | None = None, + source_tool: str = "", + ) -> str: + pid, merged = await self.graph.add_phenomenon( + source_agent=self.name, + category=category, + title=title, + description=description, + raw_data=raw_data, + timestamp=timestamp, + source_tool=source_tool, + ) + if merged: + return f"Phenomenon merged into existing: {pid} — {title} (corroboration boost)" + return f"Phenomenon recorded: {pid} — {title}" + + async def _add_lead( + self, + target_agent: str, + description: str, + priority: int = 5, + ) -> str: + lid = await self.graph.add_lead( + target_agent=target_agent, + description=description, + priority=priority, + ) + return f"Lead created: {lid} — [{target_agent}] {description}" + + async def _link_to_entity( + self, + phenomenon_id: str, + entity_name: str, + entity_type: str, + edge_type: str, + ) -> str: + # Validate phenomenon exists before creating entity + if not self.graph._node_exists(phenomenon_id): + return ( + f"Error: phenomenon '{phenomenon_id}' not found. " + f"Call list_phenomena first to get valid IDs." + ) + eid, existing = await self.graph.add_entity(entity_name, entity_type) + await self.graph.add_edge( + source_id=phenomenon_id, + target_id=eid, + edge_type=edge_type, + created_by=self.name, + ) + status = "linked to existing" if existing else "created and linked" + return f"Entity {status}: {entity_name} ({entity_type}) ←[{edge_type}]— {phenomenon_id}" + + async def _list_assets(self, category: str | None = None) -> str: + results = self.graph.list_assets(category) + if not results: + return "No files extracted yet." if not category else f"No assets in category '{category}'." + return "\n".join(results) + + async def _find_extracted_file( + self, + inode: str | None = None, + filename: str | None = None, + ) -> str: + if inode: + asset = self.graph.lookup_asset_by_inode(inode) + if asset: + return ( + f"Found: {asset.local_path} " + f"({asset.size_bytes} bytes, {asset.category}, inode:{asset.inode})" + ) + return f"No extracted file with inode {inode}." + + if filename: + results = self.graph.query_assets(filename_pattern=filename) + if not results: + return f"No extracted files matching '{filename}'." + lines = [f"Found {len(results)} matching file(s):"] + for a in results: + lines.append(f" {a.local_path} (inode:{a.inode}, {a.size_bytes} bytes, {a.category})") + return "\n".join(lines) + + return "Provide either inode or filename to search." diff --git a/evidence_graph.py b/evidence_graph.py new file mode 100644 index 0000000..efdad6d --- /dev/null +++ b/evidence_graph.py @@ -0,0 +1,799 @@ +"""Evidence Knowledge Graph for multi-agent forensic analysis. + +Replaces the flat Blackboard with a graph-based evidence store. +Nodes: Phenomenon (observable artifacts), Hypothesis (interpretive claims), Entity (recurring objects). +Edges: typed relationships with predefined weights for hypothesis confidence computation. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import uuid +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Predefined edge weights for Phenomenon → Hypothesis relationships. +# LLM only picks the edge type (categorical); the weight is looked up here. +# --------------------------------------------------------------------------- +HYPOTHESIS_EDGE_WEIGHTS: dict[str, float] = { + "direct_evidence": +0.25, + "supports": +0.15, + "prerequisite_met": +0.10, + "consequence_observed": +0.15, + "contradicts": -0.20, + "weakens": -0.10, +} + +# All valid edge types across the graph. +VALID_EDGE_TYPES: set[str] = { + # Phenomenon → Hypothesis + "direct_evidence", "supports", "prerequisite_met", + "consequence_observed", "contradicts", "weakens", + # Phenomenon → Phenomenon + "temporal", "causal", "input_to", "modifies", "co_located", "corroborates", + # Phenomenon → Entity + "created_by", "executed_by", "owned_by", "targets", + "associated_with", "found_on", "used_by", + # Hypothesis → Hypothesis + "refines", "conflicts", "depends_on", +} + + +# --------------------------------------------------------------------------- +# Graph node types +# --------------------------------------------------------------------------- + +def _compute_quality_score( + source_tool: str, + timestamp: str | None, + raw_data: dict, + description: str, + related_ids: list[str], +) -> float: + """Compute a quality score (0.0-1.0) based on evidence completeness.""" + score = 0.0 + if source_tool: + score += 0.25 + if timestamp is not None: + score += 0.20 + if raw_data: + score += 0.25 + if len(description) >= 50: + score += 0.15 + if related_ids: + score += 0.15 + return score + + +def _jaccard_similarity(a: str, b: str) -> float: + """Token-level Jaccard similarity between two strings.""" + tokens_a = set(a.lower().split()) + tokens_b = set(b.lower().split()) + if not tokens_a or not tokens_b: + return 0.0 + return len(tokens_a & tokens_b) / len(tokens_a | tokens_b) + + +@dataclass +class Phenomenon: + """Raw observable artifact found on disk.""" + + id: str # "ph-{uuid8}" + source_agent: str + category: str # filesystem, registry, email, network, timeline + title: str + description: str + raw_data: dict = field(default_factory=dict) + timestamp: str | None = None + confidence: float = 1.0 + source_tool: str = "" + corroborating_agents: list[str] = field(default_factory=list) + created_at: str = "" + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> Phenomenon: + return cls(**d) + + def summary(self) -> str: + ts = f" @ {self.timestamp}" if self.timestamp else "" + return f"[{self.id}] [{self.category}] {self.title}{ts} (conf={self.confidence:.2f})" + + +@dataclass +class Hypothesis: + """Interpretive claim about what happened on the system.""" + + id: str # "hyp-{uuid8}" + title: str + description: str + confidence: float = 0.5 + status: str = "active" # active, supported, refuted, inconclusive + parent_id: str | None = None + created_by: str = "" # "manual", "hypothesis_agent", agent name + created_at: str = "" + confidence_log: list[dict] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> Hypothesis: + return cls(**d) + + def summary(self) -> str: + return f"[{self.id}] {self.title} (conf={self.confidence:.2f}, {self.status})" + + +@dataclass +class Entity: + """Recurring actor or object across phenomena.""" + + id: str # "ent-{uuid8}" + name: str + entity_type: str # person, program, file, host, ip_address + description: str = "" + created_at: str = "" + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> Entity: + return cls(**d) + + def summary(self) -> str: + return f"[{self.id}] {self.entity_type}: {self.name}" + + +@dataclass +class Edge: + """Directed edge in the evidence graph.""" + + id: str # "edge-{uuid8}" + source_id: str + target_id: str + edge_type: str + metadata: dict = field(default_factory=dict) + created_by: str = "" + created_at: str = "" + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> Edge: + return cls(**d) + + +@dataclass +class Lead: + """An investigative lead that should be followed up by an agent.""" + + id: str + target_agent: str + description: str + priority: int = 5 # 1 (highest) - 10 (lowest) + context: dict = field(default_factory=dict) + status: str = "pending" # pending, assigned, completed, failed + hypothesis_id: str | None = None + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> Lead: + return cls(**d) + + +@dataclass +class ExtractedAsset: + """A file extracted from the disk image and tracked in the asset library.""" + + id: str # "asset-{uuid8}" + inode: str # e.g. "334-128-4" + original_path: str # disk image path from ffind + local_path: str # "extracted/SYSTEM" + category: str # registry_hive, chat_log, prefetch, ... + filename: str # "SYSTEM" + size_bytes: int + extracted_by: str # agent name + extracted_at: str # ISO timestamp + + def to_dict(self) -> dict: + return asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> ExtractedAsset: + return cls(**d) + + def summary(self) -> str: + size_kb = self.size_bytes / 1024 + return ( + f"[{self.id}] {self.filename} ({self.category}) " + f"— {size_kb:.1f}KB @ {self.local_path} [inode:{self.inode}]" + ) + + +# --------------------------------------------------------------------------- +# Evidence Graph +# --------------------------------------------------------------------------- + +class EvidenceGraph: + """Graph-based evidence store for multi-agent forensic analysis. + + Agents interact with the graph via query tools (list_phenomena, + get_phenomenon, search_graph, get_related) rather than reading + a full dump in the system prompt. + """ + + def __init__( + self, + case_info: dict | None = None, + persist_path: Path | None = None, + ) -> None: + self.case_info: dict = case_info or {} + self.image_path: str = "" + self.partition_offset: int = 0 + self.extracted_dir: str = "extracted" + + # Graph storage + self.phenomena: dict[str, Phenomenon] = {} + self.hypotheses: dict[str, Hypothesis] = {} + self.entities: dict[str, Entity] = {} + self.edges: list[Edge] = [] + + # Adjacency index for fast traversal + self._adj: dict[str, list[Edge]] = {} # node_id → outgoing edges + self._adj_rev: dict[str, list[Edge]] = {} # node_id → incoming edges + + # Lead / status management (carried over from Blackboard) + self.leads: list[Lead] = [] + self.agent_status: dict[str, str] = {} + + # Asset library — tracks all files extracted from the disk image + self.asset_library: dict[str, ExtractedAsset] = {} + self._inode_index: dict[str, str] = {} # inode → asset_id + + # Set by BaseAgent.run() before each agent execution + self._current_agent: str = "" + + self._lock = asyncio.Lock() + self._persist_path: Path | None = persist_path + + # ---- Persistence ------------------------------------------------------- + + def _auto_save(self) -> None: + """Persist full state to disk. Must be called inside _lock.""" + if self._persist_path is None: + return + try: + state = { + "case_info": self.case_info, + "image_path": self.image_path, + "partition_offset": self.partition_offset, + "extracted_dir": self.extracted_dir, + "phenomena": {pid: p.to_dict() for pid, p in self.phenomena.items()}, + "hypotheses": {hid: h.to_dict() for hid, h in self.hypotheses.items()}, + "entities": {eid: e.to_dict() for eid, e in self.entities.items()}, + "edges": [e.to_dict() for e in self.edges], + "leads": [l.to_dict() for l in self.leads], + "agent_status": dict(self.agent_status), + "asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()}, + "saved_at": datetime.now().isoformat(), + } + tmp = self._persist_path.with_suffix(".tmp") + tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2)) + tmp.replace(self._persist_path) + except Exception as e: + logger.error("EvidenceGraph auto-save failed: %s", e) + + def save_state(self, path: Path) -> None: + """Explicitly save state to the given path.""" + old = self._persist_path + self._persist_path = path + self._auto_save() + self._persist_path = old + + @classmethod + def load_state(cls, path: Path) -> EvidenceGraph: + """Restore an EvidenceGraph from a saved JSON state file.""" + data = json.loads(path.read_text()) + graph = cls( + case_info=data.get("case_info", {}), + persist_path=path, + ) + graph.image_path = data.get("image_path", "") + graph.partition_offset = data.get("partition_offset", 0) + graph.extracted_dir = data.get("extracted_dir", "extracted") + graph.phenomena = { + pid: Phenomenon.from_dict(p) + for pid, p in data.get("phenomena", {}).items() + } + graph.hypotheses = { + hid: Hypothesis.from_dict(h) + for hid, h in data.get("hypotheses", {}).items() + } + graph.entities = { + eid: Entity.from_dict(e) + for eid, e in data.get("entities", {}).items() + } + graph.edges = [Edge.from_dict(e) for e in data.get("edges", [])] + graph.leads = [Lead.from_dict(l) for l in data.get("leads", [])] + graph.agent_status = data.get("agent_status", {}) + for aid, a_data in data.get("asset_library", {}).items(): + asset = ExtractedAsset.from_dict(a_data) + graph.asset_library[aid] = asset + graph._inode_index[asset.inode] = aid + graph._rebuild_adjacency() + logger.info( + "EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, " + "%d edges, %d assets", + len(graph.phenomena), len(graph.hypotheses), + len(graph.entities), len(graph.edges), len(graph.asset_library), + ) + return graph + + def _rebuild_adjacency(self) -> None: + """Rebuild adjacency index from edges list.""" + self._adj.clear() + self._adj_rev.clear() + for edge in self.edges: + self._adj.setdefault(edge.source_id, []).append(edge) + self._adj_rev.setdefault(edge.target_id, []).append(edge) + + # ---- Node helpers ------------------------------------------------------- + + def _node_exists(self, node_id: str) -> bool: + if node_id.startswith("ph-"): + return node_id in self.phenomena + if node_id.startswith("hyp-"): + return node_id in self.hypotheses + if node_id.startswith("ent-"): + return node_id in self.entities + return False + + def get_node(self, node_id: str) -> Phenomenon | Hypothesis | Entity | None: + if node_id.startswith("ph-"): + return self.phenomena.get(node_id) + if node_id.startswith("hyp-"): + return self.hypotheses.get(node_id) + if node_id.startswith("ent-"): + return self.entities.get(node_id) + return None + + # ---- Similarity merging (Phenomenon only) -------------------------------- + + def _find_similar_phenomenon( + self, title: str, description: str, category: str, + ) -> Phenomenon | None: + best_match: Phenomenon | None = None + best_score = 0.0 + for ph in self.phenomena.values(): + if ph.category != category: + continue + title_sim = _jaccard_similarity(ph.title, title) + if title_sim <= 0.6: + continue + desc_sim = _jaccard_similarity(ph.description[:200], description[:200]) + if desc_sim <= 0.4: + continue + combined = title_sim * 0.6 + desc_sim * 0.4 + if combined > best_score: + best_score = combined + best_match = ph + return best_match + + # ---- Mutation methods (async, under lock) -------------------------------- + + async def add_phenomenon( + self, + source_agent: str, + category: str, + title: str, + description: str, + raw_data: dict | None = None, + timestamp: str | None = None, + source_tool: str = "", + ) -> tuple[str, bool]: + """Add a phenomenon. Returns (id, was_merged). + + Confidence is auto-computed from evidence completeness (source_tool, + timestamp, raw_data, description length). + """ + async with self._lock: + similar = self._find_similar_phenomenon(title, description, category) + if similar is not None: + similar.confidence = min(1.0, similar.confidence + 0.15) + if source_agent not in similar.corroborating_agents: + similar.corroborating_agents.append(source_agent) + if raw_data: + for k, v in raw_data.items(): + if k not in similar.raw_data: + similar.raw_data[k] = v + self._auto_save() + return similar.id, True + + pid = f"ph-{uuid.uuid4().hex[:8]}" + confidence = _compute_quality_score( + source_tool, timestamp, raw_data or {}, + description, [], + ) + ph = Phenomenon( + id=pid, + source_agent=source_agent, + category=category, + title=title, + description=description, + raw_data=raw_data or {}, + timestamp=timestamp, + confidence=confidence, + source_tool=source_tool, + created_at=datetime.now().isoformat(), + ) + self.phenomena[pid] = ph + self._auto_save() + return pid, False + + async def add_hypothesis( + self, + title: str, + description: str, + created_by: str = "", + parent_id: str | None = None, + ) -> str: + """Add a hypothesis. Returns the hypothesis ID.""" + async with self._lock: + hid = f"hyp-{uuid.uuid4().hex[:8]}" + hyp = Hypothesis( + id=hid, + title=title, + description=description, + confidence=0.5, + status="active", + parent_id=parent_id, + created_by=created_by, + created_at=datetime.now().isoformat(), + ) + self.hypotheses[hid] = hyp + self._auto_save() + return hid + + async def add_entity( + self, + name: str, + entity_type: str, + description: str = "", + ) -> tuple[str, bool]: + """Add an entity. Deduplicates on (name, entity_type). Returns (id, was_existing).""" + async with self._lock: + for ent in self.entities.values(): + if ent.name == name and ent.entity_type == entity_type: + return ent.id, True + + eid = f"ent-{uuid.uuid4().hex[:8]}" + self.entities[eid] = Entity( + id=eid, + name=name, + entity_type=entity_type, + description=description, + created_at=datetime.now().isoformat(), + ) + self._auto_save() + return eid, False + + async def add_edge( + self, + source_id: str, + target_id: str, + edge_type: str, + metadata: dict | None = None, + created_by: str = "", + ) -> str: + """Add a directed edge. Validates nodes exist and edge type is valid.""" + async with self._lock: + if not self._node_exists(source_id): + raise ValueError(f"Source node not found: {source_id}") + if not self._node_exists(target_id): + raise ValueError(f"Target node not found: {target_id}") + if edge_type not in VALID_EDGE_TYPES: + raise ValueError(f"Invalid edge type: {edge_type}") + + eid = f"edge-{uuid.uuid4().hex[:8]}" + edge = Edge( + id=eid, + source_id=source_id, + target_id=target_id, + edge_type=edge_type, + metadata=metadata or {}, + created_by=created_by, + created_at=datetime.now().isoformat(), + ) + self.edges.append(edge) + self._adj.setdefault(source_id, []).append(edge) + self._adj_rev.setdefault(target_id, []).append(edge) + self._auto_save() + return eid + + async def update_hypothesis_confidence( + self, + hyp_id: str, + phenomenon_id: str, + edge_type: str, + reason: str = "", + ) -> float: + """Update hypothesis confidence based on a phenomenon linkage. + + The edge_type must be one of HYPOTHESIS_EDGE_WEIGHTS keys. + Weight is looked up from the predefined table, NOT judged by LLM. + Returns the new confidence value. + """ + if edge_type not in HYPOTHESIS_EDGE_WEIGHTS: + raise ValueError( + f"Invalid hypothesis edge type: {edge_type}. " + f"Must be one of: {list(HYPOTHESIS_EDGE_WEIGHTS.keys())}" + ) + + async with self._lock: + if not self._node_exists(phenomenon_id): + raise ValueError(f"Phenomenon not found: {phenomenon_id}") + hyp = self.hypotheses.get(hyp_id) + if hyp is None: + raise ValueError(f"Hypothesis not found: {hyp_id}") + + weight = HYPOTHESIS_EDGE_WEIGHTS[edge_type] + old_conf = hyp.confidence + + if weight > 0: + delta = weight * (1 - old_conf) + else: + delta = weight * old_conf + + new_conf = max(0.0, min(1.0, old_conf + delta)) + hyp.confidence = new_conf + + if new_conf >= 0.8: + hyp.status = "supported" + elif new_conf <= 0.2: + hyp.status = "refuted" + + hyp.confidence_log.append({ + "timestamp": datetime.now().isoformat(), + "phenomenon_id": phenomenon_id, + "edge_type": edge_type, + "weight": weight, + "old_confidence": round(old_conf, 4), + "new_confidence": round(new_conf, 4), + "reason": reason, + }) + + # Also create the edge in the graph + eid = f"edge-{uuid.uuid4().hex[:8]}" + edge = Edge( + id=eid, + source_id=phenomenon_id, + target_id=hyp_id, + edge_type=edge_type, + metadata={"reason": reason}, + created_by="hypothesis_engine", + created_at=datetime.now().isoformat(), + ) + self.edges.append(edge) + self._adj.setdefault(phenomenon_id, []).append(edge) + self._adj_rev.setdefault(hyp_id, []).append(edge) + + self._auto_save() + return new_conf + + # ---- Lead management (same as old Blackboard) ---------------------------- + + async def add_lead( + self, + target_agent: str, + description: str, + priority: int = 5, + context: dict | None = None, + hypothesis_id: str | None = None, + ) -> str: + async with self._lock: + lid = f"lead-{uuid.uuid4().hex[:8]}" + self.leads.append(Lead( + id=lid, + target_agent=target_agent, + description=description, + priority=priority, + context=context or {}, + hypothesis_id=hypothesis_id, + )) + self._auto_save() + return lid + + async def get_pending_leads(self, agent_type: str | None = None) -> list[Lead]: + async with self._lock: + leads = [l for l in self.leads if l.status == "pending"] + if agent_type: + leads = [l for l in leads if l.target_agent == agent_type] + return sorted(leads, key=lambda l: l.priority) + + async def mark_lead_completed(self, lead_id: str) -> None: + async with self._lock: + for lead in self.leads: + if lead.id == lead_id: + lead.status = "completed" + break + self._auto_save() + + async def mark_lead_failed(self, lead_id: str, error: str = "") -> None: + async with self._lock: + for lead in self.leads: + if lead.id == lead_id: + lead.status = "failed" + lead.context["failure_reason"] = error + break + self._auto_save() + + # ---- Asset library ------------------------------------------------------- + + async def register_asset( + self, + inode: str, + original_path: str, + local_path: str, + category: str, + filename: str, + size_bytes: int, + extracted_by: str, + ) -> tuple[str, bool]: + """Register an extracted file. Deduplicates by inode. Returns (id, already_existed).""" + async with self._lock: + if inode in self._inode_index: + return self._inode_index[inode], True + + aid = f"asset-{uuid.uuid4().hex[:8]}" + asset = ExtractedAsset( + id=aid, + inode=inode, + original_path=original_path, + local_path=local_path, + category=category, + filename=filename, + size_bytes=size_bytes, + extracted_by=extracted_by, + extracted_at=datetime.now().isoformat(), + ) + self.asset_library[aid] = asset + self._inode_index[inode] = aid + self._auto_save() + return aid, False + + def lookup_asset_by_inode(self, inode: str) -> ExtractedAsset | None: + """Look up an extracted asset by inode (synchronous, no lock needed for reads).""" + aid = self._inode_index.get(inode) + return self.asset_library.get(aid) if aid else None + + def list_assets(self, category: str | None = None) -> list[str]: + """Return one-line summaries of all assets, optionally filtered.""" + results = [] + for asset in self.asset_library.values(): + if category and asset.category != category: + continue + results.append(asset.summary()) + return results + + def query_assets( + self, + category: str | None = None, + filename_pattern: str | None = None, + ) -> list[ExtractedAsset]: + """Query the asset library with optional filters.""" + results = [] + for asset in self.asset_library.values(): + if category and asset.category != category: + continue + if filename_pattern and filename_pattern.lower() not in asset.filename.lower(): + continue + results.append(asset) + return results + + # ---- Query methods (for agent tools) ------------------------------------ + + def list_phenomena(self, category: str | None = None) -> list[str]: + """Return one-line summaries of all phenomena, optionally filtered.""" + results = [] + for ph in self.phenomena.values(): + if category and ph.category != category: + continue + results.append(ph.summary()) + return results + + def get_phenomenon(self, ph_id: str) -> dict | None: + """Return full phenomenon details as dict, or None.""" + ph = self.phenomena.get(ph_id) + return ph.to_dict() if ph else None + + def search_graph(self, keyword: str) -> list[str]: + """Search across all node types by keyword. Returns summaries.""" + kw = keyword.lower() + results = [] + for ph in self.phenomena.values(): + if kw in ph.title.lower() or kw in ph.description.lower(): + results.append(ph.summary()) + for hyp in self.hypotheses.values(): + if kw in hyp.title.lower() or kw in hyp.description.lower(): + results.append(hyp.summary()) + for ent in self.entities.values(): + if kw in ent.name.lower() or kw in ent.description.lower(): + results.append(ent.summary()) + return results + + def get_related( + self, + node_id: str, + edge_type: str | None = None, + direction: str = "both", + ) -> list[dict]: + """Get nodes connected to the given node. Returns list of {node_summary, edge_type, direction}.""" + results = [] + if direction in ("out", "both"): + for edge in self._adj.get(node_id, []): + if edge_type and edge.edge_type != edge_type: + continue + node = self.get_node(edge.target_id) + if node: + results.append({ + "node": node.summary(), + "edge_type": edge.edge_type, + "direction": "outgoing", + "metadata": edge.metadata, + }) + if direction in ("in", "both"): + for edge in self._adj_rev.get(node_id, []): + if edge_type and edge.edge_type != edge_type: + continue + node = self.get_node(edge.source_id) + if node: + results.append({ + "node": node.summary(), + "edge_type": edge.edge_type, + "direction": "incoming", + "metadata": edge.metadata, + }) + return results + + def get_hypothesis_status(self) -> list[str]: + """Return summaries of all hypotheses.""" + return [h.summary() for h in self.hypotheses.values()] + + def get_phenomena_by_category(self, category: str) -> list[Phenomenon]: + return [p for p in self.phenomena.values() if p.category == category] + + def hypotheses_converged(self) -> bool: + """True if no hypotheses are still active.""" + return all(h.status != "active" for h in self.hypotheses.values()) + + def mark_remaining_inconclusive(self) -> None: + """Mark all still-active hypotheses as inconclusive.""" + for h in self.hypotheses.values(): + if h.status == "active": + h.status = "inconclusive" + + # ---- Summary (lightweight, for system prompt) ---------------------------- + + def stats_summary(self) -> str: + """Ultra-compact stats for inclusion in system prompt.""" + active_hyp = sum(1 for h in self.hypotheses.values() if h.status == "active") + return ( + f"Graph: {len(self.phenomena)} phenomena, " + f"{len(self.hypotheses)} hypotheses ({active_hyp} active), " + f"{len(self.entities)} entities, {len(self.edges)} edges. " + f"Asset library: {len(self.asset_library)} extracted files. " + f"Pending leads: {sum(1 for l in self.leads if l.status == 'pending')}." + ) diff --git a/llm_client.py b/llm_client.py new file mode 100644 index 0000000..5c255ee --- /dev/null +++ b/llm_client.py @@ -0,0 +1,619 @@ +"""Custom LLM client using httpx for Claude Messages API via third-party proxy. + +The proxy does not support Claude's native tool_use format (it strips the `tools` +field from requests). So we embed tool definitions in the system prompt and parse +structured JSON tool calls from the model's text output (ReAct-style). +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +import time +from collections import Counter +from dataclasses import dataclass, field +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + + +class LLMAPIError(Exception): + """Raised when the LLM API is unreachable after all retries.""" + + def __init__(self, message: str, attempts: int) -> None: + super().__init__(message) + self.attempts = attempts + + +# Markers the model uses to signal tool calls and final answers +TOOL_CALL_TAG = "" +TOOL_CALL_END = "" +TOOL_RESULT_TAG = "" +TOOL_RESULT_END = "" +ANSWER_TAG = "" +ANSWER_END = "" + + +def _build_tools_prompt(tools: list[dict]) -> str: + """Format tool definitions for inclusion in the system prompt.""" + lines = ["You have access to the following tools:\n"] + for t in tools: + schema = t.get("input_schema", {}) + props = schema.get("properties", {}) + required = schema.get("required", []) + + params = [] + for pname, pdef in props.items(): + req = " (required)" if pname in required else "" + desc = pdef.get("description", "") + ptype = pdef.get("type", "string") + enum_vals = pdef.get("enum") + if enum_vals: + allowed = ", ".join(f'"{v}"' for v in enum_vals) + params.append(f" - {pname}: {ptype}{req} — {desc} Allowed values: [{allowed}]") + else: + params.append(f" - {pname}: {ptype}{req} — {desc}") + + param_block = "\n".join(params) if params else " (no parameters)" + lines.append(f"## {t['name']}\n{t['description']}\nParameters:\n{param_block}\n") + + lines.append( + "## How to use tools\n" + "To call a tool, output a JSON block wrapped in XML tags like this:\n" + f"{TOOL_CALL_TAG}\n" + '{"name": "tool_name", "arguments": {"param1": "value1"}}\n' + f"{TOOL_CALL_END}\n\n" + "You can call multiple tools in sequence. After each tool call, you will receive the result in:\n" + f"{TOOL_RESULT_TAG}\n...result...\n{TOOL_RESULT_END}\n\n" + "When you have finished your analysis and have a final answer, wrap it in:\n" + f"{ANSWER_TAG}\nyour final answer here\n{ANSWER_END}\n\n" + "Think step by step. Call tools to gather evidence before drawing conclusions.\n" + "You MUST call at least one tool before giving your final answer." + ) + return "\n".join(lines) + + +def _extract_tool_calls(text: str) -> list[dict]: + """Extract tool call JSON blocks from model output.""" + pattern = re.compile( + re.escape(TOOL_CALL_TAG) + r"\s*(.*?)\s*" + re.escape(TOOL_CALL_END), + re.DOTALL, + ) + calls = [] + for match in pattern.finditer(text): + raw = match.group(1).strip() + try: + parsed = json.loads(raw) + calls.append(parsed) + except json.JSONDecodeError: + logger.warning("Failed to parse tool call JSON: %s", raw[:200]) + return calls + + +def _extract_answer(text: str) -> str | None: + """Extract the final answer from model output.""" + pattern = re.compile( + re.escape(ANSWER_TAG) + r"\s*(.*?)\s*" + re.escape(ANSWER_END), + re.DOTALL, + ) + match = pattern.search(text) + if match: + return match.group(1).strip() + return None + + +def _truncate_tool_result(result_text: str, max_chars: int = 3000) -> str: + """Truncate a tool result if it exceeds max_chars.""" + if len(result_text) <= max_chars: + return result_text + return result_text[: max_chars - 200] + f"\n... [truncated, {len(result_text)} total chars]" + + +# Tools that only read and never mutate state — safe to run concurrently. +READ_ONLY_TOOLS: set[str] = { + # Graph queries + "list_phenomena", "get_phenomenon", "search_graph", "get_related", + "get_hypothesis_status", "list_assets", "find_extracted_file", + # Sleuth Kit reads + "partition_info", "filesystem_info", "list_directory", "find_file", + "search_strings", "count_deleted_files", "build_filesystem_timeline", + # Registry reads (without auto-record wrappers) + "parse_registry_key", "search_registry", "get_user_activity", + # Parser reads + "read_text_file", "read_binary_preview", "search_text_file", + "read_text_file_section", "list_extracted_dir", "parse_pcap_strings", +} + + +def _fix_tool_args(tool_name: str, tool_args: dict, tools: list[dict]) -> dict: + """Try to fix misnamed tool arguments from LLM hallucination. + + The LLM sometimes confuses parameter names across tools (e.g. passing + `key_path` to search_registry which expects `pattern`). This function + maps unknown kwargs to missing expected params by position/best-effort. + """ + # Find the schema for this tool + schema = None + for t in tools: + if t.get("name") == tool_name: + schema = t.get("input_schema", {}) + break + if schema is None: + return tool_args + + props = schema.get("properties", {}) + required = set(schema.get("required", [])) + + unknown = [k for k in tool_args if k not in props] + if not unknown: + return tool_args # all args are valid, nothing to fix + + # Build the fixed args: start with valid args + fixed = {k: v for k, v in tool_args.items() if k in props} + + # Find which expected params are still missing + missing = [p for p in (required or props.keys()) if p not in fixed] + + # Try to map unknown args to missing params, in order + unknown_values = [(k, tool_args[k]) for k in unknown] + + for wrong_name, value in unknown_values: + if not missing: + break + # Pick the best match from missing params + best = missing.pop(0) + logger.warning( + "Auto-fixing tool arg: %s(%s=...) -> %s(%s=...)", + tool_name, wrong_name, tool_name, best, + ) + fixed[best] = value + + return fixed + + +def _emit_tool_call_summary(tool_calls: list[dict], elapsed: float) -> None: + """Emit a folded tool-call summary line for the terminal formatter. + + Instead of logging each tool call individually, we group by name: + "list_directory x27, extract_file x3, read_text_file x3" + """ + counts = Counter(tc.get("name", "?") for tc in tool_calls) + parts = [] + for name, count in counts.most_common(): + if count > 1: + parts.append(f"{name} x{count}") + else: + parts.append(name) + summary = ", ".join(parts) + logger.info(summary, extra={"event": "tool_calls", "elapsed": elapsed}) + + +@dataclass +class _ToolBatch: + """A batch of tool calls with the same read/write classification.""" + is_read_only: bool + calls: list[dict] = field(default_factory=list) + + +def _partition_tool_calls( + tool_calls: list[dict], + read_only: set[str] | None = None, +) -> list[_ToolBatch]: + """Partition tool calls into batches: consecutive read-only tools are + grouped together (will run in parallel), write tools are isolated.""" + if read_only is None: + read_only = READ_ONLY_TOOLS + batches: list[_ToolBatch] = [] + for tc in tool_calls: + is_ro = tc.get("name", "") in read_only + if batches and batches[-1].is_read_only and is_ro: + batches[-1].calls.append(tc) + else: + batches.append(_ToolBatch(is_read_only=is_ro, calls=[tc])) + return batches + + +# --------------------------------------------------------------------------- +# Context compression — keeps the message list from growing unboundedly. +# --------------------------------------------------------------------------- + +# Stage A: Progressive tool result decay thresholds. +# Messages are counted in (assistant, user) pairs from the END of the list. +# "Round" = one pair of (assistant tool-calling msg, user tool-result msg). +_DECAY_TIERS: list[tuple[int, int]] = [ + # (rounds_ago_threshold, max_chars_for_tool_results) + (5, 3000), # recent 5 rounds: keep full (3000 chars per tool result) + (15, 500), # 5-15 rounds ago: aggressive truncation + (999, 100), # older than 15 rounds: minimal stub +] + + +def _apply_progressive_decay(messages: list[dict]) -> list[dict]: + """Truncate tool results in older messages to save context space. + + Operates in-place-style on a copy. Only touches user messages that + contain blocks (these are the tool-result messages + generated by tool_call_loop). + """ + # Count rounds from the end. A "round" is a (assistant, user) pair. + # messages alternate: [user, assistant, user, assistant, user, ...] + # The initial user message is index 0, then pairs start at index 1. + total = len(messages) + if total <= 10: # not enough messages to bother + return messages + + result = [] + # Count tool-result user messages from the end + tool_result_indices = [ + i for i, m in enumerate(messages) + if m["role"] == "user" and TOOL_RESULT_TAG in m.get("content", "") + ] + + # Build a set of indices that need decay, mapped to their max_chars + decay_map: dict[int, int] = {} + n_tool_msgs = len(tool_result_indices) + for rank, idx in enumerate(reversed(tool_result_indices)): + rounds_ago = rank # 0 = most recent, 1 = second most recent, ... + for threshold, max_chars in _DECAY_TIERS: + if rounds_ago < threshold: + decay_map[idx] = max_chars + break + + for i, msg in enumerate(messages): + if i in decay_map: + max_chars = decay_map[i] + content = msg["content"] + if len(content) > max_chars + 200: + # Truncate but preserve the tool_result tags structure + truncated = content[:max_chars] + # Count how many tool results are in this message + n_results = content.count(TOOL_RESULT_TAG) + truncated += ( + f"\n... [context compressed: {len(content)} -> {max_chars} chars, " + f"{n_results} tool result(s)]" + ) + result.append({"role": msg["role"], "content": truncated}) + else: + result.append(msg) + else: + result.append(msg) + return result + + +# Stage B: LLM-powered message folding. +# When messages exceed this count, fold the oldest ones into a summary. +_FOLD_THRESHOLD = 24 # trigger folding when messages exceed this count +_FOLD_KEEP_RECENT = 10 # always keep the most recent N messages intact +_FOLD_SUMMARY_SYSTEM = ( + "You are a concise summarizer for an ongoing forensic investigation conversation. " + "Summarize the following early conversation between a forensic analysis agent and its " + "tool execution results. Preserve:\n" + "- Key findings and evidence discovered (file paths, inode numbers, timestamps)\n" + "- Tools that were called and their important results\n" + "- Decisions made and current investigation direction\n" + "Keep the summary under 800 words. Use bullet points." +) + + +class LLMClient: + """Calls Claude Messages API through a third-party proxy using raw httpx. + + Uses prompt-based tool calling (ReAct pattern) since the proxy does not + support Claude's native tool_use format. + """ + + def __init__( + self, + base_url: str, + api_key: str, + model: str = "claude-sonnet-4-6", + max_tokens: int = 4096, + proxy: str | None = "auto", + ) -> None: + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.model = model + self.max_tokens = max_tokens + # proxy="auto": read from env; proxy=None/""/"none": no proxy; proxy="http://...": use it + if proxy == "auto": + proxy_url = os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY") + elif proxy and proxy.lower() != "none": + proxy_url = proxy + else: + proxy_url = None + self._client = httpx.AsyncClient( + base_url=self.base_url, + headers={ + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + }, + timeout=300.0, + proxy=proxy_url, + ) + + async def close(self) -> None: + await self._client.aclose() + + async def chat( + self, + messages: list[dict], + system: str | None = None, + max_retries: int = 5, + ) -> str: + """Send a streaming chat request and return the assembled text response. + + Uses SSE streaming to keep the connection alive and avoid gateway + timeouts (504/524) on long-running completions. + """ + import asyncio as _asyncio + + payload: dict[str, Any] = { + "model": self.model, + "max_tokens": self.max_tokens, + "messages": messages, + "stream": True, + } + if system: + payload["system"] = system + + for attempt in range(max_retries): + logger.debug("LLM request (stream): %d messages (attempt %d)", len(messages), attempt + 1) + text_parts: list[str] = [] + try: + async with self._client.stream( + "POST", "/v1/messages", json=payload, + ) as resp: + # Check for HTTP errors before consuming stream + if resp.status_code >= 400: + body = await resp.aread() + raise httpx.HTTPStatusError( + f"Server error '{resp.status_code}' for url '{resp.url}'", + request=resp.request, + response=resp, + ) + + # Parse SSE events + async for line in resp.aiter_lines(): + if not line.startswith("data: "): + continue + data_str = line[6:] # strip "data: " prefix + if data_str.strip() == "[DONE]": + break + try: + event = json.loads(data_str) + except json.JSONDecodeError: + continue + + event_type = event.get("type", "") + if event_type == "content_block_delta": + delta = event.get("delta", {}) + if delta.get("type") == "text_delta": + text_parts.append(delta["text"]) + elif event_type == "message_stop": + break + elif event_type == "error": + err_msg = event.get("error", {}).get("message", "Unknown streaming error") + raise httpx.HTTPStatusError( + err_msg, request=resp.request, response=resp, + ) + + text = "".join(text_parts) + logger.debug("LLM response (stream): %d chars", len(text)) + return text + + except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as e: + if attempt < max_retries - 1: + wait = 2 ** attempt * 10 + logger.warning("Request failed (%s), retrying in %ds...", e, wait) + await _asyncio.sleep(wait) + else: + raise LLMAPIError( + f"LLM API unreachable after {max_retries} attempts: {e}", + attempts=max_retries, + ) from e + + # Should not reach here, but just in case + return "" + + async def tool_call_loop( + self, + messages: list[dict], + tools: list[dict], + tool_executor: dict[str, Any], + system: str | None = None, + max_iterations: int = 40, + ) -> tuple[str, list[dict]]: + """Run a ReAct-style tool-calling loop. + + The model outputs blocks which we parse and execute, + feeding results back as blocks until the model + outputs an block. + + Returns: + (final_text, all_messages) + """ + # Build system prompt with tool definitions + tools_prompt = _build_tools_prompt(tools) + full_system = f"{system}\n\n{tools_prompt}" if system else tools_prompt + + messages = list(messages) # don't mutate caller's list + _folded = False # Track whether we've already folded once this loop + + for i in range(max_iterations): + # ── Context compression before each API call ────────────── + # Stage A: progressively decay old tool results + messages = _apply_progressive_decay(messages) + + # Stage B: fold oldest messages into LLM summary if too long + if not _folded and len(messages) > _FOLD_THRESHOLD: + messages = await self._fold_old_messages(messages, full_system) + _folded = True + elif _folded and len(messages) > _FOLD_THRESHOLD + _FOLD_KEEP_RECENT: + # Allow a second fold if messages grew back significantly + messages = await self._fold_old_messages(messages, full_system) + + text = await self.chat(messages, system=full_system) + + # Check for final answer + answer = _extract_answer(text) + if answer is not None: + messages.append({"role": "assistant", "content": text}) + return answer, messages + + # Check for tool calls + tool_calls = _extract_tool_calls(text) + + if not tool_calls: + # No tool calls and no answer tag — treat entire text as answer + messages.append({"role": "assistant", "content": text}) + return text, messages + + # Execute tool calls — read-only tools run in parallel + messages.append({"role": "assistant", "content": text}) + + result_parts = [] + batches = _partition_tool_calls(tool_calls) + t_batch_start = time.monotonic() + + for batch in batches: + if batch.is_read_only and len(batch.calls) > 1: + batch_results = await self._execute_tool_batch_parallel( + batch.calls, tool_executor, tools, + ) + result_parts.extend(batch_results) + else: + for tc in batch.calls: + result_parts.append( + await self._execute_single_tool(tc, tool_executor, tools) + ) + + # Emit folded tool-call summary for the terminal + t_batch_elapsed = time.monotonic() - t_batch_start + _emit_tool_call_summary(tool_calls, t_batch_elapsed) + + # Feed results back as a user message + result_message = "\n\n".join(result_parts) + messages.append({"role": "user", "content": result_message}) + + logger.warning("Tool call loop hit max iterations (%d)", max_iterations) + return "[Max tool call iterations reached]", messages + + async def _execute_single_tool( + self, tc: dict, tool_executor: dict[str, Any], + tools: list[dict] | None = None, + ) -> str: + """Execute a single tool call and return the formatted result.""" + tool_name = tc.get("name", "") + tool_args = tc.get("arguments", {}) + + if tools: + tool_args = _fix_tool_args(tool_name, tool_args, tools) + + logger.info("Calling tool: %s(%s)", tool_name, json.dumps(tool_args, ensure_ascii=False)) + + executor = tool_executor.get(tool_name) + if executor is None: + result_text = f"Error: unknown tool '{tool_name}'" + else: + try: + result_text = await executor(**tool_args) + except Exception as e: + logger.error("Tool %s failed: %s", tool_name, e) + result_text = f"Error executing {tool_name}: {e}" + + return ( + f"{TOOL_RESULT_TAG}\n" + f"[{tool_name}] {_truncate_tool_result(result_text)}\n" + f"{TOOL_RESULT_END}" + ) + + async def _execute_tool_batch_parallel( + self, calls: list[dict], tool_executor: dict[str, Any], + tools: list[dict] | None = None, + ) -> list[str]: + """Execute multiple read-only tool calls concurrently.""" + logger.info("Executing %d read-only tools in parallel", len(calls)) + + async def _run_one(tc: dict) -> str: + tool_name = tc.get("name", "") + tool_args = tc.get("arguments", {}) + if tools: + tool_args = _fix_tool_args(tool_name, tool_args, tools) + logger.info("Calling tool (parallel): %s(%s)", tool_name, json.dumps(tool_args, ensure_ascii=False)) + executor = tool_executor.get(tool_name) + if executor is None: + result_text = f"Error: unknown tool '{tool_name}'" + else: + try: + result_text = await executor(**tool_args) + except Exception as e: + logger.error("Tool %s failed: %s", tool_name, e) + result_text = f"Error executing {tool_name}: {e}" + return ( + f"{TOOL_RESULT_TAG}\n" + f"[{tool_name}] {_truncate_tool_result(result_text)}\n" + f"{TOOL_RESULT_END}" + ) + + results = await asyncio.gather(*[_run_one(tc) for tc in calls]) + return list(results) + + async def _fold_old_messages( + self, messages: list[dict], system: str, + ) -> list[dict]: + """Fold old messages into an LLM-generated summary (Stage B). + + Keeps the most recent _FOLD_KEEP_RECENT messages intact and + replaces earlier ones with a single summary message. + """ + n_to_fold = len(messages) - _FOLD_KEEP_RECENT + if n_to_fold <= 2: + return messages + + old_messages = messages[:n_to_fold] + recent_messages = messages[n_to_fold:] + + # Build a text dump of old messages for summarization + old_text_parts = [] + for msg in old_messages: + role = msg["role"] + content = msg.get("content", "") + # Truncate each message for the summary prompt to avoid overload + if len(content) > 1000: + content = content[:1000] + "..." + old_text_parts.append(f"[{role}]: {content}") + old_text = "\n\n".join(old_text_parts) + + # Cap total size sent to summarizer + if len(old_text) > 15000: + old_text = old_text[:15000] + "\n\n... [further messages omitted for brevity]" + + logger.info( + "Context folding: summarizing %d old messages (%d chars) into summary", + n_to_fold, len(old_text), + ) + + try: + summary = await self.chat( + messages=[{"role": "user", "content": old_text}], + system=_FOLD_SUMMARY_SYSTEM, + ) + except Exception as e: + logger.warning("Context folding failed: %s — keeping original messages", e) + return messages + + # Replace old messages with a single summary + summary_message = { + "role": "user", + "content": ( + f"[Context summary — the following summarizes {n_to_fold} earlier " + f"messages in this conversation]\n\n{summary}" + ), + } + return [summary_message] + recent_messages diff --git a/log_config.py b/log_config.py new file mode 100644 index 0000000..58caf92 --- /dev/null +++ b/log_config.py @@ -0,0 +1,243 @@ +"""Logging configuration — colored terminal output + detailed log file. + +Terminal: compact, colored, hierarchical display with tool-call folding. +File: full-detail timestamped log for post-run analysis. +""" + +from __future__ import annotations + +import logging +import sys +import time +from pathlib import Path + +# --------------------------------------------------------------------------- +# ANSI color codes +# --------------------------------------------------------------------------- + +class _C: + """ANSI escape sequences for terminal colors.""" + RESET = "\033[0m" + BOLD = "\033[1m" + DIM = "\033[2m" + # Foreground + RED = "\033[31m" + GREEN = "\033[32m" + YELLOW = "\033[33m" + BLUE = "\033[34m" + MAGENTA = "\033[35m" + CYAN = "\033[36m" + WHITE = "\033[37m" + GREY = "\033[90m" + # Bright foreground + B_RED = "\033[91m" + B_GREEN = "\033[92m" + B_YELLOW = "\033[93m" + B_BLUE = "\033[94m" + B_MAGENTA = "\033[95m" + B_CYAN = "\033[96m" + +# Agent name → color mapping +_AGENT_COLORS: dict[str, str] = { + "filesystem": _C.B_CYAN, + "registry": _C.B_MAGENTA, + "communication": _C.B_YELLOW, + "network": _C.B_GREEN, + "timeline": _C.B_BLUE, + "hypothesis": _C.MAGENTA, + "report": _C.CYAN, +} + + +def _agent_color(name: str) -> str: + return _AGENT_COLORS.get(name, _C.WHITE) + + +def _format_elapsed(seconds: float) -> str: + """Format elapsed seconds as human-readable string.""" + if seconds < 1: + return f"{seconds * 1000:.0f}ms" + if seconds < 60: + return f"{seconds:.1f}s" + m, s = divmod(int(seconds), 60) + if m < 60: + return f"{m}m {s:02d}s" + h, m = divmod(m, 60) + return f"{h}h {m:02d}m {s:02d}s" + + +# --------------------------------------------------------------------------- +# Terminal formatter — compact, colored, hierarchical +# --------------------------------------------------------------------------- + +class TerminalFormatter(logging.Formatter): + """Colored, compact formatter for terminal output. + + Recognizes structured log messages via extra fields: + - extra['event'] : event type for special formatting + - extra['agent'] : agent name for coloring + - extra['elapsed'] : elapsed seconds for timing display + - extra['tool_name'] : tool name for tool-call lines + - extra['tool_count'] : count for folded tool calls + """ + + def format(self, record: logging.LogRecord) -> str: + ts = time.strftime("%H:%M:%S", time.localtime(record.created)) + ts_str = f"{_C.GREY}[{ts}]{_C.RESET}" + + event = getattr(record, "event", None) + + # ── Phase banner ────────────────────────────────────────── + if event == "phase": + phase_name = record.getMessage() + line = f"\n{ts_str} {_C.BOLD}{_C.WHITE}══ {phase_name} {'═' * max(1, 52 - len(phase_name))}{_C.RESET}" + return line + + # ── Agent start ─────────────────────────────────────────── + if event == "agent_start": + agent = getattr(record, "agent", "?") + task = record.getMessage() + color = _agent_color(agent) + return f"{ts_str} {color}▸ {agent:<14}{_C.RESET} {task}" + + # ── Agent done ──────────────────────────────────────────── + if event == "agent_done": + agent = getattr(record, "agent", "?") + elapsed = getattr(record, "elapsed", 0) + summary = record.getMessage() + color = _agent_color(agent) + elapsed_str = f"{_C.GREY}({_format_elapsed(elapsed)}){_C.RESET}" if elapsed else "" + return f"{ts_str} {color}└─{_C.RESET} {summary} {elapsed_str}" + + # ── Tool calls (folded) ─────────────────────────────────── + if event == "tool_calls": + summary = record.getMessage() + elapsed = getattr(record, "elapsed", 0) + elapsed_str = f"{_C.GREY}({_format_elapsed(elapsed)}){_C.RESET}" if elapsed else "" + return f"{ts_str} {_C.DIM}├─ {summary}{_C.RESET} {elapsed_str}" + + # ── Lead dispatch ───────────────────────────────────────── + if event == "dispatch": + msg = record.getMessage() + return f"{ts_str} {_C.BLUE}{msg}{_C.RESET}" + + # ── Evidence progress ───────────────────────────────────── + if event == "progress": + msg = record.getMessage() + elapsed = getattr(record, "elapsed", 0) + elapsed_str = f" {_C.GREY}({_format_elapsed(elapsed)}){_C.RESET}" if elapsed else "" + return f"{ts_str} {_C.GREEN}{msg}{_C.RESET}{elapsed_str}" + + # ── Hypothesis update ───────────────────────────────────── + if event == "hypothesis": + msg = record.getMessage() + return f"{ts_str} {_C.MAGENTA}{msg}{_C.RESET}" + + # ── Warnings ────────────────────────────────────────────── + if record.levelno >= logging.WARNING: + msg = record.getMessage() + return f"{ts_str} {_C.B_YELLOW}WARN{_C.RESET} {msg}" + + # ── Errors ──────────────────────────────────────────────── + if record.levelno >= logging.ERROR: + msg = record.getMessage() + return f"{ts_str} {_C.B_RED}ERROR{_C.RESET} {msg}" + + # ── Default (suppressed unless DEBUG) ───────────────────── + # Most raw INFO messages (httpx, debug-level tool calls) are + # handled by the level filter and never reach here. + msg = record.getMessage() + return f"{ts_str} {_C.DIM}{msg}{_C.RESET}" + + +class FileFormatter(logging.Formatter): + """Detailed formatter for log files — full timestamps, all fields.""" + + def format(self, record: logging.LogRecord) -> str: + ts = self.formatTime(record, "%Y-%m-%d %H:%M:%S") + ms = f"{record.created % 1:.3f}"[1:] # .NNN + level = record.levelname[0] # I/W/E/D + name = record.name + + event = getattr(record, "event", "") + agent = getattr(record, "agent", "") + elapsed = getattr(record, "elapsed", "") + + # Build context tags + tags = "" + if event: + tags += f" @{event}" + if agent: + tags += f" [{agent}]" + if elapsed: + tags += f" ({_format_elapsed(elapsed)})" + + msg = record.getMessage() + return f"{ts}{ms} {level} {name}{tags}: {msg}" + + +# --------------------------------------------------------------------------- +# Filters +# --------------------------------------------------------------------------- + +class TerminalFilter(logging.Filter): + """Filter for terminal handler — suppress noisy loggers and low-value messages.""" + + # Logger names to suppress from terminal entirely + _SUPPRESSED = {"httpx", "httpcore"} + + def filter(self, record: logging.LogRecord) -> bool: + # Suppress httpx noise + if record.name in self._SUPPRESSED: + return False + + # Suppress DEBUG from all loggers in terminal + if record.levelno < logging.INFO: + return False + + # Suppress raw llm_client tool-call lines (we show folded summaries instead) + if record.name == "llm_client" and "Calling tool" in record.getMessage(): + return False + if record.name == "llm_client" and "(parallel)" in record.getMessage(): + return False + + # Suppress raw LLM request/response debug lines + if record.name == "llm_client" and record.getMessage().startswith("LLM"): + return False + + return True + + +# --------------------------------------------------------------------------- +# Setup +# --------------------------------------------------------------------------- + +def setup_logging(run_dir: Path, verbose: bool = False) -> None: + """Configure logging with colored terminal + detailed file output. + + Args: + run_dir: Directory for the log file. + verbose: If True, show all messages in terminal (no filtering). + """ + root = logging.getLogger() + root.setLevel(logging.DEBUG) + + # Remove any existing handlers (e.g., from basicConfig) + root.handlers.clear() + + # ── Terminal handler ────────────────────────────────────── + term_handler = logging.StreamHandler(sys.stderr) + term_handler.setLevel(logging.DEBUG if verbose else logging.INFO) + term_handler.setFormatter(TerminalFormatter()) + if not verbose: + term_handler.addFilter(TerminalFilter()) + root.addHandler(term_handler) + + # ── File handler (full detail) ──────────────────────────── + file_handler = logging.FileHandler(run_dir / "masforensics.log") + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(FileFormatter()) + root.addHandler(file_handler) + + # Suppress overly chatty third-party loggers even in the file + logging.getLogger("httpcore").setLevel(logging.WARNING) diff --git a/main.py b/main.py new file mode 100644 index 0000000..060d826 --- /dev/null +++ b/main.py @@ -0,0 +1,272 @@ +"""MASForensics — Multi-Agent System for Digital Forensics.""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +import shutil +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import yaml + +from agent_factory import AgentFactory +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from log_config import setup_logging +from orchestrator import AnalysisAborted, Orchestrator +from tool_registry import register_all_tools + +RUNS_DIR = Path("runs") +IMAGE_DIR = Path("image") + +# Common forensic image extensions (only first segment / single-file formats) +_IMAGE_GLOBS = ["*.001", "*.dd", "*.raw", "*.img", "*.E01", "*.iso"] + + +def load_config(path: str = "config.yaml") -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +# --------------------------------------------------------------------------- +# Interactive image & partition selection +# --------------------------------------------------------------------------- + +def _discover_images(search_dir: Path = IMAGE_DIR) -> list[Path]: + """Find forensic disk image files under *search_dir*.""" + images: set[Path] = set() + for glob in _IMAGE_GLOBS: + images.update(search_dir.glob(glob)) + return sorted(images) + + +def _parse_mmls(output: str) -> list[dict]: + """Parse mmls output into a list of partition dicts. + + Returns only data partitions (skips Meta / Unallocated rows). + """ + partitions: list[dict] = [] + for line in output.splitlines(): + # Typical line: "002: 000:000 0000000063 0009510479 0009510417 NTFS / exFAT (0x07)" + m = re.match( + r"\s*\d+:\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*)", + line, + ) + if not m: + continue + slot, start, end, length, desc = m.groups() + # Skip non-data rows + if slot == "Meta" or slot.startswith("---"): + continue + partitions.append({ + "slot": slot, + "start": int(start), + "end": int(end), + "length": int(length), + "description": desc.strip(), + }) + return partitions + + +def _run_mmls(image_path: str) -> list[dict]: + """Run mmls and return parsed partition list.""" + try: + result = subprocess.run( + ["mmls", image_path], + capture_output=True, text=True, timeout=30, + ) + except FileNotFoundError: + print("Error: mmls not found. Is The Sleuth Kit installed?") + sys.exit(1) + if result.returncode != 0: + return [] + return _parse_mmls(result.stdout) + + +def select_image_interactive(image_dir: Path | None = None) -> tuple[str, int]: + """Interactively select a disk image and partition. + + If *image_dir* is None, prompts the user for the image folder path. + Returns (image_path, partition_offset_in_sectors). + """ + # --- Image folder selection --- + if image_dir is None: + raw = input("Image folder path: ").strip() + if not raw: + print("No path provided.") + sys.exit(1) + image_dir = Path(raw).expanduser().resolve() + + if not image_dir.is_dir(): + print(f"Error: {image_dir} is not a directory.") + sys.exit(1) + + # --- Image file selection --- + images = _discover_images(image_dir) + if not images: + print(f"No disk images found in {image_dir}/") + print("Supported formats: " + ", ".join(_IMAGE_GLOBS)) + sys.exit(1) + + if len(images) == 1: + image_path = images[0] + print(f"Found image: {image_path}") + else: + print("Available disk images:") + for i, img in enumerate(images, 1): + size_mb = img.stat().st_size / (1024 * 1024) + print(f" [{i}] {img.name} ({size_mb:.0f} MB)") + while True: + choice = input(f"Select image [1-{len(images)}]: ").strip() + if choice.isdigit() and 1 <= int(choice) <= len(images): + image_path = images[int(choice) - 1] + break + print("Invalid choice.") + + # --- Partition detection --- + print(f"Detecting partitions ({image_path}) ...") + partitions = _run_mmls(str(image_path)) + + if not partitions: + print("Warning: mmls could not detect partitions. Using offset 0.") + return str(image_path), 0 + + if len(partitions) == 1: + p = partitions[0] + print(f"Found partition: {p['description']} (offset={p['start']})") + return str(image_path), p["start"] + + print("Partitions:") + for i, p in enumerate(partitions, 1): + size_mb = p["length"] * 512 / (1024 * 1024) + print(f" [{i}] {p['description']} (offset={p['start']}, {size_mb:.0f} MB)") + while True: + choice = input(f"Select partition [1-{len(partitions)}]: ").strip() + if choice.isdigit() and 1 <= int(choice) <= len(partitions): + p = partitions[int(choice) - 1] + return str(image_path), p["start"] + print("Invalid choice.") + + +def find_resumable_run() -> Path | None: + """Find the most recent incomplete run with a saved graph state.""" + if not RUNS_DIR.exists(): + return None + candidates = sorted(RUNS_DIR.glob("*/graph_state.json"), reverse=True) + for state_file in candidates: + # Incomplete = has graph_state.json but no run_metadata.json (final archive) + if not (state_file.parent / "run_metadata.json").exists(): + return state_file + return None + + +def setup_run_dir() -> Path: + """Create a timestamped run directory.""" + run_dir = RUNS_DIR / datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + +async def async_main() -> None: + config = load_config() + agent_cfg = config["agent"] + + # Check for resumable run + resumable = find_resumable_run() + resume_phase = 1 + run_dir: Path | None = None + graph: EvidenceGraph | None = None + + if resumable: + print(f"Found incomplete run: {resumable.parent.name}") + try: + data = json.loads(resumable.read_text()) + ph_count = len(data.get("phenomena", {})) + hyp_count = len(data.get("hypotheses", {})) + print(f" ({ph_count} phenomena, {hyp_count} hypotheses)") + except Exception: + pass + print("Resume? [y/N] ", end="", flush=True) + choice = input().strip().lower() + if choice == "y": + run_dir = resumable.parent + graph = EvidenceGraph.load_state(resumable) + status = graph.agent_status + if any(h.status != "active" for h in graph.hypotheses.values()): + resume_phase = 3 # hypotheses exist, resume investigation + elif graph.phenomena: + resume_phase = 2 # have phenomena, generate hypotheses + print(f"Resuming from Phase {resume_phase}...") + + if run_dir is None: + run_dir = setup_run_dir() + + # Setup logging — colored terminal + detailed log file + setup_logging(run_dir) + + # Save config snapshot + shutil.copy2("config.yaml", run_dir / "config.yaml") + + # Initialize LLM client + llm = LLMClient( + base_url=agent_cfg["base_url"], + api_key=agent_cfg["api_key"], + model=agent_cfg["model"], + max_tokens=agent_cfg.get("max_tokens", 4096), + proxy=agent_cfg.get("proxy", "auto"), + ) + + # Initialize evidence graph + if graph is None: + # CLI arg takes priority, otherwise interactive prompt + cli_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else None + image_path, partition_offset = select_image_interactive(cli_dir) + graph = EvidenceGraph( + case_info=config.get("cfreds_hacking_case", {}), + persist_path=run_dir / "graph_state.json", + ) + graph.image_path = image_path + graph.partition_offset = partition_offset + graph.extracted_dir = str(run_dir / "extracted") + else: + graph._persist_path = run_dir / "graph_state.json" + + # Register all tools with bound image path + register_all_tools(graph.image_path, graph.partition_offset, graph, graph.extracted_dir) + + # Create agent factory + factory = AgentFactory(llm, graph) + + # Run orchestrator + orchestrator = Orchestrator(llm, graph, factory, config=config, run_dir=run_dir) + try: + report = await orchestrator.run(resume_phase=resume_phase) + print("\n" + "=" * 60) + print("FORENSIC ANALYSIS COMPLETE") + print("=" * 60) + print(f"Results archived to: {run_dir}") + print(report) + except AnalysisAborted: + print("\n" + "=" * 60) + print("ANALYSIS ABORTED — too many consecutive failures") + print("=" * 60) + print(f"Partial results saved to: {run_dir}") + print("Run again to resume from saved state.") + except KeyboardInterrupt: + print("\nInterrupted. State saved.") + print(f"Partial results in: {run_dir}") + finally: + await llm.close() + + +def main() -> None: + asyncio.run(async_main()) + + +if __name__ == "__main__": + main() diff --git a/orchestrator.py b/orchestrator.py new file mode 100644 index 0000000..f6bf193 --- /dev/null +++ b/orchestrator.py @@ -0,0 +1,702 @@ +"""Orchestrator — coordinates hypothesis-driven multi-agent forensic analysis.""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +import time +from datetime import datetime +from pathlib import Path + +from agent_factory import AgentFactory +from evidence_graph import EvidenceGraph, HYPOTHESIS_EDGE_WEIGHTS +from llm_client import LLMClient + +logger = logging.getLogger(__name__) + + +def _log(msg: str, **extra) -> None: + """Emit a structured log message with extra fields for the terminal formatter.""" + logger.info(msg, extra=extra) + + +def _progress_summary(graph: EvidenceGraph) -> str: + """One-line progress summary: phenomena, hypotheses, entities counts.""" + active = sum(1 for h in graph.hypotheses.values() if h.status == "active") + parts = [] + if graph.phenomena: + parts.append(f"{len(graph.phenomena)} phenomena") + if graph.hypotheses: + parts.append(f"{len(graph.hypotheses)} hypotheses ({active} active)") + if graph.entities: + parts.append(f"{len(graph.entities)} entities") + return ", ".join(parts) if parts else "empty graph" + +# Map LLM-generated lead target names to actual registered agent names. +AGENT_ALIASES: dict[str, str] = { + "malware": "filesystem", + "deleted_files": "filesystem", + "file_recovery": "filesystem", + "recycle_bin": "filesystem", + "chat_email": "communication", + "email": "communication", + "chat": "communication", + "irc": "communication", + "messaging": "communication", + "browser_internet": "network", + "browser": "network", + "internet": "network", + "web": "network", + "wireless": "network", + "pcap": "network", + "password_analysis": "registry", + "user_activity": "registry", + "event_logs": "registry", + "system_config": "registry", +} + + +class AnalysisAborted(Exception): + """Raised when too many failures trigger a graceful shutdown.""" + pass + + +class Orchestrator: + """Coordinates the hypothesis-driven multi-agent forensic analysis. + + Flow (5 phases): + 1. FileSystemAgent initial survey → Phenomenon nodes + 2. Hypothesis generation (manual or auto via HypothesisAgent) + 3. Hypothesis-directed investigation (iterative) + 4. Timeline construction + 5. Report generation + """ + + _LEAD_GEN_SYSTEM = ( + "You are the lead investigator coordinating a multi-agent digital forensics analysis. " + "Your job is to identify gaps in the evidence and generate specific, actionable " + "investigation tasks for field agents (filesystem, registry, communication, network). " + "Each task should be concrete enough for an agent to execute immediately — " + "specify file paths, registry keys, or artifact types to examine. " + "Avoid vague tasks like 'investigate further'. " + "Prioritize tasks that would most strongly confirm OR refute the hypothesis." + ) + + _JUDGE_SYSTEM = ( + "You are the lead investigator evaluating new forensic evidence against hypotheses. " + "Judge each phenomenon's relationship to the hypothesis based ONLY on the evidence described. " + "Use 'direct_evidence' only when the phenomenon IS the hypothesized activity. " + "Use 'supports' when it's consistent but not conclusive. " + "Use 'contradicts' when it actively disproves the hypothesis. " + "Omit phenomena that are unrelated. Be conservative — only link genuinely relevant evidence." + ) + + def __init__( + self, + llm: LLMClient, + graph: EvidenceGraph, + factory: AgentFactory, + config: dict | None = None, + run_dir: Path | None = None, + ) -> None: + self.llm = llm + self.graph = graph + self.factory = factory + self.config = config or {} + self.run_dir = run_dir + self._semaphore = asyncio.Semaphore(3) + self._failure_count = 0 + self._max_failures = 3 + self._start_time = datetime.now() + + def _resolve_agent_type(self, agent_type: str) -> str: + return AGENT_ALIASES.get(agent_type, agent_type) + + # ---- Lead dispatch (with fault tolerance) -------------------------------- + + async def _dispatch_leads_parallel(self, pending: list) -> None: + """Dispatch leads grouped by agent type, running different types concurrently.""" + by_agent: dict[str, list] = {} + for lead in pending: + resolved = self._resolve_agent_type(lead.target_agent) + by_agent.setdefault(resolved, []).append(lead) + + agents_str = ", ".join(f"{at} x{len(leads)}" for at, leads in by_agent.items()) + _log(f"{len(pending)} leads dispatched -> {agents_str}", event="dispatch") + + async def run_agent_leads(agent_type: str, leads: list) -> None: + for lead in leads: + resolved = self._resolve_agent_type(lead.target_agent) + agent = self.factory.get_or_create_agent(resolved) + if agent is None: + logger.warning("Skipping lead %s: no agent for '%s'", lead.id, lead.target_agent) + await self.graph.mark_lead_completed(lead.id) + continue + lead.status = "assigned" + try: + async with self._semaphore: + # Build hypothesis context if this lead is linked to one + hyp_line = "" + if lead.hypothesis_id: + hyp = self.graph.hypotheses.get(lead.hypothesis_id) + if hyp: + hyp_line = ( + f"Hypothesis under investigation: {hyp.title} " + f"(confidence: {hyp.confidence:.2f})\n" + ) + await agent.run( + f"Investigate this lead: {lead.description}\n" + f"{hyp_line}" + f"Focus area: {lead.target_agent}" + ) + await self.graph.mark_lead_completed(lead.id) + self._failure_count = 0 + except Exception as e: + logger.error("Agent [%s] failed on lead %s: %s", agent_type, lead.id, e) + await self.graph.mark_lead_failed(lead.id, str(e)) + self._failure_count += 1 + if self._failure_count >= self._max_failures: + raise AnalysisAborted( + f"Too many consecutive agent failures ({self._failure_count})" + ) + + results = await asyncio.gather( + *(run_agent_leads(at, leads) for at, leads in by_agent.items()), + return_exceptions=True, + ) + for result in results: + if isinstance(result, AnalysisAborted): + raise result + for at, result in zip(by_agent.keys(), results): + if isinstance(result, Exception): + logger.error("Agent group [%s] failed: %s", at, result) + + async def _retry_failed_leads(self) -> None: + """Retry leads that failed (once only).""" + failed = [l for l in self.graph.leads + if l.status == "failed" and not l.context.get("retry")] + if not failed: + return + logger.info("--- Retrying %d failed leads ---", len(failed)) + for lead in failed: + lead.status = "pending" + lead.context["retry"] = True + await self._dispatch_leads_parallel(failed) + + # ---- Hypothesis generation ----------------------------------------------- + + async def _generate_hypotheses_manual(self, hypotheses_config: list[dict]) -> None: + """Load hypotheses from config.""" + for h in hypotheses_config: + hid = await self.graph.add_hypothesis( + title=h["title"], + description=h["description"], + created_by="manual", + ) + logger.info("Manual hypothesis loaded: %s — %s", hid, h["title"]) + + async def _generate_hypotheses_auto(self) -> None: + """Use HypothesisAgent to analyze phenomena and generate hypotheses.""" + hyp_agent = self.factory.get_or_create_agent("hypothesis") + if hyp_agent is None: + logger.error("Cannot create hypothesis agent") + return + await hyp_agent.run( + "Analyze all phenomena discovered in Phase 1. Generate 3-7 investigative " + "hypotheses about what happened on this system. Each hypothesis should be:\n" + "1. Specific and testable\n" + "2. About a distinct aspect of activity (e.g., hacking tools, communication, " + "network attacks, data theft)\n\n" + "For each hypothesis:\n" + "- Call add_hypothesis to create it\n" + "- Then call link_phenomenon_to_hypothesis to link relevant existing phenomena\n" + "- Choose the relationship type carefully: direct_evidence, supports, " + "prerequisite_met, consequence_observed, contradicts, or weakens\n\n" + "The ultimate goal is to reconstruct a detailed timeline of what happened on this host." + ) + + # ---- Hypothesis-directed investigation ----------------------------------- + + async def _generate_hypothesis_leads(self) -> None: + """Ask LLM what evidence is still needed — all hypotheses in one call.""" + active = [h for h in self.graph.hypotheses.values() if h.status == "active"] + if not active: + return + + # Build a combined prompt with ALL active hypotheses + hyp_blocks = [] + hyp_by_id = {h.id: h for h in active} + for hyp in active: + related = self.graph.get_related(hyp.id, direction="in") + existing = "\n".join( + f" - {r['node']} [{r['edge_type']}]" for r in related + ) or " (none yet)" + hyp_blocks.append( + f"Hypothesis [{hyp.id}]: {hyp.title}\n" + f" Description: {hyp.description}\n" + f" Current confidence: {hyp.confidence:.2f}\n" + f" Existing evidence:\n{existing}" + ) + + prompt = ( + f"Active hypotheses under investigation:\n\n" + + "\n\n".join(hyp_blocks) + "\n\n" + f"For EACH hypothesis, suggest 1-3 specific, actionable investigation tasks.\n" + f"For each task, specify which agent type should handle it: " + f"filesystem, registry, communication, network.\n" + f"Prioritize tasks that would most strongly confirm OR refute the hypothesis.\n" + f"Avoid vague tasks like 'investigate further'.\n\n" + f"Respond ONLY with JSON (no markdown):\n" + f'[{{"hypothesis_id": "hyp-xxx", "agent": "agent_type", "task": "what to investigate", "priority": 1-10}}]' + ) + + try: + response = await self.llm.chat( + messages=[{"role": "user", "content": prompt}], + system=self._LEAD_GEN_SYSTEM, + ) + match = re.search(r'\[.*?\]', response, re.DOTALL) + if match: + tasks = json.loads(match.group()) + else: + tasks = json.loads(response) + + for task in tasks: + hyp_id = task.get("hypothesis_id", "") + # Validate hypothesis_id exists + if hyp_id not in hyp_by_id: + # Try to match by prefix or skip + logger.warning("Unknown hypothesis_id in lead gen response: %s", hyp_id) + hyp_id = None + await self.graph.add_lead( + target_agent=task.get("agent", "filesystem"), + description=task.get("task", ""), + priority=task.get("priority", 5), + hypothesis_id=hyp_id, + ) + except Exception as e: + logger.warning("Batched lead generation failed: %s — falling back to per-hypothesis", e) + await self._generate_hypothesis_leads_fallback(active) + + async def _generate_hypothesis_leads_fallback(self, active: list) -> None: + """Per-hypothesis fallback if batched call fails.""" + for hyp in active: + related = self.graph.get_related(hyp.id, direction="in") + existing_evidence = "\n".join( + f" - {r['node']} [{r['edge_type']}]" for r in related + ) or " (none yet)" + + prompt = ( + f"Hypothesis: {hyp.title}\n" + f"Description: {hyp.description}\n" + f"Current confidence: {hyp.confidence:.2f}\n\n" + f"Existing evidence linked to this hypothesis:\n{existing_evidence}\n\n" + f"What additional evidence should we look for to CONFIRM or DENY this hypothesis?\n" + f"List 1-3 specific, actionable investigation tasks.\n" + f"For each, specify which agent type should handle it: " + f"filesystem, registry, communication, network.\n\n" + f"Respond ONLY with JSON (no markdown):\n" + f'[{{"agent": "agent_type", "task": "what to investigate", "priority": 1-10}}]' + ) + try: + response = await self.llm.chat( + messages=[{"role": "user", "content": prompt}], + system=self._LEAD_GEN_SYSTEM, + ) + match = re.search(r'\[.*?\]', response, re.DOTALL) + tasks = json.loads(match.group()) if match else json.loads(response) + for task in tasks: + await self.graph.add_lead( + target_agent=task.get("agent", "filesystem"), + description=task.get("task", ""), + priority=task.get("priority", 5), + hypothesis_id=hyp.id, + ) + except Exception as e: + logger.warning("Failed to generate leads for hypothesis %s: %s", hyp.id, e) + + async def _judge_new_phenomena(self) -> None: + """Judge new phenomena against active hypotheses — all in one LLM call.""" + active = [h for h in self.graph.hypotheses.values() if h.status == "active"] + if not active: + return + + # Find phenomena not yet linked to any hypothesis + linked_ph_ids = set() + for edge in self.graph.edges: + if edge.target_id.startswith("hyp-") and edge.source_id.startswith("ph-"): + linked_ph_ids.add(edge.source_id) + + unlinked = [ph for ph in self.graph.phenomena.values() if ph.id not in linked_ph_ids] + if not unlinked: + return + + valid_types = list(HYPOTHESIS_EDGE_WEIGHTS.keys()) + + hyp_section = "\n".join( + f" [{h.id}] {h.title}: {h.description}" for h in active + ) + ph_section = "\n".join(f" - {ph.summary()}" for ph in unlinked) + + prompt = ( + f"Hypotheses under investigation:\n{hyp_section}\n\n" + f"New phenomena not yet linked to any hypothesis:\n{ph_section}\n\n" + f"For each phenomenon, judge whether it is related to ANY of the hypotheses above.\n" + f"A phenomenon may be linked to MULTIPLE hypotheses if relevant.\n" + f"Valid relationship types: {', '.join(valid_types)}\n" + f"If a phenomenon is NOT related to a hypothesis, omit that pairing.\n\n" + f"Respond ONLY with JSON (no markdown):\n" + f'[{{"hypothesis_id": "hyp-xxx", "phenomenon_id": "ph-xxx", "edge_type": "supports|contradicts|...", "reason": "brief explanation"}}]' + ) + + try: + response = await self.llm.chat( + messages=[{"role": "user", "content": prompt}], + system=self._JUDGE_SYSTEM, + ) + match = re.search(r'\[.*?\]', response, re.DOTALL) + if match: + judgments = json.loads(match.group()) + else: + judgments = json.loads(response) + + for j in judgments: + hyp_id = j.get("hypothesis_id", "") + ph_id = j.get("phenomenon_id", "") + edge_type = j.get("edge_type", "") + reason = j.get("reason", "") + if ( + hyp_id in self.graph.hypotheses + and ph_id in self.graph.phenomena + and edge_type in HYPOTHESIS_EDGE_WEIGHTS + ): + await self.graph.update_hypothesis_confidence( + hyp_id=hyp_id, + phenomenon_id=ph_id, + edge_type=edge_type, + reason=reason, + ) + logger.info( + "Judged: %s —[%s]→ %s (%s)", + ph_id, edge_type, hyp_id, reason[:60], + ) + except Exception as e: + logger.warning("Batched judging failed: %s — falling back to per-hypothesis", e) + await self._judge_new_phenomena_fallback(active, unlinked, valid_types) + + async def _judge_new_phenomena_fallback( + self, active: list, unlinked: list, valid_types: list[str], + ) -> None: + """Per-hypothesis fallback if batched judging call fails.""" + for hyp in active: + ph_summaries = "\n".join(f" - {ph.summary()}" for ph in unlinked) + prompt = ( + f"Hypothesis: {hyp.title}\n" + f"Description: {hyp.description}\n\n" + f"New phenomena not yet linked to this hypothesis:\n{ph_summaries}\n\n" + f"For each phenomenon, judge whether it is related to this hypothesis.\n" + f"Valid relationship types: {', '.join(valid_types)}\n" + f"If a phenomenon is NOT related, omit it.\n\n" + f"Respond ONLY with JSON (no markdown):\n" + f'[{{"phenomenon_id": "ph-xxx", "edge_type": "supports|contradicts|...", "reason": "brief explanation"}}]' + ) + try: + response = await self.llm.chat( + messages=[{"role": "user", "content": prompt}], + system=self._JUDGE_SYSTEM, + ) + match = re.search(r'\[.*?\]', response, re.DOTALL) + judgments = json.loads(match.group()) if match else json.loads(response) + for j in judgments: + ph_id = j.get("phenomenon_id", "") + edge_type = j.get("edge_type", "") + reason = j.get("reason", "") + if ph_id in self.graph.phenomena and edge_type in HYPOTHESIS_EDGE_WEIGHTS: + await self.graph.update_hypothesis_confidence( + hyp_id=hyp.id, + phenomenon_id=ph_id, + edge_type=edge_type, + reason=reason, + ) + logger.info( + "Judged: %s —[%s]→ %s (%s)", + ph_id, edge_type, hyp.id, reason[:60], + ) + except Exception as e: + logger.warning("Failed to judge phenomena for hypothesis %s: %s", hyp.id, e) + + # ---- Gap analysis (coverage check) --------------------------------------- + + _AREA_KEYWORDS: dict[str, list[str]] = { + "system_info": ["install date", "registered owner", "product name", "windows xp", "system information"], + "user_accounts": ["user account", "enumerate", "sam hive", "administrator", "mr. evil"], + "shutdown_time": ["shutdown"], + "network_config": ["network interface", "network adapter", "ip address", "dhcp", "mac address", "network config"], + "installed_software": ["installed software", "program files", "installed program"], + "email_config": ["smtp", "pop3", "nntp", "email account", "email config"], + "chat_logs": ["irc", "mirc", "chat log", "channel"], + "network_activity": ["packet capture", "pcap", "interception", "http request", "user-agent"], + "deleted_files": ["deleted file", "recycle", "recycler"], + "execution_evidence": ["prefetch", "execution", "run count", "last execution"], + } + + # Deterministic coverage: if the canonical tool was called, the area is covered. + _AREA_TOOLS: dict[str, list[str]] = { + "system_info": ["get_system_info"], + "user_accounts": ["enumerate_users"], + "shutdown_time": ["get_shutdown_time"], + "network_config": ["get_network_interfaces"], + "installed_software": ["list_installed_software"], + "email_config": ["get_email_config"], + "network_activity": ["parse_pcap_strings"], + "deleted_files": ["count_deleted_files"], + "execution_evidence": ["parse_prefetch"], + } + + def _check_coverage(self, areas: list[dict]) -> set[str]: + # Layer 1: keyword matching on category + title + description + evidence_text = "" + for ph in self.graph.phenomena.values(): + evidence_text += f" {ph.category} {ph.title} {ph.description} ".lower() + + # Layer 2: collect all source_tools that produced phenomena + used_tools: set[str] = {ph.source_tool for ph in self.graph.phenomena.values() if ph.source_tool} + + covered = set() + for area in areas: + area_name = area["area"] + # Check keywords + keywords = self._AREA_KEYWORDS.get(area_name, []) + if any(kw in evidence_text for kw in keywords): + covered.add(area_name) + continue + # Check source_tool + area_tools = self._AREA_TOOLS.get(area_name, []) + if any(tool in used_tools for tool in area_tools): + covered.add(area_name) + return covered + + async def _run_gap_analysis(self) -> None: + areas = self.config.get("investigation_areas", []) + if not areas: + return + + covered = self._check_coverage(areas) + uncovered = [a for a in areas if a["area"] not in covered] + + if not uncovered: + _log(f"All {len(areas)} investigation areas covered", event="progress") + return + + uncovered_names = ", ".join(a["area"] for a in uncovered) + _log(f"{len(uncovered)}/{len(areas)} areas uncovered: {uncovered_names}", event="dispatch") + for area in uncovered: + await self.graph.add_lead( + target_agent=area["agent"], + description=area["task"], + priority=3, + ) + + for round_num in range(3): + pending = await self.graph.get_pending_leads() + if not pending: + break + _log(f"Gap fill round {round_num}: {len(pending)} leads", event="dispatch") + await self._dispatch_leads_parallel(pending) + + # ---- Run archiving ------------------------------------------------------- + + def _archive_run(self, report: str = "") -> None: + if self.run_dir is None: + return + try: + # Phenomena export + phenomena_data = {pid: ph.to_dict() for pid, ph in self.graph.phenomena.items()} + (self.run_dir / "phenomena.json").write_text( + json.dumps(phenomena_data, ensure_ascii=False, indent=2) + ) + + # Hypotheses export + hyp_data = {hid: h.to_dict() for hid, h in self.graph.hypotheses.items()} + (self.run_dir / "hypotheses.json").write_text( + json.dumps(hyp_data, ensure_ascii=False, indent=2) + ) + + # Edges export + edges_data = [e.to_dict() for e in self.graph.edges] + (self.run_dir / "edges.json").write_text( + json.dumps(edges_data, ensure_ascii=False, indent=2) + ) + + # Entities export + ent_data = {eid: e.to_dict() for eid, e in self.graph.entities.items()} + (self.run_dir / "entities.json").write_text( + json.dumps(ent_data, ensure_ascii=False, indent=2) + ) + + # Leads export + leads_data = [l.to_dict() for l in self.graph.leads] + (self.run_dir / "leads.json").write_text( + json.dumps(leads_data, ensure_ascii=False, indent=2) + ) + + # Run metadata + end_time = datetime.now() + metadata = { + "start_time": self._start_time.isoformat(), + "end_time": end_time.isoformat(), + "duration_seconds": (end_time - self._start_time).total_seconds(), + "phenomena_count": len(self.graph.phenomena), + "hypotheses_count": len(self.graph.hypotheses), + "entities_count": len(self.graph.entities), + "edges_count": len(self.graph.edges), + "leads_total": len(self.graph.leads), + "leads_completed": sum(1 for l in self.graph.leads if l.status == "completed"), + "leads_failed": sum(1 for l in self.graph.leads if l.status == "failed"), + "agent_status": dict(self.graph.agent_status), + "hypotheses_summary": { + hid: {"title": h.title, "confidence": h.confidence, "status": h.status} + for hid, h in self.graph.hypotheses.items() + }, + } + (self.run_dir / "run_metadata.json").write_text( + json.dumps(metadata, ensure_ascii=False, indent=2) + ) + + if report: + (self.run_dir / "report.md").write_text(report) + + logger.info("Run archived to %s", self.run_dir) + except Exception as e: + logger.error("Failed to archive run: %s", e) + + # ---- Main pipeline ------------------------------------------------------- + + async def run(self, resume_phase: int = 1) -> str: + """Run the 5-phase hypothesis-driven forensic analysis pipeline.""" + _log(f"Phase 1: Filesystem Survey (image: {Path(self.graph.image_path).name})", event="phase") + + report = "" + try: + # Phase 1: Initial filesystem survey + if resume_phase <= 1: + t0 = time.monotonic() + ph_before = len(self.graph.phenomena) + fs_agent = self.factory.get_or_create_agent("filesystem") + if fs_agent: + await fs_agent.run( + "Perform an initial survey of this disk image. " + "Examine the partition table, filesystem type, and root directory structure. " + "List key user directories and identify interesting files (documents, emails, " + "chat logs, installed programs, registry hives). " + "Create leads for other agents based on what you find." + ) + new_ph = len(self.graph.phenomena) - ph_before + new_leads = sum(1 for l in self.graph.leads if l.status == "pending") + _log(f"+{new_ph} phenomena, +{new_leads} leads", event="progress", elapsed=time.monotonic() - t0) + + # Phase 2: Hypothesis generation + if resume_phase <= 2: + _log("Phase 2: Hypothesis Generation", event="phase") + t0 = time.monotonic() + manual_hypotheses = self.config.get("hypotheses", []) + if manual_hypotheses: + await self._generate_hypotheses_manual(manual_hypotheses) + if self.graph.phenomena: + await self._judge_new_phenomena() + else: + await self._generate_hypotheses_auto() + + for h in self.graph.hypotheses.values(): + _log(f" {h.summary()}", event="hypothesis") + _log( + f"+{len(self.graph.hypotheses)} hypotheses generated", + event="progress", elapsed=time.monotonic() - t0, + ) + + # Phase 3: Hypothesis-directed investigation (iterative) + if resume_phase <= 3: + max_rounds = self.config.get("max_investigation_rounds", 5) + for round_num in range(max_rounds): + _log(f"Phase 3: Investigation Round {round_num}", event="phase") + t0 = time.monotonic() + + if self.graph.hypotheses_converged(): + _log("All hypotheses converged — stopping", event="progress") + break + + await self._generate_hypothesis_leads() + + pending = await self.graph.get_pending_leads() + if not pending: + _log("No pending leads — round complete", event="progress") + break + + await self._dispatch_leads_parallel(pending) + await self._judge_new_phenomena() + + # Show hypothesis status update + for h in self.graph.hypotheses.values(): + _log(f" {h.summary()}", event="hypothesis") + _log(_progress_summary(self.graph), event="progress", elapsed=time.monotonic() - t0) + + # Retry failed leads + await self._retry_failed_leads() + + # Gap analysis + _log("Phase 3: Gap Analysis", event="phase") + await self._run_gap_analysis() + + self.graph.mark_remaining_inconclusive() + + # Phase 4: Timeline construction + if resume_phase <= 4: + _log("Phase 4: Timeline Construction", event="phase") + timeline_agent = self.factory.get_or_create_agent("timeline") + if timeline_agent: + await timeline_agent.run( + "Build a detailed chronological timeline from the evidence graph. " + "Use the build_filesystem_timeline tool for MAC times, then correlate " + "with phenomena timestamps. Use temporal edges to connect events." + ) + + # Phase 5: Report generation + _log("Phase 5: Report Generation", event="phase") + t0 = time.monotonic() + report_agent = self.factory.get_or_create_agent("report") + if report_agent: + report = await report_agent.run( + "Generate a comprehensive forensic report. Structure it as follows:\n" + "1. Executive Summary\n" + "2. Hypotheses and Evidence (for each hypothesis: title, confidence, " + "supporting/contradicting phenomena)\n" + "3. Detailed Findings by Category\n" + "4. Timeline of Events\n" + "5. Entity Summary (key people, programs, hosts involved)\n" + "6. Conclusions and Recommendations" + ) + + image_stem = Path(self.graph.image_path).stem + report_name = f"{image_stem}_forensic_report.md" + report_path = (self.run_dir / report_name) if self.run_dir else Path(report_name) + try: + report_path.write_text(report) + _log(f"Report saved: {report_path}", event="progress", elapsed=time.monotonic() - t0) + except Exception as e: + logger.error("Failed to save report: %s", e) + + except AnalysisAborted: + logger.critical("Analysis aborted due to repeated failures. Partial results preserved.") + finally: + self._archive_run(report) + total = (datetime.now() - self._start_time).total_seconds() + _log( + f"Run complete: {_progress_summary(self.graph)}", + event="progress", elapsed=total, + ) + + if not report: + return self.graph.stats_summary() + return report diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fae6bbb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "masforensics" +version = "0.1.0" +description = "Multi-Agent System for Digital Forensics" +requires-python = ">=3.14" +dependencies = [ + "httpx[socks]>=0.28.1", + "pyyaml", + "regipy>=6.2.1", +] + +[project.scripts] +masforensics = "main:main" + +[tool.setuptools] +py-modules = ["main", "llm_client", "evidence_graph", "base_agent", "orchestrator", "tool_registry", "agent_factory"] +packages = ["agents", "tools"] + +[dependency-groups] +dev = [ + "pytest>=9.0.2", + "pytest-asyncio>=1.3.0", +] diff --git a/regenerate_report.py b/regenerate_report.py new file mode 100644 index 0000000..4e01b42 --- /dev/null +++ b/regenerate_report.py @@ -0,0 +1,63 @@ +"""Regenerate the forensic report from a saved graph state without re-running the full pipeline.""" + +import asyncio +import sys +from pathlib import Path + +import yaml + +from agent_factory import AgentFactory +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import register_all_tools + + +async def main() -> None: + # Find the run to regenerate from + run_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("runs/2026-04-02T15-11-25") + state_path = run_dir / "graph_state.json" + + if not state_path.exists(): + print(f"No graph_state.json in {run_dir}") + return + + config = yaml.safe_load(open("config.yaml")) + agent_cfg = config["agent"] + + # Load graph + graph = EvidenceGraph.load_state(state_path) + print(f"Loaded: {graph.stats_summary()}") + + # LLM client with larger max_tokens for report + llm = LLMClient( + base_url=agent_cfg["base_url"], + api_key=agent_cfg["api_key"], + model=agent_cfg["model"], + max_tokens=16384, + ) + + register_all_tools(graph.image_path, graph.partition_offset, graph) + factory = AgentFactory(llm, graph) + + # Run only the report agent + report_agent = factory.get_or_create_agent("report") + report = await report_agent.run( + "Generate a comprehensive forensic report. Structure it as follows:\n" + "1. Executive Summary\n" + "2. Hypotheses and Evidence (for each hypothesis: title, confidence, " + "supporting/contradicting phenomena)\n" + "3. Detailed Findings by Category\n" + "4. Timeline of Events\n" + "5. Entity Summary (key people, programs, hosts involved)\n" + "6. Conclusions and Recommendations" + ) + + report_path = run_dir / "SCHARDT_forensic_report.md" + report_path.write_text(report) + print(f"\nReport saved to {report_path} ({len(report)} chars)") + + await llm.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tool_registry.py b/tool_registry.py new file mode 100644 index 0000000..63b0239 --- /dev/null +++ b/tool_registry.py @@ -0,0 +1,615 @@ +"""Central tool registry — catalogs all available forensic tools. + +Tools are registered once at startup with bound image_path and offset. +The AgentFactory uses this catalog to compose agents dynamically. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import os +import re +from dataclasses import dataclass, field +from typing import Any + +from tools import parsers +from tools import registry as reg +from tools import sleuthkit as tsk + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Tool result cache — keyed by (tool_name, args_hash). +# Disk image tools are deterministic (image is read-only), so identical +# calls always produce the same output. +# --------------------------------------------------------------------------- + +_tool_result_cache: dict[str, str] = {} + +# Tools safe to cache: deterministic reads with no side effects. +CACHEABLE_TOOLS: set[str] = { + "partition_info", "filesystem_info", "list_directory", "find_file", + "search_strings", "count_deleted_files", "build_filesystem_timeline", + "parse_registry_key", "search_registry", "get_user_activity", + "read_text_file", "read_binary_preview", "search_text_file", + "read_text_file_section", "list_extracted_dir", "parse_pcap_strings", +} + + +def _cache_key(tool_name: str, kwargs: dict) -> str: + """Build a deterministic cache key from tool name + arguments.""" + args_str = json.dumps(kwargs, sort_keys=True, ensure_ascii=False) + args_hash = hashlib.md5(args_str.encode()).hexdigest() + return f"{tool_name}:{args_hash}" + + +def _make_cached(tool_name: str, executor: Any) -> Any: + """Wrap an executor with an in-memory result cache.""" + + async def wrapper(**kwargs) -> str: + key = _cache_key(tool_name, kwargs) + cached = _tool_result_cache.get(key) + if cached is not None: + logger.debug("Cache hit: %s(%s)", tool_name, kwargs) + return cached + result = await executor(**kwargs) + # Only cache successful results (not errors) + if not result.startswith("Error") and not result.startswith("[Command failed"): + _tool_result_cache[key] = result + return result + + return wrapper + + +def get_cache_stats() -> dict[str, int]: + """Return cache statistics for diagnostics.""" + return {"entries": len(_tool_result_cache)} + +# Category auto-detection patterns (filename → category) +_REGISTRY_HIVE_NAMES = {"system", "software", "sam", "ntuser.dat", "security", "default"} + +ASSET_CATEGORIES = [ + "registry_hive", "chat_log", "prefetch", "network_capture", + "config_file", "address_book", "recycle_bin", "executable", + "text_log", "other", +] + + +def _auto_categorize(filename: str) -> str: + """Infer asset category from filename.""" + name_lower = filename.lower() + ext = os.path.splitext(name_lower)[1] + + # Check full name (with extension) and base name against known hive names + if name_lower in _REGISTRY_HIVE_NAMES: + return "registry_hive" + if ext == ".pf": + return "prefetch" + if ext in (".pcap", ".cap") or name_lower == "interception": + return "network_capture" + if ext == ".wab": + return "address_book" + if name_lower == "info2" or re.match(r"dc\d+\.exe", name_lower): + return "recycle_bin" + # Extension-based checks before keyword-based (e.g. mirc.ini → config, not chat) + if ext in (".ini", ".csv", ".dat", ".cfg"): + return "config_file" + if ext in (".log", ".lst"): + if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")): + return "chat_log" + return "text_log" + if any(kw in name_lower for kw in ("irc", "mirc", "channel", "chat")): + return "chat_log" + if ext in (".exe", ".dll", ".com"): + return "executable" + return "other" + + +@dataclass +class ToolDefinition: + """A registered tool available for agent composition.""" + + name: str + description: str + input_schema: dict + executor: Any # async callable (or sync for some parsers) + module: str # "sleuthkit", "registry", "parsers" + tags: list[str] = field(default_factory=list) + + +# Global tool catalog, populated by register_all_tools(). +TOOL_CATALOG: dict[str, ToolDefinition] = {} + + +def _make_auto_record(tool_name: str, category: str, executor: Any, graph: Any) -> Any: + """Wrap a forensic tool to auto-record its result as a phenomenon.""" + + async def wrapper(**kwargs) -> str: + result = await executor(**kwargs) + if graph is None or not result or result.startswith("Error") or result.startswith("["): + return result + # Auto-record: the tool produced a forensic fact + agent = getattr(graph, "_current_agent", "") or "unknown" + title = f"{tool_name}: {result.split(chr(10))[0][:80]}" + await graph.add_phenomenon( + source_agent=agent, + category=category, + title=title, + description=result[:2000], + source_tool=tool_name, + ) + return result + + return wrapper + + +def register_all_tools( + image_path: str, + partition_offset: int, + graph: Any = None, + extracted_dir: str = "extracted", +) -> None: + """Populate TOOL_CATALOG with all available tools, pre-bound to image/offset.""" + TOOL_CATALOG.clear() + + # ---- Sleuth Kit tools ---- + + TOOL_CATALOG["partition_info"] = ToolDefinition( + name="partition_info", + description="Get the partition table layout of the disk image. Run this first to understand disk structure.", + input_schema={"type": "object", "properties": {}}, + executor=lambda: tsk.partition_info(image_path), + module="sleuthkit", + tags=["filesystem", "disk", "partition"], + ) + + TOOL_CATALOG["filesystem_info"] = ToolDefinition( + name="filesystem_info", + description="Get detailed filesystem information (type, block size, volume name, etc.) for the selected partition.", + input_schema={"type": "object", "properties": {}}, + executor=lambda: tsk.filesystem_info(image_path, partition_offset), + module="sleuthkit", + tags=["filesystem", "disk"], + ) + + TOOL_CATALOG["list_directory"] = ToolDefinition( + name="list_directory", + description="List files and directories. Without inode, lists root. Use recursive=true for all files.", + input_schema={ + "type": "object", + "properties": { + "inode": {"type": "string", "description": "Inode of directory. Omit for root."}, + "recursive": {"type": "boolean", "description": "List all files recursively."}, + }, + }, + executor=lambda inode=None, recursive=False: tsk.list_directory( + image_path, partition_offset, inode, recursive + ), + module="sleuthkit", + tags=["filesystem", "directory", "listing"], + ) + + async def _extract_with_tracking(inode: str) -> str: + """Extract a file by inode. Name and category are derived from the real disk path.""" + # Dedup + if graph is not None: + existing = graph.lookup_asset_by_inode(inode) + if existing is not None: + return ( + f"Already extracted: {existing.local_path} " + f"({existing.size_bytes} bytes, {existing.category}). " + f"Disk path: {existing.original_path}" + ) + + # Resolve real disk path first + orig_path = (await tsk.find_file(image_path, inode, partition_offset)).strip() + if not orig_path or "not found" in orig_path.lower(): + return f"Error: inode {inode} not found on the disk image." + + # Derive local filename from real disk path + filename = os.path.basename(orig_path) + local_path = os.path.join(extracted_dir, filename) + + # Handle name collisions by appending inode + if os.path.exists(local_path): + base, ext = os.path.splitext(filename) + local_path = os.path.join(extracted_dir, f"{base}_{inode.replace('-', '_')}{ext}") + filename = os.path.basename(local_path) + + # Extract + result = await tsk.extract_file(image_path, inode, local_path, partition_offset) + if result.startswith("[icat failed"): + return result + + size = os.path.getsize(local_path) if os.path.exists(local_path) else 0 + category = _auto_categorize(os.path.basename(orig_path)) + + # Register + if graph is not None: + agent_name = getattr(graph, "_current_agent", "") or "unknown" + await graph.register_asset( + inode=inode, + original_path=orig_path, + local_path=local_path, + category=category, + filename=filename, + size_bytes=size, + extracted_by=agent_name, + ) + logger.info("Asset registered: %s (%s, %d bytes)", local_path, category, size) + + return ( + f"Extracted to {local_path} ({size} bytes, {category})\n" + f"Disk path: {orig_path}" + ) + + TOOL_CATALOG["extract_file"] = ToolDefinition( + name="extract_file", + description=( + "Extract a file from the disk image by inode number. " + "The filename is automatically determined from the disk path. " + "Checks if already extracted (returns existing path if so). " + "Returns the local path and the original disk path." + ), + input_schema={ + "type": "object", + "properties": { + "inode": {"type": "string", "description": "Inode number of the file (e.g. '334-128-4' or '334')."}, + }, + "required": ["inode"], + }, + executor=_extract_with_tracking, + module="sleuthkit", + tags=["filesystem", "extraction"], + ) + + TOOL_CATALOG["find_file"] = ToolDefinition( + name="find_file", + description="Find the file path for a given inode number.", + input_schema={ + "type": "object", + "properties": { + "inode": {"type": "string", "description": "Inode number to look up."}, + }, + "required": ["inode"], + }, + executor=lambda inode: tsk.find_file(image_path, inode, partition_offset), + module="sleuthkit", + tags=["filesystem"], + ) + + TOOL_CATALOG["search_strings"] = ToolDefinition( + name="search_strings", + description="Search for a string pattern across the entire disk image (slow on first call, fast after). Prefer search_text_file on already-extracted files when possible.", + input_schema={ + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "String pattern (case-insensitive grep)."}, + }, + "required": ["pattern"], + }, + executor=lambda pattern: tsk.search_strings(image_path, pattern), + module="sleuthkit", + tags=["filesystem", "search", "strings"], + ) + + TOOL_CATALOG["count_deleted_files"] = ToolDefinition( + name="count_deleted_files", + description="List and count all deleted files. Shows total count, executables, and extension breakdown.", + input_schema={"type": "object", "properties": {}}, + executor=lambda: tsk.count_deleted_files(image_path, partition_offset), + module="sleuthkit", + tags=["filesystem", "deleted", "recovery"], + ) + + TOOL_CATALOG["build_filesystem_timeline"] = ToolDefinition( + name="build_filesystem_timeline", + description="Build a MAC timeline from the filesystem (Modified/Accessed/Changed times for all files).", + input_schema={"type": "object", "properties": {}}, + executor=lambda: tsk.build_timeline(image_path, partition_offset), + module="sleuthkit", + tags=["filesystem", "timeline"], + ) + + # ---- Registry tools ---- + + TOOL_CATALOG["parse_registry_key"] = ToolDefinition( + name="parse_registry_key", + description="Parse a registry hive file and list subkeys/values at a given path.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to extracted hive file."}, + "key_path": {"type": "string", "description": "Registry key path to inspect."}, + }, + "required": ["hive_path", "key_path"], + }, + executor=lambda hive_path, key_path: reg.parse_registry_key(hive_path, key_path), + module="registry", + tags=["registry", "hive"], + ) + + TOOL_CATALOG["list_installed_software"] = ToolDefinition( + name="list_installed_software", + description="List installed software from a SOFTWARE registry hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SOFTWARE hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("list_installed_software", "registry", + lambda hive_path: reg.list_installed_software(hive_path), graph), + module="registry", + tags=["registry", "software", "installed"], + ) + + TOOL_CATALOG["get_user_activity"] = ToolDefinition( + name="get_user_activity", + description="Extract user activity from NTUSER.DAT (recent docs, typed URLs, run dialog history).", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to NTUSER.DAT."}, + }, + "required": ["hive_path"], + }, + executor=lambda hive_path: reg.get_user_activity(hive_path), + module="registry", + tags=["registry", "user", "activity"], + ) + + TOOL_CATALOG["search_registry"] = ToolDefinition( + name="search_registry", + description="Search for a pattern in registry key names and values.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to hive file."}, + "pattern": {"type": "string", "description": "Search pattern."}, + }, + "required": ["hive_path", "pattern"], + }, + executor=lambda hive_path, pattern: reg.search_registry(hive_path, pattern), + module="registry", + tags=["registry", "search"], + ) + + # ---- Registry tools (auto-record: results are forensic facts) ---- + + TOOL_CATALOG["get_system_info"] = ToolDefinition( + name="get_system_info", + description="Extract OS version, install date, and registered owner from a SOFTWARE hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SOFTWARE hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_system_info", "registry", + lambda hive_path: reg.get_system_info(hive_path), graph), + module="registry", + tags=["registry", "system"], + ) + + TOOL_CATALOG["get_timezone_info"] = ToolDefinition( + name="get_timezone_info", + description="Extract timezone settings from a SYSTEM hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SYSTEM hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_timezone_info", "registry", + lambda hive_path: reg.get_timezone_info(hive_path), graph), + module="registry", + tags=["registry", "timezone", "system"], + ) + + TOOL_CATALOG["get_computer_name"] = ToolDefinition( + name="get_computer_name", + description="Extract computer/host name from a SYSTEM hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SYSTEM hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_computer_name", "registry", + lambda hive_path: reg.get_computer_name(hive_path), graph), + module="registry", + tags=["registry", "system", "hostname"], + ) + + TOOL_CATALOG["get_shutdown_time"] = ToolDefinition( + name="get_shutdown_time", + description="Extract last shutdown time from a SYSTEM hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SYSTEM hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_shutdown_time", "registry", + lambda hive_path: reg.get_shutdown_time(hive_path), graph), + module="registry", + tags=["registry", "system", "shutdown"], + ) + + TOOL_CATALOG["enumerate_users"] = ToolDefinition( + name="enumerate_users", + description="List all user accounts and RIDs from a SAM hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SAM hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("enumerate_users", "registry", + lambda hive_path: reg.enumerate_users(hive_path), graph), + module="registry", + tags=["registry", "user", "accounts", "sam"], + ) + + TOOL_CATALOG["get_network_interfaces"] = ToolDefinition( + name="get_network_interfaces", + description="Extract network adapter and TCP/IP config from a SYSTEM hive.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to SYSTEM hive."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_network_interfaces", "registry", + lambda hive_path: reg.get_network_interfaces(hive_path), graph), + module="registry", + tags=["registry", "network", "adapter", "ip"], + ) + + TOOL_CATALOG["get_email_config"] = ToolDefinition( + name="get_email_config", + description="Extract email account configuration (SMTP, POP3, NNTP) from NTUSER.DAT.", + input_schema={ + "type": "object", + "properties": { + "hive_path": {"type": "string", "description": "Path to NTUSER.DAT."}, + }, + "required": ["hive_path"], + }, + executor=_make_auto_record("get_email_config", "registry", + lambda hive_path: reg.get_email_config(hive_path), graph), + module="registry", + tags=["registry", "email", "account"], + ) + + # ---- Parser tools ---- + + TOOL_CATALOG["parse_prefetch"] = ToolDefinition( + name="parse_prefetch", + description="Parse a Windows Prefetch (.pf) file to extract executable name, last execution time, and run count.", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Path to extracted .pf file."}, + }, + "required": ["file_path"], + }, + executor=_make_auto_record("parse_prefetch", "filesystem", + lambda file_path: parsers.parse_prefetch(file_path), graph), + module="parsers", + tags=["filesystem", "prefetch", "execution"], + ) + + TOOL_CATALOG["read_text_file"] = ToolDefinition( + name="read_text_file", + description="Read an extracted text file (configs, logs, chat logs, etc.).", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Local path to the file."}, + }, + "required": ["file_path"], + }, + executor=lambda file_path: parsers.read_text_file(file_path), + module="parsers", + tags=["text", "read"], + ) + + TOOL_CATALOG["read_binary_preview"] = ToolDefinition( + name="read_binary_preview", + description="Preview a binary file in hex+ASCII format.", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Local path to the file."}, + }, + "required": ["file_path"], + }, + executor=lambda file_path: parsers.read_binary_preview(file_path), + module="parsers", + tags=["binary", "hex", "preview"], + ) + + TOOL_CATALOG["search_text_file"] = ToolDefinition( + name="search_text_file", + description="Search for a regex pattern in an extracted text file. Returns matching lines with line numbers.", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Path to extracted file."}, + "pattern": {"type": "string", "description": "Regex pattern."}, + }, + "required": ["file_path", "pattern"], + }, + executor=lambda file_path, pattern: parsers.search_text_file(file_path, pattern), + module="parsers", + tags=["text", "search", "regex"], + ) + + TOOL_CATALOG["read_text_file_section"] = ToolDefinition( + name="read_text_file_section", + description="Read a section of a large text file starting at a byte offset.", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Path to file."}, + "start": {"type": "integer", "description": "Byte offset to start reading."}, + "max_bytes": {"type": "integer", "description": "Maximum bytes to read."}, + }, + "required": ["file_path"], + }, + executor=lambda file_path, start=0, max_bytes=8000: parsers.read_text_file_section( + file_path, start, max_bytes + ), + module="parsers", + tags=["text", "read", "section"], + ) + + TOOL_CATALOG["list_extracted_dir"] = ToolDefinition( + name="list_extracted_dir", + description="List files in an extracted directory with sizes.", + input_schema={ + "type": "object", + "properties": { + "dir_path": {"type": "string", "description": "Directory path."}, + }, + "required": ["dir_path"], + }, + executor=lambda dir_path: parsers.list_extracted_dir(dir_path), + module="parsers", + tags=["filesystem", "listing", "extracted"], + ) + + TOOL_CATALOG["parse_pcap_strings"] = ToolDefinition( + name="parse_pcap_strings", + description="Extract HTTP headers, hosts, User-Agent, cookies, and URLs from a PCAP/capture file.", + input_schema={ + "type": "object", + "properties": { + "file_path": {"type": "string", "description": "Path to PCAP file."}, + }, + "required": ["file_path"], + }, + executor=lambda file_path: parsers.parse_pcap_strings(file_path), + module="parsers", + tags=["network", "pcap", "http", "capture"], + ) + + # ---- Apply result caching to deterministic read-only tools ---- + # Must come AFTER all tools are registered. Auto-record wrapped tools + # (e.g. get_system_info) are NOT in CACHEABLE_TOOLS since they write + # to the evidence graph as a side effect. + _tool_result_cache.clear() + for tool_name, td in TOOL_CATALOG.items(): + if tool_name in CACHEABLE_TOOLS: + td.executor = _make_cached(tool_name, td.executor) diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/parsers.py b/tools/parsers.py new file mode 100644 index 0000000..186613a --- /dev/null +++ b/tools/parsers.py @@ -0,0 +1,234 @@ +"""Parsers for various forensic artifact formats.""" + +from __future__ import annotations + +import asyncio +import logging +import os +import re +import struct +from datetime import datetime, timedelta, timezone + +logger = logging.getLogger(__name__) + + +async def read_text_file(file_path: str, max_bytes: int = 8000) -> str: + """Read a text file, with size limit.""" + try: + with open(file_path, "r", errors="replace") as f: + content = f.read(max_bytes) + size = os.path.getsize(file_path) + if size > max_bytes: + content += f"\n\n[Truncated: file is {size} bytes, showing first {max_bytes}]" + return content + except Exception as e: + return f"[Error reading {file_path}: {e}]" + + +async def read_binary_preview(file_path: str, max_bytes: int = 2000) -> str: + """Read a binary file and show hex + ASCII preview.""" + try: + with open(file_path, "rb") as f: + data = f.read(max_bytes) + + lines = [] + for i in range(0, len(data), 16): + chunk = data[i:i + 16] + hex_part = " ".join(f"{b:02x}" for b in chunk) + ascii_part = "".join(chr(b) if 32 <= b < 127 else "." for b in chunk) + lines.append(f"{i:08x} {hex_part:<48} {ascii_part}") + + size = os.path.getsize(file_path) + header = f"File: {file_path} ({size} bytes)\n" + return header + "\n".join(lines) + except Exception as e: + return f"[Error reading {file_path}: {e}]" + + +async def read_text_file_section(file_path: str, start: int = 0, max_bytes: int = 8000) -> str: + """Read a section of a text file starting at byte offset `start`.""" + try: + size = os.path.getsize(file_path) + with open(file_path, "r", errors="replace") as f: + if start > 0: + f.seek(start) + content = f.read(max_bytes) + remaining = size - start - len(content.encode("utf-8", errors="replace")) + header = f"[File: {file_path}, {size} bytes, showing offset {start}–{start + len(content.encode('utf-8', errors='replace'))}]" + if remaining > 0: + content += f"\n\n[{remaining} bytes remaining after this section]" + return header + "\n" + content + except Exception as e: + return f"[Error reading {file_path}: {e}]" + + +async def search_text_file(file_path: str, pattern: str, max_matches: int = 50) -> str: + """Search for a pattern in an extracted text file. Returns matching lines with line numbers.""" + try: + size = os.path.getsize(file_path) + matches = [] + try: + compiled = re.compile(pattern, re.IGNORECASE) + except re.error: + compiled = re.compile(re.escape(pattern), re.IGNORECASE) + + with open(file_path, "r", errors="replace") as f: + for lineno, line in enumerate(f, 1): + if compiled.search(line): + matches.append(f" {lineno}: {line.rstrip()[:200]}") + if len(matches) >= max_matches: + matches.append(f" [Truncated: more than {max_matches} matches]") + break + + header = f"Search '{pattern}' in {file_path} ({size} bytes): {len(matches)} matches" + if not matches: + return header + "\n (no matches)" + return header + "\n" + "\n".join(matches) + except Exception as e: + return f"[Error searching {file_path}: {e}]" + + +async def parse_pcap_strings(file_path: str) -> str: + """Extract HTTP headers and other readable strings from a PCAP/capture file. + + Uses the `strings` command to find printable text, then filters for + forensically relevant patterns (HTTP headers, URLs, credentials). + """ + try: + proc = await asyncio.create_subprocess_exec( + "srch_strings", "-a", "-n", "8", file_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + all_strings = stdout.decode("utf-8", errors="replace").splitlines() + + hosts = set() + user_agents = set() + urls = [] + cookies = [] + http_methods = [] + other_interesting = [] + + for line in all_strings: + stripped = line.strip() + if stripped.startswith("Host: "): + hosts.add(stripped[6:]) + elif stripped.startswith("User-Agent: "): + user_agents.add(stripped[12:]) + elif stripped.startswith("Cookie: "): + cookies.append(stripped[:200]) + elif re.match(r"^(GET|POST|PUT|DELETE|HEAD) /", stripped): + urls.append(stripped[:200]) + elif stripped.startswith("HTTP/"): + http_methods.append(stripped[:200]) + elif any(kw in stripped.lower() for kw in ("password", "login", "username", "email", "set-cookie")): + other_interesting.append(stripped[:200]) + + size = os.path.getsize(file_path) + lines = [f"=== PCAP String Analysis: {file_path} ({size} bytes) ==="] + lines.append(f"Total printable strings (>=8 chars): {len(all_strings)}") + + lines.append(f"\nUnique Hosts ({len(hosts)}):") + for h in sorted(hosts): + lines.append(f" {h}") + + lines.append(f"\nUser-Agent strings ({len(user_agents)}):") + for ua in sorted(user_agents): + lines.append(f" {ua}") + + lines.append(f"\nHTTP Requests ({len(urls)}):") + for u in urls[:30]: + lines.append(f" {u}") + if len(urls) > 30: + lines.append(f" ... ({len(urls) - 30} more)") + + lines.append(f"\nHTTP Responses ({len(http_methods)}):") + for m in http_methods[:20]: + lines.append(f" {m}") + + if cookies: + lines.append(f"\nCookies ({len(cookies)}):") + for c in cookies[:20]: + lines.append(f" {c}") + + if other_interesting: + lines.append(f"\nOther interesting strings ({len(other_interesting)}):") + for o in other_interesting[:30]: + lines.append(f" {o}") + + return "\n".join(lines) + except Exception as e: + return f"[Error parsing PCAP strings: {e}]" + + +async def parse_prefetch(file_path: str) -> str: + """Parse a Windows XP Prefetch (.pf) file to extract execution info. + + Returns: executable name, last execution time, and run count. + """ + try: + with open(file_path, "rb") as f: + data = f.read() + + if len(data) < 0x94: + return f"[Error: file too small for Prefetch format ({len(data)} bytes)]" + + version = struct.unpack_from(" 0: + if name_end % 2 == 1: + name_end += 1 + filename = raw_name[:name_end].decode("utf-16-le") + else: + filename = raw_name.decode("utf-16-le", errors="replace").rstrip("\x00") + + # Last execution time: FILETIME at offset 0x78 (Windows XP, version 17) + ft = struct.unpack_from(" 0: + epoch = datetime(1601, 1, 1, tzinfo=timezone.utc) + last_run = epoch + timedelta(microseconds=ft // 10) + last_run_str = last_run.strftime("%Y-%m-%d %H:%M:%S UTC") + else: + last_run_str = "(not available)" + + # Run count at offset 0x90 + run_count = struct.unpack_from(" str: + """List files in an extracted directory.""" + try: + entries = [] + for root, dirs, files in os.walk(dir_path): + for f in files: + full = os.path.join(root, f) + rel = os.path.relpath(full, dir_path) + size = os.path.getsize(full) + entries.append(f" {rel} ({size} bytes)") + if len(entries) > 200: + entries.append(f" ... (truncated)") + break + + return f"Directory: {dir_path}\nFiles ({len(entries)}):\n" + "\n".join(entries) + except Exception as e: + return f"[Error listing {dir_path}: {e}]" diff --git a/tools/registry.py b/tools/registry.py new file mode 100644 index 0000000..6dc04be --- /dev/null +++ b/tools/registry.py @@ -0,0 +1,449 @@ +"""Windows registry parsing tools.""" + +from __future__ import annotations + +import logging +import struct +from datetime import datetime, timedelta, timezone + +logger = logging.getLogger(__name__) + +# Suppress noisy regipy warnings (hive-type identification + binary encoding fallbacks) +logging.getLogger("regipy.registry").setLevel(logging.WARNING) +logging.getLogger("regipy.utils").setLevel(logging.ERROR) + + +async def parse_registry_key(hive_path: str, key_path: str = "") -> str: + """Parse a registry hive and list subkeys/values at the given path. + + Uses regipy for pure-Python registry parsing. + """ + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed. Run: uv add regipy]" + + try: + reg = RegistryHive(hive_path) + if key_path: + key = reg.get_key(key_path) + else: + key = reg.root_key() + + lines = [f"Key: {key.path}", f"Timestamp: {key.header.last_modified}", ""] + + # Subkeys + subkeys = list(key.iter_subkeys()) + if subkeys: + lines.append(f"Subkeys ({len(subkeys)}):") + for sk in subkeys[:50]: + lines.append(f" {sk.name}") + if len(subkeys) > 50: + lines.append(f" ... ({len(subkeys) - 50} more)") + lines.append("") + + # Values + values = list(key.iter_values()) + if values: + lines.append(f"Values ({len(values)}):") + for v in values[:30]: + val_data = str(v.value) + if len(val_data) > 200: + val_data = val_data[:200] + "..." + lines.append(f" {v.name} ({v.value_type}) = {val_data}") + + return "\n".join(lines) + except Exception as e: + return f"[Error parsing registry: {e}]" + + +async def list_installed_software(hive_path: str) -> str: + """List installed software from a SOFTWARE registry hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + + try: + reg = RegistryHive(hive_path) + uninstall_path = "\\Microsoft\\Windows\\CurrentVersion\\Uninstall" + key = reg.get_key(uninstall_path) + + programs = [] + for sk in key.iter_subkeys(): + name = sk.name + display_name = None + for v in sk.iter_values(): + if v.name == "DisplayName": + display_name = v.value + break + programs.append(display_name or name) + + lines = [f"Installed Software ({len(programs)} entries):", ""] + for p in sorted(programs): + lines.append(f" - {p}") + return "\n".join(lines) + except Exception as e: + return f"[Error listing software: {e}]" + + +async def get_user_activity(hive_path: str) -> str: + """Extract user activity indicators from NTUSER.DAT.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + + try: + reg = RegistryHive(hive_path) + lines = ["=== User Activity from NTUSER.DAT ===", ""] + + # Recent documents + try: + key = reg.get_key("\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\RecentDocs") + lines.append("Recent Documents:") + for v in key.iter_values(): + if v.name != "MRUListEx": + lines.append(f" {v.name}") + lines.append("") + except Exception: + lines.append("Recent Documents: [not found]") + + # Run MRU (commands typed in Run dialog) + try: + key = reg.get_key("\\Software\\Microsoft\\Windows\\CurrentVersion\\Explorer\\RunMRU") + lines.append("Run Dialog MRU:") + for v in key.iter_values(): + if v.name not in ("MRUList",): + lines.append(f" {v.name}: {v.value}") + lines.append("") + except Exception: + lines.append("Run MRU: [not found]") + + # Typed URLs + try: + key = reg.get_key("\\Software\\Microsoft\\Internet Explorer\\TypedURLs") + lines.append("Typed URLs:") + for v in key.iter_values(): + lines.append(f" {v.value}") + lines.append("") + except Exception: + lines.append("Typed URLs: [not found]") + + return "\n".join(lines) + except Exception as e: + return f"[Error analyzing user activity: {e}]" + + +def _filetime_to_datetime(ft: int) -> str: + """Convert a Windows FILETIME (100-nanosecond intervals since 1601-01-01) to ISO string.""" + if ft <= 0: + return "(not set)" + try: + epoch = datetime(1601, 1, 1, tzinfo=timezone.utc) + dt = epoch + timedelta(microseconds=ft // 10) + return dt.strftime("%Y-%m-%d %H:%M:%S UTC") + except (ValueError, OverflowError): + return f"(invalid FILETIME: {ft})" + + +async def get_system_info(software_hive_path: str) -> str: + """Extract OS version, install date, registered owner from SOFTWARE hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(software_hive_path) + key = reg.get_key("\\Microsoft\\Windows NT\\CurrentVersion") + data = {} + for v in key.iter_values(): + data[v.name] = v.value + + lines = ["=== System Information (SOFTWARE hive) ==="] + lines.append(f"Product Name: {data.get('ProductName', 'N/A')}") + lines.append(f"Current Version: {data.get('CurrentVersion', 'N/A')}") + lines.append(f"Build Number: {data.get('CurrentBuildNumber', 'N/A')}") + lines.append(f"CSD Version (Service Pack): {data.get('CSDVersion', 'None')}") + lines.append(f"Registered Owner: {data.get('RegisteredOwner', 'N/A')}") + lines.append(f"Registered Organization: {data.get('RegisteredOrganization', 'N/A')}") + lines.append(f"Product ID: {data.get('ProductId', 'N/A')}") + lines.append(f"System Root: {data.get('SystemRoot', 'N/A')}") + + install_epoch = data.get("InstallDate") + if install_epoch and isinstance(install_epoch, int): + install_dt = datetime.fromtimestamp(install_epoch, tz=timezone.utc) + lines.append(f"Install Date: {install_dt.strftime('%Y-%m-%d %H:%M:%S UTC')} (epoch: {install_epoch})") + else: + lines.append(f"Install Date: {install_epoch}") + + return "\n".join(lines) + except Exception as e: + return f"[Error: {e}]" + + +async def get_timezone_info(system_hive_path: str) -> str: + """Extract timezone settings from SYSTEM hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(system_hive_path) + key = reg.get_key("\\ControlSet001\\Control\\TimeZoneInformation") + data = {} + for v in key.iter_values(): + data[v.name] = v.value + + lines = ["=== Timezone Information (SYSTEM hive) ==="] + lines.append(f"Standard Name: {data.get('StandardName', 'N/A')}") + lines.append(f"Daylight Name: {data.get('DaylightName', 'N/A')}") + bias = data.get("Bias", "N/A") + if isinstance(bias, int): + hours = bias // 60 + lines.append(f"Bias: {bias} minutes (UTC{-hours:+d}:00)") + else: + lines.append(f"Bias: {bias}") + lines.append(f"Active Time Bias: {data.get('ActiveTimeBias', 'N/A')}") + return "\n".join(lines) + except Exception as e: + return f"[Error: {e}]" + + +async def get_computer_name(system_hive_path: str) -> str: + """Extract computer name from SYSTEM hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(system_hive_path) + lines = ["=== Computer Name (SYSTEM hive) ==="] + + for path_label, path in [ + ("ComputerName", "\\ControlSet001\\Control\\ComputerName\\ComputerName"), + ("ActiveComputerName", "\\ControlSet001\\Control\\ComputerName\\ActiveComputerName"), + ]: + try: + key = reg.get_key(path) + for v in key.iter_values(): + if v.name == "ComputerName": + lines.append(f"{path_label}: {v.value}") + except Exception: + pass + + # Also try Tcpip hostname + try: + key = reg.get_key("\\ControlSet001\\Services\\Tcpip\\Parameters") + for v in key.iter_values(): + if v.name in ("Hostname", "Domain", "NV Hostname"): + lines.append(f"TCP/IP {v.name}: {v.value}") + except Exception: + pass + + return "\n".join(lines) if len(lines) > 1 else "Computer name not found in SYSTEM hive." + except Exception as e: + return f"[Error: {e}]" + + +async def get_shutdown_time(system_hive_path: str) -> str: + """Extract last shutdown time from SYSTEM hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(system_hive_path) + lines = ["=== Shutdown Time (SYSTEM hive) ==="] + + try: + key = reg.get_key("\\ControlSet001\\Control\\Windows") + for v in key.iter_values(): + if v.name == "ShutdownTime": + raw = v.value + if isinstance(raw, bytes) and len(raw) >= 8: + ft = struct.unpack(" str: + """Enumerate all user accounts from SAM hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(sam_hive_path) + key = reg.get_key("\\SAM\\Domains\\Account\\Users\\Names") + + accounts = [] + for sk in key.iter_subkeys(): + accounts.append(sk.name) + + lines = [f"=== User Accounts (SAM hive) — {len(accounts)} total ==="] + for acct in accounts: + lines.append(f" - {acct}") + + # Try to get RIDs from the Users key + try: + users_key = reg.get_key("\\SAM\\Domains\\Account\\Users") + rid_entries = [] + for sk in users_key.iter_subkeys(): + if sk.name != "Names" and sk.name.startswith("0"): + rid = int(sk.name, 16) + rid_entries.append(f" RID {rid} (0x{sk.name})") + if rid_entries: + lines.append("\nUser RIDs:") + lines.extend(rid_entries) + except Exception: + pass + + return "\n".join(lines) + except Exception as e: + return f"[Error: {e}]" + + +async def get_network_interfaces(system_hive_path: str) -> str: + """Extract network adapter and TCP/IP configuration from SYSTEM hive.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(system_hive_path) + lines = ["=== Network Interfaces (SYSTEM hive) ==="] + + # Try TCP/IP interfaces + try: + key = reg.get_key("\\ControlSet001\\Services\\Tcpip\\Parameters\\Interfaces") + for sk in key.iter_subkeys(): + lines.append(f"\nInterface: {sk.name}") + for v in sk.iter_values(): + if v.name in ( + "IPAddress", "SubnetMask", "DefaultGateway", + "DhcpIPAddress", "DhcpSubnetMask", "DhcpDefaultGateway", + "DhcpServer", "NameServer", "Domain", "EnableDHCP", + ): + lines.append(f" {v.name} = {v.value}") + except Exception as e: + lines.append(f"TCP/IP Interfaces: {e}") + + # Try network adapter class + adapter_class = "\\ControlSet001\\Control\\Class\\{4D36E972-E325-11CE-BFC1-08002bE10318}" + try: + key = reg.get_key(adapter_class) + lines.append("\nNetwork Adapters:") + for sk in key.iter_subkeys(): + if sk.name == "Properties": + continue + desc = None + for v in sk.iter_values(): + if v.name == "DriverDesc": + desc = v.value + if desc: + lines.append(f" [{sk.name}] {desc}") + except Exception as e: + lines.append(f"Network Adapters: {e}") + + # Try NetworkCards + try: + key = reg.get_key("\\ControlSet001\\Control\\NetworkCards") + for sk in key.iter_subkeys(): + for v in sk.iter_values(): + if v.name == "Description": + lines.append(f" NetworkCard {sk.name}: {v.value}") + except Exception: + pass + + return "\n".join(lines) if len(lines) > 1 else "No network interface data found in SYSTEM hive." + except Exception as e: + return f"[Error: {e}]" + + +async def get_email_config(ntuser_hive_path: str) -> str: + """Extract email account configuration (SMTP, POP3, NNTP) from NTUSER.DAT.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + try: + reg = RegistryHive(ntuser_hive_path) + lines = ["=== Email Account Configuration (NTUSER.DAT) ==="] + + try: + key = reg.get_key("\\Software\\Microsoft\\Internet Account Manager\\Accounts") + for sk in key.iter_subkeys(): + lines.append(f"\n--- Account: {sk.name} ---") + for v in sk.iter_values(): + # Skip binary password hash fields (but keep "Prompt for Password" flags) + if "Password" in v.name and "Prompt" not in v.name: + lines.append(f" {v.name} = [present, redacted]") + else: + lines.append(f" {v.name} = {v.value}") + except Exception as e: + lines.append(f"Internet Account Manager: {e}") + + return "\n".join(lines) + except Exception as e: + return f"[Error: {e}]" + + +async def search_registry(hive_path: str, pattern: str) -> str: + """Search for a pattern in registry key names and values.""" + try: + from regipy.registry import RegistryHive + except ImportError: + return "[Error: regipy not installed]" + + try: + reg = RegistryHive(hive_path) + pattern_lower = pattern.lower() + matches = [] + + for entry in reg.recurse_subkeys(as_json=True): + path = entry.path or "" + if pattern_lower in path.lower(): + matches.append(f"KEY: {path}") + if hasattr(entry, "values") and entry.values: + for v in entry.values: + name = v.get("name", "") + value = str(v.get("value", "")) + if pattern_lower in name.lower() or pattern_lower in value.lower(): + matches.append(f" {path}\\{name} = {value[:200]}") + + if len(matches) >= 50: + matches.append(f"[Truncated: more than 50 matches for '{pattern}']") + break + + if not matches: + return f"No registry entries matching '{pattern}' found." + return "\n".join(matches) + except Exception as e: + return f"[Error searching registry: {e}]" diff --git a/tools/sleuthkit.py b/tools/sleuthkit.py new file mode 100644 index 0000000..f8d8650 --- /dev/null +++ b/tools/sleuthkit.py @@ -0,0 +1,229 @@ +"""Wrappers around The Sleuth Kit CLI tools for forensic disk image analysis.""" + +from __future__ import annotations + +import asyncio +import logging +import os +import tempfile + +logger = logging.getLogger(__name__) + +# Cache for srch_strings dump: keyed by image_path -> dump file path. +# srch_strings scans the entire image regardless of partition, so offset is irrelevant. +_strings_cache: dict[str, str] = {} + +# Max output bytes to return to the LLM to avoid context overflow +MAX_OUTPUT = 8000 + + +async def _run(cmd: list[str], max_output: int = MAX_OUTPUT) -> str: + """Run a command asynchronously and return stdout.""" + logger.debug("Running: %s", " ".join(cmd)) + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + output = stdout.decode("utf-8", errors="replace") + + if proc.returncode != 0: + err = stderr.decode("utf-8", errors="replace") + return f"[Command failed (rc={proc.returncode})]\n{err}\n{output}" + + if len(output) > max_output: + truncated = output[:max_output] + return truncated + f"\n\n[Output truncated: {len(output)} bytes total, showing first {max_output}]" + return output + + +async def partition_info(image_path: str) -> str: + """Get partition table layout using mmls.""" + return await _run(["mmls", image_path]) + + +async def filesystem_info(image_path: str, offset: int = 0) -> str: + """Get filesystem details using fsstat.""" + cmd = ["fsstat", "-o", str(offset), image_path] + return await _run(cmd) + + +async def list_directory( + image_path: str, + offset: int = 0, + inode: str | None = None, + recursive: bool = False, +) -> str: + """List directory contents using fls.""" + cmd = ["fls", "-o", str(offset)] + if recursive: + cmd.append("-r") + cmd.append(image_path) + if inode: + cmd.append(inode) + return await _run(cmd, max_output=16000) + + +async def extract_file( + image_path: str, + inode: str, + output_path: str, + offset: int = 0, +) -> str: + """Extract a file from the image using icat. + + Streams icat stdout directly to the output file to avoid loading + large files entirely into memory. + """ + import os + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + + cmd = ["icat", "-o", str(offset), image_path, inode] + with open(output_path, "wb") as out_f: + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=out_f, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + + if proc.returncode != 0: + err = stderr.decode("utf-8", errors="replace") + # Clean up empty/partial file on failure + if os.path.exists(output_path): + os.unlink(output_path) + return f"[icat failed (rc={proc.returncode})]: {err}" + + size = os.path.getsize(output_path) + return f"Extracted {size} bytes to {output_path}" + + +async def find_file(image_path: str, inode: str, offset: int = 0) -> str: + """Find the filename for an inode using ffind.""" + cmd = ["ffind", "-o", str(offset), image_path, inode] + return await _run(cmd) + + +async def _ensure_strings_dump(image_path: str) -> str: + """Run srch_strings once and cache the output to a temp file. + + Returns the path to the cached dump file. Subsequent calls with the + same image_path reuse the existing file. srch_strings scans the entire + raw image — partition offset is irrelevant. + """ + cached = _strings_cache.get(image_path) + if cached and os.path.exists(cached): + return cached + + logger.info("Building strings dump for %s — this is a one-time cost", image_path) + import shlex + + # Write srch_strings output directly to a temp file to avoid holding + # the entire dump in memory. + fd, dump_path = tempfile.mkstemp(prefix="strings_dump_", suffix=".txt") + os.close(fd) + + # -a = scan entire file, -t d = print decimal byte offset of each string + cmd_str = ( + f"srch_strings -a -t d {shlex.quote(image_path)} " + f"> {shlex.quote(dump_path)}" + ) + proc = await asyncio.create_subprocess_shell( + cmd_str, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + + if proc.returncode != 0: + err = stderr.decode("utf-8", errors="replace") + logger.error("srch_strings failed (rc=%d): %s", proc.returncode, err) + # Fall back: don't cache, let search_strings do a direct pipe + os.unlink(dump_path) + return "" + + size_mb = os.path.getsize(dump_path) / (1024 * 1024) + logger.info("Strings dump ready: %s (%.1f MB)", dump_path, size_mb) + _strings_cache[image_path] = dump_path + return dump_path + + +async def search_strings( + image_path: str, + pattern: str, +) -> str: + """Search for string patterns in the image. + + On first call, builds a strings dump (one-time full scan). + Subsequent calls grep the cached dump — orders of magnitude faster. + """ + import shlex + + dump_path = await _ensure_strings_dump(image_path) + + if dump_path: + # Fast path: grep the cached dump file + cmd_str = ( + f"grep -i {shlex.quote(pattern)} {shlex.quote(dump_path)} | head -100" + ) + else: + # Fallback: direct pipe (cache build failed) + cmd_str = ( + f"srch_strings -a {shlex.quote(image_path)} " + f"| grep -i {shlex.quote(pattern)} | head -100" + ) + + proc = await asyncio.create_subprocess_shell( + cmd_str, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + output = stdout.decode("utf-8", errors="replace") + if not output.strip(): + return f"No strings matching '{pattern}' found." + return output[:16000] + + +async def count_deleted_files(image_path: str, offset: int = 0) -> str: + """List and count deleted files using fls -rd. Returns total count and extension breakdown.""" + cmd = ["fls", "-rd", "-o", str(offset), image_path] + output = await _run(cmd, max_output=64000) + + lines = output.strip().splitlines() + ext_counts: dict[str, int] = {} + exe_files = [] + total = 0 + for line in lines: + if not line.strip(): + continue + total += 1 + # Extract filename from fls output like "r/r * 1234: filename.ext" + parts = line.split(":", 1) + if len(parts) > 1: + fname = parts[1].strip() + ext = fname.rsplit(".", 1)[-1].lower() if "." in fname else "(no ext)" + ext_counts[ext] = ext_counts.get(ext, 0) + 1 + if ext in ("exe", "dll", "com", "bat", "cmd", "scr", "pif"): + exe_files.append(fname) + + result = [f"=== Deleted Files Summary ===", f"Total deleted entries: {total}"] + result.append(f"\nExecutable files ({len(exe_files)}):") + for e in exe_files[:50]: + result.append(f" {e}") + if len(exe_files) > 50: + result.append(f" ... ({len(exe_files) - 50} more)") + + result.append(f"\nExtension breakdown:") + for ext, count in sorted(ext_counts.items(), key=lambda x: -x[1])[:30]: + result.append(f" .{ext}: {count}") + + return "\n".join(result) + + +async def build_timeline(image_path: str, offset: int = 0) -> str: + """Build a MAC timeline using fls -m.""" + cmd = ["fls", "-m", "/", "-o", str(offset), "-r", image_path] + return await _run(cmd, max_output=32000) diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..f74462f --- /dev/null +++ b/uv.lock @@ -0,0 +1,253 @@ +version = 1 +revision = 3 +requires-python = ">=3.14" + +[[package]] +name = "anyio" +version = "4.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622, upload-time = "2026-03-24T12:59:09.671Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, +] + +[[package]] +name = "certifi" +version = "2026.2.25" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/af/2d/7bf41579a8986e348fa033a31cdd0e4121114f6bce2457e8876010b092dd/certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7", size = 155029, upload-time = "2026-02-25T02:54:17.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "construct" +version = "2.10.70" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/77/8c84b98eca70d245a2a956452f21d57930d22ab88cbeed9290ca630cf03f/construct-2.10.70.tar.gz", hash = "sha256:4d2472f9684731e58cc9c56c463be63baa1447d674e0d66aeb5627b22f512c29", size = 86337, upload-time = "2023-11-29T08:44:49.545Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fb/08b3f4bf05da99aba8ffea52a558758def16e8516bc75ca94ff73587e7d3/construct-2.10.70-py3-none-any.whl", hash = "sha256:c80be81ef595a1a821ec69dc16099550ed22197615f4320b57cc9ce2a672cb30", size = 63020, upload-time = "2023-11-29T08:44:46.876Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + +[package.optional-dependencies] +socks = [ + { name = "socksio" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "inflection" +version = "0.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/7e/691d061b7329bc8d54edbf0ec22fbfb2afe61facb681f9aaa9bff7a27d04/inflection-0.5.1.tar.gz", hash = "sha256:1a29730d366e996aaacffb2f1f1cb9593dc38e2ddd30c91250c6dde09ea9b417", size = 15091, upload-time = "2020-08-22T08:16:29.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/59/91/aa6bde563e0085a02a435aa99b49ef75b0a4b062635e606dab23ce18d720/inflection-0.5.1-py2.py3-none-any.whl", hash = "sha256:f38b2b640938a4f35ade69ac3d053042959b62a0f1076a5bbaa1b9526605a8a2", size = 9454, upload-time = "2020-08-22T08:16:27.816Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "masforensics" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "httpx", extra = ["socks"] }, + { name = "pyyaml" }, + { name = "regipy" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "pytest-asyncio" }, +] + +[package.metadata] +requires-dist = [ + { name = "httpx", extras = ["socks"], specifier = ">=0.28.1" }, + { name = "pyyaml" }, + { name = "regipy", specifier = ">=6.2.1" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=9.0.2" }, + { name = "pytest-asyncio", specifier = ">=1.3.0" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "pytest-asyncio" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, +] + +[[package]] +name = "pytz" +version = "2026.1.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/db/b8721d71d945e6a8ac63c0fc900b2067181dbb50805958d4d4661cf7d277/pytz-2026.1.post1.tar.gz", hash = "sha256:3378dde6a0c3d26719182142c56e60c7f9af7e968076f31aae569d72a0358ee1", size = 321088, upload-time = "2026-03-03T07:47:50.683Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/99/781fe0c827be2742bcc775efefccb3b048a3a9c6ce9aec0cbf4a101677e5/pytz-2026.1.post1-py2.py3-none-any.whl", hash = "sha256:f2fd16142fda348286a75e1a524be810bb05d444e5a081f37f7affc635035f7a", size = 510489, upload-time = "2026-03-03T07:47:49.167Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + +[[package]] +name = "regipy" +version = "6.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "construct" }, + { name = "inflection" }, + { name = "pytz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ba/80/dd0a588b1762c9e1016f96ae59e3c984269cbcc94ca7b63a3d097bb96416/regipy-6.2.1.tar.gz", hash = "sha256:4e09623cdeb23ba4ad9bd73a0f107c9c60aab2fe9a5dea0ba48c71af1e070dfd", size = 101711, upload-time = "2026-01-22T15:26:06.905Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/eb/db13ab9b8d54e04f42b6619acca417ee37b07eb141a54884d13d20d7459e/regipy-6.2.1-py3-none-any.whl", hash = "sha256:b03110e5c4e12385e1ba53c032ccd120c6dcde1b71afb8c3b7aa4717a5a24e43", size = 134861, upload-time = "2026-01-22T15:26:05.653Z" }, +] + +[[package]] +name = "socksio" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/5c/48a7d9495be3d1c651198fd99dbb6ce190e2274d0f28b9051307bdec6b85/socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac", size = 19055, upload-time = "2020-04-17T15:50:34.664Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/37/c3/6eeb6034408dac0fa653d126c9204ade96b819c936e136c5e8a6897eee9c/socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3", size = 12763, upload-time = "2020-04-17T15:50:31.878Z" }, +]