Initial commit: ER-TP-DGP research prototype
Event-Reified Temporal Provenance Dual-Granularity Prompting for LLM-based APT detection on DARPA provenance datasets. Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt modules, scripts for THEIA candidate universe, landmark CSG construction, hybrid prompting, and LLM inference. Excludes data/, reports/, and local LLM config from version control.
This commit is contained in:
236
tests/test_community_to_subgraph.py
Normal file
236
tests/test_community_to_subgraph.py
Normal file
@@ -0,0 +1,236 @@
|
||||
"""Tests for community_to_subgraph (Phase 14 → v0.1 fine-grained bridge)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from er_tp_dgp.community_to_subgraph import build_community_subgraphs
|
||||
from er_tp_dgp.landmark import build_landmark_graph, compute_landmark_communities
|
||||
|
||||
|
||||
PREFIX = "com.bbn.tc.schema.avro.cdm18."
|
||||
|
||||
|
||||
def _wrap(record_type, payload):
|
||||
return {"datum": {PREFIX + record_type: payload}}
|
||||
|
||||
|
||||
def _make_synthetic_jsonl(path: Path) -> None:
|
||||
"""Same mini-attack used by test_landmark.py — keeps the fixture
|
||||
aligned with the upstream module so the bridge is validated against
|
||||
landmark output that other tests already trust."""
|
||||
records = [
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-attacker",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/dropper"}},
|
||||
"cmdLine": {"string": "/tmp/dropper --foo"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-child",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/payload"}},
|
||||
"cmdLine": {"string": "/tmp/payload --beacon"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-sshd",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/usr/sbin/sshd"}},
|
||||
"cmdLine": {"string": "/usr/sbin/sshd -D"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-incoming",
|
||||
"remoteAddress": "192.168.1.5",
|
||||
"remotePort": 4444,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 5555,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-c2",
|
||||
"remoteAddress": "8.8.4.4",
|
||||
"remotePort": 443,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 50001,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"FileObject",
|
||||
{
|
||||
"uuid": "file-payload",
|
||||
"baseObject": {"properties": {"map": {"path": "/tmp/payload"}}},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"FileObject",
|
||||
{
|
||||
"uuid": "file-sshd-cfg",
|
||||
"baseObject": {"properties": {"map": {"path": "/etc/ssh/sshd_config"}}},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-recv",
|
||||
"type": "EVENT_RECVFROM",
|
||||
"timestampNanos": 1_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-incoming"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-write",
|
||||
"type": "EVENT_WRITE",
|
||||
"timestampNanos": 2_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-fork",
|
||||
"type": "EVENT_FORK",
|
||||
"timestampNanos": 3_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "subj-child"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-exec",
|
||||
"type": "EVENT_EXECUTE",
|
||||
"timestampNanos": 4_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-c2",
|
||||
"type": "EVENT_CONNECT",
|
||||
"timestampNanos": 5_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-c2"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-sshd-read",
|
||||
"type": "EVENT_READ",
|
||||
"timestampNanos": 6_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-sshd"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-sshd-cfg"},
|
||||
},
|
||||
),
|
||||
]
|
||||
path.write_text(
|
||||
"\n".join(json.dumps(record, sort_keys=True) for record in records) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
class CommunitySubgraphBridgeTests(unittest.TestCase):
|
||||
def test_bridge_materializes_v01_subgraph_for_attack_community(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
self.assertEqual(len(communities), 1)
|
||||
community = communities[0]
|
||||
|
||||
subgraphs = build_community_subgraphs([community], [theia])
|
||||
|
||||
self.assertIn(community.community_id, subgraphs)
|
||||
sub = subgraphs[community.community_id]
|
||||
|
||||
# Bridge must produce a non-empty fine-grained subgraph.
|
||||
self.assertGreater(len(sub.events), 0)
|
||||
self.assertGreater(len(sub.entities), 0)
|
||||
|
||||
# The attacker + child subjects + their files/flows must all be in.
|
||||
entity_names = {e.stable_name for e in sub.entities}
|
||||
# Subject paths from the fixture.
|
||||
self.assertTrue(
|
||||
any("dropper" in n for n in entity_names),
|
||||
f"missing attacker subject; got {sorted(entity_names)}",
|
||||
)
|
||||
self.assertTrue(
|
||||
any("payload" in n for n in entity_names),
|
||||
f"missing child/payload entity; got {sorted(entity_names)}",
|
||||
)
|
||||
# The benign sshd subject must NOT be in the community subgraph.
|
||||
self.assertFalse(
|
||||
any("sshd" in n for n in entity_names),
|
||||
f"sshd leaked into attack community; got {sorted(entity_names)}",
|
||||
)
|
||||
|
||||
# Every landmark event id should resolve in the subgraph (so
|
||||
# downstream evidence_path_ids referencing landmarks are valid).
|
||||
event_raw_ids = {e.raw_event_id for e in sub.events}
|
||||
for lm_id in community.landmark_event_ids:
|
||||
self.assertIn(
|
||||
lm_id,
|
||||
event_raw_ids,
|
||||
f"landmark {lm_id} missing from materialized subgraph",
|
||||
)
|
||||
|
||||
def test_subgraph_to_provenance_graph_round_trip(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
subgraphs = build_community_subgraphs(communities, [theia])
|
||||
|
||||
sub = subgraphs[communities[0].community_id]
|
||||
graph = sub.to_graph()
|
||||
# Both edge views must be present (this is the "Event-Reified" core).
|
||||
self.assertGreater(len(graph.event_view_edges), 0)
|
||||
self.assertGreater(len(graph.causal_view_edges), 0)
|
||||
# Every event must have its actor entity resolved.
|
||||
for event in graph.events.values():
|
||||
self.assertIn(event.actor_entity_id, graph.entities)
|
||||
|
||||
def test_truncation_flag_set_when_event_cap_hit(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
subgraphs = build_community_subgraphs(
|
||||
communities, [theia], max_events_per_community=2
|
||||
)
|
||||
sub = subgraphs[communities[0].community_id]
|
||||
# The fixture has 5 events for the attack community subjects → cap=2 must truncate.
|
||||
self.assertTrue(sub.truncated)
|
||||
self.assertGreater(sub.raw_event_count_total, len(sub.events))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
174
tests/test_hybrid_prompt.py
Normal file
174
tests/test_hybrid_prompt.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Test the hybrid (community + v0.1 fine-grained) prompt builder."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from er_tp_dgp.community_to_subgraph import build_community_subgraphs
|
||||
from er_tp_dgp.hybrid_prompt import (
|
||||
HybridCommunityPromptBuilder,
|
||||
HybridPromptSwitches,
|
||||
)
|
||||
from er_tp_dgp.landmark import build_landmark_graph, compute_landmark_communities
|
||||
|
||||
|
||||
PREFIX = "com.bbn.tc.schema.avro.cdm18."
|
||||
|
||||
|
||||
def _wrap(record_type, payload):
|
||||
return {"datum": {PREFIX + record_type: payload}}
|
||||
|
||||
|
||||
def _make_synthetic_jsonl(path: Path) -> None:
|
||||
records = [
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-attacker",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/dropper"}},
|
||||
"cmdLine": {"string": "/tmp/dropper --foo"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-child",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/payload"}},
|
||||
"cmdLine": {"string": "/tmp/payload --beacon"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-incoming",
|
||||
"remoteAddress": "192.168.1.5",
|
||||
"remotePort": 4444,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 5555,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-c2",
|
||||
"remoteAddress": "8.8.4.4",
|
||||
"remotePort": 443,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 50001,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"FileObject",
|
||||
{
|
||||
"uuid": "file-payload",
|
||||
"baseObject": {"properties": {"map": {"path": "/tmp/payload"}}},
|
||||
},
|
||||
),
|
||||
_wrap("Event", {
|
||||
"uuid": "evt-recv", "type": "EVENT_RECVFROM",
|
||||
"timestampNanos": 1_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-incoming"},
|
||||
}),
|
||||
_wrap("Event", {
|
||||
"uuid": "evt-write", "type": "EVENT_WRITE",
|
||||
"timestampNanos": 2_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
}),
|
||||
_wrap("Event", {
|
||||
"uuid": "evt-fork", "type": "EVENT_FORK",
|
||||
"timestampNanos": 3_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "subj-child"},
|
||||
}),
|
||||
_wrap("Event", {
|
||||
"uuid": "evt-exec", "type": "EVENT_EXECUTE",
|
||||
"timestampNanos": 4_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
}),
|
||||
_wrap("Event", {
|
||||
"uuid": "evt-c2", "type": "EVENT_CONNECT",
|
||||
"timestampNanos": 5_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-c2"},
|
||||
}),
|
||||
]
|
||||
path.write_text(
|
||||
"\n".join(json.dumps(record, sort_keys=True) for record in records) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
class HybridPromptTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self._tmp = TemporaryDirectory()
|
||||
self.tmp = Path(self._tmp.name)
|
||||
theia = self.tmp / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
self.theia = theia
|
||||
self.landmarks, self.edges, _ = build_landmark_graph([theia])
|
||||
self.communities = compute_landmark_communities(self.landmarks, self.edges)
|
||||
self.subgraphs = build_community_subgraphs(self.communities, [theia])
|
||||
|
||||
def tearDown(self):
|
||||
self._tmp.cleanup()
|
||||
|
||||
def test_build_returns_layered_prompt(self):
|
||||
builder = HybridCommunityPromptBuilder(
|
||||
landmarks_by_id={lm.event_id: lm for lm in self.landmarks},
|
||||
edges_by_id={e.edge_id: e for e in self.edges},
|
||||
switches=HybridPromptSwitches(
|
||||
use_text_summarization=False,
|
||||
use_path_summarization_llm=False,
|
||||
),
|
||||
)
|
||||
community = self.communities[0]
|
||||
sub = self.subgraphs[community.community_id]
|
||||
bundle = builder.build(community, sub)
|
||||
|
||||
# Prompt must include all three layers.
|
||||
self.assertIn("community_overview", bundle.prompt_text)
|
||||
self.assertIn("landmark_skeleton", bundle.prompt_text)
|
||||
self.assertIn("metapath_blocks", bundle.prompt_text)
|
||||
self.assertIn("Yes or No", bundle.prompt_text)
|
||||
|
||||
# Metadata sanity.
|
||||
self.assertEqual(bundle.metadata["method"], "ER-TP-DGP-Hybrid")
|
||||
self.assertGreaterEqual(bundle.metadata["num_landmarks_in_prompt"], 1)
|
||||
self.assertGreaterEqual(bundle.metadata["subgraph_events_count"], 1)
|
||||
|
||||
# Evidence paths from v0.1 metapaths must be present and reference
|
||||
# paths that the trimmer actually selected.
|
||||
self.assertIsInstance(bundle.evidence_path_ids, tuple)
|
||||
self.assertIsInstance(bundle.selected_landmark_ids, tuple)
|
||||
# Landmark skeleton survived selection.
|
||||
self.assertGreater(len(bundle.selected_landmark_ids), 0)
|
||||
|
||||
def test_no_ground_truth_in_prompt(self):
|
||||
builder = HybridCommunityPromptBuilder(
|
||||
landmarks_by_id={lm.event_id: lm for lm in self.landmarks},
|
||||
edges_by_id={e.edge_id: e for e in self.edges},
|
||||
switches=HybridPromptSwitches(
|
||||
use_text_summarization=False,
|
||||
use_path_summarization_llm=False,
|
||||
),
|
||||
)
|
||||
community = self.communities[0]
|
||||
sub = self.subgraphs[community.community_id]
|
||||
bundle = builder.build(community, sub)
|
||||
prompt_lower = bundle.prompt_text.lower()
|
||||
for forbidden in ("atom_id", "ground_truth", "ground truth", "label_source", '"label":'):
|
||||
self.assertNotIn(forbidden, prompt_lower)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
324
tests/test_landmark.py
Normal file
324
tests/test_landmark.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""Tests for the Landmark-Bridged Causal Story Graph."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from er_tp_dgp.landmark import (
|
||||
StreamingLandmarkGraphBuilder,
|
||||
build_landmark_graph,
|
||||
compute_landmark_communities,
|
||||
read_communities_jsonl,
|
||||
read_edges_jsonl,
|
||||
read_landmarks_jsonl,
|
||||
write_communities_jsonl,
|
||||
write_edges_jsonl,
|
||||
write_landmarks_jsonl,
|
||||
)
|
||||
from er_tp_dgp.landmark_prompt import (
|
||||
CommunityPromptSwitches,
|
||||
LandmarkCommunityPromptBuilder,
|
||||
)
|
||||
|
||||
|
||||
PREFIX = "com.bbn.tc.schema.avro.cdm18."
|
||||
|
||||
|
||||
def _wrap(record_type, payload):
|
||||
return {"datum": {PREFIX + record_type: payload}}
|
||||
|
||||
|
||||
def _make_synthetic_jsonl(path: Path) -> None:
|
||||
"""Synthetic mini-attack: a process recv's, writes /tmp/payload, execs it,
|
||||
the child connects to an external IP. Plus a benign sshd doing a routine
|
||||
file read that should not produce a meaningful community."""
|
||||
records = [
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-attacker",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/dropper"}},
|
||||
"cmdLine": {"string": "/tmp/dropper --foo"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-child",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/tmp/payload"}},
|
||||
"cmdLine": {"string": "/tmp/payload --beacon"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"Subject",
|
||||
{
|
||||
"uuid": "subj-sshd",
|
||||
"type": "SUBJECT_PROCESS",
|
||||
"hostId": "host-1",
|
||||
"properties": {"map": {"path": "/usr/sbin/sshd"}},
|
||||
"cmdLine": {"string": "/usr/sbin/sshd -D"},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-incoming",
|
||||
"remoteAddress": "192.168.1.5",
|
||||
"remotePort": 4444,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 5555,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"NetFlowObject",
|
||||
{
|
||||
"uuid": "flow-c2",
|
||||
"remoteAddress": "8.8.4.4",
|
||||
"remotePort": 443,
|
||||
"localAddress": "10.0.0.10",
|
||||
"localPort": 50001,
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"FileObject",
|
||||
{
|
||||
"uuid": "file-payload",
|
||||
"baseObject": {"properties": {"map": {"path": "/tmp/payload"}}},
|
||||
},
|
||||
),
|
||||
_wrap(
|
||||
"FileObject",
|
||||
{
|
||||
"uuid": "file-sshd-cfg",
|
||||
"baseObject": {"properties": {"map": {"path": "/etc/ssh/sshd_config"}}},
|
||||
},
|
||||
),
|
||||
# 1) attacker recv from incoming flow
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-recv",
|
||||
"type": "EVENT_RECVFROM",
|
||||
"timestampNanos": 1_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-incoming"},
|
||||
},
|
||||
),
|
||||
# 2) attacker writes /tmp/payload
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-write",
|
||||
"type": "EVENT_WRITE",
|
||||
"timestampNanos": 2_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
},
|
||||
),
|
||||
# 3) attacker forks child
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-fork",
|
||||
"type": "EVENT_FORK",
|
||||
"timestampNanos": 3_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-attacker"},
|
||||
"predicateObject": {PREFIX + "UUID": "subj-child"},
|
||||
},
|
||||
),
|
||||
# 4) child execs the payload
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-exec",
|
||||
"type": "EVENT_EXECUTE",
|
||||
"timestampNanos": 4_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-payload"},
|
||||
},
|
||||
),
|
||||
# 5) child connects to external C2
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-c2",
|
||||
"type": "EVENT_CONNECT",
|
||||
"timestampNanos": 5_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-child"},
|
||||
"predicateObject": {PREFIX + "UUID": "flow-c2"},
|
||||
},
|
||||
),
|
||||
# 6) sshd reads a config file (benign, NO landmark)
|
||||
_wrap(
|
||||
"Event",
|
||||
{
|
||||
"uuid": "evt-sshd-read",
|
||||
"type": "EVENT_READ",
|
||||
"timestampNanos": 6_000_000_000,
|
||||
"subject": {PREFIX + "UUID": "subj-sshd"},
|
||||
"predicateObject": {PREFIX + "UUID": "file-sshd-cfg"},
|
||||
},
|
||||
),
|
||||
]
|
||||
path.write_text(
|
||||
"\n".join(json.dumps(record, sort_keys=True) for record in records) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
class LandmarkGraphTests(unittest.TestCase):
|
||||
def test_streaming_builds_attack_story(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
|
||||
landmarks, edges, stats = build_landmark_graph([theia])
|
||||
|
||||
# Landmark counts: suspicious_actor_path on attacker (evt-recv is its
|
||||
# first event), external_flow on evt-c2, write_then_execute and
|
||||
# process_creation on evt-exec, suspicious_object_path on the file
|
||||
# write/exec, recv_then_write on evt-write, process_creation on
|
||||
# evt-fork. evt-sshd-read must NOT be a landmark.
|
||||
ids = {lm.event_id for lm in landmarks}
|
||||
self.assertIn("evt-recv", ids) # suspicious_actor_path
|
||||
self.assertIn("evt-write", ids) # recv_then_write + suspicious_object_path
|
||||
self.assertIn("evt-fork", ids) # process_creation
|
||||
self.assertIn("evt-exec", ids) # write_then_execute + process_creation
|
||||
self.assertIn("evt-c2", ids) # external_flow
|
||||
self.assertNotIn("evt-sshd-read", ids)
|
||||
|
||||
# Every landmark must carry at least one class label.
|
||||
for lm in landmarks:
|
||||
self.assertTrue(lm.landmark_classes, f"missing classes on {lm.event_id}")
|
||||
|
||||
# Edges should connect the attack story chronologically.
|
||||
self.assertGreater(len(edges), 0)
|
||||
for edge in edges:
|
||||
self.assertGreater(edge.delta_nanos, 0)
|
||||
self.assertEqual(edge.host_id, "host-1")
|
||||
|
||||
# In a healthy CSG over this fixture, there must be at least one path
|
||||
# from evt-recv to evt-c2 (attack timeline).
|
||||
adjacency = {}
|
||||
for edge in edges:
|
||||
adjacency.setdefault(edge.src_event_id, set()).add(edge.dst_event_id)
|
||||
seen = {"evt-recv"}
|
||||
frontier = {"evt-recv"}
|
||||
while frontier:
|
||||
new_frontier = set()
|
||||
for node in frontier:
|
||||
for nxt in adjacency.get(node, ()):
|
||||
if nxt not in seen:
|
||||
seen.add(nxt)
|
||||
new_frontier.add(nxt)
|
||||
frontier = new_frontier
|
||||
self.assertIn(
|
||||
"evt-c2",
|
||||
seen,
|
||||
f"attack story should propagate to evt-c2 via causal bridges, reached={sorted(seen)}",
|
||||
)
|
||||
|
||||
# Stats sanity.
|
||||
self.assertEqual(stats.landmarks, len(landmarks))
|
||||
self.assertEqual(stats.edges, len(edges))
|
||||
self.assertGreater(stats.events_seen, 0)
|
||||
|
||||
def test_communities_yield_one_attack_subgraph(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
self.assertEqual(len(communities), 1)
|
||||
community = communities[0]
|
||||
self.assertGreaterEqual(len(community.landmark_event_ids), 4)
|
||||
self.assertIn("subj-attacker", community.subjects)
|
||||
self.assertIn("subj-child", community.subjects)
|
||||
self.assertNotIn("subj-sshd", community.subjects)
|
||||
self.assertGreater(community.span_seconds, 0)
|
||||
self.assertIn("write_then_execute", community.landmark_class_counts)
|
||||
|
||||
def test_jsonl_roundtrip(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
|
||||
lm_path = Path(tmp) / "landmarks.jsonl"
|
||||
edge_path = Path(tmp) / "edges.jsonl"
|
||||
com_path = Path(tmp) / "communities.jsonl"
|
||||
write_landmarks_jsonl(landmarks, lm_path)
|
||||
write_edges_jsonl(edges, edge_path)
|
||||
write_communities_jsonl(communities, com_path)
|
||||
|
||||
self.assertEqual(len(read_landmarks_jsonl(lm_path)), len(landmarks))
|
||||
self.assertEqual(len(read_edges_jsonl(edge_path)), len(edges))
|
||||
roundtrip_communities = read_communities_jsonl(com_path)
|
||||
self.assertEqual(len(roundtrip_communities), len(communities))
|
||||
self.assertEqual(
|
||||
roundtrip_communities[0].community_id, communities[0].community_id
|
||||
)
|
||||
|
||||
def test_no_ground_truth_in_construction(self):
|
||||
"""The CSG construction must depend ONLY on raw THEIA records.
|
||||
|
||||
Construct a 'malicious' record stream and a 'benign' record stream
|
||||
that differ only in process path heuristics; the algorithm must
|
||||
produce more landmarks and a meaningful community for the malicious
|
||||
stream without seeing any label or atom_id.
|
||||
"""
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
landmarks, edges, _ = build_landmark_graph([theia])
|
||||
communities = compute_landmark_communities(landmarks, edges)
|
||||
|
||||
# Build a community-level prompt and verify it never mentions
|
||||
# "atom_id" / "ground_truth" / "label".
|
||||
landmarks_by_id = {lm.event_id: lm for lm in landmarks}
|
||||
edges_by_id = {edge.edge_id: edge for edge in edges}
|
||||
builder = LandmarkCommunityPromptBuilder(
|
||||
landmarks_by_id=landmarks_by_id,
|
||||
edges_by_id=edges_by_id,
|
||||
switches=CommunityPromptSwitches(max_landmarks_in_prompt=20),
|
||||
)
|
||||
bundle = builder.build(communities[0])
|
||||
prompt_lower = bundle.prompt_text.lower()
|
||||
for forbidden in ("atom_id", "ground_truth", "ground truth", "label_source", "label="):
|
||||
self.assertNotIn(forbidden, prompt_lower)
|
||||
self.assertIn("evt-c2", bundle.prompt_text)
|
||||
self.assertIn("yes or no", prompt_lower)
|
||||
|
||||
def test_streaming_progress_callback(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
theia = Path(tmp) / "synthetic.json"
|
||||
_make_synthetic_jsonl(theia)
|
||||
builder = StreamingLandmarkGraphBuilder()
|
||||
# progress_every=1 should trigger at least one print without raising.
|
||||
from io import StringIO
|
||||
import contextlib
|
||||
|
||||
buf = StringIO()
|
||||
from er_tp_dgp.theia import iter_theia_records
|
||||
|
||||
with contextlib.redirect_stdout(buf):
|
||||
builder.feed_iterable(iter_theia_records([theia]), progress_every=5)
|
||||
text = buf.getvalue()
|
||||
# Either at least one progress line, or the stream was shorter than
|
||||
# the threshold — both are acceptable, but the print path must not
|
||||
# explode.
|
||||
if "[progress]" in text:
|
||||
self.assertIn("records=", text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
1365
tests/test_pipeline.py
Normal file
1365
tests/test_pipeline.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user