Initial commit: code, paper, small artifacts
This commit is contained in:
173
tests/common/test_data_contract.py
Normal file
173
tests/common/test_data_contract.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pytest
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
||||
from common.data_contract import BENIGN_TOKEN, CANONICAL_FLOW_FEATURE_NAMES, FLOW_COUNT_FEATURE_NAMES, FLOW_COUNT_IDX, FLOW_CONTINUOUS_IDX, FLOW_D, IDLE_THRESHOLD_MS, PACKET_BINARY_CHANNEL_IDX, PACKET_CONTINUOUS_CHANNEL_IDX, PACKET_D, PACKET_FEATURE_NAMES, UNKNOWN_LABEL_TOKEN, apply_mixed_dequant, canonical_5tuple, compute_flow_features_from_packets, fit_packet_stats, normalize_label, zscore
|
||||
|
||||
def test_packet_schema_invariants():
|
||||
assert PACKET_D == 9
|
||||
assert len(PACKET_FEATURE_NAMES) == PACKET_D
|
||||
assert len(set(PACKET_FEATURE_NAMES)) == PACKET_D
|
||||
cont = set(PACKET_CONTINUOUS_CHANNEL_IDX)
|
||||
binary = set(PACKET_BINARY_CHANNEL_IDX)
|
||||
assert cont.isdisjoint(binary)
|
||||
assert cont | binary == set(range(PACKET_D))
|
||||
|
||||
def test_flow_schema_invariants():
|
||||
assert FLOW_D == 20
|
||||
assert len(CANONICAL_FLOW_FEATURE_NAMES) == FLOW_D
|
||||
assert len(set(CANONICAL_FLOW_FEATURE_NAMES)) == FLOW_D
|
||||
assert set(FLOW_COUNT_IDX).isdisjoint(FLOW_CONTINUOUS_IDX)
|
||||
assert set(FLOW_COUNT_IDX) | set(FLOW_CONTINUOUS_IDX) == set(range(FLOW_D))
|
||||
for name in FLOW_COUNT_FEATURE_NAMES:
|
||||
assert name in CANONICAL_FLOW_FEATURE_NAMES
|
||||
|
||||
@pytest.mark.parametrize('raw,expected', [('BENIGN', BENIGN_TOKEN), ('Benign', BENIGN_TOKEN), ('benign', BENIGN_TOKEN), ('normal', BENIGN_TOKEN), (' BENIGN ', BENIGN_TOKEN), ('DrDoS_DNS', 'DrDoS_DNS'), ('', UNKNOWN_LABEL_TOKEN), (' ', UNKNOWN_LABEL_TOKEN)])
|
||||
def test_normalize_label(raw, expected):
|
||||
assert normalize_label(raw) == expected
|
||||
|
||||
def test_canonical_5tuple_direction_agnostic():
|
||||
a = canonical_5tuple('10.0.0.1', 1234, '10.0.0.2', 80, 6)
|
||||
b = canonical_5tuple('10.0.0.2', 80, '10.0.0.1', 1234, 6)
|
||||
assert a == b
|
||||
|
||||
def test_canonical_5tuple_distinct_flows():
|
||||
a = canonical_5tuple('10.0.0.1', 1234, '10.0.0.2', 80, 6)
|
||||
c = canonical_5tuple('10.0.0.1', 1234, '10.0.0.3', 80, 6)
|
||||
assert a != c
|
||||
|
||||
def test_canonical_5tuple_string_inputs():
|
||||
a = canonical_5tuple('10.0.0.1', '1234', '10.0.0.2', '80', '6')
|
||||
b = canonical_5tuple('10.0.0.2', 80, '10.0.0.1', 1234, 6)
|
||||
assert a == b
|
||||
|
||||
def _make_packets(n_flows: int, T: int, n_real: int, seed: int=0) -> tuple[np.ndarray, np.ndarray]:
|
||||
rng = np.random.default_rng(seed)
|
||||
tokens = np.zeros((n_flows, T, PACKET_D), dtype=np.float32)
|
||||
tokens[:, :n_real, 0] = rng.uniform(0, 5, (n_flows, n_real))
|
||||
tokens[:, :n_real, 1] = rng.uniform(0, 3, (n_flows, n_real))
|
||||
tokens[:, :n_real, 2] = rng.integers(0, 2, (n_flows, n_real))
|
||||
tokens[:, :n_real, 3] = rng.integers(0, 2, (n_flows, n_real))
|
||||
tokens[:, :n_real, 7] = rng.integers(0, 2, (n_flows, n_real))
|
||||
tokens[:, :n_real, 8] = rng.uniform(0, 4, (n_flows, n_real))
|
||||
lens = np.full(n_flows, n_real, dtype=np.int32)
|
||||
return (tokens, lens)
|
||||
|
||||
def test_fit_packet_stats_ignores_padding():
|
||||
(tokens, lens) = _make_packets(8, 16, 4, seed=1)
|
||||
tokens[:, 4:, :] = 999.0
|
||||
(mean, std) = fit_packet_stats(tokens, lens)
|
||||
assert np.all(np.abs(mean) < 50)
|
||||
assert np.all(std < 50)
|
||||
|
||||
def test_zscore_basic():
|
||||
x = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
|
||||
mean = np.array([2.0, 3.0], dtype=np.float32)
|
||||
std = np.array([1.0, 1.0], dtype=np.float32)
|
||||
z = zscore(x, mean, std)
|
||||
assert np.allclose(z, [[-1.0, -1.0], [1.0, 1.0]])
|
||||
|
||||
def test_zscore_handles_zero_std():
|
||||
x = np.ones((3, 2), dtype=np.float32)
|
||||
mean = np.ones(2, dtype=np.float32)
|
||||
std = np.zeros(2, dtype=np.float32)
|
||||
z = zscore(x, mean, std)
|
||||
assert np.all(np.isfinite(z))
|
||||
|
||||
def test_apply_mixed_dequant_zeros_padding():
|
||||
(tokens, lens) = _make_packets(4, 8, 3, seed=2)
|
||||
(mean, std) = fit_packet_stats(tokens, lens)
|
||||
z = apply_mixed_dequant(tokens, lens, mean, std, split_tag='train', seed=0)
|
||||
assert np.all(z[:, 3:, :] == 0.0)
|
||||
|
||||
def test_apply_mixed_dequant_stable_under_same_seed():
|
||||
(tokens, lens) = _make_packets(4, 8, 3, seed=3)
|
||||
(mean, std) = fit_packet_stats(tokens, lens)
|
||||
z1 = apply_mixed_dequant(tokens, lens, mean, std, split_tag='train', seed=42)
|
||||
z2 = apply_mixed_dequant(tokens, lens, mean, std, split_tag='train', seed=42)
|
||||
assert np.allclose(z1, z2)
|
||||
|
||||
def test_apply_mixed_dequant_split_noise_differs():
|
||||
(tokens, lens) = _make_packets(4, 8, 3, seed=4)
|
||||
(mean, std) = fit_packet_stats(tokens, lens)
|
||||
z_train = apply_mixed_dequant(tokens, lens, mean, std, split_tag='train', seed=42)
|
||||
z_val = apply_mixed_dequant(tokens, lens, mean, std, split_tag='val', seed=42)
|
||||
b_idx = list(PACKET_BINARY_CHANNEL_IDX)
|
||||
assert not np.allclose(z_train[..., b_idx], z_val[..., b_idx])
|
||||
|
||||
def test_flow_features_shape():
|
||||
(tokens, lens) = _make_packets(5, 16, 8, seed=5)
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
assert feats.shape == (5, FLOW_D)
|
||||
assert feats.dtype == np.float32
|
||||
|
||||
def test_flow_features_rejects_wrong_shape():
|
||||
tokens_bad = np.zeros((3, 4, 7), dtype=np.float32)
|
||||
lens = np.full(3, 2, dtype=np.int32)
|
||||
with pytest.raises(ValueError):
|
||||
compute_flow_features_from_packets(tokens_bad, lens)
|
||||
|
||||
def test_flow_features_zero_for_empty_flow():
|
||||
tokens = np.zeros((2, 4, PACKET_D), dtype=np.float32)
|
||||
lens = np.array([0, 3], dtype=np.int32)
|
||||
tokens[1, :3, 0] = 2.0
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
assert np.all(feats[0] == 0.0)
|
||||
assert np.any(feats[1] != 0.0)
|
||||
|
||||
def test_flow_features_ack_syn_counts():
|
||||
tokens = np.zeros((1, 5, PACKET_D), dtype=np.float32)
|
||||
tokens[0, :3, 7] = 1.0
|
||||
tokens[0, 0, 3] = 1.0
|
||||
tokens[0, 3:, 7] = 1.0
|
||||
tokens[0, 3:, 3] = 1.0
|
||||
lens = np.array([3], dtype=np.int32)
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
ack_idx = CANONICAL_FLOW_FEATURE_NAMES.index('ack_cnt')
|
||||
syn_idx = CANONICAL_FLOW_FEATURE_NAMES.index('syn_cnt')
|
||||
assert feats[0, ack_idx] == pytest.approx(3.0)
|
||||
assert feats[0, syn_idx] == pytest.approx(1.0)
|
||||
|
||||
def test_flow_features_fwd_bwd_counts():
|
||||
tokens = np.zeros((1, 6, PACKET_D), dtype=np.float32)
|
||||
tokens[0, :4, 2] = np.array([0, 1, 0, 1])
|
||||
lens = np.array([4], dtype=np.int32)
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
fwd_idx = CANONICAL_FLOW_FEATURE_NAMES.index('fwd_count')
|
||||
bwd_idx = CANONICAL_FLOW_FEATURE_NAMES.index('bwd_count')
|
||||
assert feats[0, fwd_idx] == 2.0
|
||||
assert feats[0, bwd_idx] == 2.0
|
||||
|
||||
def test_flow_features_active_vs_idle():
|
||||
tokens = np.zeros((1, 4, PACKET_D), dtype=np.float32)
|
||||
lens = np.array([4], dtype=np.int32)
|
||||
tokens[0, 0, 1] = 0.0
|
||||
tokens[0, 1, 1] = np.log1p(100.0)
|
||||
tokens[0, 2, 1] = np.log1p(5000.0)
|
||||
tokens[0, 3, 1] = np.log1p(200.0)
|
||||
feats = compute_flow_features_from_packets(tokens, lens, idle_threshold_ms=1000.0)
|
||||
active_mean_idx = CANONICAL_FLOW_FEATURE_NAMES.index('active_mean')
|
||||
idle_mean_idx = CANONICAL_FLOW_FEATURE_NAMES.index('idle_mean')
|
||||
assert feats[0, active_mean_idx] == pytest.approx(np.log1p(150.0), rel=0.0001)
|
||||
assert feats[0, idle_mean_idx] == pytest.approx(np.log1p(5000.0), rel=0.0001)
|
||||
|
||||
def test_flow_features_padding_does_not_leak():
|
||||
T = 12
|
||||
(tokens_clean, lens) = _make_packets(3, T, 5, seed=6)
|
||||
feats_clean = compute_flow_features_from_packets(tokens_clean, lens)
|
||||
tokens_poisoned = tokens_clean.copy()
|
||||
tokens_poisoned[:, 5:, :] = 9999.0
|
||||
feats_poisoned = compute_flow_features_from_packets(tokens_poisoned, lens)
|
||||
assert np.allclose(feats_clean, feats_poisoned, atol=0.0001)
|
||||
|
||||
def test_flow_features_single_packet_graceful():
|
||||
tokens = np.zeros((1, 4, PACKET_D), dtype=np.float32)
|
||||
tokens[0, 0, 0] = 3.0
|
||||
tokens[0, 0, 2] = 0
|
||||
lens = np.array([1], dtype=np.int32)
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
assert np.all(np.isfinite(feats))
|
||||
iat_idx = CANONICAL_FLOW_FEATURE_NAMES.index('iat_mean')
|
||||
assert feats[0, iat_idx] == 0.0
|
||||
Reference in New Issue
Block a user