Initial commit: code, paper, small artifacts
This commit is contained in:
1
common/__init__.py
Normal file
1
common/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
pass
|
||||
136
common/data_contract.py
Normal file
136
common/data_contract.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
from typing import Sequence
|
||||
import numpy as np
|
||||
PACKET_FEATURE_NAMES: tuple[str, ...] = ('log_size', 'log_dt_ms', 'direction', 'tcp_syn', 'tcp_fin', 'tcp_rst', 'tcp_psh', 'tcp_ack', 'log_win')
|
||||
PACKET_D: int = len(PACKET_FEATURE_NAMES)
|
||||
PACKET_CONTINUOUS_CHANNEL_IDX: tuple[int, ...] = (0, 1, 8)
|
||||
PACKET_BINARY_CHANNEL_IDX: tuple[int, ...] = (2, 3, 4, 5, 6, 7)
|
||||
CONTINUOUS_CHANNEL_IDX = PACKET_CONTINUOUS_CHANNEL_IDX
|
||||
BINARY_CHANNEL_IDX = PACKET_BINARY_CHANNEL_IDX
|
||||
CANONICAL_FLOW_FEATURE_NAMES: tuple[str, ...] = ('log_duration', 'log_n_pkts', 'fwd_count', 'bwd_count', 'pkt_size_mean', 'pkt_size_std', 'pkt_size_max', 'fwd_size_mean', 'bwd_size_mean', 'bwd_size_std', 'iat_mean', 'fwd_iat_max', 'bwd_iat_max', 'bwd_iat_std', 'active_mean', 'idle_mean', 'log_pkts_per_s', 'log_total_bytes', 'ack_cnt', 'syn_cnt')
|
||||
FLOW_D: int = len(CANONICAL_FLOW_FEATURE_NAMES)
|
||||
FLOW_COUNT_FEATURE_NAMES: tuple[str, ...] = ('fwd_count', 'bwd_count', 'ack_cnt', 'syn_cnt')
|
||||
FLOW_COUNT_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name in FLOW_COUNT_FEATURE_NAMES))
|
||||
FLOW_CONTINUOUS_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name not in FLOW_COUNT_FEATURE_NAMES))
|
||||
IDLE_THRESHOLD_MS: float = 1000.0
|
||||
BENIGN_ALIASES: tuple[str, ...] = ('BENIGN', 'Benign', 'benign', 'normal', 'NORMAL', 'Normal')
|
||||
BENIGN_TOKEN: str = 'normal'
|
||||
UNKNOWN_LABEL_TOKEN: str = 'unlabeled'
|
||||
|
||||
def normalize_label(raw: object) -> str:
|
||||
s = str(raw).strip()
|
||||
if not s:
|
||||
return UNKNOWN_LABEL_TOKEN
|
||||
if s in BENIGN_ALIASES or s.upper() == 'BENIGN':
|
||||
return BENIGN_TOKEN
|
||||
return s
|
||||
|
||||
def canonical_5tuple(src_ip: object, src_port: object, dst_ip: object, dst_port: object, protocol: object) -> tuple[str, int, str, int, int]:
|
||||
sp = int(float(src_port))
|
||||
dp = int(float(dst_port))
|
||||
proto = int(float(protocol))
|
||||
a = (str(src_ip), sp)
|
||||
b = (str(dst_ip), dp)
|
||||
if a <= b:
|
||||
return (a[0], a[1], b[0], b[1], proto)
|
||||
return (b[0], b[1], a[0], a[1], proto)
|
||||
|
||||
def fit_packet_stats(packet_tokens: np.ndarray, packet_lengths: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||
T = packet_tokens.shape[1]
|
||||
mask = np.arange(T)[None, :] < packet_lengths[:, None]
|
||||
valid = packet_tokens[mask]
|
||||
return (valid.mean(axis=0).astype(np.float32), valid.std(axis=0).astype(np.float32))
|
||||
|
||||
def zscore(x: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
|
||||
return ((x - mean) / np.maximum(std, 1e-06)).astype(np.float32)
|
||||
|
||||
def _stable_dequant_noise(shape: Sequence[int], seed: int, salt: str) -> np.ndarray:
|
||||
salt_int = sum(((i + 1) * ord(c) for (i, c) in enumerate(salt)))
|
||||
rng = np.random.default_rng(seed + salt_int)
|
||||
return rng.uniform(-0.5, 0.5, size=tuple(shape)).astype(np.float32)
|
||||
|
||||
def apply_mixed_dequant(packet_tokens: np.ndarray, packet_lengths: np.ndarray, mean: np.ndarray, std: np.ndarray, *, split_tag: str, seed: int) -> np.ndarray:
|
||||
T = packet_tokens.shape[1]
|
||||
z = np.zeros_like(packet_tokens, dtype=np.float32)
|
||||
cont = list(PACKET_CONTINUOUS_CHANNEL_IDX)
|
||||
binary = list(PACKET_BINARY_CHANNEL_IDX)
|
||||
z[..., cont] = zscore(packet_tokens[..., cont], mean[cont], std[cont])
|
||||
b = packet_tokens[..., binary].astype(np.float32)
|
||||
z[..., binary] = b + _stable_dequant_noise(b.shape, seed, split_tag)
|
||||
mask = np.arange(T)[None, :] < packet_lengths[:, None]
|
||||
return (z * mask[:, :, None]).astype(np.float32)
|
||||
|
||||
def compute_flow_features_from_packets(packet_tokens: np.ndarray, packet_lengths: np.ndarray, *, idle_threshold_ms: float=IDLE_THRESHOLD_MS) -> np.ndarray:
|
||||
if packet_tokens.ndim != 3 or packet_tokens.shape[-1] != PACKET_D:
|
||||
raise ValueError(f'packet_tokens must be [N, T, {PACKET_D}], got {packet_tokens.shape}')
|
||||
if packet_lengths.ndim != 1 or packet_lengths.shape[0] != packet_tokens.shape[0]:
|
||||
raise ValueError(f'packet_lengths must be [N] matching packet_tokens, got {packet_lengths.shape}')
|
||||
(N, T, _) = packet_tokens.shape
|
||||
lens = np.clip(packet_lengths.astype(np.int64), 0, T)
|
||||
out = np.zeros((N, FLOW_D), dtype=np.float32)
|
||||
idx_of = {name: i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES)}
|
||||
log_size = packet_tokens[..., 0].astype(np.float64)
|
||||
log_dt_ms = packet_tokens[..., 1].astype(np.float64)
|
||||
direction = packet_tokens[..., 2].astype(np.float64)
|
||||
sizes = np.expm1(np.clip(log_size, 0.0, 25.0))
|
||||
for i in range(N):
|
||||
n = int(lens[i])
|
||||
if n <= 0:
|
||||
continue
|
||||
sz = sizes[i, :n]
|
||||
dt = np.expm1(np.clip(log_dt_ms[i, :n], 0.0, 25.0))
|
||||
dir_arr = direction[i, :n]
|
||||
fwd = dir_arr < 0.5
|
||||
bwd = ~fwd
|
||||
n_fwd = int(fwd.sum())
|
||||
n_bwd = int(bwd.sum())
|
||||
duration_ms = float(dt.sum())
|
||||
out[i, idx_of['log_duration']] = np.log1p(max(duration_ms, 0.0))
|
||||
out[i, idx_of['log_n_pkts']] = np.log1p(n)
|
||||
out[i, idx_of['fwd_count']] = float(n_fwd)
|
||||
out[i, idx_of['bwd_count']] = float(n_bwd)
|
||||
ls = log_size[i, :n]
|
||||
out[i, idx_of['pkt_size_mean']] = float(ls.mean())
|
||||
out[i, idx_of['pkt_size_std']] = float(ls.std()) if n > 1 else 0.0
|
||||
out[i, idx_of['pkt_size_max']] = float(ls.max())
|
||||
if n_fwd > 0:
|
||||
out[i, idx_of['fwd_size_mean']] = float(ls[fwd].mean())
|
||||
if n_bwd > 0:
|
||||
out[i, idx_of['bwd_size_mean']] = float(ls[bwd].mean())
|
||||
if n_bwd > 1:
|
||||
out[i, idx_of['bwd_size_std']] = float(ls[bwd].std())
|
||||
if n > 1:
|
||||
ldt = log_dt_ms[i, 1:n]
|
||||
out[i, idx_of['iat_mean']] = float(ldt.mean())
|
||||
if n_fwd > 1:
|
||||
fwd_dt = log_dt_ms[i, 1:n][fwd[1:]]
|
||||
if fwd_dt.size > 0:
|
||||
out[i, idx_of['fwd_iat_max']] = float(fwd_dt.max())
|
||||
if n_bwd > 1:
|
||||
bwd_dt = log_dt_ms[i, 1:n][bwd[1:]]
|
||||
if bwd_dt.size > 0:
|
||||
out[i, idx_of['bwd_iat_max']] = float(bwd_dt.max())
|
||||
if bwd_dt.size > 1:
|
||||
out[i, idx_of['bwd_iat_std']] = float(bwd_dt.std())
|
||||
if n > 1:
|
||||
dt_linear = dt[1:]
|
||||
idle_mask = dt_linear > idle_threshold_ms
|
||||
active_mask = ~idle_mask
|
||||
if active_mask.any():
|
||||
out[i, idx_of['active_mean']] = float(np.log1p(dt_linear[active_mask].mean()))
|
||||
if idle_mask.any():
|
||||
out[i, idx_of['idle_mean']] = float(np.log1p(dt_linear[idle_mask].mean()))
|
||||
duration_s = duration_ms / 1000.0
|
||||
if duration_s > 0:
|
||||
out[i, idx_of['log_pkts_per_s']] = float(np.log1p(n / duration_s))
|
||||
total_bytes = float(sz.sum())
|
||||
out[i, idx_of['log_total_bytes']] = float(np.log1p(max(total_bytes, 0.0)))
|
||||
out[:, idx_of['ack_cnt']] = _masked_channel_sum(packet_tokens[..., 7], lens).astype(np.float32)
|
||||
out[:, idx_of['syn_cnt']] = _masked_channel_sum(packet_tokens[..., 3], lens).astype(np.float32)
|
||||
return out
|
||||
|
||||
def _masked_channel_sum(channel: np.ndarray, lens: np.ndarray) -> np.ndarray:
|
||||
T = channel.shape[1]
|
||||
mask = (np.arange(T)[None, :] < lens[:, None]).astype(np.float32)
|
||||
return (channel.astype(np.float32) * mask).sum(axis=1)
|
||||
__all__ = ['PACKET_FEATURE_NAMES', 'PACKET_D', 'PACKET_CONTINUOUS_CHANNEL_IDX', 'PACKET_BINARY_CHANNEL_IDX', 'CONTINUOUS_CHANNEL_IDX', 'BINARY_CHANNEL_IDX', 'CANONICAL_FLOW_FEATURE_NAMES', 'FLOW_D', 'FLOW_COUNT_FEATURE_NAMES', 'FLOW_COUNT_IDX', 'FLOW_CONTINUOUS_IDX', 'IDLE_THRESHOLD_MS', 'BENIGN_ALIASES', 'BENIGN_TOKEN', 'UNKNOWN_LABEL_TOKEN', 'normalize_label', 'canonical_5tuple', 'fit_packet_stats', 'zscore', 'apply_mixed_dequant', 'compute_flow_features_from_packets']
|
||||
209
common/packet_store.py
Normal file
209
common/packet_store.py
Normal file
@@ -0,0 +1,209 @@
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import shutil
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Sequence
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
DEFAULT_SHARD_SIZE = 100000
|
||||
|
||||
def _as_index_array(indices: Sequence[int] | np.ndarray) -> np.ndarray:
|
||||
arr = np.asarray(indices, dtype=np.int64)
|
||||
if arr.ndim != 1:
|
||||
raise ValueError(f'indices must be 1-D, got shape {arr.shape}')
|
||||
return arr
|
||||
|
||||
class PacketShardWriter:
|
||||
|
||||
def __init__(self, root: Path, *, shard_size: int=DEFAULT_SHARD_SIZE, T_full: int | None=None, D: int | None=None, overwrite: bool=False) -> None:
|
||||
self.root = Path(root)
|
||||
self.packet_dir = self.root / 'packets'
|
||||
self.shard_size = int(shard_size)
|
||||
if self.shard_size <= 0:
|
||||
raise ValueError('shard_size must be positive')
|
||||
if self.root.exists():
|
||||
if not overwrite:
|
||||
raise FileExistsError(f'{self.root} already exists')
|
||||
shutil.rmtree(self.root)
|
||||
self.packet_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.T_full = T_full
|
||||
self.D = D
|
||||
self._n_flows = 0
|
||||
self._next_shard = 0
|
||||
self._pending_tokens: list[np.ndarray] = []
|
||||
self._pending_lengths: list[np.ndarray] = []
|
||||
self._pending_flows: list[pd.DataFrame] = []
|
||||
self._pending_n = 0
|
||||
self._manifest_writer: pq.ParquetWriter | None = None
|
||||
self._flows_writer: pq.ParquetWriter | None = None
|
||||
self._closed = False
|
||||
|
||||
def add_batch(self, tokens: np.ndarray, lengths: np.ndarray, flows: pd.DataFrame) -> None:
|
||||
if self._closed:
|
||||
raise RuntimeError('cannot add_batch after close()')
|
||||
tokens = np.asarray(tokens, dtype=np.float32)
|
||||
lengths = np.asarray(lengths, dtype=np.int32)
|
||||
if tokens.ndim != 3:
|
||||
raise ValueError(f'tokens must be [N,T,D], got {tokens.shape}')
|
||||
if lengths.shape != (tokens.shape[0],):
|
||||
raise ValueError(f'lengths shape {lengths.shape} does not match N={tokens.shape[0]}')
|
||||
if len(flows) != tokens.shape[0]:
|
||||
raise ValueError(f'flows rows {len(flows)} does not match N={tokens.shape[0]}')
|
||||
if tokens.shape[0] == 0:
|
||||
return
|
||||
if self.T_full is None:
|
||||
self.T_full = int(tokens.shape[1])
|
||||
if self.D is None:
|
||||
self.D = int(tokens.shape[2])
|
||||
if (tokens.shape[1], tokens.shape[2]) != (self.T_full, self.D):
|
||||
raise ValueError(f'tokens shape {tokens.shape[1:]} does not match store shape {(self.T_full, self.D)}')
|
||||
start = 0
|
||||
n = tokens.shape[0]
|
||||
while start < n:
|
||||
room = self.shard_size - self._pending_n
|
||||
take = min(room, n - start)
|
||||
end = start + take
|
||||
self._pending_tokens.append(tokens[start:end])
|
||||
self._pending_lengths.append(lengths[start:end])
|
||||
self._pending_flows.append(flows.iloc[start:end].reset_index(drop=True))
|
||||
self._pending_n += take
|
||||
start = end
|
||||
if self._pending_n >= self.shard_size:
|
||||
self._flush()
|
||||
|
||||
def close(self) -> None:
|
||||
if self._closed:
|
||||
return
|
||||
if self._pending_n:
|
||||
self._flush()
|
||||
if self._manifest_writer is not None:
|
||||
self._manifest_writer.close()
|
||||
if self._flows_writer is not None:
|
||||
self._flows_writer.close()
|
||||
meta = {'format': 'packet-shard-store-v1', 'n_flows': int(self._n_flows), 'T_full': int(self.T_full or 0), 'D': int(self.D or 0), 'dtype': 'float32', 'shard_size': int(self.shard_size), 'n_shards': int(self._next_shard), 'packet_dir': 'packets', 'shard_pattern': 'shard-{shard_id:06d}.npy'}
|
||||
(self.root / 'metadata.json').write_text(json.dumps(meta, indent=2) + '\n')
|
||||
self._closed = True
|
||||
|
||||
def __enter__(self) -> 'PacketShardWriter':
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> None:
|
||||
if exc_type is None:
|
||||
self.close()
|
||||
|
||||
def _flush(self) -> None:
|
||||
tokens = np.concatenate(self._pending_tokens, axis=0)
|
||||
lengths = np.concatenate(self._pending_lengths, axis=0)
|
||||
flows = pd.concat(self._pending_flows, ignore_index=True)
|
||||
n = int(tokens.shape[0])
|
||||
shard_id = self._next_shard
|
||||
rel_path = Path('packets') / f'shard-{shard_id:06d}.npy'
|
||||
np.save(self.root / rel_path, tokens, allow_pickle=False)
|
||||
flow_id = np.arange(self._n_flows, self._n_flows + n, dtype=np.uint64)
|
||||
manifest = pd.DataFrame({'flow_id': flow_id, 'shard_id': np.full(n, shard_id, dtype=np.int32), 'row_in_shard': np.arange(n, dtype=np.int32), 'packet_length': lengths.astype(np.int32, copy=False)})
|
||||
if 'flow_id' in flows.columns:
|
||||
flows = flows.drop(columns=['flow_id'])
|
||||
flows.insert(0, 'flow_id', flow_id)
|
||||
self._write_parquet_chunk('manifest', manifest, self.root / 'manifest.parquet')
|
||||
self._write_parquet_chunk('flows', flows, self.root / 'flows.parquet')
|
||||
self._n_flows += n
|
||||
self._next_shard += 1
|
||||
self._pending_tokens.clear()
|
||||
self._pending_lengths.clear()
|
||||
self._pending_flows.clear()
|
||||
self._pending_n = 0
|
||||
|
||||
def _write_parquet_chunk(self, kind: str, df: pd.DataFrame, path: Path) -> None:
|
||||
table = pa.Table.from_pandas(df, preserve_index=False)
|
||||
if kind == 'manifest':
|
||||
if self._manifest_writer is None:
|
||||
self._manifest_writer = pq.ParquetWriter(path, table.schema, compression='snappy')
|
||||
self._manifest_writer.write_table(table)
|
||||
elif kind == 'flows':
|
||||
if self._flows_writer is None:
|
||||
self._flows_writer = pq.ParquetWriter(path, table.schema, compression='snappy')
|
||||
self._flows_writer.write_table(table)
|
||||
else:
|
||||
raise ValueError(kind)
|
||||
|
||||
def write_packet_store_from_arrays(*, root: Path, tokens: np.ndarray, lengths: np.ndarray, flows: pd.DataFrame, shard_size: int=DEFAULT_SHARD_SIZE, overwrite: bool=False) -> None:
|
||||
with PacketShardWriter(root, shard_size=shard_size, T_full=int(tokens.shape[1]), D=int(tokens.shape[2]), overwrite=overwrite) as writer:
|
||||
writer.add_batch(tokens, lengths, flows)
|
||||
|
||||
@dataclass
|
||||
class PacketShardStore:
|
||||
root: Path
|
||||
metadata: dict
|
||||
manifest: pd.DataFrame
|
||||
|
||||
@classmethod
|
||||
def open(cls, root: Path) -> 'PacketShardStore':
|
||||
root = Path(root)
|
||||
meta_path = root / 'metadata.json'
|
||||
manifest_path = root / 'manifest.parquet'
|
||||
flows_path = root / 'flows.parquet'
|
||||
if not meta_path.exists():
|
||||
raise FileNotFoundError(meta_path)
|
||||
if not manifest_path.exists():
|
||||
raise FileNotFoundError(manifest_path)
|
||||
if not flows_path.exists():
|
||||
raise FileNotFoundError(flows_path)
|
||||
metadata = json.loads(meta_path.read_text())
|
||||
manifest = pd.read_parquet(manifest_path)
|
||||
expected = np.arange(len(manifest), dtype=np.uint64)
|
||||
actual = manifest['flow_id'].to_numpy(dtype=np.uint64)
|
||||
if not np.array_equal(actual, expected):
|
||||
raise ValueError('manifest flow_id must be sequential and row-aligned')
|
||||
return cls(root=root, metadata=metadata, manifest=manifest)
|
||||
|
||||
@property
|
||||
def n_flows(self) -> int:
|
||||
return int(self.metadata['n_flows'])
|
||||
|
||||
@property
|
||||
def T_full(self) -> int:
|
||||
return int(self.metadata['T_full'])
|
||||
|
||||
@property
|
||||
def D(self) -> int:
|
||||
return int(self.metadata['D'])
|
||||
|
||||
def shard_path(self, shard_id: int) -> Path:
|
||||
return self.root / 'packets' / f'shard-{int(shard_id):06d}.npy'
|
||||
|
||||
def read_flows(self, columns: list[str] | None=None) -> pd.DataFrame:
|
||||
return pd.read_parquet(self.root / 'flows.parquet', columns=columns)
|
||||
|
||||
def read_packets(self, indices: Sequence[int] | np.ndarray, *, T: int | None=None) -> tuple[np.ndarray, np.ndarray]:
|
||||
idx = _as_index_array(indices)
|
||||
if len(idx) == 0:
|
||||
t = self.T_full if T is None else int(T)
|
||||
return (np.zeros((0, t, self.D), dtype=np.float32), np.zeros((0,), dtype=np.int32))
|
||||
if idx.min() < 0 or idx.max() >= self.n_flows:
|
||||
raise IndexError(f'indices out of range for n_flows={self.n_flows}')
|
||||
t = self.T_full if T is None else int(T)
|
||||
if t > self.T_full:
|
||||
raise ValueError(f'requested T={t} > T_full={self.T_full}')
|
||||
rows = self.manifest.iloc[idx]
|
||||
out = np.empty((len(idx), t, self.D), dtype=np.float32)
|
||||
lengths = np.minimum(rows['packet_length'].to_numpy(dtype=np.int32), t).astype(np.int32, copy=False)
|
||||
pos = np.arange(len(idx), dtype=np.int64)
|
||||
for shard_id in rows['shard_id'].unique():
|
||||
mask = rows['shard_id'].to_numpy() == shard_id
|
||||
dest = pos[mask]
|
||||
row_in_shard = rows.loc[mask, 'row_in_shard'].to_numpy(dtype=np.int64)
|
||||
arr = np.load(self.shard_path(int(shard_id)), mmap_mode='r')
|
||||
out[dest] = arr[row_in_shard, :t, :]
|
||||
return (out, lengths)
|
||||
|
||||
def is_packet_store(path: Path) -> bool:
|
||||
path = Path(path)
|
||||
return (path / 'metadata.json').exists() and (path / 'manifest.parquet').exists()
|
||||
|
||||
def iter_store_roots(paths: Iterable[Path]) -> Iterable[Path]:
|
||||
for path in paths:
|
||||
if is_packet_store(path):
|
||||
yield Path(path)
|
||||
Reference in New Issue
Block a user