Initial commit: code, paper, small artifacts

This commit is contained in:
2026-05-07 20:47:30 +08:00
commit fae2db8cff
322 changed files with 33159 additions and 0 deletions

1
common/__init__.py Normal file
View File

@@ -0,0 +1 @@
pass

136
common/data_contract.py Normal file
View File

@@ -0,0 +1,136 @@
from __future__ import annotations
from typing import Sequence
import numpy as np
PACKET_FEATURE_NAMES: tuple[str, ...] = ('log_size', 'log_dt_ms', 'direction', 'tcp_syn', 'tcp_fin', 'tcp_rst', 'tcp_psh', 'tcp_ack', 'log_win')
PACKET_D: int = len(PACKET_FEATURE_NAMES)
PACKET_CONTINUOUS_CHANNEL_IDX: tuple[int, ...] = (0, 1, 8)
PACKET_BINARY_CHANNEL_IDX: tuple[int, ...] = (2, 3, 4, 5, 6, 7)
CONTINUOUS_CHANNEL_IDX = PACKET_CONTINUOUS_CHANNEL_IDX
BINARY_CHANNEL_IDX = PACKET_BINARY_CHANNEL_IDX
CANONICAL_FLOW_FEATURE_NAMES: tuple[str, ...] = ('log_duration', 'log_n_pkts', 'fwd_count', 'bwd_count', 'pkt_size_mean', 'pkt_size_std', 'pkt_size_max', 'fwd_size_mean', 'bwd_size_mean', 'bwd_size_std', 'iat_mean', 'fwd_iat_max', 'bwd_iat_max', 'bwd_iat_std', 'active_mean', 'idle_mean', 'log_pkts_per_s', 'log_total_bytes', 'ack_cnt', 'syn_cnt')
FLOW_D: int = len(CANONICAL_FLOW_FEATURE_NAMES)
FLOW_COUNT_FEATURE_NAMES: tuple[str, ...] = ('fwd_count', 'bwd_count', 'ack_cnt', 'syn_cnt')
FLOW_COUNT_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name in FLOW_COUNT_FEATURE_NAMES))
FLOW_CONTINUOUS_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name not in FLOW_COUNT_FEATURE_NAMES))
IDLE_THRESHOLD_MS: float = 1000.0
BENIGN_ALIASES: tuple[str, ...] = ('BENIGN', 'Benign', 'benign', 'normal', 'NORMAL', 'Normal')
BENIGN_TOKEN: str = 'normal'
UNKNOWN_LABEL_TOKEN: str = 'unlabeled'
def normalize_label(raw: object) -> str:
s = str(raw).strip()
if not s:
return UNKNOWN_LABEL_TOKEN
if s in BENIGN_ALIASES or s.upper() == 'BENIGN':
return BENIGN_TOKEN
return s
def canonical_5tuple(src_ip: object, src_port: object, dst_ip: object, dst_port: object, protocol: object) -> tuple[str, int, str, int, int]:
sp = int(float(src_port))
dp = int(float(dst_port))
proto = int(float(protocol))
a = (str(src_ip), sp)
b = (str(dst_ip), dp)
if a <= b:
return (a[0], a[1], b[0], b[1], proto)
return (b[0], b[1], a[0], a[1], proto)
def fit_packet_stats(packet_tokens: np.ndarray, packet_lengths: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
T = packet_tokens.shape[1]
mask = np.arange(T)[None, :] < packet_lengths[:, None]
valid = packet_tokens[mask]
return (valid.mean(axis=0).astype(np.float32), valid.std(axis=0).astype(np.float32))
def zscore(x: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
return ((x - mean) / np.maximum(std, 1e-06)).astype(np.float32)
def _stable_dequant_noise(shape: Sequence[int], seed: int, salt: str) -> np.ndarray:
salt_int = sum(((i + 1) * ord(c) for (i, c) in enumerate(salt)))
rng = np.random.default_rng(seed + salt_int)
return rng.uniform(-0.5, 0.5, size=tuple(shape)).astype(np.float32)
def apply_mixed_dequant(packet_tokens: np.ndarray, packet_lengths: np.ndarray, mean: np.ndarray, std: np.ndarray, *, split_tag: str, seed: int) -> np.ndarray:
T = packet_tokens.shape[1]
z = np.zeros_like(packet_tokens, dtype=np.float32)
cont = list(PACKET_CONTINUOUS_CHANNEL_IDX)
binary = list(PACKET_BINARY_CHANNEL_IDX)
z[..., cont] = zscore(packet_tokens[..., cont], mean[cont], std[cont])
b = packet_tokens[..., binary].astype(np.float32)
z[..., binary] = b + _stable_dequant_noise(b.shape, seed, split_tag)
mask = np.arange(T)[None, :] < packet_lengths[:, None]
return (z * mask[:, :, None]).astype(np.float32)
def compute_flow_features_from_packets(packet_tokens: np.ndarray, packet_lengths: np.ndarray, *, idle_threshold_ms: float=IDLE_THRESHOLD_MS) -> np.ndarray:
if packet_tokens.ndim != 3 or packet_tokens.shape[-1] != PACKET_D:
raise ValueError(f'packet_tokens must be [N, T, {PACKET_D}], got {packet_tokens.shape}')
if packet_lengths.ndim != 1 or packet_lengths.shape[0] != packet_tokens.shape[0]:
raise ValueError(f'packet_lengths must be [N] matching packet_tokens, got {packet_lengths.shape}')
(N, T, _) = packet_tokens.shape
lens = np.clip(packet_lengths.astype(np.int64), 0, T)
out = np.zeros((N, FLOW_D), dtype=np.float32)
idx_of = {name: i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES)}
log_size = packet_tokens[..., 0].astype(np.float64)
log_dt_ms = packet_tokens[..., 1].astype(np.float64)
direction = packet_tokens[..., 2].astype(np.float64)
sizes = np.expm1(np.clip(log_size, 0.0, 25.0))
for i in range(N):
n = int(lens[i])
if n <= 0:
continue
sz = sizes[i, :n]
dt = np.expm1(np.clip(log_dt_ms[i, :n], 0.0, 25.0))
dir_arr = direction[i, :n]
fwd = dir_arr < 0.5
bwd = ~fwd
n_fwd = int(fwd.sum())
n_bwd = int(bwd.sum())
duration_ms = float(dt.sum())
out[i, idx_of['log_duration']] = np.log1p(max(duration_ms, 0.0))
out[i, idx_of['log_n_pkts']] = np.log1p(n)
out[i, idx_of['fwd_count']] = float(n_fwd)
out[i, idx_of['bwd_count']] = float(n_bwd)
ls = log_size[i, :n]
out[i, idx_of['pkt_size_mean']] = float(ls.mean())
out[i, idx_of['pkt_size_std']] = float(ls.std()) if n > 1 else 0.0
out[i, idx_of['pkt_size_max']] = float(ls.max())
if n_fwd > 0:
out[i, idx_of['fwd_size_mean']] = float(ls[fwd].mean())
if n_bwd > 0:
out[i, idx_of['bwd_size_mean']] = float(ls[bwd].mean())
if n_bwd > 1:
out[i, idx_of['bwd_size_std']] = float(ls[bwd].std())
if n > 1:
ldt = log_dt_ms[i, 1:n]
out[i, idx_of['iat_mean']] = float(ldt.mean())
if n_fwd > 1:
fwd_dt = log_dt_ms[i, 1:n][fwd[1:]]
if fwd_dt.size > 0:
out[i, idx_of['fwd_iat_max']] = float(fwd_dt.max())
if n_bwd > 1:
bwd_dt = log_dt_ms[i, 1:n][bwd[1:]]
if bwd_dt.size > 0:
out[i, idx_of['bwd_iat_max']] = float(bwd_dt.max())
if bwd_dt.size > 1:
out[i, idx_of['bwd_iat_std']] = float(bwd_dt.std())
if n > 1:
dt_linear = dt[1:]
idle_mask = dt_linear > idle_threshold_ms
active_mask = ~idle_mask
if active_mask.any():
out[i, idx_of['active_mean']] = float(np.log1p(dt_linear[active_mask].mean()))
if idle_mask.any():
out[i, idx_of['idle_mean']] = float(np.log1p(dt_linear[idle_mask].mean()))
duration_s = duration_ms / 1000.0
if duration_s > 0:
out[i, idx_of['log_pkts_per_s']] = float(np.log1p(n / duration_s))
total_bytes = float(sz.sum())
out[i, idx_of['log_total_bytes']] = float(np.log1p(max(total_bytes, 0.0)))
out[:, idx_of['ack_cnt']] = _masked_channel_sum(packet_tokens[..., 7], lens).astype(np.float32)
out[:, idx_of['syn_cnt']] = _masked_channel_sum(packet_tokens[..., 3], lens).astype(np.float32)
return out
def _masked_channel_sum(channel: np.ndarray, lens: np.ndarray) -> np.ndarray:
T = channel.shape[1]
mask = (np.arange(T)[None, :] < lens[:, None]).astype(np.float32)
return (channel.astype(np.float32) * mask).sum(axis=1)
__all__ = ['PACKET_FEATURE_NAMES', 'PACKET_D', 'PACKET_CONTINUOUS_CHANNEL_IDX', 'PACKET_BINARY_CHANNEL_IDX', 'CONTINUOUS_CHANNEL_IDX', 'BINARY_CHANNEL_IDX', 'CANONICAL_FLOW_FEATURE_NAMES', 'FLOW_D', 'FLOW_COUNT_FEATURE_NAMES', 'FLOW_COUNT_IDX', 'FLOW_CONTINUOUS_IDX', 'IDLE_THRESHOLD_MS', 'BENIGN_ALIASES', 'BENIGN_TOKEN', 'UNKNOWN_LABEL_TOKEN', 'normalize_label', 'canonical_5tuple', 'fit_packet_stats', 'zscore', 'apply_mixed_dequant', 'compute_flow_features_from_packets']

209
common/packet_store.py Normal file
View File

@@ -0,0 +1,209 @@
from __future__ import annotations
import json
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Sequence
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
DEFAULT_SHARD_SIZE = 100000
def _as_index_array(indices: Sequence[int] | np.ndarray) -> np.ndarray:
arr = np.asarray(indices, dtype=np.int64)
if arr.ndim != 1:
raise ValueError(f'indices must be 1-D, got shape {arr.shape}')
return arr
class PacketShardWriter:
def __init__(self, root: Path, *, shard_size: int=DEFAULT_SHARD_SIZE, T_full: int | None=None, D: int | None=None, overwrite: bool=False) -> None:
self.root = Path(root)
self.packet_dir = self.root / 'packets'
self.shard_size = int(shard_size)
if self.shard_size <= 0:
raise ValueError('shard_size must be positive')
if self.root.exists():
if not overwrite:
raise FileExistsError(f'{self.root} already exists')
shutil.rmtree(self.root)
self.packet_dir.mkdir(parents=True, exist_ok=True)
self.T_full = T_full
self.D = D
self._n_flows = 0
self._next_shard = 0
self._pending_tokens: list[np.ndarray] = []
self._pending_lengths: list[np.ndarray] = []
self._pending_flows: list[pd.DataFrame] = []
self._pending_n = 0
self._manifest_writer: pq.ParquetWriter | None = None
self._flows_writer: pq.ParquetWriter | None = None
self._closed = False
def add_batch(self, tokens: np.ndarray, lengths: np.ndarray, flows: pd.DataFrame) -> None:
if self._closed:
raise RuntimeError('cannot add_batch after close()')
tokens = np.asarray(tokens, dtype=np.float32)
lengths = np.asarray(lengths, dtype=np.int32)
if tokens.ndim != 3:
raise ValueError(f'tokens must be [N,T,D], got {tokens.shape}')
if lengths.shape != (tokens.shape[0],):
raise ValueError(f'lengths shape {lengths.shape} does not match N={tokens.shape[0]}')
if len(flows) != tokens.shape[0]:
raise ValueError(f'flows rows {len(flows)} does not match N={tokens.shape[0]}')
if tokens.shape[0] == 0:
return
if self.T_full is None:
self.T_full = int(tokens.shape[1])
if self.D is None:
self.D = int(tokens.shape[2])
if (tokens.shape[1], tokens.shape[2]) != (self.T_full, self.D):
raise ValueError(f'tokens shape {tokens.shape[1:]} does not match store shape {(self.T_full, self.D)}')
start = 0
n = tokens.shape[0]
while start < n:
room = self.shard_size - self._pending_n
take = min(room, n - start)
end = start + take
self._pending_tokens.append(tokens[start:end])
self._pending_lengths.append(lengths[start:end])
self._pending_flows.append(flows.iloc[start:end].reset_index(drop=True))
self._pending_n += take
start = end
if self._pending_n >= self.shard_size:
self._flush()
def close(self) -> None:
if self._closed:
return
if self._pending_n:
self._flush()
if self._manifest_writer is not None:
self._manifest_writer.close()
if self._flows_writer is not None:
self._flows_writer.close()
meta = {'format': 'packet-shard-store-v1', 'n_flows': int(self._n_flows), 'T_full': int(self.T_full or 0), 'D': int(self.D or 0), 'dtype': 'float32', 'shard_size': int(self.shard_size), 'n_shards': int(self._next_shard), 'packet_dir': 'packets', 'shard_pattern': 'shard-{shard_id:06d}.npy'}
(self.root / 'metadata.json').write_text(json.dumps(meta, indent=2) + '\n')
self._closed = True
def __enter__(self) -> 'PacketShardWriter':
return self
def __exit__(self, exc_type, exc, tb) -> None:
if exc_type is None:
self.close()
def _flush(self) -> None:
tokens = np.concatenate(self._pending_tokens, axis=0)
lengths = np.concatenate(self._pending_lengths, axis=0)
flows = pd.concat(self._pending_flows, ignore_index=True)
n = int(tokens.shape[0])
shard_id = self._next_shard
rel_path = Path('packets') / f'shard-{shard_id:06d}.npy'
np.save(self.root / rel_path, tokens, allow_pickle=False)
flow_id = np.arange(self._n_flows, self._n_flows + n, dtype=np.uint64)
manifest = pd.DataFrame({'flow_id': flow_id, 'shard_id': np.full(n, shard_id, dtype=np.int32), 'row_in_shard': np.arange(n, dtype=np.int32), 'packet_length': lengths.astype(np.int32, copy=False)})
if 'flow_id' in flows.columns:
flows = flows.drop(columns=['flow_id'])
flows.insert(0, 'flow_id', flow_id)
self._write_parquet_chunk('manifest', manifest, self.root / 'manifest.parquet')
self._write_parquet_chunk('flows', flows, self.root / 'flows.parquet')
self._n_flows += n
self._next_shard += 1
self._pending_tokens.clear()
self._pending_lengths.clear()
self._pending_flows.clear()
self._pending_n = 0
def _write_parquet_chunk(self, kind: str, df: pd.DataFrame, path: Path) -> None:
table = pa.Table.from_pandas(df, preserve_index=False)
if kind == 'manifest':
if self._manifest_writer is None:
self._manifest_writer = pq.ParquetWriter(path, table.schema, compression='snappy')
self._manifest_writer.write_table(table)
elif kind == 'flows':
if self._flows_writer is None:
self._flows_writer = pq.ParquetWriter(path, table.schema, compression='snappy')
self._flows_writer.write_table(table)
else:
raise ValueError(kind)
def write_packet_store_from_arrays(*, root: Path, tokens: np.ndarray, lengths: np.ndarray, flows: pd.DataFrame, shard_size: int=DEFAULT_SHARD_SIZE, overwrite: bool=False) -> None:
with PacketShardWriter(root, shard_size=shard_size, T_full=int(tokens.shape[1]), D=int(tokens.shape[2]), overwrite=overwrite) as writer:
writer.add_batch(tokens, lengths, flows)
@dataclass
class PacketShardStore:
root: Path
metadata: dict
manifest: pd.DataFrame
@classmethod
def open(cls, root: Path) -> 'PacketShardStore':
root = Path(root)
meta_path = root / 'metadata.json'
manifest_path = root / 'manifest.parquet'
flows_path = root / 'flows.parquet'
if not meta_path.exists():
raise FileNotFoundError(meta_path)
if not manifest_path.exists():
raise FileNotFoundError(manifest_path)
if not flows_path.exists():
raise FileNotFoundError(flows_path)
metadata = json.loads(meta_path.read_text())
manifest = pd.read_parquet(manifest_path)
expected = np.arange(len(manifest), dtype=np.uint64)
actual = manifest['flow_id'].to_numpy(dtype=np.uint64)
if not np.array_equal(actual, expected):
raise ValueError('manifest flow_id must be sequential and row-aligned')
return cls(root=root, metadata=metadata, manifest=manifest)
@property
def n_flows(self) -> int:
return int(self.metadata['n_flows'])
@property
def T_full(self) -> int:
return int(self.metadata['T_full'])
@property
def D(self) -> int:
return int(self.metadata['D'])
def shard_path(self, shard_id: int) -> Path:
return self.root / 'packets' / f'shard-{int(shard_id):06d}.npy'
def read_flows(self, columns: list[str] | None=None) -> pd.DataFrame:
return pd.read_parquet(self.root / 'flows.parquet', columns=columns)
def read_packets(self, indices: Sequence[int] | np.ndarray, *, T: int | None=None) -> tuple[np.ndarray, np.ndarray]:
idx = _as_index_array(indices)
if len(idx) == 0:
t = self.T_full if T is None else int(T)
return (np.zeros((0, t, self.D), dtype=np.float32), np.zeros((0,), dtype=np.int32))
if idx.min() < 0 or idx.max() >= self.n_flows:
raise IndexError(f'indices out of range for n_flows={self.n_flows}')
t = self.T_full if T is None else int(T)
if t > self.T_full:
raise ValueError(f'requested T={t} > T_full={self.T_full}')
rows = self.manifest.iloc[idx]
out = np.empty((len(idx), t, self.D), dtype=np.float32)
lengths = np.minimum(rows['packet_length'].to_numpy(dtype=np.int32), t).astype(np.int32, copy=False)
pos = np.arange(len(idx), dtype=np.int64)
for shard_id in rows['shard_id'].unique():
mask = rows['shard_id'].to_numpy() == shard_id
dest = pos[mask]
row_in_shard = rows.loc[mask, 'row_in_shard'].to_numpy(dtype=np.int64)
arr = np.load(self.shard_path(int(shard_id)), mmap_mode='r')
out[dest] = arr[row_in_shard, :t, :]
return (out, lengths)
def is_packet_store(path: Path) -> bool:
path = Path(path)
return (path / 'metadata.json').exists() and (path / 'manifest.parquet').exists()
def iter_store_roots(paths: Iterable[Path]) -> Iterable[Path]:
for path in paths:
if is_packet_store(path):
yield Path(path)