Initial commit: code, paper, small artifacts

This commit is contained in:
2026-05-07 20:47:30 +08:00
commit fae2db8cff
322 changed files with 33159 additions and 0 deletions

136
common/data_contract.py Normal file
View File

@@ -0,0 +1,136 @@
from __future__ import annotations
from typing import Sequence
import numpy as np
PACKET_FEATURE_NAMES: tuple[str, ...] = ('log_size', 'log_dt_ms', 'direction', 'tcp_syn', 'tcp_fin', 'tcp_rst', 'tcp_psh', 'tcp_ack', 'log_win')
PACKET_D: int = len(PACKET_FEATURE_NAMES)
PACKET_CONTINUOUS_CHANNEL_IDX: tuple[int, ...] = (0, 1, 8)
PACKET_BINARY_CHANNEL_IDX: tuple[int, ...] = (2, 3, 4, 5, 6, 7)
CONTINUOUS_CHANNEL_IDX = PACKET_CONTINUOUS_CHANNEL_IDX
BINARY_CHANNEL_IDX = PACKET_BINARY_CHANNEL_IDX
CANONICAL_FLOW_FEATURE_NAMES: tuple[str, ...] = ('log_duration', 'log_n_pkts', 'fwd_count', 'bwd_count', 'pkt_size_mean', 'pkt_size_std', 'pkt_size_max', 'fwd_size_mean', 'bwd_size_mean', 'bwd_size_std', 'iat_mean', 'fwd_iat_max', 'bwd_iat_max', 'bwd_iat_std', 'active_mean', 'idle_mean', 'log_pkts_per_s', 'log_total_bytes', 'ack_cnt', 'syn_cnt')
FLOW_D: int = len(CANONICAL_FLOW_FEATURE_NAMES)
FLOW_COUNT_FEATURE_NAMES: tuple[str, ...] = ('fwd_count', 'bwd_count', 'ack_cnt', 'syn_cnt')
FLOW_COUNT_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name in FLOW_COUNT_FEATURE_NAMES))
FLOW_CONTINUOUS_IDX: tuple[int, ...] = tuple((i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES) if name not in FLOW_COUNT_FEATURE_NAMES))
IDLE_THRESHOLD_MS: float = 1000.0
BENIGN_ALIASES: tuple[str, ...] = ('BENIGN', 'Benign', 'benign', 'normal', 'NORMAL', 'Normal')
BENIGN_TOKEN: str = 'normal'
UNKNOWN_LABEL_TOKEN: str = 'unlabeled'
def normalize_label(raw: object) -> str:
s = str(raw).strip()
if not s:
return UNKNOWN_LABEL_TOKEN
if s in BENIGN_ALIASES or s.upper() == 'BENIGN':
return BENIGN_TOKEN
return s
def canonical_5tuple(src_ip: object, src_port: object, dst_ip: object, dst_port: object, protocol: object) -> tuple[str, int, str, int, int]:
sp = int(float(src_port))
dp = int(float(dst_port))
proto = int(float(protocol))
a = (str(src_ip), sp)
b = (str(dst_ip), dp)
if a <= b:
return (a[0], a[1], b[0], b[1], proto)
return (b[0], b[1], a[0], a[1], proto)
def fit_packet_stats(packet_tokens: np.ndarray, packet_lengths: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
T = packet_tokens.shape[1]
mask = np.arange(T)[None, :] < packet_lengths[:, None]
valid = packet_tokens[mask]
return (valid.mean(axis=0).astype(np.float32), valid.std(axis=0).astype(np.float32))
def zscore(x: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
return ((x - mean) / np.maximum(std, 1e-06)).astype(np.float32)
def _stable_dequant_noise(shape: Sequence[int], seed: int, salt: str) -> np.ndarray:
salt_int = sum(((i + 1) * ord(c) for (i, c) in enumerate(salt)))
rng = np.random.default_rng(seed + salt_int)
return rng.uniform(-0.5, 0.5, size=tuple(shape)).astype(np.float32)
def apply_mixed_dequant(packet_tokens: np.ndarray, packet_lengths: np.ndarray, mean: np.ndarray, std: np.ndarray, *, split_tag: str, seed: int) -> np.ndarray:
T = packet_tokens.shape[1]
z = np.zeros_like(packet_tokens, dtype=np.float32)
cont = list(PACKET_CONTINUOUS_CHANNEL_IDX)
binary = list(PACKET_BINARY_CHANNEL_IDX)
z[..., cont] = zscore(packet_tokens[..., cont], mean[cont], std[cont])
b = packet_tokens[..., binary].astype(np.float32)
z[..., binary] = b + _stable_dequant_noise(b.shape, seed, split_tag)
mask = np.arange(T)[None, :] < packet_lengths[:, None]
return (z * mask[:, :, None]).astype(np.float32)
def compute_flow_features_from_packets(packet_tokens: np.ndarray, packet_lengths: np.ndarray, *, idle_threshold_ms: float=IDLE_THRESHOLD_MS) -> np.ndarray:
if packet_tokens.ndim != 3 or packet_tokens.shape[-1] != PACKET_D:
raise ValueError(f'packet_tokens must be [N, T, {PACKET_D}], got {packet_tokens.shape}')
if packet_lengths.ndim != 1 or packet_lengths.shape[0] != packet_tokens.shape[0]:
raise ValueError(f'packet_lengths must be [N] matching packet_tokens, got {packet_lengths.shape}')
(N, T, _) = packet_tokens.shape
lens = np.clip(packet_lengths.astype(np.int64), 0, T)
out = np.zeros((N, FLOW_D), dtype=np.float32)
idx_of = {name: i for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES)}
log_size = packet_tokens[..., 0].astype(np.float64)
log_dt_ms = packet_tokens[..., 1].astype(np.float64)
direction = packet_tokens[..., 2].astype(np.float64)
sizes = np.expm1(np.clip(log_size, 0.0, 25.0))
for i in range(N):
n = int(lens[i])
if n <= 0:
continue
sz = sizes[i, :n]
dt = np.expm1(np.clip(log_dt_ms[i, :n], 0.0, 25.0))
dir_arr = direction[i, :n]
fwd = dir_arr < 0.5
bwd = ~fwd
n_fwd = int(fwd.sum())
n_bwd = int(bwd.sum())
duration_ms = float(dt.sum())
out[i, idx_of['log_duration']] = np.log1p(max(duration_ms, 0.0))
out[i, idx_of['log_n_pkts']] = np.log1p(n)
out[i, idx_of['fwd_count']] = float(n_fwd)
out[i, idx_of['bwd_count']] = float(n_bwd)
ls = log_size[i, :n]
out[i, idx_of['pkt_size_mean']] = float(ls.mean())
out[i, idx_of['pkt_size_std']] = float(ls.std()) if n > 1 else 0.0
out[i, idx_of['pkt_size_max']] = float(ls.max())
if n_fwd > 0:
out[i, idx_of['fwd_size_mean']] = float(ls[fwd].mean())
if n_bwd > 0:
out[i, idx_of['bwd_size_mean']] = float(ls[bwd].mean())
if n_bwd > 1:
out[i, idx_of['bwd_size_std']] = float(ls[bwd].std())
if n > 1:
ldt = log_dt_ms[i, 1:n]
out[i, idx_of['iat_mean']] = float(ldt.mean())
if n_fwd > 1:
fwd_dt = log_dt_ms[i, 1:n][fwd[1:]]
if fwd_dt.size > 0:
out[i, idx_of['fwd_iat_max']] = float(fwd_dt.max())
if n_bwd > 1:
bwd_dt = log_dt_ms[i, 1:n][bwd[1:]]
if bwd_dt.size > 0:
out[i, idx_of['bwd_iat_max']] = float(bwd_dt.max())
if bwd_dt.size > 1:
out[i, idx_of['bwd_iat_std']] = float(bwd_dt.std())
if n > 1:
dt_linear = dt[1:]
idle_mask = dt_linear > idle_threshold_ms
active_mask = ~idle_mask
if active_mask.any():
out[i, idx_of['active_mean']] = float(np.log1p(dt_linear[active_mask].mean()))
if idle_mask.any():
out[i, idx_of['idle_mean']] = float(np.log1p(dt_linear[idle_mask].mean()))
duration_s = duration_ms / 1000.0
if duration_s > 0:
out[i, idx_of['log_pkts_per_s']] = float(np.log1p(n / duration_s))
total_bytes = float(sz.sum())
out[i, idx_of['log_total_bytes']] = float(np.log1p(max(total_bytes, 0.0)))
out[:, idx_of['ack_cnt']] = _masked_channel_sum(packet_tokens[..., 7], lens).astype(np.float32)
out[:, idx_of['syn_cnt']] = _masked_channel_sum(packet_tokens[..., 3], lens).astype(np.float32)
return out
def _masked_channel_sum(channel: np.ndarray, lens: np.ndarray) -> np.ndarray:
T = channel.shape[1]
mask = (np.arange(T)[None, :] < lens[:, None]).astype(np.float32)
return (channel.astype(np.float32) * mask).sum(axis=1)
__all__ = ['PACKET_FEATURE_NAMES', 'PACKET_D', 'PACKET_CONTINUOUS_CHANNEL_IDX', 'PACKET_BINARY_CHANNEL_IDX', 'CONTINUOUS_CHANNEL_IDX', 'BINARY_CHANNEL_IDX', 'CANONICAL_FLOW_FEATURE_NAMES', 'FLOW_D', 'FLOW_COUNT_FEATURE_NAMES', 'FLOW_COUNT_IDX', 'FLOW_CONTINUOUS_IDX', 'IDLE_THRESHOLD_MS', 'BENIGN_ALIASES', 'BENIGN_TOKEN', 'UNKNOWN_LABEL_TOKEN', 'normalize_label', 'canonical_5tuple', 'fit_packet_stats', 'zscore', 'apply_mixed_dequant', 'compute_flow_features_from_packets']