Mixed_CFM was loading AdaLNBlock / SinusoidalTimeEmb / _sinkhorn_coupling and flow-feature helpers from Unified_CFM via importlib spec hacks. Pulled those symbols into Mixed_CFM/_layers.py (model primitives) and inlined the flow-feature loader helpers into Mixed_CFM/data.py, then deleted Unified_CFM/ entirely along with three dead aggregate shell scripts whose referenced eval entry point (artifacts/verify_2026_04_24/) was already gone. Verified: historic janus_iscxtor2016_seed42 checkpoint re-evaluated under the absorbed code reproduces all 10 phase1 AUROC scores to 6 decimals; same-seed retrain converges to within +/-0.001 on terminal_norm (residual drift is CUDA non-determinism in MultiheadAttention + Sinkhorn argmax, not the absorption). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
253 lines
13 KiB
Python
253 lines
13 KiB
Python
from __future__ import annotations
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
import numpy as np
|
|
import pandas as pd
|
|
import sys as _sys
|
|
from pathlib import Path as _Path
|
|
_sys.path.insert(0, str(_Path(__file__).resolve().parents[1]))
|
|
from common.data_contract import (
|
|
PACKET_FEATURE_NAMES,
|
|
PACKET_CONTINUOUS_CHANNEL_IDX,
|
|
PACKET_BINARY_CHANNEL_IDX,
|
|
canonical_5tuple as _canonical_key,
|
|
fit_packet_stats as _fit_packet_stats,
|
|
zscore as _zscore,
|
|
)
|
|
|
|
DEFAULT_FLOW_META_COLUMNS = {'flow_id', 'label', 'day', 'service', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'timestamp', 'start_ts', 'n_pkts'}
|
|
|
|
|
|
def _read_flow_features(path: Path, *, expected_rows: int, feature_columns: Optional[list[str]]=None) -> tuple[np.ndarray, tuple[str, ...], np.ndarray | None]:
|
|
path = Path(path)
|
|
if path.suffix == '.npz':
|
|
data = np.load(path, allow_pickle=True)
|
|
x = data['features'].astype(np.float32)
|
|
raw_names = data['feature_names'] if 'feature_names' in data.files else np.arange(x.shape[1])
|
|
names = tuple((str(v) for v in raw_names))
|
|
flow_id = data['flow_id'] if 'flow_id' in data.files else None
|
|
elif path.suffix in ('.parquet', '.pq'):
|
|
df = pd.read_parquet(path)
|
|
flow_id = df['flow_id'].to_numpy() if 'flow_id' in df.columns else None
|
|
if feature_columns:
|
|
cols = feature_columns
|
|
else:
|
|
cols = [c for c in df.columns if c not in DEFAULT_FLOW_META_COLUMNS and pd.api.types.is_numeric_dtype(df[c])]
|
|
if not cols:
|
|
raise ValueError(f'no numeric flow feature columns found in {path}')
|
|
x = df[cols].to_numpy(dtype=np.float32)
|
|
names = tuple(cols)
|
|
else:
|
|
raise ValueError(f'unsupported flow feature file: {path}')
|
|
if len(x) != expected_rows:
|
|
raise ValueError(f'flow feature row count {len(x):,} != packet row count {expected_rows:,}')
|
|
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
|
return (x, names, flow_id)
|
|
|
|
|
|
def _feature_columns_from_df(df: pd.DataFrame, requested: Optional[list[str]]) -> list[str]:
|
|
if requested:
|
|
return requested
|
|
return [c for c in df.columns if c not in DEFAULT_FLOW_META_COLUMNS and pd.api.types.is_numeric_dtype(df[c])]
|
|
|
|
|
|
def _align_flow_features_by_scan(feature_df: pd.DataFrame, packet_flows: pd.DataFrame, *, feature_columns: list[str]) -> tuple[np.ndarray, tuple[str, ...]]:
|
|
required = ['label', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol']
|
|
missing_feature = [c for c in required if c not in feature_df.columns]
|
|
missing_packet = [c for c in required if c not in packet_flows.columns]
|
|
if missing_feature or missing_packet:
|
|
raise ValueError(f'scan alignment requires label + 5-tuple metadata. missing in feature_df={missing_feature}, packet_flows={missing_packet}')
|
|
packet_keys = [(str(lbl), _canonical_key(src, sp, dst, dp, proto)) for (lbl, src, sp, dst, dp, proto) in zip(packet_flows['label'].to_numpy(), packet_flows['src_ip'].to_numpy(), packet_flows['src_port'].to_numpy(), packet_flows['dst_ip'].to_numpy(), packet_flows['dst_port'].to_numpy(), packet_flows['protocol'].to_numpy())]
|
|
labels = feature_df['label'].to_numpy()
|
|
src_ip = feature_df['src_ip'].to_numpy()
|
|
src_port = feature_df['src_port'].to_numpy()
|
|
dst_ip = feature_df['dst_ip'].to_numpy()
|
|
dst_port = feature_df['dst_port'].to_numpy()
|
|
protocol = feature_df['protocol'].to_numpy()
|
|
matched: list[int] = []
|
|
j = 0
|
|
n_csv = len(feature_df)
|
|
for (i, target) in enumerate(packet_keys):
|
|
while j < n_csv:
|
|
cand = (str(labels[j]), _canonical_key(src_ip[j], src_port[j], dst_ip[j], dst_port[j], protocol[j]))
|
|
j += 1
|
|
if cand == target:
|
|
matched.append(j - 1)
|
|
break
|
|
else:
|
|
raise ValueError(f'failed to align packet flow row {i:,}/{len(packet_keys):,}; the CSV cache may not be the same one used for packet extraction')
|
|
print(f'[data] scan-aligned CSV flow features: matched={len(matched):,} from csv_rows={n_csv:,} skipped={matched[-1] + 1 - len(matched):,}')
|
|
x = feature_df.iloc[matched][feature_columns].to_numpy(dtype=np.float32)
|
|
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
|
return (x, tuple(feature_columns))
|
|
|
|
|
|
def _read_aligned_flow_features(path: Path, packet_flows: pd.DataFrame, *, feature_columns: Optional[list[str]]=None, align: str='auto') -> tuple[np.ndarray, tuple[str, ...]]:
|
|
path = Path(path)
|
|
if align not in ('auto', 'row', 'scan'):
|
|
raise ValueError("flow_features_align must be 'auto', 'row', or 'scan'")
|
|
if path.suffix == '.npz':
|
|
(x, names, flow_id) = _read_flow_features(path, expected_rows=len(packet_flows), feature_columns=feature_columns)
|
|
packet_id = packet_flows['flow_id'].to_numpy() if 'flow_id' in packet_flows else None
|
|
if flow_id is not None and packet_id is not None and (not np.array_equal(flow_id, packet_id)):
|
|
raise ValueError('NPZ flow_id does not align with Packet_CFM flows')
|
|
return (x, names)
|
|
if path.suffix not in ('.parquet', '.pq'):
|
|
raise ValueError(f'unsupported flow feature file: {path}')
|
|
feature_df = pd.read_parquet(path)
|
|
cols = _feature_columns_from_df(feature_df, feature_columns)
|
|
if not cols:
|
|
raise ValueError(f'no numeric flow feature columns found in {path}')
|
|
packet_id = packet_flows['flow_id'].to_numpy() if 'flow_id' in packet_flows else None
|
|
if len(feature_df) == len(packet_flows):
|
|
feature_id = feature_df['flow_id'].to_numpy() if 'flow_id' in feature_df.columns else None
|
|
if feature_id is None or packet_id is None or np.array_equal(feature_id, packet_id):
|
|
x = feature_df[cols].to_numpy(dtype=np.float32)
|
|
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
|
return (x, tuple(cols))
|
|
if align == 'row':
|
|
raise ValueError("flow_id mismatch with flow_features_align='row'")
|
|
if align == 'row':
|
|
raise ValueError(f'row alignment requested but feature rows={len(feature_df):,} packet rows={len(packet_flows):,}')
|
|
return _align_flow_features_by_scan(feature_df, packet_flows, feature_columns=cols)
|
|
|
|
|
|
def _preprocess_flow(train: np.ndarray, val: np.ndarray, attack: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
mean = train.mean(axis=0).astype(np.float32)
|
|
std = train.std(axis=0).astype(np.float32)
|
|
return (_zscore(train, mean, std), _zscore(val, mean, std), _zscore(attack, mean, std), mean, std)
|
|
|
|
@dataclass
|
|
class MixedData:
|
|
train_cont: np.ndarray
|
|
val_cont: np.ndarray
|
|
attack_cont: np.ndarray
|
|
train_disc: np.ndarray
|
|
val_disc: np.ndarray
|
|
attack_disc: np.ndarray
|
|
train_flow: np.ndarray
|
|
val_flow: np.ndarray
|
|
attack_flow: np.ndarray
|
|
train_len: np.ndarray
|
|
val_len: np.ndarray
|
|
attack_len: np.ndarray
|
|
attack_labels: np.ndarray
|
|
cont_mean: np.ndarray
|
|
cont_std: np.ndarray
|
|
flow_mean: np.ndarray
|
|
flow_std: np.ndarray
|
|
flow_feature_names: tuple[str, ...]
|
|
packet_feature_names: tuple[str, ...] = PACKET_FEATURE_NAMES
|
|
|
|
@property
|
|
def T(self) -> int:
|
|
return int(self.train_cont.shape[1])
|
|
|
|
@property
|
|
def n_cont(self) -> int:
|
|
return int(self.train_cont.shape[2])
|
|
|
|
@property
|
|
def n_disc(self) -> int:
|
|
return int(self.train_disc.shape[2])
|
|
|
|
@property
|
|
def flow_dim(self) -> int:
|
|
return int(self.train_flow.shape[1])
|
|
|
|
def _zscore_cont(train_x: np.ndarray, val_x: np.ndarray, attack_x: np.ndarray, train_l: np.ndarray, val_l: np.ndarray, attack_l: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
(mean, std) = _fit_packet_stats(train_x, train_l)
|
|
|
|
def prep(x: np.ndarray, l: np.ndarray) -> np.ndarray:
|
|
z = _zscore(x, mean, std)
|
|
T = x.shape[1]
|
|
m = np.arange(T)[None, :] < l[:, None]
|
|
return (z * m[:, :, None]).astype(np.float32)
|
|
return (prep(train_x, train_l), prep(val_x, val_l), prep(attack_x, attack_l), mean, std)
|
|
|
|
def load_mixed_data(*, packets_npz: Path | None=None, source_store: Path | None=None, flows_parquet: Path, flow_features_path: Path, flow_feature_columns: Optional[list[str]]=None, flow_features_align: str='auto', T: int=64, split_seed: int=42, train_ratio: float=0.8, benign_label: str='normal', min_len: int=2, attack_cap: int | None=None, val_cap: int | None=None) -> MixedData:
|
|
if (packets_npz is None) == (source_store is None):
|
|
raise ValueError('pass exactly one of packets_npz or source_store')
|
|
flows_parquet = Path(flows_parquet)
|
|
print(f'[data] flows={flows_parquet} packets={(packets_npz if packets_npz else source_store)}')
|
|
flow_cols = ['flow_id', 'label', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol']
|
|
flows = pd.read_parquet(flows_parquet, columns=flow_cols)
|
|
labels_full = flows['label'].to_numpy().astype(str)
|
|
flow_id = flows['flow_id'].to_numpy()
|
|
tokens_full: np.ndarray | None = None
|
|
store = None
|
|
if packets_npz is not None:
|
|
pz = np.load(Path(packets_npz))
|
|
tokens_full = pz['packet_tokens'].astype(np.float32)
|
|
lens_full = pz['packet_lengths'].astype(np.int32)
|
|
if T > tokens_full.shape[1]:
|
|
raise ValueError(f'requested T={T} > stored {tokens_full.shape[1]}')
|
|
tokens_full = tokens_full[:, :T].copy()
|
|
lens_full = np.minimum(lens_full, T).astype(np.int32)
|
|
if 'flow_id' in pz.files and (not np.array_equal(pz['flow_id'], flow_id)):
|
|
raise ValueError('packets_npz / flows_parquet not row-aligned')
|
|
else:
|
|
from common.packet_store import PacketShardStore
|
|
store = PacketShardStore.open(Path(source_store))
|
|
store_id = store.read_flows(columns=['flow_id'])['flow_id'].to_numpy()
|
|
if not np.array_equal(store_id, flow_id):
|
|
raise ValueError('source_store / flows_parquet not row-aligned')
|
|
lens_full = np.minimum(store.manifest['packet_length'].to_numpy(dtype=np.int32), T)
|
|
(flow_features, flow_names) = _read_aligned_flow_features(Path(flow_features_path), flows, feature_columns=flow_feature_columns, align=flow_features_align)
|
|
keep = lens_full >= min_len
|
|
labels = labels_full[keep]
|
|
flow_features = flow_features[keep]
|
|
lens = lens_full[keep]
|
|
global_idx = np.flatnonzero(keep).astype(np.int64)
|
|
materialized = tokens_full[keep] if tokens_full is not None else None
|
|
print(f'[data] kept {keep.sum():,} of {len(keep):,} (min_len={min_len})')
|
|
benign = np.where(labels == benign_label)[0]
|
|
attack = np.where(labels != benign_label)[0]
|
|
rng = np.random.default_rng(split_seed)
|
|
rng.shuffle(benign)
|
|
n_train = int(len(benign) * train_ratio)
|
|
train_local = benign[:n_train]
|
|
val_local = benign[n_train:]
|
|
if val_cap is not None and len(val_local) > val_cap:
|
|
val_local = np.sort(rng.choice(val_local, size=val_cap, replace=False))
|
|
if attack_cap is not None and len(attack) > attack_cap:
|
|
attack = np.sort(rng.choice(attack, size=attack_cap, replace=False))
|
|
print(f'[data] train={len(train_local):,} val={len(val_local):,} attack={len(attack):,}')
|
|
|
|
def _materialize(idx_local: np.ndarray) -> np.ndarray:
|
|
if materialized is not None:
|
|
return materialized[idx_local].astype(np.float32, copy=False)
|
|
assert store is not None
|
|
g = global_idx[idx_local]
|
|
(tok, _) = store.read_packets(g.astype(np.int64), T=T)
|
|
return tok.astype(np.float32, copy=False)
|
|
tr_p = _materialize(train_local)
|
|
va_p = _materialize(val_local)
|
|
at_p = _materialize(attack)
|
|
tr_l = lens[train_local]
|
|
va_l = lens[val_local]
|
|
at_l = lens[attack]
|
|
tr_f = flow_features[train_local]
|
|
va_f = flow_features[val_local]
|
|
at_f = flow_features[attack]
|
|
cont_idx = list(PACKET_CONTINUOUS_CHANNEL_IDX)
|
|
disc_idx = list(PACKET_BINARY_CHANNEL_IDX)
|
|
tr_cont = tr_p[..., cont_idx]
|
|
va_cont = va_p[..., cont_idx]
|
|
at_cont = at_p[..., cont_idx]
|
|
tr_disc = tr_p[..., disc_idx].astype(np.int8)
|
|
va_disc = va_p[..., disc_idx].astype(np.int8)
|
|
at_disc = at_p[..., disc_idx].astype(np.int8)
|
|
(tr_cont, va_cont, at_cont, c_mean, c_std) = _zscore_cont(tr_cont, va_cont, at_cont, tr_l, va_l, at_l)
|
|
(tr_flow, va_flow, at_flow, f_mean, f_std) = _preprocess_flow(tr_f, va_f, at_f)
|
|
return MixedData(train_cont=tr_cont, val_cont=va_cont, attack_cont=at_cont, train_disc=tr_disc, val_disc=va_disc, attack_disc=at_disc, train_flow=tr_flow, val_flow=va_flow, attack_flow=at_flow, train_len=tr_l, val_len=va_l, attack_len=at_l, attack_labels=labels[attack], cont_mean=c_mean, cont_std=c_std, flow_mean=f_mean, flow_std=f_std, flow_feature_names=tuple(flow_names))
|
|
|
|
def subsample_train(data: MixedData, n: int, seed: int) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
if n <= 0 or n >= len(data.train_cont):
|
|
return (data.train_flow, data.train_cont, data.train_disc, data.train_len)
|
|
rng = np.random.default_rng(seed)
|
|
idx = rng.choice(len(data.train_cont), n, replace=False)
|
|
idx.sort()
|
|
return (data.train_flow[idx], data.train_cont[idx], data.train_disc[idx], data.train_len[idx])
|