Mixed_CFM: absorb Unified_CFM primitives; remove Unified_CFM
Mixed_CFM was loading AdaLNBlock / SinusoidalTimeEmb / _sinkhorn_coupling and flow-feature helpers from Unified_CFM via importlib spec hacks. Pulled those symbols into Mixed_CFM/_layers.py (model primitives) and inlined the flow-feature loader helpers into Mixed_CFM/data.py, then deleted Unified_CFM/ entirely along with three dead aggregate shell scripts whose referenced eval entry point (artifacts/verify_2026_04_24/) was already gone. Verified: historic janus_iscxtor2016_seed42 checkpoint re-evaluated under the absorbed code reproduces all 10 phase1 AUROC scores to 6 decimals; same-seed retrain converges to within +/-0.001 on terminal_norm (residual drift is CUDA non-determinism in MultiheadAttention + Sinkhorn argmax, not the absorption). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
59
Mixed_CFM/_layers.py
Normal file
59
Mixed_CFM/_layers.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def _sinkhorn_coupling(C: torch.Tensor, reg: float=0.05, n_iter: int=20) -> torch.Tensor:
|
||||
C = C.float()
|
||||
log_k = -C / reg
|
||||
B = C.shape[0]
|
||||
log_u = torch.zeros(B, device=C.device)
|
||||
log_v = torch.zeros(B, device=C.device)
|
||||
for _ in range(n_iter):
|
||||
log_v = -torch.logsumexp(log_k + log_u.unsqueeze(1), dim=0)
|
||||
log_u = -torch.logsumexp(log_k + log_v.unsqueeze(0), dim=1)
|
||||
log_p = log_u.unsqueeze(1) + log_k + log_v.unsqueeze(0)
|
||||
return log_p.argmax(dim=1)
|
||||
|
||||
|
||||
class SinusoidalTimeEmb(nn.Module):
|
||||
|
||||
def __init__(self, dim: int) -> None:
|
||||
super().__init__()
|
||||
if dim % 2 != 0:
|
||||
raise ValueError('time embedding dimension must be even')
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, t: torch.Tensor) -> torch.Tensor:
|
||||
half = self.dim // 2
|
||||
freqs = torch.exp(-math.log(10000) * torch.arange(half, device=t.device, dtype=t.dtype) / max(half - 1, 1))
|
||||
args = t[:, None] * freqs[None, :]
|
||||
return torch.cat([args.sin(), args.cos()], dim=-1)
|
||||
|
||||
|
||||
class AdaLNBlock(nn.Module):
|
||||
|
||||
def __init__(self, d_model: int, n_heads: int, mlp_ratio: float, cond_dim: int) -> None:
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(d_model, elementwise_affine=False)
|
||||
self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
|
||||
self.norm2 = nn.LayerNorm(d_model, elementwise_affine=False)
|
||||
hidden = int(d_model * mlp_ratio)
|
||||
self.mlp = nn.Sequential(nn.Linear(d_model, hidden), nn.GELU(), nn.Linear(hidden, d_model))
|
||||
self.cond_proj = nn.Linear(cond_dim, 6 * d_model)
|
||||
nn.init.zeros_(self.cond_proj.weight)
|
||||
nn.init.zeros_(self.cond_proj.bias)
|
||||
|
||||
@staticmethod
|
||||
def _modulate(x: torch.Tensor, gamma: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
|
||||
return x * (1.0 + gamma[:, None, :]) + beta[:, None, :]
|
||||
|
||||
def forward(self, x: torch.Tensor, cond: torch.Tensor, key_padding_mask: torch.Tensor | None, attn_mask: torch.Tensor | None=None) -> torch.Tensor:
|
||||
(g1, b1, a1, g2, b2, a2) = self.cond_proj(cond).chunk(6, dim=-1)
|
||||
h = self._modulate(self.norm1(x), g1, b1)
|
||||
(attn_out, _) = self.attn(h, h, h, key_padding_mask=key_padding_mask, attn_mask=attn_mask, need_weights=False)
|
||||
x = x + a1[:, None, :] * attn_out
|
||||
h = self._modulate(self.norm2(x), g2, b2)
|
||||
return x + a2[:, None, :] * self.mlp(h)
|
||||
@@ -7,19 +7,116 @@ import pandas as pd
|
||||
import sys as _sys
|
||||
from pathlib import Path as _Path
|
||||
_sys.path.insert(0, str(_Path(__file__).resolve().parents[1]))
|
||||
from common.data_contract import PACKET_FEATURE_NAMES, PACKET_CONTINUOUS_CHANNEL_IDX, PACKET_BINARY_CHANNEL_IDX, fit_packet_stats as _fit_packet_stats, zscore as _zscore
|
||||
import importlib.util as _ilu
|
||||
_UDATA_NAME = 'unified_cfm_data'
|
||||
if _UDATA_NAME not in _sys.modules:
|
||||
_udata_spec = _ilu.spec_from_file_location(_UDATA_NAME, _Path(__file__).resolve().parents[1] / 'Unified_CFM' / 'data.py')
|
||||
_udata = _ilu.module_from_spec(_udata_spec)
|
||||
_sys.modules[_UDATA_NAME] = _udata
|
||||
_udata_spec.loader.exec_module(_udata)
|
||||
else:
|
||||
_udata = _sys.modules[_UDATA_NAME]
|
||||
DEFAULT_FLOW_META_COLUMNS = _udata.DEFAULT_FLOW_META_COLUMNS
|
||||
_read_aligned_flow_features = _udata._read_aligned_flow_features
|
||||
_preprocess_flow = _udata._preprocess_flow
|
||||
from common.data_contract import (
|
||||
PACKET_FEATURE_NAMES,
|
||||
PACKET_CONTINUOUS_CHANNEL_IDX,
|
||||
PACKET_BINARY_CHANNEL_IDX,
|
||||
canonical_5tuple as _canonical_key,
|
||||
fit_packet_stats as _fit_packet_stats,
|
||||
zscore as _zscore,
|
||||
)
|
||||
|
||||
DEFAULT_FLOW_META_COLUMNS = {'flow_id', 'label', 'day', 'service', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'timestamp', 'start_ts', 'n_pkts'}
|
||||
|
||||
|
||||
def _read_flow_features(path: Path, *, expected_rows: int, feature_columns: Optional[list[str]]=None) -> tuple[np.ndarray, tuple[str, ...], np.ndarray | None]:
|
||||
path = Path(path)
|
||||
if path.suffix == '.npz':
|
||||
data = np.load(path, allow_pickle=True)
|
||||
x = data['features'].astype(np.float32)
|
||||
raw_names = data['feature_names'] if 'feature_names' in data.files else np.arange(x.shape[1])
|
||||
names = tuple((str(v) for v in raw_names))
|
||||
flow_id = data['flow_id'] if 'flow_id' in data.files else None
|
||||
elif path.suffix in ('.parquet', '.pq'):
|
||||
df = pd.read_parquet(path)
|
||||
flow_id = df['flow_id'].to_numpy() if 'flow_id' in df.columns else None
|
||||
if feature_columns:
|
||||
cols = feature_columns
|
||||
else:
|
||||
cols = [c for c in df.columns if c not in DEFAULT_FLOW_META_COLUMNS and pd.api.types.is_numeric_dtype(df[c])]
|
||||
if not cols:
|
||||
raise ValueError(f'no numeric flow feature columns found in {path}')
|
||||
x = df[cols].to_numpy(dtype=np.float32)
|
||||
names = tuple(cols)
|
||||
else:
|
||||
raise ValueError(f'unsupported flow feature file: {path}')
|
||||
if len(x) != expected_rows:
|
||||
raise ValueError(f'flow feature row count {len(x):,} != packet row count {expected_rows:,}')
|
||||
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
return (x, names, flow_id)
|
||||
|
||||
|
||||
def _feature_columns_from_df(df: pd.DataFrame, requested: Optional[list[str]]) -> list[str]:
|
||||
if requested:
|
||||
return requested
|
||||
return [c for c in df.columns if c not in DEFAULT_FLOW_META_COLUMNS and pd.api.types.is_numeric_dtype(df[c])]
|
||||
|
||||
|
||||
def _align_flow_features_by_scan(feature_df: pd.DataFrame, packet_flows: pd.DataFrame, *, feature_columns: list[str]) -> tuple[np.ndarray, tuple[str, ...]]:
|
||||
required = ['label', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'protocol']
|
||||
missing_feature = [c for c in required if c not in feature_df.columns]
|
||||
missing_packet = [c for c in required if c not in packet_flows.columns]
|
||||
if missing_feature or missing_packet:
|
||||
raise ValueError(f'scan alignment requires label + 5-tuple metadata. missing in feature_df={missing_feature}, packet_flows={missing_packet}')
|
||||
packet_keys = [(str(lbl), _canonical_key(src, sp, dst, dp, proto)) for (lbl, src, sp, dst, dp, proto) in zip(packet_flows['label'].to_numpy(), packet_flows['src_ip'].to_numpy(), packet_flows['src_port'].to_numpy(), packet_flows['dst_ip'].to_numpy(), packet_flows['dst_port'].to_numpy(), packet_flows['protocol'].to_numpy())]
|
||||
labels = feature_df['label'].to_numpy()
|
||||
src_ip = feature_df['src_ip'].to_numpy()
|
||||
src_port = feature_df['src_port'].to_numpy()
|
||||
dst_ip = feature_df['dst_ip'].to_numpy()
|
||||
dst_port = feature_df['dst_port'].to_numpy()
|
||||
protocol = feature_df['protocol'].to_numpy()
|
||||
matched: list[int] = []
|
||||
j = 0
|
||||
n_csv = len(feature_df)
|
||||
for (i, target) in enumerate(packet_keys):
|
||||
while j < n_csv:
|
||||
cand = (str(labels[j]), _canonical_key(src_ip[j], src_port[j], dst_ip[j], dst_port[j], protocol[j]))
|
||||
j += 1
|
||||
if cand == target:
|
||||
matched.append(j - 1)
|
||||
break
|
||||
else:
|
||||
raise ValueError(f'failed to align packet flow row {i:,}/{len(packet_keys):,}; the CSV cache may not be the same one used for packet extraction')
|
||||
print(f'[data] scan-aligned CSV flow features: matched={len(matched):,} from csv_rows={n_csv:,} skipped={matched[-1] + 1 - len(matched):,}')
|
||||
x = feature_df.iloc[matched][feature_columns].to_numpy(dtype=np.float32)
|
||||
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
return (x, tuple(feature_columns))
|
||||
|
||||
|
||||
def _read_aligned_flow_features(path: Path, packet_flows: pd.DataFrame, *, feature_columns: Optional[list[str]]=None, align: str='auto') -> tuple[np.ndarray, tuple[str, ...]]:
|
||||
path = Path(path)
|
||||
if align not in ('auto', 'row', 'scan'):
|
||||
raise ValueError("flow_features_align must be 'auto', 'row', or 'scan'")
|
||||
if path.suffix == '.npz':
|
||||
(x, names, flow_id) = _read_flow_features(path, expected_rows=len(packet_flows), feature_columns=feature_columns)
|
||||
packet_id = packet_flows['flow_id'].to_numpy() if 'flow_id' in packet_flows else None
|
||||
if flow_id is not None and packet_id is not None and (not np.array_equal(flow_id, packet_id)):
|
||||
raise ValueError('NPZ flow_id does not align with Packet_CFM flows')
|
||||
return (x, names)
|
||||
if path.suffix not in ('.parquet', '.pq'):
|
||||
raise ValueError(f'unsupported flow feature file: {path}')
|
||||
feature_df = pd.read_parquet(path)
|
||||
cols = _feature_columns_from_df(feature_df, feature_columns)
|
||||
if not cols:
|
||||
raise ValueError(f'no numeric flow feature columns found in {path}')
|
||||
packet_id = packet_flows['flow_id'].to_numpy() if 'flow_id' in packet_flows else None
|
||||
if len(feature_df) == len(packet_flows):
|
||||
feature_id = feature_df['flow_id'].to_numpy() if 'flow_id' in feature_df.columns else None
|
||||
if feature_id is None or packet_id is None or np.array_equal(feature_id, packet_id):
|
||||
x = feature_df[cols].to_numpy(dtype=np.float32)
|
||||
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
return (x, tuple(cols))
|
||||
if align == 'row':
|
||||
raise ValueError("flow_id mismatch with flow_features_align='row'")
|
||||
if align == 'row':
|
||||
raise ValueError(f'row alignment requested but feature rows={len(feature_df):,} packet rows={len(packet_flows):,}')
|
||||
return _align_flow_features_by_scan(feature_df, packet_flows, feature_columns=cols)
|
||||
|
||||
|
||||
def _preprocess_flow(train: np.ndarray, val: np.ndarray, attack: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
mean = train.mean(axis=0).astype(np.float32)
|
||||
std = train.std(axis=0).astype(np.float32)
|
||||
return (_zscore(train, mean, std), _zscore(val, mean, std), _zscore(attack, mean, std), mean, std)
|
||||
|
||||
@dataclass
|
||||
class MixedData:
|
||||
|
||||
@@ -1,23 +1,14 @@
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import sys as _sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path as _Path
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import importlib.util as _ilu
|
||||
import sys as _sys
|
||||
from pathlib import Path as _Path
|
||||
_UNIFIED_NAME = 'unified_cfm_model'
|
||||
if _UNIFIED_NAME not in _sys.modules:
|
||||
_unified_spec = _ilu.spec_from_file_location(_UNIFIED_NAME, _Path(__file__).resolve().parents[1] / 'Unified_CFM' / 'model.py')
|
||||
_unified = _ilu.module_from_spec(_unified_spec)
|
||||
_sys.modules[_UNIFIED_NAME] = _unified
|
||||
_unified_spec.loader.exec_module(_unified)
|
||||
else:
|
||||
_unified = _sys.modules[_UNIFIED_NAME]
|
||||
AdaLNBlock = _unified.AdaLNBlock
|
||||
SinusoidalTimeEmb = _unified.SinusoidalTimeEmb
|
||||
_sinkhorn_coupling = _unified._sinkhorn_coupling
|
||||
|
||||
_sys.path.insert(0, str(_Path(__file__).resolve().parent))
|
||||
from _layers import AdaLNBlock, SinusoidalTimeEmb, _sinkhorn_coupling
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user