Initial commit: code, paper, small artifacts

This commit is contained in:
2026-05-07 20:47:30 +08:00
commit fae2db8cff
322 changed files with 33159 additions and 0 deletions

View File

@@ -0,0 +1,106 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/anomaly_transformer_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
AGGS = ('mean', 'max', 'median', 'p90')
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def _abs_auroc(v):
return max(v, 1.0 - v)
def main():
rows = []
full = {'protocols': {}}
per_class_collect = {p: {} for p in PROTOCOLS}
for protocol in PROTOCOLS:
agg_aurocs = {agg: [] for agg in AGGS}
agg_abs_aurocs = {agg: [] for agg in AGGS}
seeds_run = []
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
seeds_run.append(s)
for agg in AGGS:
ov = r['overall_by_agg'][agg]
agg_aurocs[agg].append(ov['auroc'])
agg_abs_aurocs[agg].append(_abs_auroc(ov['auroc']))
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
agg_summary = {}
for agg in AGGS:
(m, sd) = _ms(agg_aurocs[agg])
(am, asd) = _ms(agg_abs_aurocs[agg])
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'abs_auroc_mean': am, 'abs_auroc_std': asd}
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['abs_auroc_mean'])
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'abs_auroc_mean': agg_summary[best_agg]['abs_auroc_mean'], 'abs_auroc_std': agg_summary[best_agg]['abs_auroc_std'], 'all_aggs': agg_summary})
lines = ['# Anomaly-Transformer (ICLR 2022) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: ICLR 2022 Anomaly-Transformer (association-discrepancy minimax). Vendored model class from `baselines/Anomaly-Transformer/model/AnomalyTransformer.py`; training + scoring loop reimplemented to match our protocol (input shape [B, T=64, D=9] = our z-scored packet sequences, same train/val/attack splits as eval_new_scores.py).', 'Hyperparams: d_model=128, n_heads=4, e_layers=3, batch=128, lr=1e-4, k_disc=3.0, temperature=50.0, epochs=15.', 'Score: per-position softmax(-association_KL · T) · MSE(rec, x), then aggregated per flow (mean / max / median / p90).', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm (Unified_CFM) | **AT (ours)** | abs AUROC | best agg | Δ vs terminal |', '|---|---:|---:|---:|---|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
(am, asd) = (row['abs_auroc_mean'], row['abs_auroc_std'])
if np.isnan(m):
continue
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
d_terminal = m - tn_m
lines.append(f"| {PRETTY[p]} | {tn_str} | **{m:.4f} ± {sd:.4f}** | {am:.4f} ± {asd:.4f} | `{row['best_agg']}` | {d_terminal:+.4f} |")
lines.append('')
lines.append('## All aggregators (3-seed mean ± std)')
lines.append('')
lines.append('| Protocol | mean | max | median | p90 |')
lines.append('|---|---:|---:|---:|---:|')
for row in rows:
cells = [PRETTY[row['protocol']]]
for agg in AGGS:
a = row['all_aggs'][agg]
m = a['auroc_mean']
if np.isnan(m):
cells.append('')
else:
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
lines.append('| ' + ' | '.join(cells) + ' |')
lines.append('')
lines.append('## Per-attack (forward + reverse, mean aggregator)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
continue
lines.append('| attack | n | AT AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} raw={row['auroc_mean']:.4f}±{row['auroc_std']:.4f} abs={row['abs_auroc_mean']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,109 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/kitsune_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
AGGS = ('mean', 'max', 'median', 'p90')
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
KITSUNE_PAPER = {'iscxtor_within': (0.78, None), 'cicids_within': (0.85, None), 'cicddos_within': (None, None), 'forward_cross': (None, None), 'reverse_cross': (None, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def main():
rows = []
per_class_collect = {p: {} for p in PROTOCOLS}
full = {'protocols': {}}
for protocol in PROTOCOLS:
agg_aurocs = {agg: [] for agg in AGGS}
agg_auprcs = {agg: [] for agg in AGGS}
seeds_run = []
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
seeds_run.append(s)
for agg in AGGS:
ov = r['overall_by_agg'][agg]
agg_aurocs[agg].append(ov['auroc'])
agg_auprcs[agg].append(ov['auprc'])
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
agg_summary = {}
for agg in AGGS:
(m, sd) = _ms(agg_aurocs[agg])
(ma, sda) = _ms(agg_auprcs[agg])
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda}
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['auroc_mean'])
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'all_aggs': agg_summary})
lines = ['# Kitsune (Path B) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: KitNET ensemble autoencoder (the ML core of Kitsune).', "**Path B**: feeds our **z-scored 9-d packet features** directly through `KitNET.process()` for the FM+AD grace, then `KitNET.execute()` per packet during eval. **AfterImage's 100-d host/session statistics are skipped** (they require sequential pcap streams which our (B,T,9) tensor abstraction discards). This keeps data usage unified with `eval_new_scores.py`.", 'Train: 5000 source-benign flows → ~75-320k packets (≥ FM+AD=55k grace).', 'Score: per-flow aggregate of per-packet RMSE (mean / max / median / p90).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm | Kitsune paper (Shafir reproduction) | **Kitsune Path B (ours)** | best agg | Δ vs paper | Δ vs terminal |', '|---|---:|---:|---:|---|---:|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(kp_m, _) = KITSUNE_PAPER[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
if np.isnan(m):
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | — | (no runs) | — | — | — |')
continue
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
kp_str = f'{kp_m:.4f}' if kp_m is not None else ''
d_terminal = m - tn_m
d_paper = m - kp_m if kp_m is not None else None
d_paper_str = f'{d_paper:+.4f}' if d_paper is not None else ''
lines.append(f"| {PRETTY[p]} | {tn_str} | {kp_str} | **{m:.4f} ± {sd:.4f}** | `{row['best_agg']}` | {d_paper_str} | {d_terminal:+.4f} |")
lines.append('')
lines.append('## All aggregators (3-seed mean ± std)')
lines.append('')
lines.append('| Protocol | mean | max | median | p90 |')
lines.append('|---|---:|---:|---:|---:|')
for row in rows:
cells = [PRETTY[row['protocol']]]
for agg in AGGS:
a = row['all_aggs'][agg]
m = a['auroc_mean']
if np.isnan(m):
cells.append('')
else:
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
lines.append('| ' + ' | '.join(cells) + ' |')
lines.append('')
lines.append('## Per-attack (forward + reverse, mean aggregator)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
lines.append('(no runs)')
continue
lines.append('| attack | n | Kitsune AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM, 'kitsune_paper': KITSUNE_PAPER}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,93 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/shafir_nf_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
SHAFIR_PAPER = {'iscxtor_within': (0.8731, None), 'cicids_within': (0.9303, None), 'cicddos_within': (0.93, None), 'forward_cross': (0.89, None), 'reverse_cross': (0.93, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def main():
rows = []
per_class_collect = {p: {} for p in PROTOCOLS}
for protocol in PROTOCOLS:
(aurocs, auprcs, t_train) = ([], [], [])
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
aurocs.append(r['overall']['neg_log_prob']['auroc'])
auprcs.append(r['overall']['neg_log_prob']['auprc'])
t_train.append(r.get('t_train_sec', 0.0))
for (cls, info) in r.get('per_class', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
(m, sd) = _ms(aurocs)
(ma, sda) = _ms(auprcs)
(tt, _) = _ms(t_train)
rows.append({'protocol': protocol, 'n_seeds': len(aurocs), 'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda, 't_train_sec_mean': tt})
lines = ['# Shafir 2026 NF Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', "Method: Shafir's official `pzflow.Flow` (single basic NF).", 'Features: our **20-d canonical packet-derived flow features** (`common.data_contract.CANONICAL_FLOW_FEATURE_NAMES`), z-scored with the **same source training stats** that the Unified_CFM checkpoint uses.', 'Train cap: 10,000 source-benign samples (Shafir paper protocol).', 'Optimizer: SGD lr=1e-3, 100 epochs (Shafir paper defaults).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (3-seed mean ± std)', '', '| Protocol | terminal_norm (ours) | Shafir NF — paper | **Shafir NF — our features** | Δ vs paper | Δ vs terminal_norm |', '|---|---:|---:|---:|---:|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(sp_m, _) = SHAFIR_PAPER[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
if np.isnan(m):
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | {sp_m:.4f} | (no runs yet) | — | — |')
continue
d_paper = m - sp_m
d_terminal = m - tn_m
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
lines.append(f'| {PRETTY[p]} | {tn_str} | {sp_m:.4f} | **{m:.4f} ± {sd:.4f}** | {d_paper:+.4f} | {d_terminal:+.4f} |')
lines.append('')
lines.append('## Per-protocol stats')
lines.append('')
lines.append('| Protocol | n_seeds | AUPRC mean ± std | Train time (s, mean) |')
lines.append('|---|---:|---:|---:|')
for row in rows:
p = row['protocol']
(m, sd) = (row['auprc_mean'], row['auprc_std'])
if np.isnan(m):
continue
lines.append(f"| {PRETTY[p]} | {row['n_seeds']} | {m:.4f} ± {sd:.4f} | {row['t_train_sec_mean']:.1f} |")
lines.append('')
lines.append('## Per-attack (forward + reverse)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
lines.append('(no runs)')
continue
lines.append('| attack | n | Shafir NF AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': {p: {cls: {'n': v['n'], **dict(zip(['mean', 'std'], _ms(v['aurocs'])))} for (cls, v) in dd.items()} for (p, dd) in per_class_collect.items()}, 'baselines': {'terminal_norm': TERMINAL_NORM, 'shafir_paper': SHAFIR_PAPER}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,267 @@
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import yaml
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
sys.path.insert(0, str(REPO / 'baselines/Anomaly-Transformer'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from packet_store import PacketShardStore
from model.AnomalyTransformer import AnomalyTransformer
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _load_within(model_dir, n_val, n_atk, n_train_cap, seed):
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
(train_packets, train_len) = (data.train_packets, data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
(val_packets, val_len) = (data.val_packets, data.val_len)
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
if n_val is not None and len(val_packets) > n_val:
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
(val_packets, val_len) = (val_packets[idx], val_len[idx])
if n_atk is not None and len(atk_packets) > n_atk:
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
def _load_cross(spec, ckpt, seed, n_train_cap, T):
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
src_cfg = yaml.safe_load(src_cfg_path.read_text())
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
rng = np.random.default_rng(seed + 1000)
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
labels = flows['label'].astype(str).to_numpy()
rng2 = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng2.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
store = PacketShardStore.open(target_store)
def _materialize(idx):
(tok, ll) = store.read_packets(idx, T=T)
ll = np.minimum(ll, T).astype(np.int32)
return (tok.astype(np.float32), ll)
(b_tok, b_len) = _materialize(b_sel)
(a_tok, a_len) = _materialize(a_sel)
if packet_preprocess == 'mixed_dequant':
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
else:
val_packets = _zscore(b_tok, packet_mean, packet_std)
atk_packets = _zscore(a_tok, packet_mean, packet_std)
msk_b = np.arange(T)[None, :] < b_len[:, None]
msk_a = np.arange(T)[None, :] < a_len[:, None]
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
def _kl(p, q):
return torch.sum(p * (torch.log(p + 0.0001) - torch.log(q + 0.0001)), dim=-1)
def _norm_prior(prior, win_size: int) -> torch.Tensor:
return prior / torch.unsqueeze(torch.sum(prior, dim=-1), dim=-1).repeat(1, 1, 1, win_size)
def _train(model: AnomalyTransformer, train_packets: np.ndarray, train_len: np.ndarray, *, batch_size: int, epochs: int, lr: float, k_disc: float, win_size: int, device: torch.device) -> dict:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()
n = len(train_packets)
losses_log = []
t0 = time.time()
for epoch in range(epochs):
model.train()
rng = np.random.default_rng(epoch)
perm = rng.permutation(n)
epoch_losses = []
for s in range(0, n, batch_size):
idx = perm[s:s + batch_size]
x = torch.from_numpy(train_packets[idx]).float().to(device)
optimizer.zero_grad()
(output, series, prior, _) = model(x)
series_loss = 0.0
prior_loss = 0.0
for u in range(len(prior)):
norm_p = _norm_prior(prior[u], win_size)
series_loss += torch.mean(_kl(series[u], norm_p.detach())) + torch.mean(_kl(norm_p.detach(), series[u]))
prior_loss += torch.mean(_kl(norm_p, series[u].detach())) + torch.mean(_kl(series[u].detach(), norm_p))
series_loss /= len(prior)
prior_loss /= len(prior)
rec_loss = criterion(output, x)
loss1 = rec_loss - k_disc * series_loss
loss2 = rec_loss + k_disc * prior_loss
loss1.backward(retain_graph=True)
loss2.backward()
optimizer.step()
epoch_losses.append(rec_loss.item())
losses_log.append(float(np.mean(epoch_losses)))
if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
print(f' [epoch {epoch + 1}/{epochs}] rec_loss={losses_log[-1]:.4f} ({time.time() - t0:.1f}s elapsed)', flush=True)
return {'losses': losses_log, 't_train_sec': time.time() - t0}
@torch.no_grad()
def _score(model: AnomalyTransformer, packets: np.ndarray, lens: np.ndarray, *, batch_size: int, win_size: int, temperature: float, device: torch.device) -> dict[str, np.ndarray]:
model.eval()
n = len(packets)
means = np.zeros(n, dtype=np.float32)
maxes = np.zeros(n, dtype=np.float32)
medians = np.zeros(n, dtype=np.float32)
p90s = np.zeros(n, dtype=np.float32)
crit = nn.MSELoss(reduction='none')
for s in range(0, n, batch_size):
x = torch.from_numpy(packets[s:s + batch_size]).float().to(device)
L = torch.from_numpy(lens[s:s + batch_size]).long().to(device)
(output, series, prior, _) = model(x)
rec = crit(output, x).mean(dim=-1)
series_loss = 0.0
prior_loss = 0.0
for u in range(len(prior)):
norm_p = _norm_prior(prior[u], win_size)
kl1 = _kl(series[u], norm_p.detach())
kl2 = _kl(norm_p.detach(), series[u])
series_loss = series_loss + (kl1 + kl2)
if isinstance(series_loss, torch.Tensor):
sl = series_loss.mean(dim=1)
metric = torch.softmax(-sl * temperature, dim=-1) * rec
else:
metric = rec
T_eff = x.shape[1]
arange = torch.arange(T_eff, device=device).unsqueeze(0).expand_as(metric)
mask = arange < L.unsqueeze(1)
for i in range(metric.shape[0]):
li = int(L[i].item())
if li == 0:
continue
row = metric[i, :li].cpu().numpy()
means[s + i] = row.mean()
maxes[s + i] = row.max()
medians[s + i] = float(np.median(row))
p90s[s + i] = float(np.percentile(row, 90))
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _per_class(val_score, atk_score, atk_labels):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=10000)
p.add_argument('--epochs', type=int, default=10)
p.add_argument('--lr', type=float, default=0.0001)
p.add_argument('--k-disc', type=float, default=3.0, help='weight on association-discrepancy KL term')
p.add_argument('--temperature', type=float, default=50.0)
p.add_argument('--batch-size', type=int, default=64)
p.add_argument('--d-model', type=int, default=128)
p.add_argument('--n-heads', type=int, default=4)
p.add_argument('--e-layers', type=int, default=3)
p.add_argument('--T', type=int, default=64)
p.add_argument('--device', type=str, default='auto')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda' if args.device == 'auto' and torch.cuda.is_available() else args.device if args.device != 'auto' else 'cpu')
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] anomaly_transformer protocol={args.protocol} seed={args.seed}')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
n_train = len(arrays['train_packets'])
n_val = len(arrays['val_packets'])
n_atk = len(arrays['atk_packets'])
D = arrays['train_packets'].shape[-1]
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D} device={device}')
torch.manual_seed(args.seed)
model = AnomalyTransformer(win_size=args.T, enc_in=D, c_out=D, d_model=args.d_model, n_heads=args.n_heads, e_layers=args.e_layers, d_ff=args.d_model, dropout=0.0, output_attention=True).to(device)
n_params = sum((p.numel() for p in model.parameters()))
print(f'[model] params={n_params:,}')
train_meta = _train(model, arrays['train_packets'], arrays['train_len'], batch_size=args.batch_size, epochs=args.epochs, lr=args.lr, k_disc=args.k_disc, win_size=args.T, device=device)
print(f"[train] {train_meta['t_train_sec']:.1f}s, final rec_loss={train_meta['losses'][-1]:.4f}")
t0 = time.time()
val_aggs = _score(model, arrays['val_packets'], arrays['val_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
print(f'[score] benign in {time.time() - t0:.1f}s')
t0 = time.time()
atk_aggs = _score(model, arrays['atk_packets'], arrays['atk_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
print(f'[score] attack in {time.time() - t0:.1f}s')
overall = {}
per_class_by_agg = {}
for agg in ('mean', 'max', 'median', 'p90'):
v = val_aggs[agg]
a = atk_aggs[agg]
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
s = np.r_[v, a]
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'anomaly_transformer', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': n_train, 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'epochs': args.epochs, 'lr': args.lr, 'k_disc': args.k_disc, 'temperature': args.temperature, 'd_model': args.d_model, 't_train_sec': round(train_meta['t_train_sec'], 2), 'loss_first_last': [train_meta['losses'][0], train_meta['losses'][-1]], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
for agg in ('mean', 'max', 'median', 'p90'):
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
np.savez_compressed(npz_path, **save)
print(f'[saved] {out_json}')
best = max(overall, key=lambda k: overall[k]['auroc'])
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/anomaly_transformer_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-15}"
BATCH="${BATCH:-128}"
D_MODEL="${D_MODEL:-128}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS batch=$BATCH ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_anomaly_transformer.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --batch-size "$BATCH" --d-model "$D_MODEL" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,223 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import yaml
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from packet_store import PacketShardStore
from KitNET.KitNET import KitNET
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _load_within(model_dir: Path, n_val, n_atk, n_train_cap, seed):
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
(train_packets, train_len) = (data.train_packets, data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
(val_packets, val_len) = (data.val_packets, data.val_len)
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
if n_val is not None and len(val_packets) > n_val:
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
(val_packets, val_len) = (val_packets[idx], val_len[idx])
if n_atk is not None and len(atk_packets) > n_atk:
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
def _load_cross(spec, ckpt, seed, n_train_cap, T):
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
src_cfg = yaml.safe_load(src_cfg_path.read_text())
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
rng = np.random.default_rng(seed + 1000)
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
labels = flows['label'].astype(str).to_numpy()
rng2 = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng2.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
store = PacketShardStore.open(target_store)
def _materialize(idx):
(tok, ll) = store.read_packets(idx, T=T)
ll = np.minimum(ll, T).astype(np.int32)
return (tok.astype(np.float32), ll)
(b_tok, b_len) = _materialize(b_sel)
(a_tok, a_len) = _materialize(a_sel)
if packet_preprocess == 'mixed_dequant':
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
else:
val_packets = _zscore(b_tok, packet_mean, packet_std)
atk_packets = _zscore(a_tok, packet_mean, packet_std)
msk_b = np.arange(T)[None, :] < b_len[:, None]
msk_a = np.arange(T)[None, :] < a_len[:, None]
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
def _flatten_packets(packets: np.ndarray, lens: np.ndarray) -> np.ndarray:
out_chunks = []
for i in range(len(packets)):
L = int(lens[i])
if L > 0:
out_chunks.append(packets[i, :L])
if not out_chunks:
return np.empty((0, packets.shape[-1]), dtype=np.float32)
return np.concatenate(out_chunks, axis=0).astype(np.float32)
def _train_kitnet(kit: KitNET, train_flat: np.ndarray) -> dict[str, float]:
t0 = time.time()
last_rmse = 0.0
for i in range(len(train_flat)):
last_rmse = kit.process(train_flat[i])
if (i + 1) % 50000 == 0:
print(f' [train] processed {i + 1:,}/{len(train_flat):,} last_rmse={last_rmse:.4f}', flush=True)
return {'t_train_sec': round(time.time() - t0, 2), 'n_trained_packets': len(train_flat)}
def _score_flows(kit: KitNET, packets: np.ndarray, lens: np.ndarray) -> dict[str, np.ndarray]:
N = len(packets)
means = np.zeros(N, dtype=np.float32)
maxes = np.zeros(N, dtype=np.float32)
medians = np.zeros(N, dtype=np.float32)
p90s = np.zeros(N, dtype=np.float32)
for i in range(N):
L = int(lens[i])
if L == 0:
continue
rmses = np.zeros(L, dtype=np.float32)
for t in range(L):
rmses[t] = kit.execute(packets[i, t])
means[i] = rmses.mean()
maxes[i] = rmses.max()
medians[i] = np.median(rmses)
p90s[i] = np.percentile(rmses, 90)
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=2000, help='Cap source-benign train flows (each contributes ~T packets).')
p.add_argument('--fm-grace', type=int, default=2000, help='Kitsune feature-mapper grace period (packets).')
p.add_argument('--ad-grace', type=int, default=20000, help='Kitsune anomaly-detector grace period (packets).')
p.add_argument('--max-ae-size', type=int, default=10)
p.add_argument('--lr', type=float, default=0.1)
p.add_argument('--hidden-ratio', type=float, default=0.75)
p.add_argument('--T', type=int, default=64)
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] kitsune protocol={args.protocol} seed={args.seed}')
print(f'[run] using packet stats from {model_dir}/model.pt')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
n_train = len(arrays['train_packets'])
n_val = len(arrays['val_packets'])
n_atk = len(arrays['atk_packets'])
D = arrays['train_packets'].shape[-1]
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D}')
train_flat = _flatten_packets(arrays['train_packets'], arrays['train_len'])
print(f'[data] train_flat packets={len(train_flat):,} FM_grace={args.fm_grace} AD_grace={args.ad_grace}')
if len(train_flat) < args.fm_grace + args.ad_grace:
raise ValueError(f'Need at least FM+AD={args.fm_grace + args.ad_grace} packets, have {len(train_flat)} (try increasing --n-train-cap).')
kit = KitNET(n=D, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
train_meta = _train_kitnet(kit, train_flat)
print(f'[train] {train_meta}')
t0 = time.time()
val_aggs = _score_flows(kit, arrays['val_packets'], arrays['val_len'])
print(f'[score] benign in {time.time() - t0:.1f}s')
t0 = time.time()
atk_aggs = _score_flows(kit, arrays['atk_packets'], arrays['atk_len'])
print(f'[score] attack in {time.time() - t0:.1f}s')
overall = {}
per_class_by_agg = {}
for agg in ('mean', 'max', 'median', 'p90'):
v = val_aggs[agg]
a = atk_aggs[agg]
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
s = np.r_[v, a]
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'kitsune_path_b', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train_flows': n_train, 'n_train_packets': int(len(train_flat)), 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'max_ae_size': args.max_ae_size, 't_train_sec': train_meta['t_train_sec'], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
for agg in ('mean', 'max', 'median', 'p90'):
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
np.savez_compressed(npz_path, **save)
print(f'[saved] {out_json}')
print(f'[saved] {npz_path}')
best = max(overall, key=lambda k: overall[k]['auroc'])
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
print()
print('=== overall AUROC by aggregator ===')
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/kitsune_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
N_TRAIN_CAP="${N_TRAIN_CAP:-5000}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed n_train_cap=$N_TRAIN_CAP ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_kitsune.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--n-train-cap "$N_TRAIN_CAP" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,211 @@
from __future__ import annotations
import argparse
import json
import sys
import time
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
if not hasattr(np, 'Inf'):
np.Inf = np.inf
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
from FeatureExtractor import FE
from KitNET.KitNET import KitNET
from data import load_unified_data
PCAP_GLOBS = {'iscxtor': str(REPO / 'datasets/iscxtor2016/raw/pcap_extracted/**/*.pcap'), 'cicids2017': str(REPO / 'datasets/cicids2017/raw/pcap/*.pcap'), 'cicddos2019': str(REPO / 'datasets/cicddos2019/raw/pcap/*')}
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', 'iscxtor', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'cicids2017', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'cicddos2019', {'n_val': 10000, 'n_atk': 20000})}
def _canonical_key(src_ip, dst_ip, src_port, dst_port, protocol) -> tuple:
a = (src_ip, src_port)
b = (dst_ip, dst_port)
if a <= b:
return (a[0], b[0], a[1], b[1], int(protocol))
return (b[0], a[0], b[1], a[1], int(protocol))
def _proto_from_kitsune(srcproto: str, dstproto: str) -> int:
if srcproto == 'icmp':
return 1
if srcproto == 'arp':
return 0
return -1
class FEWithMeta(FE):
def __init__(self, path, limit=np.inf):
super().__init__(path, limit)
self._last_ts = None
self._last_5tuple = None
self._last_framelen = None
def get_next_vector(self):
if self.curPacketIndx == self.limit:
if self.parse_type == 'tsv':
self.tsvinf.close()
return []
if self.parse_type == 'tsv':
row = self.tsvin.__next__()
IPtype = np.nan
timestamp = row[0]
framelen = row[1]
srcIP = ''
dstIP = ''
if row[4] != '':
(srcIP, dstIP, IPtype) = (row[4], row[5], 0)
elif row[17] != '':
(srcIP, dstIP, IPtype) = (row[17], row[18], 1)
srcproto = row[6] + row[8]
dstproto = row[7] + row[9]
(srcMAC, dstMAC) = (row[2], row[3])
if srcproto == '':
if row[12] != '':
(srcproto, dstproto) = ('arp', 'arp')
(srcIP, dstIP, IPtype) = (row[14], row[16], 0)
elif row[10] != '':
(srcproto, dstproto, IPtype) = ('icmp', 'icmp', 0)
elif srcIP + srcproto + dstIP + dstproto == '':
(srcIP, dstIP) = (row[2], row[3])
else:
return []
try:
sp = int(srcproto) if srcproto.isdigit() else 0
dp = int(dstproto) if dstproto.isdigit() else 0
except Exception:
(sp, dp) = (0, 0)
try:
self._last_ts = float(timestamp)
except Exception:
self._last_ts = np.nan
self._last_5tuple = (srcIP, dstIP, sp, dp)
try:
self._last_framelen = int(framelen)
except Exception:
self._last_framelen = 0
self.curPacketIndx += 1
try:
return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))
except Exception as e:
print(f' [warn] netStat error: {e}')
return []
def _stream_pcap_kitsune(pcap_path: Path, *, kit: KitNET, fm_grace: int, ad_grace: int, packet_limit: int, fivetuple_to_rmses: dict, n_packets_total: list) -> None:
print(f' [stream] {pcap_path.name}', flush=True)
fe = FEWithMeta(str(pcap_path), limit=packet_limit)
t0 = time.time()
n_local = 0
while True:
x = fe.get_next_vector()
if len(x) == 0:
break
n_local += 1
n_packets_total[0] += 1
rmse = kit.process(x)
if rmse is None or rmse == 0:
continue
if fe._last_5tuple is None:
continue
(srcIP, dstIP, sp, dp) = fe._last_5tuple
key = (srcIP, dstIP, sp, dp) if (srcIP, sp) <= (dstIP, dp) else (dstIP, srcIP, dp, sp)
fivetuple_to_rmses[key].append(rmse)
if n_local % 200000 == 0:
print(f' [{n_local:,}] elapsed {time.time() - t0:.0f}s ({n_local / max(time.time() - t0, 0.001):.0f} pkt/s)', flush=True)
print(f' [stream] {pcap_path.name} done: {n_local:,} packets in {time.time() - t0:.0f}s', flush=True)
def _flows_to_key(flows_df: pd.DataFrame) -> np.ndarray:
keys = []
for (src_ip, dst_ip, sp, dp) in zip(flows_df['src_ip'], flows_df['dst_ip'], flows_df['src_port'], flows_df['dst_port']):
if (str(src_ip), int(sp)) <= (str(dst_ip), int(dp)):
k = (str(src_ip), str(dst_ip), int(sp), int(dp))
else:
k = (str(dst_ip), str(src_ip), int(dp), int(sp))
keys.append(k)
return np.asarray(keys, dtype=object)
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS))
p.add_argument('--seed', type=int, required=True)
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--fm-grace', type=int, default=5000)
p.add_argument('--ad-grace', type=int, default=50000)
p.add_argument('--max-ae-size', type=int, default=10)
p.add_argument('--lr', type=float, default=0.1)
p.add_argument('--hidden-ratio', type=float, default=0.75)
p.add_argument('--packet-limit-per-pcap', type=int, default=2000000, help='Cap per-pcap packets to keep runtime tractable. None = full.')
p.add_argument('--max-pcaps', type=int, default=None, help='Cap number of pcap files processed (default: all).')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
(template, ds_name, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
print(f'[run] kitsune_path_a protocol={args.protocol} seed={args.seed}')
print(f'[run] dataset={ds_name} model_dir={model_dir}')
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else caps['n_atk'], val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else caps['n_val'])
flows_full = pd.read_parquet(cfg['flows_parquet'])
print(f'[data] flows.parquet rows: {len(flows_full):,}; val={len(data.val_flow):,} attack={len(data.attack_flow):,}')
from glob import glob
pcaps = sorted(glob(PCAP_GLOBS[ds_name], recursive=True))
pcaps = [Path(p) for p in pcaps]
if args.max_pcaps is not None:
pcaps = pcaps[:args.max_pcaps]
print(f'[pcap] discovered {len(pcaps)} pcap(s)')
for p in pcaps[:5]:
print(f' {p}')
if len(pcaps) > 5:
print(f' ...({len(pcaps) - 5} more)')
kit = KitNET(n=100, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
fivetuple_to_rmses: dict = defaultdict(list)
n_total = [0]
t0 = time.time()
for p in pcaps:
_stream_pcap_kitsune(p, kit=kit, fm_grace=args.fm_grace, ad_grace=args.ad_grace, packet_limit=args.packet_limit_per_pcap, fivetuple_to_rmses=fivetuple_to_rmses, n_packets_total=n_total)
elapsed = time.time() - t0
print(f'[stream] total {n_total[0]:,} packets in {elapsed:.0f}s ({n_total[0] / max(elapsed, 0.001):.0f} pkt/s)')
print(f'[stream] unique 5-tuples seen: {len(fivetuple_to_rmses):,}')
keys_full = _flows_to_key(flows_full)
print(f'[match] keying {len(keys_full):,} flows to 5-tuples')
flow_score_mean = np.full(len(flows_full), np.nan, dtype=np.float64)
flow_score_max = np.full(len(flows_full), np.nan, dtype=np.float64)
flow_score_median = np.full(len(flows_full), np.nan, dtype=np.float64)
n_matched = 0
for (i, k) in enumerate(keys_full):
rl = fivetuple_to_rmses.get(tuple(k))
if rl:
flow_score_mean[i] = float(np.mean(rl))
flow_score_max[i] = float(np.max(rl))
flow_score_median[i] = float(np.median(rl))
n_matched += 1
print(f'[match] flows with RMSE coverage: {n_matched:,}/{len(flows_full):,} ({100 * n_matched / max(len(flows_full), 1):.1f}%)')
val_flow_ids = set((int(x) for x in data.val_flow_ids)) if hasattr(data, 'val_flow_ids') else None
bin_labels = (flows_full['label'].astype(str) != cfg.get('benign_label', 'normal')).astype(int).to_numpy()
keys = ['mean', 'max', 'median']
score_arrs = {'mean': flow_score_mean, 'max': flow_score_max, 'median': flow_score_median}
overall = {}
for k in keys:
s = score_arrs[k]
valid = ~np.isnan(s)
if valid.sum() < 10:
overall[k] = {'auroc': float('nan'), 'auprc': float('nan'), 'n_valid': int(valid.sum())}
continue
y = bin_labels[valid]
sv = s[valid]
overall[k] = {'auroc': _safe_metric(roc_auc_score, y, sv), 'auprc': _safe_metric(average_precision_score, y, sv), 'n_valid': int(valid.sum())}
print(f" [{k}] AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f} (n_valid={overall[k]['n_valid']:,})")
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out = {'method': 'kitsune_path_a', 'protocol': args.protocol, 'seed': args.seed, 'dataset': ds_name, 'n_pcaps': len(pcaps), 'n_total_packets': int(n_total[0]), 'n_unique_5tuples': int(len(fivetuple_to_rmses)), 'n_flows_total': int(len(flows_full)), 'n_flows_matched': int(n_matched), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'packet_limit_per_pcap': args.packet_limit_per_pcap, 'elapsed_sec': round(elapsed, 1), 'overall_by_agg': overall}
out_json.write_text(json.dumps(out, indent=2))
np.savez_compressed(out_json.with_suffix('.npz'), flow_score_mean=flow_score_mean.astype(np.float32), flow_score_max=flow_score_max.astype(np.float32), flow_score_median=flow_score_median.astype(np.float32), binary_label=bin_labels.astype(np.int8))
print(f'[saved] {out_json}')
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,227 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import yaml
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
import optax
from pzflow import Flow
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from model import UnifiedCFMConfig, UnifiedTokenCFM
from packet_store import PacketShardStore
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000}), 'ciciot_within': ('runs/unified_cfm_ciciot2023_shafir5_2026_04_29', {'n_val': 10000, 'n_atk': 30000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _load_within(model_dir: Path, n_val: int | None, n_atk: int | None, n_train_cap: int, seed: int) -> dict[str, Any]:
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
train_flow = data.train_flow
if len(train_flow) > n_train_cap:
idx = np.sort(rng.choice(len(train_flow), size=n_train_cap, replace=False))
train_flow = train_flow[idx]
val_flow = data.val_flow
(atk_flow, atk_labels) = (data.attack_flow, data.attack_labels)
if n_val is not None and len(val_flow) > n_val:
idx = np.sort(rng.choice(len(val_flow), size=n_val, replace=False))
val_flow = val_flow[idx]
if n_atk is not None and len(atk_flow) > n_atk:
idx = np.sort(rng.choice(len(atk_flow), size=n_atk, replace=False))
atk_flow = atk_flow[idx]
atk_labels = atk_labels[idx]
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
def _load_cross(spec: dict[str, Any], ckpt_dict: dict[str, Any], seed: int, T: int, n_train_cap: int) -> dict[str, Any]:
flow_mean = np.asarray(ckpt_dict['flow_mean'], dtype=np.float32)
flow_std = np.asarray(ckpt_dict['flow_std'], dtype=np.float32)
flow_names = [str(n) for n in ckpt_dict['flow_feature_names']]
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
target_flow_features = REPO / spec['target_flow_features']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
ff = pd.read_parquet(target_flow_features)
if not np.array_equal(flows['flow_id'].to_numpy(dtype=np.uint64), ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('target flows and flow_features not row-aligned')
labels = flows['label'].astype(str).to_numpy()
rng = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
a_sel_chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
a_sel_chunks.append(rng.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(a_sel_chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
def _flow_only(idx):
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
val_flow = _flow_only(b_sel)
atk_flow = _flow_only(a_sel)
atk_labels = labels[a_sel]
src_flows = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flows'], columns=['flow_id', 'label'])
src_ff = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flow_features'])
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('source flows and flow_features not row-aligned')
src_labels = src_flows['label'].astype(str).to_numpy()
src_benign_idx = np.flatnonzero(src_labels == 'normal')
rng2 = np.random.default_rng(seed + 1000)
if len(src_benign_idx) > n_train_cap:
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=n_train_cap, replace=False))
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels, 'flow_names': flow_names}
def ckpt_dict_paths(ckpt: dict[str, Any]) -> dict[str, str]:
raise NotImplementedError('paths must be passed via main()')
def _train_and_score(train_flow: np.ndarray, val_flow: np.ndarray, atk_flow: np.ndarray, *, epochs: int, lr: float, optimizer: str, verbose: bool):
cols = [f'x{i}' for i in range(train_flow.shape[1])]
df_train = pd.DataFrame(train_flow.astype(np.float32), columns=cols)
df_val = pd.DataFrame(val_flow.astype(np.float32), columns=cols)
df_atk = pd.DataFrame(atk_flow.astype(np.float32), columns=cols)
if optimizer == 'sgd':
opt = optax.sgd(learning_rate=lr)
elif optimizer == 'adam':
opt = optax.adam(learning_rate=lr)
else:
raise ValueError(f'unknown optimizer {optimizer!r}')
flow = Flow(df_train.columns.tolist())
t0 = time.time()
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=verbose)
t_train = time.time() - t0
t0 = time.time()
lp_val = np.asarray(flow.log_prob(df_val))
lp_atk = np.asarray(flow.log_prob(df_atk))
t_score = time.time() - t0
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=10000, help='Cap benign train (default 10k mirrors Shafir).')
p.add_argument('--epochs', type=int, default=100)
p.add_argument('--lr', type=float, default=0.001)
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
p.add_argument('--T', type=int, default=64)
p.add_argument('--verbose', action='store_true')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] shafir_nf protocol={args.protocol} seed={args.seed}')
print(f'[run] using normalization stats from {model_dir}/model.pt (source ckpt)')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
flows_parquet = Path(cfg['flows_parquet'])
flow_features_path = Path(cfg['flow_features_path'])
flow_mean = np.asarray(ckpt['flow_mean'], dtype=np.float32)
flow_std = np.asarray(ckpt['flow_std'], dtype=np.float32)
flow_names = [str(n) for n in ckpt['flow_feature_names']]
src_flows = pd.read_parquet(flows_parquet, columns=['flow_id', 'label'])
src_ff = pd.read_parquet(flow_features_path)
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('source flows and flow_features not row-aligned')
src_labels = src_flows['label'].astype(str).to_numpy()
src_benign_idx = np.flatnonzero(src_labels == 'normal')
rng2 = np.random.default_rng(args.seed + 1000)
if len(src_benign_idx) > args.n_train_cap:
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=args.n_train_cap, replace=False))
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
target_flow_features = REPO / spec['target_flow_features']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
ff = pd.read_parquet(target_flow_features)
labels = flows['label'].astype(str).to_numpy()
rng = np.random.default_rng(args.seed)
b_sel = np.sort(rng.choice(np.flatnonzero(labels == 'normal'), size=n_benign, replace=False))
atk_idx = np.flatnonzero(labels != 'normal')
atk_classes = sorted(set(labels[atk_idx]))
per_class_n = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = atk_idx[labels[atk_idx] == cls]
k = min(per_class_n, len(pool))
if k:
chunks.append(rng.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
def _flow_only(idx):
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
val_flow = _flow_only(b_sel)
atk_flow = _flow_only(a_sel)
atk_labels = labels[a_sel]
arrays = {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
print(f"[data] train={len(arrays['train_flow']):,} val={len(arrays['val_flow']):,} attack={len(arrays['atk_flow']):,} D={arrays['train_flow'].shape[1]}")
res = _train_and_score(arrays['train_flow'], arrays['val_flow'], arrays['atk_flow'], epochs=args.epochs, lr=args.lr, optimizer=args.optimizer, verbose=args.verbose)
(val_score, atk_score) = (res['score_val'], res['score_atk'])
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
s = np.r_[val_score, atk_score]
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
per_cls = _per_class(val_score, atk_score, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'shafir_nf', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': int(len(arrays['train_flow'])), 'n_val': int(len(arrays['val_flow'])), 'n_atk': int(len(arrays['atk_flow'])), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=np.asarray(arrays['atk_labels']).astype(str), losses=res['losses'])
print(f'[saved] {out_json}')
print(f'[saved] {npz_path}')
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s score={res['t_score']:.1f}s")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/shafir_nf_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-100}"
LR="${LR:-0.001}"
OPTIMIZER="${OPTIMIZER:-sgd}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_shafir_nf.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,265 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
import warnings
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
warnings.filterwarnings('ignore')
import optax
from pzflow import Flow
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
REPO = Path(__file__).resolve().parents[2]
IDS2017_FEATURES = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
TOR2016_FEATURES = ['Protocol', 'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
CICIOT5_FEATURES = ['HTTPS', 'Protocol Type', 'Magnitude', 'Variance', 'fin_count']
CICIDS_BEST5_FEATURES = ['Bwd Packet Length Mean', 'Fwd Packets/s', 'ACK Flag Count', 'Total Length of Bwd Packets', 'Flow Duration']
TOR_BEST4_FEATURES = ['Flow IAT Std', 'Flow Bytes/s', 'Flow Packets/s', 'Bwd IAT Max']
COLUMN_ALIASES = {'Total Fwd Packets': ['Total Fwd Packet'], 'Total Backward Packets': ['Total Bwd packets'], 'Total Length of Fwd Packets': ['Total Length of Fwd Packet'], 'Total Length of Bwd Packets': ['Total Length of Bwd Packet'], 'Fwd Header Length': ['Fwd Header Length.1'], 'Init_Win_bytes_forward': ['FWD Init Win Bytes', 'Init Win Bytes Fwd'], 'Init_Win_bytes_backward': ['Bwd Init Win Bytes', 'Init Win Bytes Bwd'], 'act_data_pkt_fwd': ['Fwd Act Data Pkts'], 'min_seg_size_forward': ['Fwd Seg Size Min'], 'Avg Fwd Segment Size': ['Fwd Segment Size Avg'], 'Avg Bwd Segment Size': ['Bwd Segment Size Avg'], 'Min Packet Length': ['Packet Length Min'], 'Max Packet Length': ['Packet Length Max']}
DATASETS = {'iscxtor': {'csv_glob': str(REPO / 'datasets/iscxtor2016/raw/csv/Scenario-*-merged_5s.csv'), 'label_col': 'label', 'benign_values': ['nonTOR'], 'drop_patterns': [], 'feature_set': TOR_BEST4_FEATURES}, 'cicids2017': {'csv_glob': str(REPO / 'datasets/cicids2017/raw/csv/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [' - Attempted', '- Attempted'], 'feature_set': CICIDS_BEST5_FEATURES}, 'cicddos2019': {'csv_glob': str(REPO / 'datasets/cicddos2019/raw/csv/**/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [], 'feature_set': CICIDS_BEST5_FEATURES}, 'ciciot2023': {'csv_glob': str(REPO / 'datasets/ciciot2023/raw/csv/CSV/*/*.pcap.csv'), 'label_col': None, 'benign_folder': 'Benign_Final', 'drop_patterns': [], 'feature_set': CICIOT5_FEATURES}}
PROTOCOL_CONFIG = {'iscxtor_within': ('iscxtor', 'iscxtor', {'n_train': 10000, 'n_val': 10000, 'n_attack': None}), 'cicids_within': ('cicids2017', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'cicddos_within': ('cicddos2019', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 20000}), 'ciciot_within': ('ciciot2023', 'ciciot2023', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'forward_cross': ('cicids2017', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000}), 'reverse_cross': ('cicddos2019', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000})}
def _resolve_columns(df: pd.DataFrame, names: list[str]) -> tuple[list[str], list[str]]:
df.columns = [c.strip() if isinstance(c, str) else c for c in df.columns]
(resolved, missing) = ([], [])
for n in names:
if n in df.columns:
resolved.append(n)
continue
found = None
for alias in COLUMN_ALIASES.get(n, []):
if alias in df.columns:
found = alias
break
if found is None:
low = {c.lower(): c for c in df.columns}
if n.lower() in low:
found = low[n.lower()]
if found is None:
missing.append(n)
else:
resolved.append(found)
return (resolved, missing)
def _load_csvs(dataset_name: str, return_paths: bool=False):
cfg = DATASETS[dataset_name]
paths = sorted(glob(cfg['csv_glob'], recursive=True))
if not paths:
raise FileNotFoundError(f"no CSVs match {cfg['csv_glob']}")
print(f' [csv] {dataset_name}: {len(paths)} files')
return paths if return_paths else paths
def _attach_labels(df: pd.DataFrame, dataset_name: str, source_path: str | None=None) -> pd.DataFrame:
cfg = DATASETS[dataset_name]
if cfg.get('label_col') is None:
folder = Path(source_path).parent.name
df = df.copy()
df['cls_label'] = folder
df['binary_label'] = 0 if folder == cfg['benign_folder'] else 1
else:
lbl_col = cfg['label_col'].strip()
match = None
for c in df.columns:
if isinstance(c, str) and c.strip() == lbl_col:
match = c
break
if match is None:
raise KeyError(f'label column {lbl_col!r} not found in {source_path}')
df = df.copy()
df['cls_label'] = df[match].astype(str).str.strip()
for pat in cfg['drop_patterns']:
df = df[~df['cls_label'].str.contains(pat, na=False, regex=False)]
df['binary_label'] = df['cls_label'].apply(lambda x: 0 if x in cfg['benign_values'] else 1)
return df
def _load_dataset(dataset_name: str, feature_set: list[str]) -> pd.DataFrame:
cfg = DATASETS[dataset_name]
paths = _load_csvs(dataset_name)
dfs = []
for p in paths:
try:
df = pd.read_csv(p, low_memory=False)
except Exception as e:
print(f' [csv-warn] skip {p}: {e}')
continue
df = _attach_labels(df, dataset_name, source_path=p)
(resolved, missing) = _resolve_columns(df, feature_set)
if missing:
if not hasattr(_load_dataset, '_warned'):
_load_dataset._warned = set()
key = (dataset_name, tuple(missing))
if key not in _load_dataset._warned:
_load_dataset._warned.add(key)
print(f' [warn] {Path(p).name}: missing {missing}')
sub = df[resolved + ['binary_label', 'cls_label']].copy()
rename = {r: n for (r, n) in zip(resolved, [f for f in feature_set if f not in missing])}
sub = sub.rename(columns=rename)
dfs.append(sub)
if not dfs:
raise RuntimeError(f'no usable CSVs for {dataset_name}')
full = pd.concat(dfs, axis=0, ignore_index=True)
for c in [c for c in feature_set if c in full.columns]:
full[c] = pd.to_numeric(full[c], errors='coerce')
full = full.replace([np.inf, -np.inf], np.nan)
feat_cols = [c for c in feature_set if c in full.columns]
full = full.dropna(subset=feat_cols).reset_index(drop=True)
print(f' [csv] {dataset_name} concat: {len(full):,} rows benign={int((full.binary_label == 0).sum()):,} attack={int((full.binary_label == 1).sum()):,} features_kept={len(feat_cols)}')
return (full, feat_cols)
def _sample_within(df: pd.DataFrame, caps: dict, seed: int):
rng = np.random.default_rng(seed)
benign = df[df.binary_label == 0]
attack = df[df.binary_label == 1]
n_train = caps['n_train']
n_val = caps['n_val']
n_atk = caps['n_attack']
needed_b = n_train + n_val
if len(benign) < needed_b:
raise RuntimeError(f'only {len(benign)} benign rows, need {needed_b}')
b_idx = rng.permutation(len(benign))
train = benign.iloc[b_idx[:n_train]]
val = benign.iloc[b_idx[n_train:n_train + n_val]]
if n_atk is None:
atk = attack
else:
atk_classes = sorted(attack['cls_label'].unique())
per = max(1, n_atk // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack[attack['cls_label'] == cls]
k = min(per, len(pool))
if k:
chunks.append(pool.sample(n=k, random_state=seed))
atk = pd.concat(chunks, axis=0, ignore_index=True)
if len(atk) > n_atk:
atk = atk.sample(n=n_atk, random_state=seed)
return (train, val, atk)
def _sample_cross(src_df, tgt_df, caps, seed):
rng = np.random.default_rng(seed + 1000)
src_benign = src_df[src_df.binary_label == 0]
if len(src_benign) < caps['n_train']:
raise RuntimeError(f"src benign only {len(src_benign)}, need {caps['n_train']}")
sb_idx = rng.permutation(len(src_benign))
train = src_benign.iloc[sb_idx[:caps['n_train']]]
rng2 = np.random.default_rng(seed)
tgt_benign = tgt_df[tgt_df.binary_label == 0]
tgt_attack = tgt_df[tgt_df.binary_label == 1]
if len(tgt_benign) < caps['n_val']:
raise RuntimeError(f'tgt benign only {len(tgt_benign)}')
tb_idx = rng2.permutation(len(tgt_benign))
val = tgt_benign.iloc[tb_idx[:caps['n_val']]]
atk_classes = sorted(tgt_attack['cls_label'].unique())
per = max(1, caps['n_attack'] // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = tgt_attack[tgt_attack['cls_label'] == cls]
k = min(per, len(pool))
if k:
chunks.append(pool.sample(n=k, random_state=seed))
atk = pd.concat(chunks, axis=0, ignore_index=True)
if len(atk) > caps['n_attack']:
atk = atk.sample(n=caps['n_attack'], random_state=seed)
return (train, val, atk)
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer):
raw_train = train[feat_cols].astype(np.float64).values
keep = raw_train.std(axis=0) > 0
if not keep.all():
dropped = [c for (c, k) in zip(feat_cols, keep) if not k]
print(f' [train] dropping {len(dropped)} zero-variance cols: {dropped}')
feat_cols = [c for (c, k) in zip(feat_cols, keep) if k]
raw_train = raw_train[:, keep]
raw_val = val[feat_cols].astype(np.float64).values
raw_atk = atk[feat_cols].astype(np.float64).values
scaler = StandardScaler()
X_train = scaler.fit_transform(raw_train)
X_val = scaler.transform(raw_val)
X_atk = scaler.transform(raw_atk)
clip_lim = 30.0
X_train = np.clip(X_train, -clip_lim, clip_lim)
X_val = np.clip(X_val, -clip_lim, clip_lim)
X_atk = np.clip(X_atk, -clip_lim, clip_lim)
df_train = pd.DataFrame(X_train.astype(np.float32), columns=[f'x{i}' for i in range(len(feat_cols))])
df_val = pd.DataFrame(X_val.astype(np.float32), columns=df_train.columns)
df_atk = pd.DataFrame(X_atk.astype(np.float32), columns=df_train.columns)
if optimizer == 'sgd':
opt = optax.sgd(learning_rate=lr)
else:
opt = optax.adam(learning_rate=lr)
flow = Flow(df_train.columns.tolist())
t0 = time.time()
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False)
t_train = time.time() - t0
t0 = time.time()
lp_val = np.asarray(flow.log_prob(df_val))
lp_atk = np.asarray(flow.log_prob(df_atk))
t_score = time.time() - t0
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
def _per_class(val_score, atk_score, atk_labels):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(PROTOCOL_CONFIG))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--epochs', type=int, default=100)
p.add_argument('--lr', type=float, default=0.001)
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
(src_name, tgt_name, caps) = PROTOCOL_CONFIG[args.protocol]
cross = src_name != tgt_name
print(f'[run] shafir_nf_csv protocol={args.protocol} seed={args.seed}')
print(f' src={src_name} tgt={tgt_name} cross={cross}')
feat_set = DATASETS[src_name]['feature_set']
(src_df, src_feat_cols) = _load_dataset(src_name, feat_set)
if cross:
(tgt_df, tgt_feat_cols) = _load_dataset(tgt_name, feat_set)
feat_cols = [c for c in feat_set if c in src_feat_cols and c in tgt_feat_cols]
print(f' [features] cross intersection: {len(feat_cols)} cols')
(train, val, atk) = _sample_cross(src_df, tgt_df, caps, args.seed)
else:
feat_cols = src_feat_cols
print(f' [features] within: {len(feat_cols)} cols')
(train, val, atk) = _sample_within(src_df, caps, args.seed)
print(f' [data] train={len(train):,} val={len(val):,} attack={len(atk):,} D={len(feat_cols)}')
res = _train_and_score(train, val, atk, feat_cols, epochs=args.epochs, lr=args.lr, optimizer=args.optimizer)
(val_score, atk_score) = (res['score_val'], res['score_atk'])
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
s = np.r_[val_score, atk_score]
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
a_labels = atk['cls_label'].astype(str).to_numpy()
per_cls = _per_class(val_score, atk_score, a_labels)
out = {'method': 'shafir_nf_csv', 'protocol': args.protocol, 'seed': args.seed, 'src_dataset': src_name, 'tgt_dataset': tgt_name, 'feature_set': feat_cols, 'n_features': len(feat_cols), 'n_train': len(train), 'n_val': len(val), 'n_atk': len(atk), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=a_labels.astype(str), losses=res['losses'])
print(f'[saved] {out_json}')
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/shafir_nf_csv_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within ciciot_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-100}"
LR="${LR:-0.001}"
OPTIMIZER="${OPTIMIZER:-sgd}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_shafir_nf_csv.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"