Add baseline reproduction: Shafir NF 2-NF ensemble (17/18 cells), ConMD Table I citation, JANUS thresholded F1 across 4 datasets
This commit is contained in:
171
scripts/aggregate/thresholded_metrics.py
Normal file
171
scripts/aggregate/thresholded_metrics.py
Normal file
@@ -0,0 +1,171 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from sklearn.covariance import OAS
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2] / "artifacts" / "route_comparison"
|
||||
DATASETS = ["cicids2017", "cicddos2019", "ciciot2023", "iscxtor2016"]
|
||||
SEEDS = (42, 43, 44)
|
||||
RNG_SPLIT = 12345
|
||||
|
||||
|
||||
def metrics_at_tau(d2_b, d2_a, tau):
|
||||
tp = int((d2_a >= tau).sum())
|
||||
fn = int((d2_a < tau).sum())
|
||||
fp = int((d2_b >= tau).sum())
|
||||
tn = int((d2_b < tau).sum())
|
||||
prec = tp / max(tp + fp, 1)
|
||||
rec = tp / max(tp + fn, 1)
|
||||
f1 = 2 * prec * rec / max(prec + rec, 1e-9)
|
||||
fpr = fp / max(fp + tn, 1)
|
||||
return {"f1": f1, "prec": prec, "rec": rec, "fpr": fpr}
|
||||
|
||||
|
||||
def evaluate_seed(npz_path: Path) -> dict:
|
||||
z = np.load(npz_path, allow_pickle=True)
|
||||
keys = sorted(k.replace("val_", "") for k in z.files if k.startswith("val_") and not k.endswith("labels"))
|
||||
val_S = np.stack([z[f"val_{k}"] for k in keys], axis=1)
|
||||
atk_S = np.stack([z[f"atk_{k}"] for k in keys], axis=1)
|
||||
val_S = np.nan_to_num(val_S, nan=0.0, posinf=1e6, neginf=-1e6)
|
||||
atk_S = np.nan_to_num(atk_S, nan=0.0, posinf=1e6, neginf=-1e6)
|
||||
K = val_S.shape[1]
|
||||
rng = np.random.default_rng(RNG_SPLIT)
|
||||
idx = rng.permutation(len(val_S))
|
||||
half = len(idx) // 2
|
||||
val_A = val_S[idx[:half]]
|
||||
val_B = val_S[idx[half:]]
|
||||
mu = val_A.mean(axis=0)
|
||||
oas = OAS().fit(val_A)
|
||||
inv_cov = np.linalg.inv(oas.covariance_ + 1e-9 * np.eye(K))
|
||||
|
||||
def d2(S):
|
||||
d = S - mu
|
||||
return np.einsum("ni,ij,nj->n", d, inv_cov, d)
|
||||
|
||||
d2_A = d2(val_A)
|
||||
d2_B = d2(val_B)
|
||||
d2_atk = d2(atk_S)
|
||||
auroc = float(roc_auc_score(np.r_[np.zeros(len(d2_B)), np.ones(len(d2_atk))], np.r_[d2_B, d2_atk]))
|
||||
out = {"AUROC": auroc, "n_val": len(val_S), "n_atk": len(atk_S)}
|
||||
for pct, name in [(95, "P95"), (99, "P99")]:
|
||||
tau = float(np.percentile(d2_A, pct))
|
||||
m = metrics_at_tau(d2_B, d2_atk, tau)
|
||||
out[f"F1@{name}"] = m["f1"]
|
||||
out[f"Prec@{name}"] = m["prec"]
|
||||
out[f"Recall@{name}"] = m["rec"]
|
||||
out[f"FPR@{name}"] = m["fpr"]
|
||||
return out
|
||||
|
||||
|
||||
def aggregate(dataset: str) -> dict:
|
||||
rows = []
|
||||
for s in SEEDS:
|
||||
npz = ROOT / f"janus_{dataset}_seed{s}/phase1_scores.npz"
|
||||
if not npz.exists():
|
||||
print(f"[skip] {dataset} seed{s} — npz missing")
|
||||
continue
|
||||
rows.append(evaluate_seed(npz))
|
||||
if not rows:
|
||||
return {}
|
||||
metrics = [k for k in rows[0] if k not in ("n_val", "n_atk")]
|
||||
out = {"n_val": rows[0]["n_val"], "n_atk": rows[0]["n_atk"], "n_seeds": len(rows)}
|
||||
for m in metrics:
|
||||
a = np.array([r[m] for r in rows])
|
||||
out[m] = {"mean": float(a.mean()), "std": float(a.std()), "per_seed": [float(x) for x in a]}
|
||||
return out
|
||||
|
||||
|
||||
SUPERVISED_REF = {
|
||||
"cicddos2019": {"method": "TIPSO-GAN (supervised, single seed)", "AUROC": 0.9999, "F1": 0.9989, "source": "baselines/TIPSO-GAN/ndss_tipso_artifact/artifacts/perf_summary_cicddos2019.json"},
|
||||
}
|
||||
|
||||
|
||||
def render_md(by_ds: dict) -> str:
|
||||
lines = []
|
||||
lines.append("# JANUS Thresholded Metrics — Mahalanobis-OAS, 3-seed mean ± std")
|
||||
lines.append("")
|
||||
lines.append("Computed post-hoc from `janus_<ds>_seed{42,43,44}/phase1_scores.npz` — no retraining.")
|
||||
lines.append("")
|
||||
lines.append("## Protocol")
|
||||
lines.append("")
|
||||
lines.append("- Aggregator: **Mahalanobis-OAS** distance over the 10-d JANUS raw score vector")
|
||||
lines.append(f"- (μ, Σ) fit on **benign val half A** (random split seed={RNG_SPLIT}); F1/Precision/Recall/FPR measured on **benign val half B + ALL attacks**")
|
||||
lines.append("- AUROC measured on (half B + attacks)")
|
||||
lines.append("- Thresholds: τ95 = 95th percentile of d² on half A; τ99 = 99th percentile")
|
||||
lines.append("")
|
||||
lines.append("## Headline (4 datasets × 3 seeds)")
|
||||
lines.append("")
|
||||
lines.append("| Dataset | n_val | n_atk | AUROC | F1@P95 | Prec@P95 | Recall@P95 | FPR@P95 | F1@P99 | TPR@P99 |")
|
||||
lines.append("|---|---|---|---|---|---|---|---|---|---|")
|
||||
for ds in DATASETS:
|
||||
if ds not in by_ds or not by_ds[ds]:
|
||||
lines.append(f"| {ds} | — | — | — | — | — | — | — | — | — |")
|
||||
continue
|
||||
d = by_ds[ds]
|
||||
|
||||
def cell(k):
|
||||
v = d[k]
|
||||
return f"{v['mean']:.4f} ± {v['std']:.4f}"
|
||||
|
||||
lines.append(
|
||||
f"| {ds} | {d['n_val']} | {d['n_atk']} | {cell('AUROC')} | "
|
||||
f"{cell('F1@P95')} | {cell('Prec@P95')} | {cell('Recall@P95')} | {cell('FPR@P95')} | "
|
||||
f"{cell('F1@P99')} | {cell('Recall@P99')} |"
|
||||
)
|
||||
lines.append("")
|
||||
if any(ds in SUPERVISED_REF and ds in by_ds and by_ds[ds] for ds in DATASETS):
|
||||
lines.append("## Supervised SOTA reference (cell-by-cell)")
|
||||
lines.append("")
|
||||
lines.append("Single-seed published numbers from supervised methods, where available, for context. The protocols are not directly comparable (supervised uses attack labels at training); this is meant to show the ceiling, not for head-to-head SOTA claim.")
|
||||
lines.append("")
|
||||
lines.append("| Dataset | Supervised method | Sup AUROC | Sup F1 | JANUS AUROC | JANUS F1@P95 | Δ AUROC | Δ F1 |")
|
||||
lines.append("|---|---|---|---|---|---|---|---|")
|
||||
for ds in DATASETS:
|
||||
if ds not in SUPERVISED_REF or ds not in by_ds or not by_ds[ds]:
|
||||
continue
|
||||
ref = SUPERVISED_REF[ds]
|
||||
d = by_ds[ds]
|
||||
lines.append(
|
||||
f"| {ds} | {ref['method']} | {ref['AUROC']:.4f} | {ref['F1']:.4f} | "
|
||||
f"{d['AUROC']['mean']:.4f} ± {d['AUROC']['std']:.4f} | "
|
||||
f"{d['F1@P95']['mean']:.4f} ± {d['F1@P95']['std']:.4f} | "
|
||||
f"{d['AUROC']['mean'] - ref['AUROC']:+.4f} | "
|
||||
f"{d['F1@P95']['mean'] - ref['F1']:+.4f} |"
|
||||
)
|
||||
lines.append("")
|
||||
for ds in DATASETS:
|
||||
if ds not in by_ds or not by_ds[ds]:
|
||||
continue
|
||||
d = by_ds[ds]
|
||||
lines.append(f"## {ds}")
|
||||
lines.append("")
|
||||
lines.append(f"n_val={d['n_val']}, n_atk={d['n_atk']}, n_seeds={d['n_seeds']}")
|
||||
lines.append("")
|
||||
lines.append("| Metric | seed42 | seed43 | seed44 | mean ± std |")
|
||||
lines.append("|---|---|---|---|---|")
|
||||
for m in ["AUROC", "F1@P95", "Prec@P95", "Recall@P95", "FPR@P95", "F1@P99", "Prec@P99", "Recall@P99", "FPR@P99"]:
|
||||
v = d[m]
|
||||
ps = v["per_seed"]
|
||||
lines.append(f"| {m} | {ps[0]:.4f} | {ps[1]:.4f} | {ps[2]:.4f} | {v['mean']:.4f} ± {v['std']:.4f} |")
|
||||
lines.append("")
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--datasets", nargs="*", default=DATASETS)
|
||||
p.add_argument("--out", type=Path, default=ROOT / "THRESHOLDED.md")
|
||||
args = p.parse_args()
|
||||
by_ds = {ds: aggregate(ds) for ds in args.datasets}
|
||||
md = render_md(by_ds)
|
||||
args.out.write_text(md)
|
||||
print(md)
|
||||
print(f"\n[wrote] {args.out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -12,7 +12,7 @@ import pandas as pd
|
||||
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
|
||||
warnings.filterwarnings('ignore')
|
||||
import optax
|
||||
from pzflow import Flow
|
||||
from pzflow import Flow, FlowEnsemble
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
@@ -175,7 +175,7 @@ def _safe_metric(fn, y, s) -> float:
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer):
|
||||
def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer, n_flows=1, seed=0):
|
||||
raw_train = train[feat_cols].astype(np.float64).values
|
||||
keep = raw_train.std(axis=0) > 0
|
||||
if not keep.all():
|
||||
@@ -200,9 +200,16 @@ def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer):
|
||||
opt = optax.sgd(learning_rate=lr)
|
||||
else:
|
||||
opt = optax.adam(learning_rate=lr)
|
||||
flow = Flow(df_train.columns.tolist())
|
||||
if n_flows > 1:
|
||||
flow = FlowEnsemble(df_train.columns.tolist(), N=n_flows)
|
||||
else:
|
||||
flow = Flow(df_train.columns.tolist())
|
||||
t0 = time.time()
|
||||
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False)
|
||||
if n_flows > 1:
|
||||
losses_dict = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False, seed=seed)
|
||||
losses = list(losses_dict.values())[0]
|
||||
else:
|
||||
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False)
|
||||
t_train = time.time() - t0
|
||||
t0 = time.time()
|
||||
lp_val = np.asarray(flow.log_prob(df_val))
|
||||
@@ -229,6 +236,7 @@ def main():
|
||||
p.add_argument('--epochs', type=int, default=100)
|
||||
p.add_argument('--lr', type=float, default=0.001)
|
||||
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
|
||||
p.add_argument('--n-flows', type=int, default=1, help='1 = single NF (Shafir paper baseline mode); 2 = paper headline ensemble')
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
(src_name, tgt_name, caps) = PROTOCOL_CONFIG[args.protocol]
|
||||
@@ -247,15 +255,16 @@ def main():
|
||||
print(f' [features] within: {len(feat_cols)} cols')
|
||||
(train, val, atk) = _sample_within(src_df, caps, args.seed)
|
||||
print(f' [data] train={len(train):,} val={len(val):,} attack={len(atk):,} D={len(feat_cols)}')
|
||||
res = _train_and_score(train, val, atk, feat_cols, epochs=args.epochs, lr=args.lr, optimizer=args.optimizer)
|
||||
res = _train_and_score(train, val, atk, feat_cols, epochs=args.epochs, lr=args.lr, optimizer=args.optimizer, n_flows=args.n_flows, seed=args.seed)
|
||||
(val_score, atk_score) = (res['score_val'], res['score_atk'])
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
|
||||
s = np.r_[val_score, atk_score]
|
||||
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
|
||||
a_labels = atk['cls_label'].astype(str).to_numpy()
|
||||
per_cls = _per_class(val_score, atk_score, a_labels)
|
||||
out = {'method': 'shafir_nf_csv', 'protocol': args.protocol, 'seed': args.seed, 'src_dataset': src_name, 'tgt_dataset': tgt_name, 'feature_set': feat_cols, 'n_features': len(feat_cols), 'n_train': len(train), 'n_val': len(val), 'n_atk': len(atk), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out = {'method': 'shafir_nf_csv', 'protocol': args.protocol, 'seed': args.seed, 'n_flows': args.n_flows, 'src_dataset': src_name, 'tgt_dataset': tgt_name, 'feature_set': feat_cols, 'n_features': len(feat_cols), 'n_train': len(train), 'n_val': len(val), 'n_atk': len(atk), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
|
||||
suffix = f"_n{args.n_flows}" if args.n_flows > 1 else ""
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}{suffix}.json'
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
npz_path = out_json.with_suffix('.npz')
|
||||
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=a_labels.astype(str), losses=res['losses'])
|
||||
|
||||
Reference in New Issue
Block a user