baselines: add 3x3 cross-dataset runners for IF/OCSVM (path A + B) and Shafir NF
New scripts under scripts/baselines/: - run_if_ocsvm_cross.py - 20-d canonical flow features (path A) - run_if_ocsvm_cross_packets.py - raw 576-d packet sequence (path B) - run_shafir_nf_cross.py - single-NF on 5-d SHAFIR5 subset or 20-d - *_all.sh - 3 sources x 3 targets x 3 seeds sweepers New aggregator scripts/aggregate/baselines_cross_3x3_table.py builds a Markdown 3x3 matrix per method from per-cell NPZ outputs. RESULTS.md gains a "Shallow-baseline 3x3 cross matrices" subsection pointing at the new artifact directories. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
121
scripts/aggregate/baselines_cross_3x3_table.py
Normal file
121
scripts/aggregate/baselines_cross_3x3_table.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Aggregate IF/OCSVM 3x3 cross-dataset AUROC matrices (3-seed mean ± std).
|
||||
|
||||
Reads NPZs produced by scripts/baselines/run_if_ocsvm_cross.py:
|
||||
{method}_{src}_to_{tgt}_seed{S}.npz with keys b_score, a_score, a_labels
|
||||
|
||||
Writes one Markdown table per method.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
|
||||
DATASETS = ["cicids2017", "cicddos2019", "ciciot2023"]
|
||||
SEEDS = [42, 43, 44]
|
||||
DEFAULT_METHODS = ["iforest", "ocsvm"]
|
||||
TITLE_NAMES = {
|
||||
"iforest": "Isolation Forest",
|
||||
"ocsvm": "OCSVM (RBF)",
|
||||
"shafir_nf": "Shafir NF (single-flow, 20-d, fast)",
|
||||
}
|
||||
SHORT = {"cicids2017": "CICIDS17", "cicddos2019": "CICDDoS19", "ciciot2023": "CICIoT23"}
|
||||
|
||||
|
||||
def cell_auroc(npz_path: Path) -> tuple[float, int, int]:
|
||||
z = np.load(npz_path, allow_pickle=True)
|
||||
b = z["b_score"]
|
||||
a = z["a_score"]
|
||||
y = np.r_[np.zeros(len(b)), np.ones(len(a))]
|
||||
s = np.r_[b, a]
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
return float(roc_auc_score(y, s)), len(b), len(a)
|
||||
|
||||
|
||||
def build_method_table(method: str, in_dir: Path) -> tuple[str, list[str]]:
|
||||
cells = {}
|
||||
counts = {}
|
||||
missing = []
|
||||
for src in DATASETS:
|
||||
for tgt in DATASETS:
|
||||
aucs = []
|
||||
n_b = n_a = None
|
||||
for s in SEEDS:
|
||||
p = in_dir / f"{method}_{src}_to_{tgt}_seed{s}.npz"
|
||||
if not p.exists():
|
||||
missing.append(p.name)
|
||||
continue
|
||||
auc, n_b, n_a = cell_auroc(p)
|
||||
aucs.append(auc)
|
||||
if not aucs:
|
||||
cells[(src, tgt)] = (float("nan"), float("nan"))
|
||||
else:
|
||||
a = np.asarray(aucs)
|
||||
cells[(src, tgt)] = (a.mean(), a.std())
|
||||
counts[(src, tgt)] = (n_b, n_a)
|
||||
|
||||
lines: list[str] = []
|
||||
title_name = TITLE_NAMES.get(method, method)
|
||||
lines.append(f"# 3×3 cross-dataset AUROC matrix — {title_name} (3-seed mean ± std)\n")
|
||||
lines.append("Rows = source (10K benign training); columns = target (10K benign + balanced ≤1M attacks).")
|
||||
lines.append("Trained on raw 20-d canonical flow features after `StandardScaler` fit on source benign train.")
|
||||
lines.append("Diagonal italic = within-dataset (target benign sampled from rows disjoint from training).\n")
|
||||
|
||||
header = "| Source ↓ / Target → | " + " | ".join(SHORT[t] for t in DATASETS) + " |"
|
||||
sep = "|" + "|".join(["---"] * (len(DATASETS) + 1)) + "|"
|
||||
lines.append(header)
|
||||
lines.append(sep)
|
||||
for src in DATASETS:
|
||||
row = [f"**{SHORT[src]}**"]
|
||||
for tgt in DATASETS:
|
||||
m, sd = cells[(src, tgt)]
|
||||
cell = f"{m:.4f} ± {sd:.4f}"
|
||||
if src == tgt:
|
||||
cell = f"_{cell}_"
|
||||
row.append(cell)
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
|
||||
lines.append("\n## Sample counts (target benign / target attacks)\n")
|
||||
lines.append(header)
|
||||
lines.append(sep)
|
||||
for src in DATASETS:
|
||||
row = [SHORT[src]]
|
||||
for tgt in DATASETS:
|
||||
n_b, n_a = counts[(src, tgt)]
|
||||
row.append(f"{n_b}b / {n_a}a" if n_b is not None else "missing")
|
||||
lines.append("| " + " | ".join(row) + " |")
|
||||
return "\n".join(lines) + "\n", missing
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--in-dir", type=Path,
|
||||
default=REPO / "artifacts/baselines/if_ocsvm_cross_2026_05_11")
|
||||
p.add_argument("--out-md", type=Path,
|
||||
default=None,
|
||||
help="Combined markdown output path. Defaults to <in-dir>/CROSS_MATRIX_3x3.md")
|
||||
p.add_argument("--methods", nargs="+", default=DEFAULT_METHODS,
|
||||
help="Method names to aggregate (matching NPZ filename prefixes).")
|
||||
args = p.parse_args()
|
||||
|
||||
out_md = args.out_md or (args.in_dir / "CROSS_MATRIX_3x3.md")
|
||||
parts = []
|
||||
all_missing: list[str] = []
|
||||
for method in args.methods:
|
||||
block, missing = build_method_table(method, args.in_dir)
|
||||
parts.append(block)
|
||||
all_missing.extend(missing)
|
||||
print(block)
|
||||
print()
|
||||
if all_missing:
|
||||
print("# Missing inputs (counted as NaN cells)")
|
||||
for m in all_missing:
|
||||
print(f" - {m}")
|
||||
out_md.write_text("\n\n".join(parts))
|
||||
print(f"[wrote] {out_md}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user