baselines: add 3x3 cross-dataset runners for IF/OCSVM (path A + B) and Shafir NF

New scripts under scripts/baselines/:
- run_if_ocsvm_cross.py            - 20-d canonical flow features (path A)
- run_if_ocsvm_cross_packets.py    - raw 576-d packet sequence (path B)
- run_shafir_nf_cross.py           - single-NF on 5-d SHAFIR5 subset or 20-d
- *_all.sh                         - 3 sources x 3 targets x 3 seeds sweepers

New aggregator scripts/aggregate/baselines_cross_3x3_table.py builds a
Markdown 3x3 matrix per method from per-cell NPZ outputs.

RESULTS.md gains a "Shallow-baseline 3x3 cross matrices" subsection
pointing at the new artifact directories.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-12 17:41:20 +08:00
parent ff0efa97bf
commit 6e5f753c01
8 changed files with 979 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
"""Aggregate IF/OCSVM 3x3 cross-dataset AUROC matrices (3-seed mean ± std).
Reads NPZs produced by scripts/baselines/run_if_ocsvm_cross.py:
{method}_{src}_to_{tgt}_seed{S}.npz with keys b_score, a_score, a_labels
Writes one Markdown table per method.
"""
from __future__ import annotations
import argparse
from pathlib import Path
import numpy as np
from sklearn.metrics import roc_auc_score
REPO = Path(__file__).resolve().parents[2]
DATASETS = ["cicids2017", "cicddos2019", "ciciot2023"]
SEEDS = [42, 43, 44]
DEFAULT_METHODS = ["iforest", "ocsvm"]
TITLE_NAMES = {
"iforest": "Isolation Forest",
"ocsvm": "OCSVM (RBF)",
"shafir_nf": "Shafir NF (single-flow, 20-d, fast)",
}
SHORT = {"cicids2017": "CICIDS17", "cicddos2019": "CICDDoS19", "ciciot2023": "CICIoT23"}
def cell_auroc(npz_path: Path) -> tuple[float, int, int]:
z = np.load(npz_path, allow_pickle=True)
b = z["b_score"]
a = z["a_score"]
y = np.r_[np.zeros(len(b)), np.ones(len(a))]
s = np.r_[b, a]
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
return float(roc_auc_score(y, s)), len(b), len(a)
def build_method_table(method: str, in_dir: Path) -> tuple[str, list[str]]:
cells = {}
counts = {}
missing = []
for src in DATASETS:
for tgt in DATASETS:
aucs = []
n_b = n_a = None
for s in SEEDS:
p = in_dir / f"{method}_{src}_to_{tgt}_seed{s}.npz"
if not p.exists():
missing.append(p.name)
continue
auc, n_b, n_a = cell_auroc(p)
aucs.append(auc)
if not aucs:
cells[(src, tgt)] = (float("nan"), float("nan"))
else:
a = np.asarray(aucs)
cells[(src, tgt)] = (a.mean(), a.std())
counts[(src, tgt)] = (n_b, n_a)
lines: list[str] = []
title_name = TITLE_NAMES.get(method, method)
lines.append(f"# 3×3 cross-dataset AUROC matrix — {title_name} (3-seed mean ± std)\n")
lines.append("Rows = source (10K benign training); columns = target (10K benign + balanced ≤1M attacks).")
lines.append("Trained on raw 20-d canonical flow features after `StandardScaler` fit on source benign train.")
lines.append("Diagonal italic = within-dataset (target benign sampled from rows disjoint from training).\n")
header = "| Source ↓ / Target → | " + " | ".join(SHORT[t] for t in DATASETS) + " |"
sep = "|" + "|".join(["---"] * (len(DATASETS) + 1)) + "|"
lines.append(header)
lines.append(sep)
for src in DATASETS:
row = [f"**{SHORT[src]}**"]
for tgt in DATASETS:
m, sd = cells[(src, tgt)]
cell = f"{m:.4f} ± {sd:.4f}"
if src == tgt:
cell = f"_{cell}_"
row.append(cell)
lines.append("| " + " | ".join(row) + " |")
lines.append("\n## Sample counts (target benign / target attacks)\n")
lines.append(header)
lines.append(sep)
for src in DATASETS:
row = [SHORT[src]]
for tgt in DATASETS:
n_b, n_a = counts[(src, tgt)]
row.append(f"{n_b}b / {n_a}a" if n_b is not None else "missing")
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines) + "\n", missing
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--in-dir", type=Path,
default=REPO / "artifacts/baselines/if_ocsvm_cross_2026_05_11")
p.add_argument("--out-md", type=Path,
default=None,
help="Combined markdown output path. Defaults to <in-dir>/CROSS_MATRIX_3x3.md")
p.add_argument("--methods", nargs="+", default=DEFAULT_METHODS,
help="Method names to aggregate (matching NPZ filename prefixes).")
args = p.parse_args()
out_md = args.out_md or (args.in_dir / "CROSS_MATRIX_3x3.md")
parts = []
all_missing: list[str] = []
for method in args.methods:
block, missing = build_method_table(method, args.in_dir)
parts.append(block)
all_missing.extend(missing)
print(block)
print()
if all_missing:
print("# Missing inputs (counted as NaN cells)")
for m in all_missing:
print(f" - {m}")
out_md.write_text("\n\n".join(parts))
print(f"[wrote] {out_md}")
if __name__ == "__main__":
main()