diff --git a/RESULTS.md b/RESULTS.md index 6a69551..8e886b6 100644 --- a/RESULTS.md +++ b/RESULTS.md @@ -133,6 +133,25 @@ Full 4×4 cross matrix at `artifacts/route_comparison/CROSS_MATRIX.md`. All See `artifacts/route_comparison/SCORE_ROUTER.md` for full ablation across max-of-z, plain Mahalanobis, Ledoit-Wolf, OAS, and score-subset variants. +#### Shallow-baseline 3×3 cross matrices (Isolation Forest, OCSVM) — 2026-05-12 add + +Two input modalities tested as cross-dataset reference points: + +- **Path A** (`artifacts/baselines/if_ocsvm_cross_2026_05_11/`): IF and OCSVM + on the 20-d canonical flow features (`StandardScaler`). Strong shallow + baseline — best off-diagonal AUROC is OCSVM 0.966 on CICIDS17→CICDDoS19. + JANUS still wins all 9 cells; largest margin is CICDDoS19→CICIDS17 + (JANUS 0.941 vs OCSVM 0.571, **+0.370 AUROC**). +- **Path B** (`artifacts/baselines/if_ocsvm_cross_packets_2026_05_11/`): IF + and OCSVM on the raw 576-d packet-token sequence (T=64×9, flattened), + matching the input modality JANUS itself consumes. Numbers are weaker + across the board (avg −0.16 AUROC vs path A); 3 IF cells and 1 OCSVM cell + drop **below random**. This is the input-controlled comparison and is the + recommended baseline column for the paper's cross-dataset table. + +Full 3×3 matrices for both paths and a JANUS-vs-baselines off-diagonal +margin table are appended to `artifacts/baselines/COMPARISON_TABLE.md`. + ### Reverse cross (CICDDoS2019 → CICIDS2017) — 2026-05-01 update The reverse direction was the project's "stuck" failure mode (memory note @@ -376,6 +395,11 @@ artifacts. per-seed eval results across all experiments. - `artifacts/phase25_sigma06_cross_2026_04_25/cicids2017_to_cicddos2019_seed*.json` — 3-seed cross-dataset eval JSONs. +- `artifacts/baselines/if_ocsvm_cross_2026_05_11/CROSS_MATRIX_3x3.md` — + IF/OCSVM 3×3 cross matrix on 20-d canonical flow features (path A). +- `artifacts/baselines/if_ocsvm_cross_packets_2026_05_11/CROSS_MATRIX_3x3.md` — + IF/OCSVM 3×3 cross matrix on raw 576-d packet sequence (path B, + input-modality controlled with JANUS). - Aggregator scripts: `artifacts/verify_2026_04_24/aggregate_phase{0,1,2,25,sigma06,per_attack_multiseed}.py`. - Orchestrator scripts: `artifacts/verify_2026_04_24/run_phase*.sh`. diff --git a/scripts/aggregate/baselines_cross_3x3_table.py b/scripts/aggregate/baselines_cross_3x3_table.py new file mode 100644 index 0000000..dbb6889 --- /dev/null +++ b/scripts/aggregate/baselines_cross_3x3_table.py @@ -0,0 +1,121 @@ +"""Aggregate IF/OCSVM 3x3 cross-dataset AUROC matrices (3-seed mean ± std). + +Reads NPZs produced by scripts/baselines/run_if_ocsvm_cross.py: + {method}_{src}_to_{tgt}_seed{S}.npz with keys b_score, a_score, a_labels + +Writes one Markdown table per method. +""" +from __future__ import annotations +import argparse +from pathlib import Path +import numpy as np +from sklearn.metrics import roc_auc_score + +REPO = Path(__file__).resolve().parents[2] + +DATASETS = ["cicids2017", "cicddos2019", "ciciot2023"] +SEEDS = [42, 43, 44] +DEFAULT_METHODS = ["iforest", "ocsvm"] +TITLE_NAMES = { + "iforest": "Isolation Forest", + "ocsvm": "OCSVM (RBF)", + "shafir_nf": "Shafir NF (single-flow, 20-d, fast)", +} +SHORT = {"cicids2017": "CICIDS17", "cicddos2019": "CICDDoS19", "ciciot2023": "CICIoT23"} + + +def cell_auroc(npz_path: Path) -> tuple[float, int, int]: + z = np.load(npz_path, allow_pickle=True) + b = z["b_score"] + a = z["a_score"] + y = np.r_[np.zeros(len(b)), np.ones(len(a))] + s = np.r_[b, a] + s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12) + return float(roc_auc_score(y, s)), len(b), len(a) + + +def build_method_table(method: str, in_dir: Path) -> tuple[str, list[str]]: + cells = {} + counts = {} + missing = [] + for src in DATASETS: + for tgt in DATASETS: + aucs = [] + n_b = n_a = None + for s in SEEDS: + p = in_dir / f"{method}_{src}_to_{tgt}_seed{s}.npz" + if not p.exists(): + missing.append(p.name) + continue + auc, n_b, n_a = cell_auroc(p) + aucs.append(auc) + if not aucs: + cells[(src, tgt)] = (float("nan"), float("nan")) + else: + a = np.asarray(aucs) + cells[(src, tgt)] = (a.mean(), a.std()) + counts[(src, tgt)] = (n_b, n_a) + + lines: list[str] = [] + title_name = TITLE_NAMES.get(method, method) + lines.append(f"# 3×3 cross-dataset AUROC matrix — {title_name} (3-seed mean ± std)\n") + lines.append("Rows = source (10K benign training); columns = target (10K benign + balanced ≤1M attacks).") + lines.append("Trained on raw 20-d canonical flow features after `StandardScaler` fit on source benign train.") + lines.append("Diagonal italic = within-dataset (target benign sampled from rows disjoint from training).\n") + + header = "| Source ↓ / Target → | " + " | ".join(SHORT[t] for t in DATASETS) + " |" + sep = "|" + "|".join(["---"] * (len(DATASETS) + 1)) + "|" + lines.append(header) + lines.append(sep) + for src in DATASETS: + row = [f"**{SHORT[src]}**"] + for tgt in DATASETS: + m, sd = cells[(src, tgt)] + cell = f"{m:.4f} ± {sd:.4f}" + if src == tgt: + cell = f"_{cell}_" + row.append(cell) + lines.append("| " + " | ".join(row) + " |") + + lines.append("\n## Sample counts (target benign / target attacks)\n") + lines.append(header) + lines.append(sep) + for src in DATASETS: + row = [SHORT[src]] + for tgt in DATASETS: + n_b, n_a = counts[(src, tgt)] + row.append(f"{n_b}b / {n_a}a" if n_b is not None else "missing") + lines.append("| " + " | ".join(row) + " |") + return "\n".join(lines) + "\n", missing + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--in-dir", type=Path, + default=REPO / "artifacts/baselines/if_ocsvm_cross_2026_05_11") + p.add_argument("--out-md", type=Path, + default=None, + help="Combined markdown output path. Defaults to /CROSS_MATRIX_3x3.md") + p.add_argument("--methods", nargs="+", default=DEFAULT_METHODS, + help="Method names to aggregate (matching NPZ filename prefixes).") + args = p.parse_args() + + out_md = args.out_md or (args.in_dir / "CROSS_MATRIX_3x3.md") + parts = [] + all_missing: list[str] = [] + for method in args.methods: + block, missing = build_method_table(method, args.in_dir) + parts.append(block) + all_missing.extend(missing) + print(block) + print() + if all_missing: + print("# Missing inputs (counted as NaN cells)") + for m in all_missing: + print(f" - {m}") + out_md.write_text("\n\n".join(parts)) + print(f"[wrote] {out_md}") + + +if __name__ == "__main__": + main() diff --git a/scripts/baselines/run_if_ocsvm_cross.py b/scripts/baselines/run_if_ocsvm_cross.py new file mode 100644 index 0000000..ba1ea3e --- /dev/null +++ b/scripts/baselines/run_if_ocsvm_cross.py @@ -0,0 +1,237 @@ +"""Cross-dataset baselines (Isolation Forest, OCSVM) on the 20-d canonical +flow-feature contract. + +Protocol per (method, src, tgt, seed): + - Train: 10,000 source benign rows (random sample seeded with --seed + 1000) + - Test: 10,000 target benign rows (random sample seeded with --seed) + + balanced per-class attack sample with n_attack cap (--n-attack + default 1,000,000, divided across all attack classes, matching + Mixed_CFM/eval_cross.py) + - For diagonal src == tgt, target benign is sampled from the source-pool + complement (the rows not used for training) so train and test are disjoint. + +Outputs (in --out-dir): + {method}_{src}_to_{tgt}_seed{seed}.npz -- b_score, a_score, a_labels + {method}_{src}_to_{tgt}_seed{seed}.json -- AUROC, AUPRC, sample counts, timing +""" +from __future__ import annotations +import argparse +import json +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.ensemble import IsolationForest +from sklearn.metrics import average_precision_score, roc_auc_score +from sklearn.preprocessing import StandardScaler +from sklearn.svm import OneClassSVM + +REPO = Path(__file__).resolve().parents[2] + +DATASETS = { + "cicids2017": { + "flows": REPO / "datasets/cicids2017/processed/flows.parquet", + "flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet", + }, + "cicddos2019": { + "flows": REPO / "datasets/cicddos2019/processed/flows.parquet", + "flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet", + }, + "ciciot2023": { + "flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet", + "flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet", + }, +} + +FEATURE_COLS = ( + "log_duration", "log_n_pkts", "fwd_count", "bwd_count", + "pkt_size_mean", "pkt_size_std", "pkt_size_max", + "fwd_size_mean", "bwd_size_mean", "bwd_size_std", + "iat_mean", "fwd_iat_max", "bwd_iat_max", "bwd_iat_std", + "active_mean", "idle_mean", + "log_pkts_per_s", "log_total_bytes", + "ack_cnt", "syn_cnt", +) + + +def _load_dataset(name: str): + paths = DATASETS[name] + flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"]) + ff = pd.read_parquet(paths["flow_features"]) + if not np.array_equal( + flows["flow_id"].to_numpy(dtype=np.uint64), + ff["flow_id"].to_numpy(dtype=np.uint64), + ): + raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned") + X = ff[list(FEATURE_COLS)].to_numpy(dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32) + labels = flows["label"].astype(str).to_numpy() + return X, labels + + +def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray: + attack_idx = np.flatnonzero(labels != "normal") + atk_labels = labels[attack_idx] + classes = sorted(set(atk_labels)) + per_class = max(1, n_attack // len(classes)) + chunks = [] + for cls in classes: + pool = attack_idx[atk_labels == cls] + k = min(per_class, len(pool)) + if k: + chunks.append(rng.choice(pool, size=k, replace=False)) + sel = np.sort(np.concatenate(chunks)) + if len(sel) > n_attack: + sel = np.sort(rng.choice(sel, size=n_attack, replace=False)) + return sel + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--method", choices=["iforest", "ocsvm"], required=True) + p.add_argument("--src", choices=list(DATASETS), required=True) + p.add_argument("--tgt", choices=list(DATASETS), required=True) + p.add_argument("--seed", type=int, required=True) + p.add_argument("--out-dir", type=Path, required=True) + p.add_argument("--n-train", type=int, default=10000) + p.add_argument("--n-benign", type=int, default=10000) + p.add_argument("--n-attack", type=int, default=1_000_000, + help="Per-class balanced cap (matches Mixed_CFM/eval_cross.py).") + # Method hyperparams + p.add_argument("--iforest-n-estimators", type=int, default=200) + p.add_argument("--ocsvm-nu", type=float, default=0.1) + p.add_argument("--ocsvm-gamma", type=str, default="scale") + p.add_argument("--ocsvm-cache-mb", type=int, default=2000) + args = p.parse_args() + + args.out_dir.mkdir(parents=True, exist_ok=True) + tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}" + print(f"[run] {tag}") + + # --- source training --- + t0 = time.time() + src_X, src_labels = _load_dataset(args.src) + src_benign_idx = np.flatnonzero(src_labels == "normal") + rng_train = np.random.default_rng(args.seed + 1000) + if len(src_benign_idx) < args.n_train: + raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}") + train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False)) + train_X = src_X[train_sel] + t_load_src = time.time() - t0 + + # --- target eval --- + t0 = time.time() + if args.tgt == args.src: + tgt_X, tgt_labels = src_X, src_labels + used_for_train = np.zeros(len(tgt_labels), dtype=bool) + used_for_train[train_sel] = True + eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used_for_train) + else: + tgt_X, tgt_labels = _load_dataset(args.tgt) + eligible_benign = np.flatnonzero(tgt_labels == "normal") + rng_eval = np.random.default_rng(args.seed) + n_benign = min(args.n_benign, len(eligible_benign)) + if n_benign < args.n_benign: + print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})") + b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False)) + a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval) + val_X = tgt_X[b_sel] + atk_X = tgt_X[a_sel] + a_labels = tgt_labels[a_sel] + t_load_tgt = time.time() - t0 + print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}" + f" classes={len(set(a_labels))} D={train_X.shape[1]}") + + # --- standardize on source train --- + scaler = StandardScaler().fit(train_X) + train_Z = scaler.transform(train_X).astype(np.float32) + val_Z = scaler.transform(val_X).astype(np.float32) + atk_Z = scaler.transform(atk_X).astype(np.float32) + + # --- fit --- + t0 = time.time() + if args.method == "iforest": + model = IsolationForest( + n_estimators=args.iforest_n_estimators, + random_state=args.seed, + n_jobs=-1, + contamination="auto", + ) + model.fit(train_Z) + else: + model = OneClassSVM( + kernel="rbf", + nu=args.ocsvm_nu, + gamma=args.ocsvm_gamma, + cache_size=args.ocsvm_cache_mb, + ) + model.fit(train_Z) + t_fit = time.time() - t0 + + # --- score: higher = more anomalous --- + # IsolationForest.score_samples returns higher-for-normal, so negate. + # OneClassSVM.score_samples returns signed distance to boundary + # (higher = more normal), so negate too. + t0 = time.time() + if args.method == "iforest": + b_score = (-model.score_samples(val_Z)).astype(np.float32) + a_score = (-model.score_samples(atk_Z)).astype(np.float32) + else: + b_score = (-model.decision_function(val_Z)).astype(np.float32) + a_score = (-model.decision_function(atk_Z)).astype(np.float32) + t_score = time.time() - t0 + + # --- metrics --- + y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))] + s = np.r_[b_score, a_score] + s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12) + auroc = float(roc_auc_score(y, s)) + auprc = float(average_precision_score(y, s)) + + per_class = {} + for cls in sorted(set(a_labels)): + m = a_labels == cls + y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))] + s_c = np.r_[b_score, a_score[m]] + s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12) + try: + auc_c = float(roc_auc_score(y_c, s_c)) + except ValueError: + auc_c = float("nan") + per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c} + + out = { + "method": args.method, + "src": args.src, + "tgt": args.tgt, + "seed": args.seed, + "n_train": int(len(train_X)), + "n_benign": int(len(val_X)), + "n_attack": int(len(atk_X)), + "n_attack_classes": int(len(set(a_labels))), + "t_load_src_sec": round(t_load_src, 2), + "t_load_tgt_sec": round(t_load_tgt, 2), + "t_fit_sec": round(t_fit, 2), + "t_score_sec": round(t_score, 2), + "overall": {"auroc": auroc, "auprc": auprc}, + "per_class": per_class, + } + if args.method == "iforest": + out["hparams"] = {"n_estimators": args.iforest_n_estimators} + else: + out["hparams"] = {"nu": args.ocsvm_nu, "gamma": args.ocsvm_gamma} + + json_path = args.out_dir / f"{tag}.json" + json_path.write_text(json.dumps(out, indent=2)) + npz_path = args.out_dir / f"{tag}.npz" + np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str)) + print(f"[saved] {json_path}") + print(f"[saved] {npz_path}") + print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} " + f"AUROC={auroc:.4f} AUPRC={auprc:.4f} " + f"fit={t_fit:.1f}s score={t_score:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/scripts/baselines/run_if_ocsvm_cross_all.sh b/scripts/baselines/run_if_ocsvm_cross_all.sh new file mode 100755 index 0000000..f324d60 --- /dev/null +++ b/scripts/baselines/run_if_ocsvm_cross_all.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Orchestrate the full 3x3 cross-dataset sweep for IF/OCSVM baselines. +# 3 sources x 3 targets x 3 seeds x 2 methods = 54 runs. +set -euo pipefail + +REPO="/home/chy/JANUS" +cd "$REPO" + +OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_2026_05_11}" +mkdir -p "$OUT_DIR" +LOG_DIR="$OUT_DIR/logs" +mkdir -p "$LOG_DIR" + +DATASETS=(cicids2017 cicddos2019 ciciot2023) +SEEDS=(42 43 44) +METHODS=(iforest ocsvm) + +START=$(date +%s) +for method in "${METHODS[@]}"; do + for src in "${DATASETS[@]}"; do + for tgt in "${DATASETS[@]}"; do + for seed in "${SEEDS[@]}"; do + tag="${method}_${src}_to_${tgt}_seed${seed}" + if [[ -f "$OUT_DIR/${tag}.json" ]]; then + echo "[skip] $tag (json exists)" + continue + fi + echo "[start] $tag" + uv run --no-sync python scripts/baselines/run_if_ocsvm_cross.py \ + --method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \ + --out-dir "$OUT_DIR" \ + > "$LOG_DIR/${tag}.log" 2>&1 + echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))" + done + done + done +done +END=$(date +%s) +echo "[all done] elapsed $((END - START))s" diff --git a/scripts/baselines/run_if_ocsvm_cross_packets.py b/scripts/baselines/run_if_ocsvm_cross_packets.py new file mode 100644 index 0000000..3b10935 --- /dev/null +++ b/scripts/baselines/run_if_ocsvm_cross_packets.py @@ -0,0 +1,233 @@ +"""Path-B: IF/OCSVM cross-dataset baselines on RAW PACKET SEQUENCES. + +Same protocol as run_if_ocsvm_cross.py, but the input feature vector is the +flattened first T=64 packet tokens (9-d each) -> 576-d. No flow-stat +aggregation — this is the input modality JANUS itself consumes, so it +measures what classical AD can do without hand-engineered features. + +Outputs: + {method}_{src}_to_{tgt}_seed{seed}.{json,npz} +""" +from __future__ import annotations +import argparse +import json +import sys +import time +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.ensemble import IsolationForest +from sklearn.metrics import average_precision_score, roc_auc_score +from sklearn.preprocessing import StandardScaler +from sklearn.svm import OneClassSVM + +REPO = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(REPO)) +from common.packet_store import PacketShardStore # noqa: E402 + +DATASETS = { + "cicids2017": { + "flows": REPO / "datasets/cicids2017/processed/flows.parquet", + "packets_npz": REPO / "datasets/cicids2017/processed/packets.npz", + "source_store": None, + }, + "cicddos2019": { + "flows": REPO / "datasets/cicddos2019/processed/flows.parquet", + "packets_npz": None, + "source_store": REPO / "datasets/cicddos2019/processed/full_store", + }, + "ciciot2023": { + "flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet", + "packets_npz": None, + "source_store": REPO / "datasets/ciciot2023/processed/full_store", + }, +} + + +def _load_labels(name: str) -> np.ndarray: + paths = DATASETS[name] + flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"]) + return flows["label"].astype(str).to_numpy() + + +def _materialize_packets(name: str, indices: np.ndarray, T: int) -> np.ndarray: + paths = DATASETS[name] + if paths["packets_npz"] is not None: + pz = np.load(paths["packets_npz"], mmap_mode="r") + tokens = pz["packet_tokens"] + if T > tokens.shape[1]: + raise ValueError(f"requested T={T} > stored {tokens.shape[1]}") + out = np.asarray(tokens[indices, :T, :]).astype(np.float32, copy=True) + return out + else: + store = PacketShardStore.open(paths["source_store"]) + tok, _ = store.read_packets(indices.astype(np.int64), T=T) + return tok.astype(np.float32, copy=False) + + +def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray: + attack_idx = np.flatnonzero(labels != "normal") + atk_labels = labels[attack_idx] + classes = sorted(set(atk_labels)) + per_class = max(1, n_attack // len(classes)) + chunks = [] + for cls in classes: + pool = attack_idx[atk_labels == cls] + k = min(per_class, len(pool)) + if k: + chunks.append(rng.choice(pool, size=k, replace=False)) + sel = np.sort(np.concatenate(chunks)) + if len(sel) > n_attack: + sel = np.sort(rng.choice(sel, size=n_attack, replace=False)) + return sel + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--method", choices=["iforest", "ocsvm"], required=True) + p.add_argument("--src", choices=list(DATASETS), required=True) + p.add_argument("--tgt", choices=list(DATASETS), required=True) + p.add_argument("--seed", type=int, required=True) + p.add_argument("--out-dir", type=Path, required=True) + p.add_argument("--T", type=int, default=64, help="Packets-per-flow cap (matches JANUS T=64).") + p.add_argument("--n-train", type=int, default=10000) + p.add_argument("--n-benign", type=int, default=10000) + p.add_argument("--n-attack", type=int, default=200000, + help="Per-class balanced cap on target attacks. Smaller than the " + "20-d run (1M) because 576-d OCSVM scoring is much slower.") + p.add_argument("--min-len", type=int, default=2) + # Method hyperparams + p.add_argument("--iforest-n-estimators", type=int, default=200) + p.add_argument("--ocsvm-nu", type=float, default=0.1) + p.add_argument("--ocsvm-gamma", type=str, default="scale") + p.add_argument("--ocsvm-cache-mb", type=int, default=2000) + args = p.parse_args() + + args.out_dir.mkdir(parents=True, exist_ok=True) + tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}" + print(f"[run] {tag} (raw {args.T}x9 packets = {args.T * 9}-d)") + + # --- source training --- + t0 = time.time() + src_labels = _load_labels(args.src) + src_benign_idx = np.flatnonzero(src_labels == "normal") + rng_train = np.random.default_rng(args.seed + 1000) + if len(src_benign_idx) < args.n_train: + raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}") + train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False)) + train_tokens = _materialize_packets(args.src, train_sel, T=args.T) + train_X = train_tokens.reshape(len(train_sel), -1) + t_load_src = time.time() - t0 + + # --- target eval --- + t0 = time.time() + if args.tgt == args.src: + tgt_labels = src_labels + used = np.zeros(len(tgt_labels), dtype=bool) + used[train_sel] = True + eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used) + else: + tgt_labels = _load_labels(args.tgt) + eligible_benign = np.flatnonzero(tgt_labels == "normal") + rng_eval = np.random.default_rng(args.seed) + n_benign = min(args.n_benign, len(eligible_benign)) + if n_benign < args.n_benign: + print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})") + b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False)) + a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval) + val_tokens = _materialize_packets(args.tgt, b_sel, T=args.T) + atk_tokens = _materialize_packets(args.tgt, a_sel, T=args.T) + val_X = val_tokens.reshape(len(b_sel), -1) + atk_X = atk_tokens.reshape(len(a_sel), -1) + a_labels = tgt_labels[a_sel] + t_load_tgt = time.time() - t0 + print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}" + f" classes={len(set(a_labels))} D={train_X.shape[1]}") + + # --- standardize --- + scaler = StandardScaler().fit(train_X) + train_Z = scaler.transform(train_X).astype(np.float32) + val_Z = scaler.transform(val_X).astype(np.float32) + atk_Z = scaler.transform(atk_X).astype(np.float32) + + # --- fit --- + t0 = time.time() + if args.method == "iforest": + model = IsolationForest( + n_estimators=args.iforest_n_estimators, + random_state=args.seed, + n_jobs=-1, + contamination="auto", + ) + model.fit(train_Z) + else: + model = OneClassSVM( + kernel="rbf", + nu=args.ocsvm_nu, + gamma=args.ocsvm_gamma, + cache_size=args.ocsvm_cache_mb, + ) + model.fit(train_Z) + t_fit = time.time() - t0 + + # --- score (higher = more anomalous) --- + t0 = time.time() + if args.method == "iforest": + b_score = (-model.score_samples(val_Z)).astype(np.float32) + a_score = (-model.score_samples(atk_Z)).astype(np.float32) + else: + b_score = (-model.decision_function(val_Z)).astype(np.float32) + a_score = (-model.decision_function(atk_Z)).astype(np.float32) + t_score = time.time() - t0 + + # --- metrics --- + y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))] + s = np.r_[b_score, a_score] + s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12) + auroc = float(roc_auc_score(y, s)) + auprc = float(average_precision_score(y, s)) + + per_class = {} + for cls in sorted(set(a_labels)): + m = a_labels == cls + y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))] + s_c = np.r_[b_score, a_score[m]] + s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12) + try: + auc_c = float(roc_auc_score(y_c, s_c)) + except ValueError: + auc_c = float("nan") + per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c} + + out = { + "method": args.method, + "src": args.src, + "tgt": args.tgt, + "seed": args.seed, + "T": args.T, + "feature_dim": int(train_X.shape[1]), + "input_mode": "raw_packet_sequence", + "n_train": int(len(train_X)), + "n_benign": int(len(val_X)), + "n_attack": int(len(atk_X)), + "n_attack_classes": int(len(set(a_labels))), + "t_load_src_sec": round(t_load_src, 2), + "t_load_tgt_sec": round(t_load_tgt, 2), + "t_fit_sec": round(t_fit, 2), + "t_score_sec": round(t_score, 2), + "overall": {"auroc": auroc, "auprc": auprc}, + "per_class": per_class, + } + json_path = args.out_dir / f"{tag}.json" + json_path.write_text(json.dumps(out, indent=2)) + npz_path = args.out_dir / f"{tag}.npz" + np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str)) + print(f"[saved] {json_path}") + print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} " + f"AUROC={auroc:.4f} AUPRC={auprc:.4f} " + f"fit={t_fit:.1f}s score={t_score:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/scripts/baselines/run_if_ocsvm_cross_packets_all.sh b/scripts/baselines/run_if_ocsvm_cross_packets_all.sh new file mode 100755 index 0000000..28cd87b --- /dev/null +++ b/scripts/baselines/run_if_ocsvm_cross_packets_all.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Path-B sweep: IF/OCSVM on raw 64x9 packet sequence (576-d), 3x3 cross-dataset. +set -euo pipefail + +REPO="/home/chy/JANUS" +cd "$REPO" + +OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_packets_2026_05_11}" +mkdir -p "$OUT_DIR" +LOG_DIR="$OUT_DIR/logs" +mkdir -p "$LOG_DIR" + +DATASETS=(cicids2017 cicddos2019 ciciot2023) +SEEDS=(42 43 44) +METHODS=(iforest ocsvm) + +START=$(date +%s) +for method in "${METHODS[@]}"; do + for src in "${DATASETS[@]}"; do + for tgt in "${DATASETS[@]}"; do + for seed in "${SEEDS[@]}"; do + tag="${method}_${src}_to_${tgt}_seed${seed}" + if [[ -f "$OUT_DIR/${tag}.json" ]]; then + echo "[skip] $tag (json exists)" + continue + fi + echo "[start] $tag" + uv run --no-sync python scripts/baselines/run_if_ocsvm_cross_packets.py \ + --method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \ + --out-dir "$OUT_DIR" \ + > "$LOG_DIR/${tag}.log" 2>&1 + echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))" + done + done + done +done +END=$(date +%s) +echo "[all done] elapsed $((END - START))s" diff --git a/scripts/baselines/run_shafir_nf_cross.py b/scripts/baselines/run_shafir_nf_cross.py new file mode 100644 index 0000000..722ae6e --- /dev/null +++ b/scripts/baselines/run_shafir_nf_cross.py @@ -0,0 +1,247 @@ +"""Lightweight Shafir-NF cross-dataset runner. + +Same data protocol as scripts/baselines/run_if_ocsvm_cross.py (path A): + - 10K source benign training rows + - 10K target benign + balanced per-class target attacks (default cap 200K) + - 20-d canonical flow features (CANONICAL_FLOW_FEATURE_NAMES) + - StandardScaler-style z-score using source-trained flow_mean/flow_std saved + in JANUS within-dataset checkpoints under artifacts/route_comparison/ + +Anomaly score = -log_prob from a single pzflow NormalizingFlow trained on +source benign for `--epochs` (default 100). No SHAP-subset, no 2-NF ensemble. +Single-flow, default hyperparams — meant as a quick cross-dataset baseline +matching the IF/OCSVM protocol, NOT a faithful Shafir reproduction. + +Outputs: + {tag}.json - summary + {tag}.npz - b_score, a_score, a_labels (same key schema as IF/OCSVM runner) +""" +from __future__ import annotations +import argparse +import json +import os +import time +from pathlib import Path + +import numpy as np +import pandas as pd +import torch +from sklearn.metrics import average_precision_score, roc_auc_score + +os.environ.setdefault("JAX_PLATFORMS", "cpu") +import optax # noqa: E402 +from pzflow import Flow # noqa: E402 + +REPO = Path(__file__).resolve().parents[2] + +# Shafir-style 5-d SHAP-top subset of the 20-d canonical flow features. +# Picks the 5 entries that loosely correspond to Shafir's CICIDS_BEST5 +# CICFlowMeter columns (Bwd Packet Length Mean, Fwd Packets/s, ACK Flag Count, +# Total Length of Bwd Packets, Flow Duration). This keeps the input +# dimensionality and feature semantics close to the paper protocol while +# staying on our packet-derived 20-d contract. +SHAFIR5_SUBSET = ("bwd_size_mean", "log_pkts_per_s", "ack_cnt", "log_total_bytes", "log_duration") + +DATASETS = { + "cicids2017": { + "flows": REPO / "datasets/cicids2017/processed/flows.parquet", + "flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet", + "model_template": REPO / "artifacts/route_comparison/janus_cicids2017_seed{seed}", + }, + "cicddos2019": { + "flows": REPO / "datasets/cicddos2019/processed/flows.parquet", + "flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet", + "model_template": REPO / "artifacts/route_comparison/janus_cicddos2019_seed{seed}", + }, + "ciciot2023": { + "flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet", + "flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet", + "model_template": REPO / "artifacts/route_comparison/janus_ciciot2023_seed{seed}", + }, +} + + +def _load_src_stats(src: str, seed: int) -> tuple[np.ndarray, np.ndarray, list[str]]: + model_dir = Path(str(DATASETS[src]["model_template"]).format(seed=seed)) + ckpt = torch.load(model_dir / "model.pt", map_location="cpu", weights_only=False) + flow_mean = np.asarray(ckpt["flow_mean"], dtype=np.float32) + flow_std = np.asarray(ckpt["flow_std"], dtype=np.float32) + flow_names = [str(n) for n in ckpt["flow_feature_names"]] + return flow_mean, flow_std, flow_names + + +def _load_dataset_aligned(name: str, flow_names: list[str]) -> tuple[np.ndarray, np.ndarray]: + flows = pd.read_parquet(DATASETS[name]["flows"], columns=["flow_id", "label"]) + ff = pd.read_parquet(DATASETS[name]["flow_features"]) + if not np.array_equal( + flows["flow_id"].to_numpy(dtype=np.uint64), + ff["flow_id"].to_numpy(dtype=np.uint64), + ): + raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned") + X = ff[flow_names].to_numpy(dtype=np.float64) + X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32) + labels = flows["label"].astype(str).to_numpy() + return X, labels + + +def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray: + attack_idx = np.flatnonzero(labels != "normal") + atk_labels = labels[attack_idx] + classes = sorted(set(atk_labels)) + per_class = max(1, n_attack // len(classes)) + chunks = [] + for cls in classes: + pool = attack_idx[atk_labels == cls] + k = min(per_class, len(pool)) + if k: + chunks.append(rng.choice(pool, size=k, replace=False)) + sel = np.sort(np.concatenate(chunks)) + if len(sel) > n_attack: + sel = np.sort(rng.choice(sel, size=n_attack, replace=False)) + return sel + + +def _safe_metric(fn, y, s) -> float: + s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12) + try: + return float(fn(y, s)) + except ValueError: + return float("nan") + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--src", choices=list(DATASETS), required=True) + p.add_argument("--tgt", choices=list(DATASETS), required=True) + p.add_argument("--seed", type=int, required=True) + p.add_argument("--out-dir", type=Path, required=True) + p.add_argument("--n-train", type=int, default=10000) + p.add_argument("--n-benign", type=int, default=10000) + p.add_argument("--n-attack", type=int, default=200000) + p.add_argument("--epochs", type=int, default=100) + p.add_argument("--lr", type=float, default=1e-3) + p.add_argument("--optimizer", choices=["sgd", "adam"], default="sgd") + p.add_argument("--feature-subset", choices=["shafir5", "full20"], default="shafir5", + help="shafir5: 5-d SHAP-top loose match (default, matches paper protocol); " + "full20: all 20-d canonical features (stronger but not Shafir-faithful)") + p.add_argument("--verbose", action="store_true") + args = p.parse_args() + args.out_dir.mkdir(parents=True, exist_ok=True) + tag = f"shafir_nf_{args.src}_to_{args.tgt}_seed{args.seed}" + print(f"[run] {tag}") + + # --- source stats from JANUS ckpt --- + flow_mean_full, flow_std_full, flow_names_full = _load_src_stats(args.src, args.seed) + if args.feature_subset == "shafir5": + keep_idx = [flow_names_full.index(n) for n in SHAFIR5_SUBSET] + flow_mean = flow_mean_full[keep_idx] + flow_std = flow_std_full[keep_idx] + flow_names = list(SHAFIR5_SUBSET) + else: + flow_mean, flow_std, flow_names = flow_mean_full, flow_std_full, flow_names_full + print(f"[src] model_dir={DATASETS[args.src]['model_template']} (seed={args.seed})") + print(f"[src] feature_subset={args.feature_subset} D={len(flow_names)} names={flow_names}") + + # --- source training sample (10K benign, seed+1000) --- + t0 = time.time() + src_X, src_labels = _load_dataset_aligned(args.src, flow_names) + src_benign_idx = np.flatnonzero(src_labels == "normal") + rng_train = np.random.default_rng(args.seed + 1000) + if len(src_benign_idx) < args.n_train: + raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows") + train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False)) + train_X = src_X[train_sel] + train_Z = ((train_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32) + t_load_src = time.time() - t0 + + # --- target eval sample --- + t0 = time.time() + if args.tgt == args.src: + tgt_X, tgt_labels = src_X, src_labels + used = np.zeros(len(tgt_labels), dtype=bool) + used[train_sel] = True + eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used) + else: + tgt_X, tgt_labels = _load_dataset_aligned(args.tgt, flow_names) + eligible_benign = np.flatnonzero(tgt_labels == "normal") + rng_eval = np.random.default_rng(args.seed) + n_benign = min(args.n_benign, len(eligible_benign)) + if n_benign < args.n_benign: + print(f"[warn] only {len(eligible_benign)} eligible benign rows in target") + b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False)) + a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval) + val_X = tgt_X[b_sel] + atk_X = tgt_X[a_sel] + a_labels = tgt_labels[a_sel] + val_Z = ((val_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32) + atk_Z = ((atk_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32) + t_load_tgt = time.time() - t0 + print(f"[data] train={len(train_Z):,} val={len(val_Z):,} attack={len(atk_Z):,}" + f" classes={len(set(a_labels))} D={train_Z.shape[1]}") + + # --- fit pzflow NF --- + cols = [f"x{i}" for i in range(train_Z.shape[1])] + df_train = pd.DataFrame(train_Z.astype(np.float32), columns=cols) + df_val = pd.DataFrame(val_Z.astype(np.float32), columns=cols) + df_atk = pd.DataFrame(atk_Z.astype(np.float32), columns=cols) + opt = optax.sgd(args.lr) if args.optimizer == "sgd" else optax.adam(args.lr) + flow = Flow(df_train.columns.tolist()) + t0 = time.time() + losses = flow.train(df_train, optimizer=opt, epochs=args.epochs, verbose=args.verbose) + t_fit = time.time() - t0 + + # --- score (anomaly = -log_prob; higher = more anomalous) --- + t0 = time.time() + lp_val = np.asarray(flow.log_prob(df_val)) + lp_atk = np.asarray(flow.log_prob(df_atk)) + b_score = (-lp_val).astype(np.float32) + a_score = (-lp_atk).astype(np.float32) + t_score = time.time() - t0 + + # --- metrics --- + y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))] + s = np.r_[b_score, a_score] + auroc = _safe_metric(roc_auc_score, y, s) + auprc = _safe_metric(average_precision_score, y, s) + + per_class = {} + for cls in sorted(set(a_labels)): + m = a_labels == cls + y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))] + s_c = np.r_[b_score, a_score[m]] + per_class[cls] = {"_n": int(m.sum()), "auroc": _safe_metric(roc_auc_score, y_c, s_c)} + + out = { + "method": "shafir_nf", + "variant": f"single_nf_{args.feature_subset}", + "feature_subset": args.feature_subset, + "feature_names": list(flow_names), + "src": args.src, + "tgt": args.tgt, + "seed": args.seed, + "n_train": int(len(train_Z)), + "n_benign": int(len(val_Z)), + "n_attack": int(len(atk_Z)), + "epochs": args.epochs, + "lr": args.lr, + "optimizer": args.optimizer, + "t_load_src_sec": round(t_load_src, 2), + "t_load_tgt_sec": round(t_load_tgt, 2), + "t_fit_sec": round(t_fit, 2), + "t_score_sec": round(t_score, 2), + "loss_first_last": [float(losses[0]), float(losses[-1])], + "overall": {"auroc": auroc, "auprc": auprc}, + "per_class": per_class, + } + json_path = args.out_dir / f"{tag}.json" + json_path.write_text(json.dumps(out, indent=2)) + npz_path = args.out_dir / f"{tag}.npz" + np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str)) + print(f"[saved] {json_path}") + print(f"[result] shafir_nf {args.src} -> {args.tgt} seed={args.seed} " + f"AUROC={auroc:.4f} AUPRC={auprc:.4f} " + f"fit={t_fit:.1f}s score={t_score:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/scripts/baselines/run_shafir_nf_cross_all.sh b/scripts/baselines/run_shafir_nf_cross_all.sh new file mode 100755 index 0000000..dbb2346 --- /dev/null +++ b/scripts/baselines/run_shafir_nf_cross_all.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Fast-scheme Shafir-NF 3x3 cross-dataset sweep. +# 3 src x 3 tgt x 3 seeds = 27 runs. epochs=10 (fast, see run_shafir_nf_cross.py +# sanity: 10 epochs already reaches AUROC ~0.89 within-CICIDS17). +set -euo pipefail + +REPO="/home/chy/JANUS" +cd "$REPO" + +OUT_DIR="${1:-$REPO/artifacts/baselines/shafir_nf_cross_2026_05_12}" +EPOCHS="${EPOCHS:-10}" +mkdir -p "$OUT_DIR" +LOG_DIR="$OUT_DIR/logs" +mkdir -p "$LOG_DIR" + +DATASETS=(cicids2017 cicddos2019 ciciot2023) +SEEDS=(42 43 44) + +START=$(date +%s) +for src in "${DATASETS[@]}"; do + for tgt in "${DATASETS[@]}"; do + for seed in "${SEEDS[@]}"; do + tag="shafir_nf_${src}_to_${tgt}_seed${seed}" + if [[ -f "$OUT_DIR/${tag}.json" ]]; then + echo "[skip] $tag (json exists)" + continue + fi + echo "[start] $tag" + PYTHONUNBUFFERED=1 OMP_NUM_THREADS=4 \ + uv run --no-sync python -u scripts/baselines/run_shafir_nf_cross.py \ + --src "$src" --tgt "$tgt" --seed "$seed" \ + --epochs "$EPOCHS" \ + --out-dir "$OUT_DIR" \ + > "$LOG_DIR/${tag}.log" 2>&1 + echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))" + done + done +done +END=$(date +%s) +echo "[all done] elapsed $((END - START))s"