baselines: add 3x3 cross-dataset runners for IF/OCSVM (path A + B) and Shafir NF

New scripts under scripts/baselines/:
- run_if_ocsvm_cross.py            - 20-d canonical flow features (path A)
- run_if_ocsvm_cross_packets.py    - raw 576-d packet sequence (path B)
- run_shafir_nf_cross.py           - single-NF on 5-d SHAFIR5 subset or 20-d
- *_all.sh                         - 3 sources x 3 targets x 3 seeds sweepers

New aggregator scripts/aggregate/baselines_cross_3x3_table.py builds a
Markdown 3x3 matrix per method from per-cell NPZ outputs.

RESULTS.md gains a "Shallow-baseline 3x3 cross matrices" subsection
pointing at the new artifact directories.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-12 17:41:20 +08:00
parent ff0efa97bf
commit 6e5f753c01
8 changed files with 979 additions and 0 deletions

View File

@@ -0,0 +1,237 @@
"""Cross-dataset baselines (Isolation Forest, OCSVM) on the 20-d canonical
flow-feature contract.
Protocol per (method, src, tgt, seed):
- Train: 10,000 source benign rows (random sample seeded with --seed + 1000)
- Test: 10,000 target benign rows (random sample seeded with --seed)
+ balanced per-class attack sample with n_attack cap (--n-attack
default 1,000,000, divided across all attack classes, matching
Mixed_CFM/eval_cross.py)
- For diagonal src == tgt, target benign is sampled from the source-pool
complement (the rows not used for training) so train and test are disjoint.
Outputs (in --out-dir):
{method}_{src}_to_{tgt}_seed{seed}.npz -- b_score, a_score, a_labels
{method}_{src}_to_{tgt}_seed{seed}.json -- AUROC, AUPRC, sample counts, timing
"""
from __future__ import annotations
import argparse
import json
import time
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
REPO = Path(__file__).resolve().parents[2]
DATASETS = {
"cicids2017": {
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
"flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet",
},
"cicddos2019": {
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
"flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet",
},
"ciciot2023": {
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
"flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet",
},
}
FEATURE_COLS = (
"log_duration", "log_n_pkts", "fwd_count", "bwd_count",
"pkt_size_mean", "pkt_size_std", "pkt_size_max",
"fwd_size_mean", "bwd_size_mean", "bwd_size_std",
"iat_mean", "fwd_iat_max", "bwd_iat_max", "bwd_iat_std",
"active_mean", "idle_mean",
"log_pkts_per_s", "log_total_bytes",
"ack_cnt", "syn_cnt",
)
def _load_dataset(name: str):
paths = DATASETS[name]
flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"])
ff = pd.read_parquet(paths["flow_features"])
if not np.array_equal(
flows["flow_id"].to_numpy(dtype=np.uint64),
ff["flow_id"].to_numpy(dtype=np.uint64),
):
raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned")
X = ff[list(FEATURE_COLS)].to_numpy(dtype=np.float64)
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
labels = flows["label"].astype(str).to_numpy()
return X, labels
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
attack_idx = np.flatnonzero(labels != "normal")
atk_labels = labels[attack_idx]
classes = sorted(set(atk_labels))
per_class = max(1, n_attack // len(classes))
chunks = []
for cls in classes:
pool = attack_idx[atk_labels == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng.choice(pool, size=k, replace=False))
sel = np.sort(np.concatenate(chunks))
if len(sel) > n_attack:
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
return sel
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--method", choices=["iforest", "ocsvm"], required=True)
p.add_argument("--src", choices=list(DATASETS), required=True)
p.add_argument("--tgt", choices=list(DATASETS), required=True)
p.add_argument("--seed", type=int, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--n-train", type=int, default=10000)
p.add_argument("--n-benign", type=int, default=10000)
p.add_argument("--n-attack", type=int, default=1_000_000,
help="Per-class balanced cap (matches Mixed_CFM/eval_cross.py).")
# Method hyperparams
p.add_argument("--iforest-n-estimators", type=int, default=200)
p.add_argument("--ocsvm-nu", type=float, default=0.1)
p.add_argument("--ocsvm-gamma", type=str, default="scale")
p.add_argument("--ocsvm-cache-mb", type=int, default=2000)
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}"
print(f"[run] {tag}")
# --- source training ---
t0 = time.time()
src_X, src_labels = _load_dataset(args.src)
src_benign_idx = np.flatnonzero(src_labels == "normal")
rng_train = np.random.default_rng(args.seed + 1000)
if len(src_benign_idx) < args.n_train:
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}")
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
train_X = src_X[train_sel]
t_load_src = time.time() - t0
# --- target eval ---
t0 = time.time()
if args.tgt == args.src:
tgt_X, tgt_labels = src_X, src_labels
used_for_train = np.zeros(len(tgt_labels), dtype=bool)
used_for_train[train_sel] = True
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used_for_train)
else:
tgt_X, tgt_labels = _load_dataset(args.tgt)
eligible_benign = np.flatnonzero(tgt_labels == "normal")
rng_eval = np.random.default_rng(args.seed)
n_benign = min(args.n_benign, len(eligible_benign))
if n_benign < args.n_benign:
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})")
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
val_X = tgt_X[b_sel]
atk_X = tgt_X[a_sel]
a_labels = tgt_labels[a_sel]
t_load_tgt = time.time() - t0
print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}"
f" classes={len(set(a_labels))} D={train_X.shape[1]}")
# --- standardize on source train ---
scaler = StandardScaler().fit(train_X)
train_Z = scaler.transform(train_X).astype(np.float32)
val_Z = scaler.transform(val_X).astype(np.float32)
atk_Z = scaler.transform(atk_X).astype(np.float32)
# --- fit ---
t0 = time.time()
if args.method == "iforest":
model = IsolationForest(
n_estimators=args.iforest_n_estimators,
random_state=args.seed,
n_jobs=-1,
contamination="auto",
)
model.fit(train_Z)
else:
model = OneClassSVM(
kernel="rbf",
nu=args.ocsvm_nu,
gamma=args.ocsvm_gamma,
cache_size=args.ocsvm_cache_mb,
)
model.fit(train_Z)
t_fit = time.time() - t0
# --- score: higher = more anomalous ---
# IsolationForest.score_samples returns higher-for-normal, so negate.
# OneClassSVM.score_samples returns signed distance to boundary
# (higher = more normal), so negate too.
t0 = time.time()
if args.method == "iforest":
b_score = (-model.score_samples(val_Z)).astype(np.float32)
a_score = (-model.score_samples(atk_Z)).astype(np.float32)
else:
b_score = (-model.decision_function(val_Z)).astype(np.float32)
a_score = (-model.decision_function(atk_Z)).astype(np.float32)
t_score = time.time() - t0
# --- metrics ---
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
s = np.r_[b_score, a_score]
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
auroc = float(roc_auc_score(y, s))
auprc = float(average_precision_score(y, s))
per_class = {}
for cls in sorted(set(a_labels)):
m = a_labels == cls
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
s_c = np.r_[b_score, a_score[m]]
s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12)
try:
auc_c = float(roc_auc_score(y_c, s_c))
except ValueError:
auc_c = float("nan")
per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c}
out = {
"method": args.method,
"src": args.src,
"tgt": args.tgt,
"seed": args.seed,
"n_train": int(len(train_X)),
"n_benign": int(len(val_X)),
"n_attack": int(len(atk_X)),
"n_attack_classes": int(len(set(a_labels))),
"t_load_src_sec": round(t_load_src, 2),
"t_load_tgt_sec": round(t_load_tgt, 2),
"t_fit_sec": round(t_fit, 2),
"t_score_sec": round(t_score, 2),
"overall": {"auroc": auroc, "auprc": auprc},
"per_class": per_class,
}
if args.method == "iforest":
out["hparams"] = {"n_estimators": args.iforest_n_estimators}
else:
out["hparams"] = {"nu": args.ocsvm_nu, "gamma": args.ocsvm_gamma}
json_path = args.out_dir / f"{tag}.json"
json_path.write_text(json.dumps(out, indent=2))
npz_path = args.out_dir / f"{tag}.npz"
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
print(f"[saved] {json_path}")
print(f"[saved] {npz_path}")
print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} "
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
f"fit={t_fit:.1f}s score={t_score:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bash
# Orchestrate the full 3x3 cross-dataset sweep for IF/OCSVM baselines.
# 3 sources x 3 targets x 3 seeds x 2 methods = 54 runs.
set -euo pipefail
REPO="/home/chy/JANUS"
cd "$REPO"
OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_2026_05_11}"
mkdir -p "$OUT_DIR"
LOG_DIR="$OUT_DIR/logs"
mkdir -p "$LOG_DIR"
DATASETS=(cicids2017 cicddos2019 ciciot2023)
SEEDS=(42 43 44)
METHODS=(iforest ocsvm)
START=$(date +%s)
for method in "${METHODS[@]}"; do
for src in "${DATASETS[@]}"; do
for tgt in "${DATASETS[@]}"; do
for seed in "${SEEDS[@]}"; do
tag="${method}_${src}_to_${tgt}_seed${seed}"
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
echo "[skip] $tag (json exists)"
continue
fi
echo "[start] $tag"
uv run --no-sync python scripts/baselines/run_if_ocsvm_cross.py \
--method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \
--out-dir "$OUT_DIR" \
> "$LOG_DIR/${tag}.log" 2>&1
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
done
done
done
done
END=$(date +%s)
echo "[all done] elapsed $((END - START))s"

View File

@@ -0,0 +1,233 @@
"""Path-B: IF/OCSVM cross-dataset baselines on RAW PACKET SEQUENCES.
Same protocol as run_if_ocsvm_cross.py, but the input feature vector is the
flattened first T=64 packet tokens (9-d each) -> 576-d. No flow-stat
aggregation — this is the input modality JANUS itself consumes, so it
measures what classical AD can do without hand-engineered features.
Outputs:
{method}_{src}_to_{tgt}_seed{seed}.{json,npz}
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO))
from common.packet_store import PacketShardStore # noqa: E402
DATASETS = {
"cicids2017": {
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
"packets_npz": REPO / "datasets/cicids2017/processed/packets.npz",
"source_store": None,
},
"cicddos2019": {
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
"packets_npz": None,
"source_store": REPO / "datasets/cicddos2019/processed/full_store",
},
"ciciot2023": {
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
"packets_npz": None,
"source_store": REPO / "datasets/ciciot2023/processed/full_store",
},
}
def _load_labels(name: str) -> np.ndarray:
paths = DATASETS[name]
flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"])
return flows["label"].astype(str).to_numpy()
def _materialize_packets(name: str, indices: np.ndarray, T: int) -> np.ndarray:
paths = DATASETS[name]
if paths["packets_npz"] is not None:
pz = np.load(paths["packets_npz"], mmap_mode="r")
tokens = pz["packet_tokens"]
if T > tokens.shape[1]:
raise ValueError(f"requested T={T} > stored {tokens.shape[1]}")
out = np.asarray(tokens[indices, :T, :]).astype(np.float32, copy=True)
return out
else:
store = PacketShardStore.open(paths["source_store"])
tok, _ = store.read_packets(indices.astype(np.int64), T=T)
return tok.astype(np.float32, copy=False)
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
attack_idx = np.flatnonzero(labels != "normal")
atk_labels = labels[attack_idx]
classes = sorted(set(atk_labels))
per_class = max(1, n_attack // len(classes))
chunks = []
for cls in classes:
pool = attack_idx[atk_labels == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng.choice(pool, size=k, replace=False))
sel = np.sort(np.concatenate(chunks))
if len(sel) > n_attack:
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
return sel
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--method", choices=["iforest", "ocsvm"], required=True)
p.add_argument("--src", choices=list(DATASETS), required=True)
p.add_argument("--tgt", choices=list(DATASETS), required=True)
p.add_argument("--seed", type=int, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--T", type=int, default=64, help="Packets-per-flow cap (matches JANUS T=64).")
p.add_argument("--n-train", type=int, default=10000)
p.add_argument("--n-benign", type=int, default=10000)
p.add_argument("--n-attack", type=int, default=200000,
help="Per-class balanced cap on target attacks. Smaller than the "
"20-d run (1M) because 576-d OCSVM scoring is much slower.")
p.add_argument("--min-len", type=int, default=2)
# Method hyperparams
p.add_argument("--iforest-n-estimators", type=int, default=200)
p.add_argument("--ocsvm-nu", type=float, default=0.1)
p.add_argument("--ocsvm-gamma", type=str, default="scale")
p.add_argument("--ocsvm-cache-mb", type=int, default=2000)
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}"
print(f"[run] {tag} (raw {args.T}x9 packets = {args.T * 9}-d)")
# --- source training ---
t0 = time.time()
src_labels = _load_labels(args.src)
src_benign_idx = np.flatnonzero(src_labels == "normal")
rng_train = np.random.default_rng(args.seed + 1000)
if len(src_benign_idx) < args.n_train:
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}")
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
train_tokens = _materialize_packets(args.src, train_sel, T=args.T)
train_X = train_tokens.reshape(len(train_sel), -1)
t_load_src = time.time() - t0
# --- target eval ---
t0 = time.time()
if args.tgt == args.src:
tgt_labels = src_labels
used = np.zeros(len(tgt_labels), dtype=bool)
used[train_sel] = True
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used)
else:
tgt_labels = _load_labels(args.tgt)
eligible_benign = np.flatnonzero(tgt_labels == "normal")
rng_eval = np.random.default_rng(args.seed)
n_benign = min(args.n_benign, len(eligible_benign))
if n_benign < args.n_benign:
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})")
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
val_tokens = _materialize_packets(args.tgt, b_sel, T=args.T)
atk_tokens = _materialize_packets(args.tgt, a_sel, T=args.T)
val_X = val_tokens.reshape(len(b_sel), -1)
atk_X = atk_tokens.reshape(len(a_sel), -1)
a_labels = tgt_labels[a_sel]
t_load_tgt = time.time() - t0
print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}"
f" classes={len(set(a_labels))} D={train_X.shape[1]}")
# --- standardize ---
scaler = StandardScaler().fit(train_X)
train_Z = scaler.transform(train_X).astype(np.float32)
val_Z = scaler.transform(val_X).astype(np.float32)
atk_Z = scaler.transform(atk_X).astype(np.float32)
# --- fit ---
t0 = time.time()
if args.method == "iforest":
model = IsolationForest(
n_estimators=args.iforest_n_estimators,
random_state=args.seed,
n_jobs=-1,
contamination="auto",
)
model.fit(train_Z)
else:
model = OneClassSVM(
kernel="rbf",
nu=args.ocsvm_nu,
gamma=args.ocsvm_gamma,
cache_size=args.ocsvm_cache_mb,
)
model.fit(train_Z)
t_fit = time.time() - t0
# --- score (higher = more anomalous) ---
t0 = time.time()
if args.method == "iforest":
b_score = (-model.score_samples(val_Z)).astype(np.float32)
a_score = (-model.score_samples(atk_Z)).astype(np.float32)
else:
b_score = (-model.decision_function(val_Z)).astype(np.float32)
a_score = (-model.decision_function(atk_Z)).astype(np.float32)
t_score = time.time() - t0
# --- metrics ---
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
s = np.r_[b_score, a_score]
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
auroc = float(roc_auc_score(y, s))
auprc = float(average_precision_score(y, s))
per_class = {}
for cls in sorted(set(a_labels)):
m = a_labels == cls
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
s_c = np.r_[b_score, a_score[m]]
s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12)
try:
auc_c = float(roc_auc_score(y_c, s_c))
except ValueError:
auc_c = float("nan")
per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c}
out = {
"method": args.method,
"src": args.src,
"tgt": args.tgt,
"seed": args.seed,
"T": args.T,
"feature_dim": int(train_X.shape[1]),
"input_mode": "raw_packet_sequence",
"n_train": int(len(train_X)),
"n_benign": int(len(val_X)),
"n_attack": int(len(atk_X)),
"n_attack_classes": int(len(set(a_labels))),
"t_load_src_sec": round(t_load_src, 2),
"t_load_tgt_sec": round(t_load_tgt, 2),
"t_fit_sec": round(t_fit, 2),
"t_score_sec": round(t_score, 2),
"overall": {"auroc": auroc, "auprc": auprc},
"per_class": per_class,
}
json_path = args.out_dir / f"{tag}.json"
json_path.write_text(json.dumps(out, indent=2))
npz_path = args.out_dir / f"{tag}.npz"
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
print(f"[saved] {json_path}")
print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} "
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
f"fit={t_fit:.1f}s score={t_score:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# Path-B sweep: IF/OCSVM on raw 64x9 packet sequence (576-d), 3x3 cross-dataset.
set -euo pipefail
REPO="/home/chy/JANUS"
cd "$REPO"
OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_packets_2026_05_11}"
mkdir -p "$OUT_DIR"
LOG_DIR="$OUT_DIR/logs"
mkdir -p "$LOG_DIR"
DATASETS=(cicids2017 cicddos2019 ciciot2023)
SEEDS=(42 43 44)
METHODS=(iforest ocsvm)
START=$(date +%s)
for method in "${METHODS[@]}"; do
for src in "${DATASETS[@]}"; do
for tgt in "${DATASETS[@]}"; do
for seed in "${SEEDS[@]}"; do
tag="${method}_${src}_to_${tgt}_seed${seed}"
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
echo "[skip] $tag (json exists)"
continue
fi
echo "[start] $tag"
uv run --no-sync python scripts/baselines/run_if_ocsvm_cross_packets.py \
--method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \
--out-dir "$OUT_DIR" \
> "$LOG_DIR/${tag}.log" 2>&1
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
done
done
done
done
END=$(date +%s)
echo "[all done] elapsed $((END - START))s"

View File

@@ -0,0 +1,247 @@
"""Lightweight Shafir-NF cross-dataset runner.
Same data protocol as scripts/baselines/run_if_ocsvm_cross.py (path A):
- 10K source benign training rows
- 10K target benign + balanced per-class target attacks (default cap 200K)
- 20-d canonical flow features (CANONICAL_FLOW_FEATURE_NAMES)
- StandardScaler-style z-score using source-trained flow_mean/flow_std saved
in JANUS within-dataset checkpoints under artifacts/route_comparison/
Anomaly score = -log_prob from a single pzflow NormalizingFlow trained on
source benign for `--epochs` (default 100). No SHAP-subset, no 2-NF ensemble.
Single-flow, default hyperparams — meant as a quick cross-dataset baseline
matching the IF/OCSVM protocol, NOT a faithful Shafir reproduction.
Outputs:
{tag}.json - summary
{tag}.npz - b_score, a_score, a_labels (same key schema as IF/OCSVM runner)
"""
from __future__ import annotations
import argparse
import json
import os
import time
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import average_precision_score, roc_auc_score
os.environ.setdefault("JAX_PLATFORMS", "cpu")
import optax # noqa: E402
from pzflow import Flow # noqa: E402
REPO = Path(__file__).resolve().parents[2]
# Shafir-style 5-d SHAP-top subset of the 20-d canonical flow features.
# Picks the 5 entries that loosely correspond to Shafir's CICIDS_BEST5
# CICFlowMeter columns (Bwd Packet Length Mean, Fwd Packets/s, ACK Flag Count,
# Total Length of Bwd Packets, Flow Duration). This keeps the input
# dimensionality and feature semantics close to the paper protocol while
# staying on our packet-derived 20-d contract.
SHAFIR5_SUBSET = ("bwd_size_mean", "log_pkts_per_s", "ack_cnt", "log_total_bytes", "log_duration")
DATASETS = {
"cicids2017": {
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
"flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet",
"model_template": REPO / "artifacts/route_comparison/janus_cicids2017_seed{seed}",
},
"cicddos2019": {
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
"flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet",
"model_template": REPO / "artifacts/route_comparison/janus_cicddos2019_seed{seed}",
},
"ciciot2023": {
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
"flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet",
"model_template": REPO / "artifacts/route_comparison/janus_ciciot2023_seed{seed}",
},
}
def _load_src_stats(src: str, seed: int) -> tuple[np.ndarray, np.ndarray, list[str]]:
model_dir = Path(str(DATASETS[src]["model_template"]).format(seed=seed))
ckpt = torch.load(model_dir / "model.pt", map_location="cpu", weights_only=False)
flow_mean = np.asarray(ckpt["flow_mean"], dtype=np.float32)
flow_std = np.asarray(ckpt["flow_std"], dtype=np.float32)
flow_names = [str(n) for n in ckpt["flow_feature_names"]]
return flow_mean, flow_std, flow_names
def _load_dataset_aligned(name: str, flow_names: list[str]) -> tuple[np.ndarray, np.ndarray]:
flows = pd.read_parquet(DATASETS[name]["flows"], columns=["flow_id", "label"])
ff = pd.read_parquet(DATASETS[name]["flow_features"])
if not np.array_equal(
flows["flow_id"].to_numpy(dtype=np.uint64),
ff["flow_id"].to_numpy(dtype=np.uint64),
):
raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned")
X = ff[flow_names].to_numpy(dtype=np.float64)
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
labels = flows["label"].astype(str).to_numpy()
return X, labels
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
attack_idx = np.flatnonzero(labels != "normal")
atk_labels = labels[attack_idx]
classes = sorted(set(atk_labels))
per_class = max(1, n_attack // len(classes))
chunks = []
for cls in classes:
pool = attack_idx[atk_labels == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng.choice(pool, size=k, replace=False))
sel = np.sort(np.concatenate(chunks))
if len(sel) > n_attack:
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
return sel
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
try:
return float(fn(y, s))
except ValueError:
return float("nan")
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--src", choices=list(DATASETS), required=True)
p.add_argument("--tgt", choices=list(DATASETS), required=True)
p.add_argument("--seed", type=int, required=True)
p.add_argument("--out-dir", type=Path, required=True)
p.add_argument("--n-train", type=int, default=10000)
p.add_argument("--n-benign", type=int, default=10000)
p.add_argument("--n-attack", type=int, default=200000)
p.add_argument("--epochs", type=int, default=100)
p.add_argument("--lr", type=float, default=1e-3)
p.add_argument("--optimizer", choices=["sgd", "adam"], default="sgd")
p.add_argument("--feature-subset", choices=["shafir5", "full20"], default="shafir5",
help="shafir5: 5-d SHAP-top loose match (default, matches paper protocol); "
"full20: all 20-d canonical features (stronger but not Shafir-faithful)")
p.add_argument("--verbose", action="store_true")
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
tag = f"shafir_nf_{args.src}_to_{args.tgt}_seed{args.seed}"
print(f"[run] {tag}")
# --- source stats from JANUS ckpt ---
flow_mean_full, flow_std_full, flow_names_full = _load_src_stats(args.src, args.seed)
if args.feature_subset == "shafir5":
keep_idx = [flow_names_full.index(n) for n in SHAFIR5_SUBSET]
flow_mean = flow_mean_full[keep_idx]
flow_std = flow_std_full[keep_idx]
flow_names = list(SHAFIR5_SUBSET)
else:
flow_mean, flow_std, flow_names = flow_mean_full, flow_std_full, flow_names_full
print(f"[src] model_dir={DATASETS[args.src]['model_template']} (seed={args.seed})")
print(f"[src] feature_subset={args.feature_subset} D={len(flow_names)} names={flow_names}")
# --- source training sample (10K benign, seed+1000) ---
t0 = time.time()
src_X, src_labels = _load_dataset_aligned(args.src, flow_names)
src_benign_idx = np.flatnonzero(src_labels == "normal")
rng_train = np.random.default_rng(args.seed + 1000)
if len(src_benign_idx) < args.n_train:
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows")
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
train_X = src_X[train_sel]
train_Z = ((train_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
t_load_src = time.time() - t0
# --- target eval sample ---
t0 = time.time()
if args.tgt == args.src:
tgt_X, tgt_labels = src_X, src_labels
used = np.zeros(len(tgt_labels), dtype=bool)
used[train_sel] = True
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used)
else:
tgt_X, tgt_labels = _load_dataset_aligned(args.tgt, flow_names)
eligible_benign = np.flatnonzero(tgt_labels == "normal")
rng_eval = np.random.default_rng(args.seed)
n_benign = min(args.n_benign, len(eligible_benign))
if n_benign < args.n_benign:
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target")
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
val_X = tgt_X[b_sel]
atk_X = tgt_X[a_sel]
a_labels = tgt_labels[a_sel]
val_Z = ((val_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
atk_Z = ((atk_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
t_load_tgt = time.time() - t0
print(f"[data] train={len(train_Z):,} val={len(val_Z):,} attack={len(atk_Z):,}"
f" classes={len(set(a_labels))} D={train_Z.shape[1]}")
# --- fit pzflow NF ---
cols = [f"x{i}" for i in range(train_Z.shape[1])]
df_train = pd.DataFrame(train_Z.astype(np.float32), columns=cols)
df_val = pd.DataFrame(val_Z.astype(np.float32), columns=cols)
df_atk = pd.DataFrame(atk_Z.astype(np.float32), columns=cols)
opt = optax.sgd(args.lr) if args.optimizer == "sgd" else optax.adam(args.lr)
flow = Flow(df_train.columns.tolist())
t0 = time.time()
losses = flow.train(df_train, optimizer=opt, epochs=args.epochs, verbose=args.verbose)
t_fit = time.time() - t0
# --- score (anomaly = -log_prob; higher = more anomalous) ---
t0 = time.time()
lp_val = np.asarray(flow.log_prob(df_val))
lp_atk = np.asarray(flow.log_prob(df_atk))
b_score = (-lp_val).astype(np.float32)
a_score = (-lp_atk).astype(np.float32)
t_score = time.time() - t0
# --- metrics ---
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
s = np.r_[b_score, a_score]
auroc = _safe_metric(roc_auc_score, y, s)
auprc = _safe_metric(average_precision_score, y, s)
per_class = {}
for cls in sorted(set(a_labels)):
m = a_labels == cls
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
s_c = np.r_[b_score, a_score[m]]
per_class[cls] = {"_n": int(m.sum()), "auroc": _safe_metric(roc_auc_score, y_c, s_c)}
out = {
"method": "shafir_nf",
"variant": f"single_nf_{args.feature_subset}",
"feature_subset": args.feature_subset,
"feature_names": list(flow_names),
"src": args.src,
"tgt": args.tgt,
"seed": args.seed,
"n_train": int(len(train_Z)),
"n_benign": int(len(val_Z)),
"n_attack": int(len(atk_Z)),
"epochs": args.epochs,
"lr": args.lr,
"optimizer": args.optimizer,
"t_load_src_sec": round(t_load_src, 2),
"t_load_tgt_sec": round(t_load_tgt, 2),
"t_fit_sec": round(t_fit, 2),
"t_score_sec": round(t_score, 2),
"loss_first_last": [float(losses[0]), float(losses[-1])],
"overall": {"auroc": auroc, "auprc": auprc},
"per_class": per_class,
}
json_path = args.out_dir / f"{tag}.json"
json_path.write_text(json.dumps(out, indent=2))
npz_path = args.out_dir / f"{tag}.npz"
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
print(f"[saved] {json_path}")
print(f"[result] shafir_nf {args.src} -> {args.tgt} seed={args.seed} "
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
f"fit={t_fit:.1f}s score={t_score:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env bash
# Fast-scheme Shafir-NF 3x3 cross-dataset sweep.
# 3 src x 3 tgt x 3 seeds = 27 runs. epochs=10 (fast, see run_shafir_nf_cross.py
# sanity: 10 epochs already reaches AUROC ~0.89 within-CICIDS17).
set -euo pipefail
REPO="/home/chy/JANUS"
cd "$REPO"
OUT_DIR="${1:-$REPO/artifacts/baselines/shafir_nf_cross_2026_05_12}"
EPOCHS="${EPOCHS:-10}"
mkdir -p "$OUT_DIR"
LOG_DIR="$OUT_DIR/logs"
mkdir -p "$LOG_DIR"
DATASETS=(cicids2017 cicddos2019 ciciot2023)
SEEDS=(42 43 44)
START=$(date +%s)
for src in "${DATASETS[@]}"; do
for tgt in "${DATASETS[@]}"; do
for seed in "${SEEDS[@]}"; do
tag="shafir_nf_${src}_to_${tgt}_seed${seed}"
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
echo "[skip] $tag (json exists)"
continue
fi
echo "[start] $tag"
PYTHONUNBUFFERED=1 OMP_NUM_THREADS=4 \
uv run --no-sync python -u scripts/baselines/run_shafir_nf_cross.py \
--src "$src" --tgt "$tgt" --seed "$seed" \
--epochs "$EPOCHS" \
--out-dir "$OUT_DIR" \
> "$LOG_DIR/${tag}.log" 2>&1
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
done
done
done
END=$(date +%s)
echo "[all done] elapsed $((END - START))s"