baselines: add 3x3 cross-dataset runners for IF/OCSVM (path A + B) and Shafir NF
New scripts under scripts/baselines/: - run_if_ocsvm_cross.py - 20-d canonical flow features (path A) - run_if_ocsvm_cross_packets.py - raw 576-d packet sequence (path B) - run_shafir_nf_cross.py - single-NF on 5-d SHAFIR5 subset or 20-d - *_all.sh - 3 sources x 3 targets x 3 seeds sweepers New aggregator scripts/aggregate/baselines_cross_3x3_table.py builds a Markdown 3x3 matrix per method from per-cell NPZ outputs. RESULTS.md gains a "Shallow-baseline 3x3 cross matrices" subsection pointing at the new artifact directories. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
237
scripts/baselines/run_if_ocsvm_cross.py
Normal file
237
scripts/baselines/run_if_ocsvm_cross.py
Normal file
@@ -0,0 +1,237 @@
|
||||
"""Cross-dataset baselines (Isolation Forest, OCSVM) on the 20-d canonical
|
||||
flow-feature contract.
|
||||
|
||||
Protocol per (method, src, tgt, seed):
|
||||
- Train: 10,000 source benign rows (random sample seeded with --seed + 1000)
|
||||
- Test: 10,000 target benign rows (random sample seeded with --seed)
|
||||
+ balanced per-class attack sample with n_attack cap (--n-attack
|
||||
default 1,000,000, divided across all attack classes, matching
|
||||
Mixed_CFM/eval_cross.py)
|
||||
- For diagonal src == tgt, target benign is sampled from the source-pool
|
||||
complement (the rows not used for training) so train and test are disjoint.
|
||||
|
||||
Outputs (in --out-dir):
|
||||
{method}_{src}_to_{tgt}_seed{seed}.npz -- b_score, a_score, a_labels
|
||||
{method}_{src}_to_{tgt}_seed{seed}.json -- AUROC, AUPRC, sample counts, timing
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import OneClassSVM
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
|
||||
DATASETS = {
|
||||
"cicids2017": {
|
||||
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
|
||||
"flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet",
|
||||
},
|
||||
"cicddos2019": {
|
||||
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
|
||||
"flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet",
|
||||
},
|
||||
"ciciot2023": {
|
||||
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
|
||||
"flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet",
|
||||
},
|
||||
}
|
||||
|
||||
FEATURE_COLS = (
|
||||
"log_duration", "log_n_pkts", "fwd_count", "bwd_count",
|
||||
"pkt_size_mean", "pkt_size_std", "pkt_size_max",
|
||||
"fwd_size_mean", "bwd_size_mean", "bwd_size_std",
|
||||
"iat_mean", "fwd_iat_max", "bwd_iat_max", "bwd_iat_std",
|
||||
"active_mean", "idle_mean",
|
||||
"log_pkts_per_s", "log_total_bytes",
|
||||
"ack_cnt", "syn_cnt",
|
||||
)
|
||||
|
||||
|
||||
def _load_dataset(name: str):
|
||||
paths = DATASETS[name]
|
||||
flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"])
|
||||
ff = pd.read_parquet(paths["flow_features"])
|
||||
if not np.array_equal(
|
||||
flows["flow_id"].to_numpy(dtype=np.uint64),
|
||||
ff["flow_id"].to_numpy(dtype=np.uint64),
|
||||
):
|
||||
raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned")
|
||||
X = ff[list(FEATURE_COLS)].to_numpy(dtype=np.float64)
|
||||
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
labels = flows["label"].astype(str).to_numpy()
|
||||
return X, labels
|
||||
|
||||
|
||||
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
|
||||
attack_idx = np.flatnonzero(labels != "normal")
|
||||
atk_labels = labels[attack_idx]
|
||||
classes = sorted(set(atk_labels))
|
||||
per_class = max(1, n_attack // len(classes))
|
||||
chunks = []
|
||||
for cls in classes:
|
||||
pool = attack_idx[atk_labels == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
chunks.append(rng.choice(pool, size=k, replace=False))
|
||||
sel = np.sort(np.concatenate(chunks))
|
||||
if len(sel) > n_attack:
|
||||
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
|
||||
return sel
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--method", choices=["iforest", "ocsvm"], required=True)
|
||||
p.add_argument("--src", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--tgt", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--seed", type=int, required=True)
|
||||
p.add_argument("--out-dir", type=Path, required=True)
|
||||
p.add_argument("--n-train", type=int, default=10000)
|
||||
p.add_argument("--n-benign", type=int, default=10000)
|
||||
p.add_argument("--n-attack", type=int, default=1_000_000,
|
||||
help="Per-class balanced cap (matches Mixed_CFM/eval_cross.py).")
|
||||
# Method hyperparams
|
||||
p.add_argument("--iforest-n-estimators", type=int, default=200)
|
||||
p.add_argument("--ocsvm-nu", type=float, default=0.1)
|
||||
p.add_argument("--ocsvm-gamma", type=str, default="scale")
|
||||
p.add_argument("--ocsvm-cache-mb", type=int, default=2000)
|
||||
args = p.parse_args()
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}"
|
||||
print(f"[run] {tag}")
|
||||
|
||||
# --- source training ---
|
||||
t0 = time.time()
|
||||
src_X, src_labels = _load_dataset(args.src)
|
||||
src_benign_idx = np.flatnonzero(src_labels == "normal")
|
||||
rng_train = np.random.default_rng(args.seed + 1000)
|
||||
if len(src_benign_idx) < args.n_train:
|
||||
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}")
|
||||
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
|
||||
train_X = src_X[train_sel]
|
||||
t_load_src = time.time() - t0
|
||||
|
||||
# --- target eval ---
|
||||
t0 = time.time()
|
||||
if args.tgt == args.src:
|
||||
tgt_X, tgt_labels = src_X, src_labels
|
||||
used_for_train = np.zeros(len(tgt_labels), dtype=bool)
|
||||
used_for_train[train_sel] = True
|
||||
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used_for_train)
|
||||
else:
|
||||
tgt_X, tgt_labels = _load_dataset(args.tgt)
|
||||
eligible_benign = np.flatnonzero(tgt_labels == "normal")
|
||||
rng_eval = np.random.default_rng(args.seed)
|
||||
n_benign = min(args.n_benign, len(eligible_benign))
|
||||
if n_benign < args.n_benign:
|
||||
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})")
|
||||
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
|
||||
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
|
||||
val_X = tgt_X[b_sel]
|
||||
atk_X = tgt_X[a_sel]
|
||||
a_labels = tgt_labels[a_sel]
|
||||
t_load_tgt = time.time() - t0
|
||||
print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}"
|
||||
f" classes={len(set(a_labels))} D={train_X.shape[1]}")
|
||||
|
||||
# --- standardize on source train ---
|
||||
scaler = StandardScaler().fit(train_X)
|
||||
train_Z = scaler.transform(train_X).astype(np.float32)
|
||||
val_Z = scaler.transform(val_X).astype(np.float32)
|
||||
atk_Z = scaler.transform(atk_X).astype(np.float32)
|
||||
|
||||
# --- fit ---
|
||||
t0 = time.time()
|
||||
if args.method == "iforest":
|
||||
model = IsolationForest(
|
||||
n_estimators=args.iforest_n_estimators,
|
||||
random_state=args.seed,
|
||||
n_jobs=-1,
|
||||
contamination="auto",
|
||||
)
|
||||
model.fit(train_Z)
|
||||
else:
|
||||
model = OneClassSVM(
|
||||
kernel="rbf",
|
||||
nu=args.ocsvm_nu,
|
||||
gamma=args.ocsvm_gamma,
|
||||
cache_size=args.ocsvm_cache_mb,
|
||||
)
|
||||
model.fit(train_Z)
|
||||
t_fit = time.time() - t0
|
||||
|
||||
# --- score: higher = more anomalous ---
|
||||
# IsolationForest.score_samples returns higher-for-normal, so negate.
|
||||
# OneClassSVM.score_samples returns signed distance to boundary
|
||||
# (higher = more normal), so negate too.
|
||||
t0 = time.time()
|
||||
if args.method == "iforest":
|
||||
b_score = (-model.score_samples(val_Z)).astype(np.float32)
|
||||
a_score = (-model.score_samples(atk_Z)).astype(np.float32)
|
||||
else:
|
||||
b_score = (-model.decision_function(val_Z)).astype(np.float32)
|
||||
a_score = (-model.decision_function(atk_Z)).astype(np.float32)
|
||||
t_score = time.time() - t0
|
||||
|
||||
# --- metrics ---
|
||||
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
|
||||
s = np.r_[b_score, a_score]
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
auroc = float(roc_auc_score(y, s))
|
||||
auprc = float(average_precision_score(y, s))
|
||||
|
||||
per_class = {}
|
||||
for cls in sorted(set(a_labels)):
|
||||
m = a_labels == cls
|
||||
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
|
||||
s_c = np.r_[b_score, a_score[m]]
|
||||
s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
try:
|
||||
auc_c = float(roc_auc_score(y_c, s_c))
|
||||
except ValueError:
|
||||
auc_c = float("nan")
|
||||
per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c}
|
||||
|
||||
out = {
|
||||
"method": args.method,
|
||||
"src": args.src,
|
||||
"tgt": args.tgt,
|
||||
"seed": args.seed,
|
||||
"n_train": int(len(train_X)),
|
||||
"n_benign": int(len(val_X)),
|
||||
"n_attack": int(len(atk_X)),
|
||||
"n_attack_classes": int(len(set(a_labels))),
|
||||
"t_load_src_sec": round(t_load_src, 2),
|
||||
"t_load_tgt_sec": round(t_load_tgt, 2),
|
||||
"t_fit_sec": round(t_fit, 2),
|
||||
"t_score_sec": round(t_score, 2),
|
||||
"overall": {"auroc": auroc, "auprc": auprc},
|
||||
"per_class": per_class,
|
||||
}
|
||||
if args.method == "iforest":
|
||||
out["hparams"] = {"n_estimators": args.iforest_n_estimators}
|
||||
else:
|
||||
out["hparams"] = {"nu": args.ocsvm_nu, "gamma": args.ocsvm_gamma}
|
||||
|
||||
json_path = args.out_dir / f"{tag}.json"
|
||||
json_path.write_text(json.dumps(out, indent=2))
|
||||
npz_path = args.out_dir / f"{tag}.npz"
|
||||
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
|
||||
print(f"[saved] {json_path}")
|
||||
print(f"[saved] {npz_path}")
|
||||
print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} "
|
||||
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
|
||||
f"fit={t_fit:.1f}s score={t_score:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
39
scripts/baselines/run_if_ocsvm_cross_all.sh
Executable file
39
scripts/baselines/run_if_ocsvm_cross_all.sh
Executable file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
# Orchestrate the full 3x3 cross-dataset sweep for IF/OCSVM baselines.
|
||||
# 3 sources x 3 targets x 3 seeds x 2 methods = 54 runs.
|
||||
set -euo pipefail
|
||||
|
||||
REPO="/home/chy/JANUS"
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_2026_05_11}"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG_DIR="$OUT_DIR/logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
DATASETS=(cicids2017 cicddos2019 ciciot2023)
|
||||
SEEDS=(42 43 44)
|
||||
METHODS=(iforest ocsvm)
|
||||
|
||||
START=$(date +%s)
|
||||
for method in "${METHODS[@]}"; do
|
||||
for src in "${DATASETS[@]}"; do
|
||||
for tgt in "${DATASETS[@]}"; do
|
||||
for seed in "${SEEDS[@]}"; do
|
||||
tag="${method}_${src}_to_${tgt}_seed${seed}"
|
||||
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
|
||||
echo "[skip] $tag (json exists)"
|
||||
continue
|
||||
fi
|
||||
echo "[start] $tag"
|
||||
uv run --no-sync python scripts/baselines/run_if_ocsvm_cross.py \
|
||||
--method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
> "$LOG_DIR/${tag}.log" 2>&1
|
||||
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
END=$(date +%s)
|
||||
echo "[all done] elapsed $((END - START))s"
|
||||
233
scripts/baselines/run_if_ocsvm_cross_packets.py
Normal file
233
scripts/baselines/run_if_ocsvm_cross_packets.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Path-B: IF/OCSVM cross-dataset baselines on RAW PACKET SEQUENCES.
|
||||
|
||||
Same protocol as run_if_ocsvm_cross.py, but the input feature vector is the
|
||||
flattened first T=64 packet tokens (9-d each) -> 576-d. No flow-stat
|
||||
aggregation — this is the input modality JANUS itself consumes, so it
|
||||
measures what classical AD can do without hand-engineered features.
|
||||
|
||||
Outputs:
|
||||
{method}_{src}_to_{tgt}_seed{seed}.{json,npz}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import OneClassSVM
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(REPO))
|
||||
from common.packet_store import PacketShardStore # noqa: E402
|
||||
|
||||
DATASETS = {
|
||||
"cicids2017": {
|
||||
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
|
||||
"packets_npz": REPO / "datasets/cicids2017/processed/packets.npz",
|
||||
"source_store": None,
|
||||
},
|
||||
"cicddos2019": {
|
||||
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
|
||||
"packets_npz": None,
|
||||
"source_store": REPO / "datasets/cicddos2019/processed/full_store",
|
||||
},
|
||||
"ciciot2023": {
|
||||
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
|
||||
"packets_npz": None,
|
||||
"source_store": REPO / "datasets/ciciot2023/processed/full_store",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _load_labels(name: str) -> np.ndarray:
|
||||
paths = DATASETS[name]
|
||||
flows = pd.read_parquet(paths["flows"], columns=["flow_id", "label"])
|
||||
return flows["label"].astype(str).to_numpy()
|
||||
|
||||
|
||||
def _materialize_packets(name: str, indices: np.ndarray, T: int) -> np.ndarray:
|
||||
paths = DATASETS[name]
|
||||
if paths["packets_npz"] is not None:
|
||||
pz = np.load(paths["packets_npz"], mmap_mode="r")
|
||||
tokens = pz["packet_tokens"]
|
||||
if T > tokens.shape[1]:
|
||||
raise ValueError(f"requested T={T} > stored {tokens.shape[1]}")
|
||||
out = np.asarray(tokens[indices, :T, :]).astype(np.float32, copy=True)
|
||||
return out
|
||||
else:
|
||||
store = PacketShardStore.open(paths["source_store"])
|
||||
tok, _ = store.read_packets(indices.astype(np.int64), T=T)
|
||||
return tok.astype(np.float32, copy=False)
|
||||
|
||||
|
||||
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
|
||||
attack_idx = np.flatnonzero(labels != "normal")
|
||||
atk_labels = labels[attack_idx]
|
||||
classes = sorted(set(atk_labels))
|
||||
per_class = max(1, n_attack // len(classes))
|
||||
chunks = []
|
||||
for cls in classes:
|
||||
pool = attack_idx[atk_labels == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
chunks.append(rng.choice(pool, size=k, replace=False))
|
||||
sel = np.sort(np.concatenate(chunks))
|
||||
if len(sel) > n_attack:
|
||||
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
|
||||
return sel
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--method", choices=["iforest", "ocsvm"], required=True)
|
||||
p.add_argument("--src", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--tgt", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--seed", type=int, required=True)
|
||||
p.add_argument("--out-dir", type=Path, required=True)
|
||||
p.add_argument("--T", type=int, default=64, help="Packets-per-flow cap (matches JANUS T=64).")
|
||||
p.add_argument("--n-train", type=int, default=10000)
|
||||
p.add_argument("--n-benign", type=int, default=10000)
|
||||
p.add_argument("--n-attack", type=int, default=200000,
|
||||
help="Per-class balanced cap on target attacks. Smaller than the "
|
||||
"20-d run (1M) because 576-d OCSVM scoring is much slower.")
|
||||
p.add_argument("--min-len", type=int, default=2)
|
||||
# Method hyperparams
|
||||
p.add_argument("--iforest-n-estimators", type=int, default=200)
|
||||
p.add_argument("--ocsvm-nu", type=float, default=0.1)
|
||||
p.add_argument("--ocsvm-gamma", type=str, default="scale")
|
||||
p.add_argument("--ocsvm-cache-mb", type=int, default=2000)
|
||||
args = p.parse_args()
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
tag = f"{args.method}_{args.src}_to_{args.tgt}_seed{args.seed}"
|
||||
print(f"[run] {tag} (raw {args.T}x9 packets = {args.T * 9}-d)")
|
||||
|
||||
# --- source training ---
|
||||
t0 = time.time()
|
||||
src_labels = _load_labels(args.src)
|
||||
src_benign_idx = np.flatnonzero(src_labels == "normal")
|
||||
rng_train = np.random.default_rng(args.seed + 1000)
|
||||
if len(src_benign_idx) < args.n_train:
|
||||
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows < n_train={args.n_train}")
|
||||
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
|
||||
train_tokens = _materialize_packets(args.src, train_sel, T=args.T)
|
||||
train_X = train_tokens.reshape(len(train_sel), -1)
|
||||
t_load_src = time.time() - t0
|
||||
|
||||
# --- target eval ---
|
||||
t0 = time.time()
|
||||
if args.tgt == args.src:
|
||||
tgt_labels = src_labels
|
||||
used = np.zeros(len(tgt_labels), dtype=bool)
|
||||
used[train_sel] = True
|
||||
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used)
|
||||
else:
|
||||
tgt_labels = _load_labels(args.tgt)
|
||||
eligible_benign = np.flatnonzero(tgt_labels == "normal")
|
||||
rng_eval = np.random.default_rng(args.seed)
|
||||
n_benign = min(args.n_benign, len(eligible_benign))
|
||||
if n_benign < args.n_benign:
|
||||
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target (asked {args.n_benign})")
|
||||
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
|
||||
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
|
||||
val_tokens = _materialize_packets(args.tgt, b_sel, T=args.T)
|
||||
atk_tokens = _materialize_packets(args.tgt, a_sel, T=args.T)
|
||||
val_X = val_tokens.reshape(len(b_sel), -1)
|
||||
atk_X = atk_tokens.reshape(len(a_sel), -1)
|
||||
a_labels = tgt_labels[a_sel]
|
||||
t_load_tgt = time.time() - t0
|
||||
print(f"[data] train={len(train_X):,} val={len(val_X):,} attack={len(atk_X):,}"
|
||||
f" classes={len(set(a_labels))} D={train_X.shape[1]}")
|
||||
|
||||
# --- standardize ---
|
||||
scaler = StandardScaler().fit(train_X)
|
||||
train_Z = scaler.transform(train_X).astype(np.float32)
|
||||
val_Z = scaler.transform(val_X).astype(np.float32)
|
||||
atk_Z = scaler.transform(atk_X).astype(np.float32)
|
||||
|
||||
# --- fit ---
|
||||
t0 = time.time()
|
||||
if args.method == "iforest":
|
||||
model = IsolationForest(
|
||||
n_estimators=args.iforest_n_estimators,
|
||||
random_state=args.seed,
|
||||
n_jobs=-1,
|
||||
contamination="auto",
|
||||
)
|
||||
model.fit(train_Z)
|
||||
else:
|
||||
model = OneClassSVM(
|
||||
kernel="rbf",
|
||||
nu=args.ocsvm_nu,
|
||||
gamma=args.ocsvm_gamma,
|
||||
cache_size=args.ocsvm_cache_mb,
|
||||
)
|
||||
model.fit(train_Z)
|
||||
t_fit = time.time() - t0
|
||||
|
||||
# --- score (higher = more anomalous) ---
|
||||
t0 = time.time()
|
||||
if args.method == "iforest":
|
||||
b_score = (-model.score_samples(val_Z)).astype(np.float32)
|
||||
a_score = (-model.score_samples(atk_Z)).astype(np.float32)
|
||||
else:
|
||||
b_score = (-model.decision_function(val_Z)).astype(np.float32)
|
||||
a_score = (-model.decision_function(atk_Z)).astype(np.float32)
|
||||
t_score = time.time() - t0
|
||||
|
||||
# --- metrics ---
|
||||
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
|
||||
s = np.r_[b_score, a_score]
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
auroc = float(roc_auc_score(y, s))
|
||||
auprc = float(average_precision_score(y, s))
|
||||
|
||||
per_class = {}
|
||||
for cls in sorted(set(a_labels)):
|
||||
m = a_labels == cls
|
||||
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
|
||||
s_c = np.r_[b_score, a_score[m]]
|
||||
s_c = np.nan_to_num(s_c, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
try:
|
||||
auc_c = float(roc_auc_score(y_c, s_c))
|
||||
except ValueError:
|
||||
auc_c = float("nan")
|
||||
per_class[cls] = {"_n": int(m.sum()), "auroc": auc_c}
|
||||
|
||||
out = {
|
||||
"method": args.method,
|
||||
"src": args.src,
|
||||
"tgt": args.tgt,
|
||||
"seed": args.seed,
|
||||
"T": args.T,
|
||||
"feature_dim": int(train_X.shape[1]),
|
||||
"input_mode": "raw_packet_sequence",
|
||||
"n_train": int(len(train_X)),
|
||||
"n_benign": int(len(val_X)),
|
||||
"n_attack": int(len(atk_X)),
|
||||
"n_attack_classes": int(len(set(a_labels))),
|
||||
"t_load_src_sec": round(t_load_src, 2),
|
||||
"t_load_tgt_sec": round(t_load_tgt, 2),
|
||||
"t_fit_sec": round(t_fit, 2),
|
||||
"t_score_sec": round(t_score, 2),
|
||||
"overall": {"auroc": auroc, "auprc": auprc},
|
||||
"per_class": per_class,
|
||||
}
|
||||
json_path = args.out_dir / f"{tag}.json"
|
||||
json_path.write_text(json.dumps(out, indent=2))
|
||||
npz_path = args.out_dir / f"{tag}.npz"
|
||||
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
|
||||
print(f"[saved] {json_path}")
|
||||
print(f"[result] {args.method:7s} {args.src} -> {args.tgt} seed={args.seed} "
|
||||
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
|
||||
f"fit={t_fit:.1f}s score={t_score:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
38
scripts/baselines/run_if_ocsvm_cross_packets_all.sh
Executable file
38
scripts/baselines/run_if_ocsvm_cross_packets_all.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# Path-B sweep: IF/OCSVM on raw 64x9 packet sequence (576-d), 3x3 cross-dataset.
|
||||
set -euo pipefail
|
||||
|
||||
REPO="/home/chy/JANUS"
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="${1:-$REPO/artifacts/baselines/if_ocsvm_cross_packets_2026_05_11}"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG_DIR="$OUT_DIR/logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
DATASETS=(cicids2017 cicddos2019 ciciot2023)
|
||||
SEEDS=(42 43 44)
|
||||
METHODS=(iforest ocsvm)
|
||||
|
||||
START=$(date +%s)
|
||||
for method in "${METHODS[@]}"; do
|
||||
for src in "${DATASETS[@]}"; do
|
||||
for tgt in "${DATASETS[@]}"; do
|
||||
for seed in "${SEEDS[@]}"; do
|
||||
tag="${method}_${src}_to_${tgt}_seed${seed}"
|
||||
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
|
||||
echo "[skip] $tag (json exists)"
|
||||
continue
|
||||
fi
|
||||
echo "[start] $tag"
|
||||
uv run --no-sync python scripts/baselines/run_if_ocsvm_cross_packets.py \
|
||||
--method "$method" --src "$src" --tgt "$tgt" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
> "$LOG_DIR/${tag}.log" 2>&1
|
||||
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
END=$(date +%s)
|
||||
echo "[all done] elapsed $((END - START))s"
|
||||
247
scripts/baselines/run_shafir_nf_cross.py
Normal file
247
scripts/baselines/run_shafir_nf_cross.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""Lightweight Shafir-NF cross-dataset runner.
|
||||
|
||||
Same data protocol as scripts/baselines/run_if_ocsvm_cross.py (path A):
|
||||
- 10K source benign training rows
|
||||
- 10K target benign + balanced per-class target attacks (default cap 200K)
|
||||
- 20-d canonical flow features (CANONICAL_FLOW_FEATURE_NAMES)
|
||||
- StandardScaler-style z-score using source-trained flow_mean/flow_std saved
|
||||
in JANUS within-dataset checkpoints under artifacts/route_comparison/
|
||||
|
||||
Anomaly score = -log_prob from a single pzflow NormalizingFlow trained on
|
||||
source benign for `--epochs` (default 100). No SHAP-subset, no 2-NF ensemble.
|
||||
Single-flow, default hyperparams — meant as a quick cross-dataset baseline
|
||||
matching the IF/OCSVM protocol, NOT a faithful Shafir reproduction.
|
||||
|
||||
Outputs:
|
||||
{tag}.json - summary
|
||||
{tag}.npz - b_score, a_score, a_labels (same key schema as IF/OCSVM runner)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
|
||||
os.environ.setdefault("JAX_PLATFORMS", "cpu")
|
||||
import optax # noqa: E402
|
||||
from pzflow import Flow # noqa: E402
|
||||
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
|
||||
# Shafir-style 5-d SHAP-top subset of the 20-d canonical flow features.
|
||||
# Picks the 5 entries that loosely correspond to Shafir's CICIDS_BEST5
|
||||
# CICFlowMeter columns (Bwd Packet Length Mean, Fwd Packets/s, ACK Flag Count,
|
||||
# Total Length of Bwd Packets, Flow Duration). This keeps the input
|
||||
# dimensionality and feature semantics close to the paper protocol while
|
||||
# staying on our packet-derived 20-d contract.
|
||||
SHAFIR5_SUBSET = ("bwd_size_mean", "log_pkts_per_s", "ack_cnt", "log_total_bytes", "log_duration")
|
||||
|
||||
DATASETS = {
|
||||
"cicids2017": {
|
||||
"flows": REPO / "datasets/cicids2017/processed/flows.parquet",
|
||||
"flow_features": REPO / "datasets/cicids2017/processed/flow_features.parquet",
|
||||
"model_template": REPO / "artifacts/route_comparison/janus_cicids2017_seed{seed}",
|
||||
},
|
||||
"cicddos2019": {
|
||||
"flows": REPO / "datasets/cicddos2019/processed/flows.parquet",
|
||||
"flow_features": REPO / "datasets/cicddos2019/processed/flow_features.parquet",
|
||||
"model_template": REPO / "artifacts/route_comparison/janus_cicddos2019_seed{seed}",
|
||||
},
|
||||
"ciciot2023": {
|
||||
"flows": REPO / "datasets/ciciot2023/processed/full_store/flows.parquet",
|
||||
"flow_features": REPO / "datasets/ciciot2023/processed/flow_features.parquet",
|
||||
"model_template": REPO / "artifacts/route_comparison/janus_ciciot2023_seed{seed}",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _load_src_stats(src: str, seed: int) -> tuple[np.ndarray, np.ndarray, list[str]]:
|
||||
model_dir = Path(str(DATASETS[src]["model_template"]).format(seed=seed))
|
||||
ckpt = torch.load(model_dir / "model.pt", map_location="cpu", weights_only=False)
|
||||
flow_mean = np.asarray(ckpt["flow_mean"], dtype=np.float32)
|
||||
flow_std = np.asarray(ckpt["flow_std"], dtype=np.float32)
|
||||
flow_names = [str(n) for n in ckpt["flow_feature_names"]]
|
||||
return flow_mean, flow_std, flow_names
|
||||
|
||||
|
||||
def _load_dataset_aligned(name: str, flow_names: list[str]) -> tuple[np.ndarray, np.ndarray]:
|
||||
flows = pd.read_parquet(DATASETS[name]["flows"], columns=["flow_id", "label"])
|
||||
ff = pd.read_parquet(DATASETS[name]["flow_features"])
|
||||
if not np.array_equal(
|
||||
flows["flow_id"].to_numpy(dtype=np.uint64),
|
||||
ff["flow_id"].to_numpy(dtype=np.uint64),
|
||||
):
|
||||
raise ValueError(f"{name}: flows.parquet and flow_features.parquet are not row-aligned")
|
||||
X = ff[flow_names].to_numpy(dtype=np.float64)
|
||||
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
labels = flows["label"].astype(str).to_numpy()
|
||||
return X, labels
|
||||
|
||||
|
||||
def _balanced_attack_sample(labels: np.ndarray, n_attack: int, rng: np.random.Generator) -> np.ndarray:
|
||||
attack_idx = np.flatnonzero(labels != "normal")
|
||||
atk_labels = labels[attack_idx]
|
||||
classes = sorted(set(atk_labels))
|
||||
per_class = max(1, n_attack // len(classes))
|
||||
chunks = []
|
||||
for cls in classes:
|
||||
pool = attack_idx[atk_labels == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
chunks.append(rng.choice(pool, size=k, replace=False))
|
||||
sel = np.sort(np.concatenate(chunks))
|
||||
if len(sel) > n_attack:
|
||||
sel = np.sort(rng.choice(sel, size=n_attack, replace=False))
|
||||
return sel
|
||||
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1e12, neginf=-1e12)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float("nan")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--src", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--tgt", choices=list(DATASETS), required=True)
|
||||
p.add_argument("--seed", type=int, required=True)
|
||||
p.add_argument("--out-dir", type=Path, required=True)
|
||||
p.add_argument("--n-train", type=int, default=10000)
|
||||
p.add_argument("--n-benign", type=int, default=10000)
|
||||
p.add_argument("--n-attack", type=int, default=200000)
|
||||
p.add_argument("--epochs", type=int, default=100)
|
||||
p.add_argument("--lr", type=float, default=1e-3)
|
||||
p.add_argument("--optimizer", choices=["sgd", "adam"], default="sgd")
|
||||
p.add_argument("--feature-subset", choices=["shafir5", "full20"], default="shafir5",
|
||||
help="shafir5: 5-d SHAP-top loose match (default, matches paper protocol); "
|
||||
"full20: all 20-d canonical features (stronger but not Shafir-faithful)")
|
||||
p.add_argument("--verbose", action="store_true")
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
tag = f"shafir_nf_{args.src}_to_{args.tgt}_seed{args.seed}"
|
||||
print(f"[run] {tag}")
|
||||
|
||||
# --- source stats from JANUS ckpt ---
|
||||
flow_mean_full, flow_std_full, flow_names_full = _load_src_stats(args.src, args.seed)
|
||||
if args.feature_subset == "shafir5":
|
||||
keep_idx = [flow_names_full.index(n) for n in SHAFIR5_SUBSET]
|
||||
flow_mean = flow_mean_full[keep_idx]
|
||||
flow_std = flow_std_full[keep_idx]
|
||||
flow_names = list(SHAFIR5_SUBSET)
|
||||
else:
|
||||
flow_mean, flow_std, flow_names = flow_mean_full, flow_std_full, flow_names_full
|
||||
print(f"[src] model_dir={DATASETS[args.src]['model_template']} (seed={args.seed})")
|
||||
print(f"[src] feature_subset={args.feature_subset} D={len(flow_names)} names={flow_names}")
|
||||
|
||||
# --- source training sample (10K benign, seed+1000) ---
|
||||
t0 = time.time()
|
||||
src_X, src_labels = _load_dataset_aligned(args.src, flow_names)
|
||||
src_benign_idx = np.flatnonzero(src_labels == "normal")
|
||||
rng_train = np.random.default_rng(args.seed + 1000)
|
||||
if len(src_benign_idx) < args.n_train:
|
||||
raise RuntimeError(f"{args.src}: only {len(src_benign_idx)} benign rows")
|
||||
train_sel = np.sort(rng_train.choice(src_benign_idx, size=args.n_train, replace=False))
|
||||
train_X = src_X[train_sel]
|
||||
train_Z = ((train_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
|
||||
t_load_src = time.time() - t0
|
||||
|
||||
# --- target eval sample ---
|
||||
t0 = time.time()
|
||||
if args.tgt == args.src:
|
||||
tgt_X, tgt_labels = src_X, src_labels
|
||||
used = np.zeros(len(tgt_labels), dtype=bool)
|
||||
used[train_sel] = True
|
||||
eligible_benign = np.flatnonzero((tgt_labels == "normal") & ~used)
|
||||
else:
|
||||
tgt_X, tgt_labels = _load_dataset_aligned(args.tgt, flow_names)
|
||||
eligible_benign = np.flatnonzero(tgt_labels == "normal")
|
||||
rng_eval = np.random.default_rng(args.seed)
|
||||
n_benign = min(args.n_benign, len(eligible_benign))
|
||||
if n_benign < args.n_benign:
|
||||
print(f"[warn] only {len(eligible_benign)} eligible benign rows in target")
|
||||
b_sel = np.sort(rng_eval.choice(eligible_benign, size=n_benign, replace=False))
|
||||
a_sel = _balanced_attack_sample(tgt_labels, args.n_attack, rng_eval)
|
||||
val_X = tgt_X[b_sel]
|
||||
atk_X = tgt_X[a_sel]
|
||||
a_labels = tgt_labels[a_sel]
|
||||
val_Z = ((val_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
|
||||
atk_Z = ((atk_X - flow_mean) / np.maximum(flow_std, 1e-6)).astype(np.float32)
|
||||
t_load_tgt = time.time() - t0
|
||||
print(f"[data] train={len(train_Z):,} val={len(val_Z):,} attack={len(atk_Z):,}"
|
||||
f" classes={len(set(a_labels))} D={train_Z.shape[1]}")
|
||||
|
||||
# --- fit pzflow NF ---
|
||||
cols = [f"x{i}" for i in range(train_Z.shape[1])]
|
||||
df_train = pd.DataFrame(train_Z.astype(np.float32), columns=cols)
|
||||
df_val = pd.DataFrame(val_Z.astype(np.float32), columns=cols)
|
||||
df_atk = pd.DataFrame(atk_Z.astype(np.float32), columns=cols)
|
||||
opt = optax.sgd(args.lr) if args.optimizer == "sgd" else optax.adam(args.lr)
|
||||
flow = Flow(df_train.columns.tolist())
|
||||
t0 = time.time()
|
||||
losses = flow.train(df_train, optimizer=opt, epochs=args.epochs, verbose=args.verbose)
|
||||
t_fit = time.time() - t0
|
||||
|
||||
# --- score (anomaly = -log_prob; higher = more anomalous) ---
|
||||
t0 = time.time()
|
||||
lp_val = np.asarray(flow.log_prob(df_val))
|
||||
lp_atk = np.asarray(flow.log_prob(df_atk))
|
||||
b_score = (-lp_val).astype(np.float32)
|
||||
a_score = (-lp_atk).astype(np.float32)
|
||||
t_score = time.time() - t0
|
||||
|
||||
# --- metrics ---
|
||||
y = np.r_[np.zeros(len(b_score)), np.ones(len(a_score))]
|
||||
s = np.r_[b_score, a_score]
|
||||
auroc = _safe_metric(roc_auc_score, y, s)
|
||||
auprc = _safe_metric(average_precision_score, y, s)
|
||||
|
||||
per_class = {}
|
||||
for cls in sorted(set(a_labels)):
|
||||
m = a_labels == cls
|
||||
y_c = np.r_[np.zeros(len(b_score)), np.ones(int(m.sum()))]
|
||||
s_c = np.r_[b_score, a_score[m]]
|
||||
per_class[cls] = {"_n": int(m.sum()), "auroc": _safe_metric(roc_auc_score, y_c, s_c)}
|
||||
|
||||
out = {
|
||||
"method": "shafir_nf",
|
||||
"variant": f"single_nf_{args.feature_subset}",
|
||||
"feature_subset": args.feature_subset,
|
||||
"feature_names": list(flow_names),
|
||||
"src": args.src,
|
||||
"tgt": args.tgt,
|
||||
"seed": args.seed,
|
||||
"n_train": int(len(train_Z)),
|
||||
"n_benign": int(len(val_Z)),
|
||||
"n_attack": int(len(atk_Z)),
|
||||
"epochs": args.epochs,
|
||||
"lr": args.lr,
|
||||
"optimizer": args.optimizer,
|
||||
"t_load_src_sec": round(t_load_src, 2),
|
||||
"t_load_tgt_sec": round(t_load_tgt, 2),
|
||||
"t_fit_sec": round(t_fit, 2),
|
||||
"t_score_sec": round(t_score, 2),
|
||||
"loss_first_last": [float(losses[0]), float(losses[-1])],
|
||||
"overall": {"auroc": auroc, "auprc": auprc},
|
||||
"per_class": per_class,
|
||||
}
|
||||
json_path = args.out_dir / f"{tag}.json"
|
||||
json_path.write_text(json.dumps(out, indent=2))
|
||||
npz_path = args.out_dir / f"{tag}.npz"
|
||||
np.savez_compressed(npz_path, b_score=b_score, a_score=a_score, a_labels=a_labels.astype(str))
|
||||
print(f"[saved] {json_path}")
|
||||
print(f"[result] shafir_nf {args.src} -> {args.tgt} seed={args.seed} "
|
||||
f"AUROC={auroc:.4f} AUPRC={auprc:.4f} "
|
||||
f"fit={t_fit:.1f}s score={t_score:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
scripts/baselines/run_shafir_nf_cross_all.sh
Executable file
40
scripts/baselines/run_shafir_nf_cross_all.sh
Executable file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fast-scheme Shafir-NF 3x3 cross-dataset sweep.
|
||||
# 3 src x 3 tgt x 3 seeds = 27 runs. epochs=10 (fast, see run_shafir_nf_cross.py
|
||||
# sanity: 10 epochs already reaches AUROC ~0.89 within-CICIDS17).
|
||||
set -euo pipefail
|
||||
|
||||
REPO="/home/chy/JANUS"
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="${1:-$REPO/artifacts/baselines/shafir_nf_cross_2026_05_12}"
|
||||
EPOCHS="${EPOCHS:-10}"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG_DIR="$OUT_DIR/logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
DATASETS=(cicids2017 cicddos2019 ciciot2023)
|
||||
SEEDS=(42 43 44)
|
||||
|
||||
START=$(date +%s)
|
||||
for src in "${DATASETS[@]}"; do
|
||||
for tgt in "${DATASETS[@]}"; do
|
||||
for seed in "${SEEDS[@]}"; do
|
||||
tag="shafir_nf_${src}_to_${tgt}_seed${seed}"
|
||||
if [[ -f "$OUT_DIR/${tag}.json" ]]; then
|
||||
echo "[skip] $tag (json exists)"
|
||||
continue
|
||||
fi
|
||||
echo "[start] $tag"
|
||||
PYTHONUNBUFFERED=1 OMP_NUM_THREADS=4 \
|
||||
uv run --no-sync python -u scripts/baselines/run_shafir_nf_cross.py \
|
||||
--src "$src" --tgt "$tgt" --seed "$seed" \
|
||||
--epochs "$EPOCHS" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
> "$LOG_DIR/${tag}.log" 2>&1
|
||||
echo "[done] $tag ($(grep -F '[result]' "$LOG_DIR/${tag}.log" | tail -1))"
|
||||
done
|
||||
done
|
||||
done
|
||||
END=$(date +%s)
|
||||
echo "[all done] elapsed $((END - START))s"
|
||||
Reference in New Issue
Block a user