ablation: add Group A (aggregator) + Group B (architecture) infrastructure

Extends MixedCFMConfig with 5 backwards-compatible flags (use_flow_token,
n_packet_tokens, disc_as_cont, cont_as_disc + cont_n_bins) so existing
JANUS-full checkpoints load with 0 missing/unexpected keys.

Adds:
- 60 ablation training configs (5 variants × 4 datasets × 3 seeds)
- scripts/ablation/{generate_configs.py, run_groupB.sh, run_cross_groupB.sh,
  smoke_test.sh} — config generation + GPU drivers
- scripts/aggregate/aggregate_ablation{,_cross,_cross_B}.py — produces
  within-dataset and cross-dataset (3×3) ablation tables with 3-seed mean
  ± 95% t-CI plus optional paired DeLong p-values

README updated with ablation section pointing at
artifacts/ablation/ABLATION_SUMMARY.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-08 23:59:27 +08:00
parent 1d8862fbeb
commit a6bcbbd299
72 changed files with 3642 additions and 96 deletions

View File

@@ -0,0 +1,56 @@
"""Generate 60 B-group ablation configs from existing 12 base configs.
Reads:
Mixed_CFM/configs/<ds>_seed<S>.yaml (4 datasets × 3 seeds = 12 base)
Writes:
Mixed_CFM/configs/ablation/<gid>/<ds>_seed<S>.yaml (5 variants × 12 = 60)
Each variant overrides save_dir → artifacts/ablation/janus_<ds>_seed<S>_<gid>/
plus the variant-specific flags. CICIoT2023 base is `ciciot2023_seed42.yaml`
(NOT `ciciot2023_route_c_seed42.yaml`, which is a different score-router config).
"""
from __future__ import annotations
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[2]
BASE_DIR = ROOT / "Mixed_CFM" / "configs"
OUT_DIR = ROOT / "Mixed_CFM" / "configs" / "ablation"
DATASETS = ["iscxtor2016", "cicids2017", "cicddos2019", "ciciot2023"]
SEEDS = [42, 43, 44]
VARIANTS = {
"b1_noflow": {"use_flow_token": False},
"b2_flowonly": {"n_packet_tokens": 0, "lambda_disc": 0.0},
"b3_allcont": {"disc_as_cont": True, "lambda_disc": 0.0},
"b4_alldisc": {"cont_as_disc": True, "n_disc_classes": 8},
"b5_nodisc": {"lambda_disc": 0.0},
}
def main() -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
for gid, overrides in VARIANTS.items():
(OUT_DIR / gid).mkdir(parents=True, exist_ok=True)
n_written = 0
for ds in DATASETS:
for seed in SEEDS:
base_path = BASE_DIR / f"{ds}_seed{seed}.yaml"
if not base_path.exists():
print(f"[miss] {base_path}")
continue
base_cfg = yaml.safe_load(base_path.read_text())
for gid, overrides in VARIANTS.items():
cfg = dict(base_cfg)
cfg["save_dir"] = str(ROOT / "artifacts" / "ablation" / f"janus_{ds}_seed{seed}_{gid}")
cfg.update(overrides)
out = OUT_DIR / gid / f"{ds}_seed{seed}.yaml"
out.write_text(yaml.safe_dump(cfg, sort_keys=False))
n_written += 1
print(f"[wrote] {n_written} config files under {OUT_DIR}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env bash
# Cross-dataset evaluation for B-group ablation models.
# 5 variants × 6 off-diagonal directions × 3 seeds = 90 cross evals.
#
# Each B-variant model dir is artifacts/ablation/janus_<ds>_seed<S>_<gid>/.
# We only cross within the 3-dataset matrix (cicids2017, cicddos2019, ciciot2023);
# ISCXTor16 has different feature space for cross.
#
# Usage:
# bash scripts/ablation/run_cross_groupB.sh # all 90
# bash scripts/ablation/run_cross_groupB.sh b1_noflow b3_allcont
set -euo pipefail
ROOT=/home/chy/JANUS
EVAL=${ROOT}/Mixed_CFM/eval_cross.py
OUT_DIR=${ROOT}/artifacts/ablation/cross
mkdir -p "${OUT_DIR}"
declare -A STORE FLOWS FEATS
STORE[cicids2017]=${ROOT}/datasets/cicids2017/processed/full_store
FLOWS[cicids2017]=${ROOT}/datasets/cicids2017/processed/flows.parquet
FEATS[cicids2017]=${ROOT}/datasets/cicids2017/processed/flow_features.parquet
STORE[cicddos2019]=${ROOT}/datasets/cicddos2019/processed/full_store
FLOWS[cicddos2019]=${ROOT}/datasets/cicddos2019/processed/flows.parquet
FEATS[cicddos2019]=${ROOT}/datasets/cicddos2019/processed/flow_features.parquet
STORE[ciciot2023]=${ROOT}/datasets/ciciot2023/processed/full_store
FLOWS[ciciot2023]=${ROOT}/datasets/ciciot2023/processed/full_store/flows.parquet
FEATS[ciciot2023]=${ROOT}/datasets/ciciot2023/processed/flow_features.parquet
ALL_GIDS=(b1_noflow b2_flowonly b3_allcont b4_alldisc b5_nodisc)
DATASETS=(cicids2017 cicddos2019 ciciot2023)
SEEDS=(42 43 44)
GPU="${GPU:-0}"
if [[ $# -gt 0 ]]; then
GIDS=("$@")
else
GIDS=("${ALL_GIDS[@]}")
fi
run_one() {
local gid=$1 src=$2 tgt=$3 seed=$4
local md=${ROOT}/artifacts/ablation/janus_${src}_seed${seed}_${gid}
local out=${OUT_DIR}/${gid}__seed${seed}_${src}_to_${tgt}.json
if [[ -f "${out}" ]]; then echo "[skip] $gid ${src}${tgt} seed${seed}"; return; fi
if [[ ! -f "${md}/model.pt" ]]; then echo "[missing model] ${md}/model.pt"; return; fi
echo "[gpu${GPU}] $(date +%H:%M:%S) $gid ${src}${tgt} seed${seed}"
cd ${ROOT}/Mixed_CFM
CUDA_VISIBLE_DEVICES=${GPU} uv run --no-sync python -u ${EVAL} \
--model-dir ${md} \
--target-store ${STORE[$tgt]} --target-flows ${FLOWS[$tgt]} --target-flow-features ${FEATS[$tgt]} \
--benign-label normal --n-benign 10000 --n-attack 1000000 \
--out ${out} --seed ${seed} --T 64 --batch-size 512 --n-steps 16 \
> ${OUT_DIR}/${gid}__seed${seed}_${src}_to_${tgt}.log 2>&1
}
for gid in "${GIDS[@]}"; do
for src in "${DATASETS[@]}"; do
for tgt in "${DATASETS[@]}"; do
[[ "$src" == "$tgt" ]] && continue
for seed in "${SEEDS[@]}"; do
run_one "$gid" "$src" "$tgt" "$seed"
done
done
done
done
echo "[done] cross evals complete"

76
scripts/ablation/run_groupB.sh Executable file
View File

@@ -0,0 +1,76 @@
#!/usr/bin/env bash
# Run all 60 B-group ablation training + phase1-eval runs.
#
# Splits work across two GPUs round-robin (set GPUS env to override).
# Logs per-run go to artifacts/ablation/<save_dir>/{train,phase1}.log.
#
# Usage:
# bash scripts/ablation/run_groupB.sh # all 60 runs
# bash scripts/ablation/run_groupB.sh b1_noflow b5_nodisc # subset of groups
# GPUS=0 bash scripts/ablation/run_groupB.sh # single-GPU serial
set -euo pipefail
cd "$(dirname "$0")/../.."
ALL_GIDS=(b1_noflow b2_flowonly b3_allcont b4_alldisc b5_nodisc)
DATASETS=(iscxtor2016 cicids2017 cicddos2019 ciciot2023)
SEEDS=(42 43 44)
GPUS="${GPUS:-0,1}"
IFS=',' read -ra GPU_ARR <<< "$GPUS"
N_GPU=${#GPU_ARR[@]}
if [[ $# -gt 0 ]]; then
GIDS=("$@")
else
GIDS=("${ALL_GIDS[@]}")
fi
# Build the full run list
runs=()
for gid in "${GIDS[@]}"; do
for ds in "${DATASETS[@]}"; do
for seed in "${SEEDS[@]}"; do
runs+=("${gid}|${ds}|${seed}")
done
done
done
n_runs=${#runs[@]}
echo "[plan] ${n_runs} runs across GPUs ${GPUS} (gids=${GIDS[*]})"
run_one() {
local spec="$1" gpu_id="$2"
IFS='|' read -r gid ds seed <<< "$spec"
local cfg="Mixed_CFM/configs/ablation/${gid}/${ds}_seed${seed}.yaml"
local save_dir
save_dir=$(uv run --no-sync python -c "import yaml,sys; print(yaml.safe_load(open('$cfg'))['save_dir'])")
mkdir -p "$save_dir"
echo "[gpu${gpu_id}] $(date +%H:%M:%S) START $gid $ds seed${seed}"
CUDA_VISIBLE_DEVICES="$gpu_id" uv run --no-sync python Mixed_CFM/train.py \
--config "$cfg" >"$save_dir/train.log" 2>&1
CUDA_VISIBLE_DEVICES="$gpu_id" uv run --no-sync python Mixed_CFM/eval_phase1.py \
--model-dir "$save_dir" --out-dir "$save_dir" \
--batch-size 256 --n-steps 16 \
--n-val-cap 30000 --n-atk-cap 30000 >"$save_dir/phase1.log" 2>&1
echo "[gpu${gpu_id}] $(date +%H:%M:%S) DONE $gid $ds seed${seed}"
}
# Round-robin assignment
pids=()
for i in "${!runs[@]}"; do
spec="${runs[$i]}"
gpu_id="${GPU_ARR[$((i % N_GPU))]}"
# If single GPU: serial; if multi-GPU: parallel up to N_GPU at a time
if [[ $N_GPU -eq 1 ]]; then
run_one "$spec" "$gpu_id"
else
run_one "$spec" "$gpu_id" &
pids+=($!)
# Cap concurrency at N_GPU
if (( (i + 1) % N_GPU == 0 )); then
for pid in "${pids[@]}"; do wait "$pid" || true; done
pids=()
fi
fi
done
for pid in "${pids[@]}"; do wait "$pid" || true; done
echo "[done] all ${n_runs} runs complete"

39
scripts/ablation/smoke_test.sh Executable file
View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bash
# Smoke-test all 5 B-group variants on cicids2017 seed42 with reduced epochs
# and tiny train set, on CPU (so VLLM workers on the GPUs are not disturbed).
#
# After: each ablation/janus_cicids2017_seed42_<gid>/ should contain model.pt
# + phase1_scores.npz with the variant-specific score keys.
set -euo pipefail
cd "$(dirname "$0")/../.."
GIDS=(b1_noflow b2_flowonly b3_allcont b4_alldisc b5_nodisc)
DS=cicids2017
SEED=42
for gid in "${GIDS[@]}"; do
cfg="Mixed_CFM/configs/ablation/${gid}/${DS}_seed${SEED}.yaml"
echo "=================================================="
echo "[smoke] $gid"
echo "=================================================="
uv run --no-sync python Mixed_CFM/train.py \
--config "$cfg" \
--override "device=cpu" "epochs=2" "n_train=500" "eval_n=200" "eval_every=2" \
"save_dir=/home/chy/JANUS/artifacts/ablation_smoke/${gid}" 2>&1 | tail -8
uv run --no-sync python Mixed_CFM/eval_phase1.py \
--model-dir "/home/chy/JANUS/artifacts/ablation_smoke/${gid}" \
--out-dir "/home/chy/JANUS/artifacts/ablation_smoke/${gid}" \
--device cpu --batch-size 64 --n-steps 4 \
--n-val-cap 200 --n-atk-cap 200 2>&1 | tail -4
echo
done
echo "=== Smoke summary ==="
for gid in "${GIDS[@]}"; do
npz="/home/chy/JANUS/artifacts/ablation_smoke/${gid}/phase1_scores.npz"
if [[ -f "$npz" ]]; then
keys=$(uv run --no-sync python -c "import numpy as np; z=np.load('$npz', allow_pickle=True); print(','.join(sorted(k for k in z.files if k.startswith(('val_terminal','val_disc')))))")
echo "$gid: $keys"
else
echo "$gid: MISSING"
fi
done