Files
JANUS/scripts/repr_experiment.sh

125 lines
5.0 KiB
Bash
Executable File

#!/bin/bash
# End-to-end representation experiment: re-extract CICIDS2017 + CICDDoS2019
# with metadata columns, then train E0/E1/E2 with fixed 10k benign and
# evaluate on CICDDoS2019.
#
# Stages (each with wall-clock logging + per-stage log file):
# S1 re-extract CICIDS2017 → datasets/cicids2017/processed/*
# S2a re-extract CICDDoS2019 03-11 shard
# S2b re-extract CICDDoS2019 01-12 shard
# S2c merge CICDDoS2019 shards
# S3 train E0 (mixed_dequant, no ctx) [configs/n10k_baseline.yaml]
# S4 train E1 (relative_v2, no ctx) [configs/n10k_relv2.yaml]
# S5 train E2 (relative_v2, with 8-d ctx) [configs/n10k_relv2_ctx.yaml]
# S6 detect+per_class for each on CICDDoS2019
# S7 summary table
#
# Any stage's failure aborts the rest and leaves the partial log intact.
set -uo pipefail
ROOT=/home/chy/mambafortrafficmodeling
cd "$ROOT"
STAMP=$(date +%Y%m%d_%H%M%S)
OUT_DIR="runs/repr_experiment_${STAMP}"
mkdir -p "$OUT_DIR"
MAIN_LOG="$OUT_DIR/orch.log"
exec > >(tee -a "$MAIN_LOG") 2>&1
N_VAL=20000
N_ATK=100000
SPLIT_SEED=42
echo "========================================================================"
echo "= $(date): repr_experiment start ="
echo "= output root: $OUT_DIR ="
echo "========================================================================"
run_stage() {
local name=$1; shift
local log="$OUT_DIR/${name}.log"
echo ""
echo ">>> $(date): [$name] START"
echo ">>> $(date): [$name] command: $*"
local t0=$(date +%s)
if ! "$@" > "$log" 2>&1; then
local t1=$(date +%s); echo "!!! $(date): [$name] FAILED after $((t1-t0))s — see $log"
tail -30 "$log"
exit 1
fi
local t1=$(date +%s)
echo "<<< $(date): [$name] OK in $((t1-t0))s (log: $log)"
# Print tail of log so orch.log shows meaningful progress.
tail -10 "$log" | sed 's/^/ | /'
}
# ====================================================================
# S1 — re-extract CICIDS2017
# ====================================================================
run_stage "s1_extract_cicids2017" \
uv run python scripts/extract_cicids2017.py --jobs 5 --time-offset 28800
# ====================================================================
# S2 — re-extract CICDDoS2019 (per-shard) + merge
# ====================================================================
run_stage "s2a_extract_cicddos2019_03-11" \
uv run python scripts/extract_cicddos2019.py \
--shards 03-11 --jobs 1 \
--out-packets datasets/cicddos2019/processed/packets.03-11.npz \
--out-flows datasets/cicddos2019/processed/flows.03-11.parquet
run_stage "s2b_extract_cicddos2019_01-12" \
uv run python scripts/extract_cicddos2019.py \
--shards 01-12 --jobs 1 \
--out-packets datasets/cicddos2019/processed/packets.01-12.npz \
--out-flows datasets/cicddos2019/processed/flows.01-12.parquet
run_stage "s2c_merge_cicddos2019" \
uv run python scripts/merge_shard_artifacts.py \
--in datasets/cicddos2019/processed/packets.03-11.npz \
--in datasets/cicddos2019/processed/packets.01-12.npz \
--out-packets datasets/cicddos2019/processed/packets.npz \
--out-flows datasets/cicddos2019/processed/flows.parquet
# ====================================================================
# S3..S5 — train E0 / E1 / E2 with the same 10k benign
# ====================================================================
train_and_eval() {
local tag=$1 cfg=$2
local run_dir="$OUT_DIR/$tag"
mkdir -p "$run_dir"
# Copy config and patch save_dir to our per-tag directory.
cp "$cfg" "$run_dir/config.yaml"
sed -i "s#^save_dir:.*#save_dir: $run_dir#" "$run_dir/config.yaml"
run_stage "${tag}_train" \
uv run python -m train --config "$run_dir/config.yaml"
run_stage "${tag}_detect_ddos" \
uv run python -m detect \
--save-dir "$run_dir" \
--packets-npz datasets/cicddos2019/processed/packets.npz \
--flows-parquet datasets/cicddos2019/processed/flows.parquet \
--n-val "$N_VAL" --n-atk "$N_ATK" --seed "$SPLIT_SEED"
run_stage "${tag}_per_class" \
uv run python -m eval.per_class --save-dir "$run_dir"
}
train_and_eval "e0_baseline" "configs/n10k_baseline.yaml"
train_and_eval "e1_relv2" "configs/n10k_relv2.yaml"
train_and_eval "e2_relv2_ctx" "configs/n10k_relv2_ctx.yaml"
# ====================================================================
# S7 — summary table
# ====================================================================
run_stage "s7_summary" \
uv run python scripts/summarize_repr_exp.py --root "$OUT_DIR"
echo ""
echo "========================================================================"
echo "= $(date): repr_experiment DONE ="
echo "= results under: $OUT_DIR ="
echo "========================================================================"