Initial commit: code, paper, small artifacts
This commit is contained in:
205
scripts/auto_transfer_after_merge.sh
Executable file
205
scripts/auto_transfer_after_merge.sh
Executable file
@@ -0,0 +1,205 @@
|
||||
#!/bin/bash
|
||||
# Wait for user's merge_shard_artifacts.py to finish, then run transfer eval.
|
||||
#
|
||||
# The previous auto_transfer_ddos.sh aborted at STAGE 3 because
|
||||
# np.savez_compressed on 50+ GB array produces a corrupt zip. User's
|
||||
# merge_shard_artifacts.py uses uncompressed np.savez and is currently
|
||||
# running. We just wait for it, validate, then do the 4-cell transfer.
|
||||
set -uo pipefail
|
||||
|
||||
ROOT=/home/chy/mambafortrafficmodeling
|
||||
cd "$ROOT"
|
||||
|
||||
SRC_SWEEP="runs/n10k_refactor_20260422_220351"
|
||||
DST="runs/transfer_ddos"
|
||||
LOG="$DST/run.log"
|
||||
DDOS_DIR="datasets/cicddos2019/processed"
|
||||
mkdir -p "$DST"
|
||||
|
||||
# Append to existing log.
|
||||
exec > >(tee -a "$LOG") 2>&1
|
||||
|
||||
N_VAL=20000
|
||||
N_ATK=100000
|
||||
SPLIT_SEED=42
|
||||
|
||||
echo ""
|
||||
echo "=== $(date): [after-merge script] starts ==="
|
||||
echo "waiting for merge_shard_artifacts.py to finish..."
|
||||
|
||||
elapsed=0
|
||||
while pgrep -f "merge_shard_artifacts" > /dev/null; do
|
||||
sleep 30
|
||||
elapsed=$((elapsed + 30))
|
||||
if (( elapsed % 180 == 0 )); then
|
||||
rss=$(pgrep -af "merge_shard_artifacts" | head -1 | awk '{print $1}' | xargs -I{} ps -p {} -o rss= 2>/dev/null || echo "?")
|
||||
echo "[heartbeat $(date +%H:%M:%S)] merge still running, waited ${elapsed}s rss=${rss} kB"
|
||||
fi
|
||||
done
|
||||
echo "=== $(date): merge exited after ${elapsed}s ==="
|
||||
sleep 10
|
||||
|
||||
PACKETS="$DDOS_DIR/packets.npz"
|
||||
FLOWS="$DDOS_DIR/flows.parquet"
|
||||
|
||||
# ---- validate ----
|
||||
if [[ ! -f "$PACKETS" || ! -f "$FLOWS" ]]; then
|
||||
echo "ERROR: artifacts missing after merge"
|
||||
echo " $PACKETS : $([[ -f $PACKETS ]] && echo OK || echo MISSING)"
|
||||
echo " $FLOWS : $([[ -f $FLOWS ]] && echo OK || echo MISSING)"
|
||||
exit 1
|
||||
fi
|
||||
ls -lh "$PACKETS" "$FLOWS"
|
||||
|
||||
uv run python - <<'EOF'
|
||||
import numpy as np, pandas as pd, sys
|
||||
try:
|
||||
p = np.load('datasets/cicddos2019/processed/packets.npz')
|
||||
f = pd.read_parquet('datasets/cicddos2019/processed/flows.parquet')
|
||||
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
|
||||
assert set(f.columns) == {'flow_id', 'label'}, f.columns
|
||||
assert p['flow_id'].shape[0] == len(f)
|
||||
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy())
|
||||
n = len(f)
|
||||
n_benign = int((f['label'] == 'normal').sum())
|
||||
print(f'[validate] OK: N={n:,} benign={n_benign:,} attack={n-n_benign:,}')
|
||||
print(f'[validate] packet_tokens shape/dtype: {p["packet_tokens"].shape} {p["packet_tokens"].dtype}')
|
||||
print('[validate] label value_counts (top 20):')
|
||||
print(f['label'].value_counts().head(20).to_string())
|
||||
except Exception as e:
|
||||
print(f'[validate] FAILED: {type(e).__name__}: {e}')
|
||||
sys.exit(2)
|
||||
EOF
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "ERROR: validation failed"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# ---- transfer detect + per_class per cell ----
|
||||
echo ""
|
||||
echo "=== $(date): transfer detect + per_class (4 cells) ==="
|
||||
CELLS=( "seed42/s0" "seed42/s0.6" "seed43/s0" "seed43/s0.6" )
|
||||
|
||||
run_cell() {
|
||||
local cell=$1
|
||||
local src_dir="$SRC_SWEEP/$cell"
|
||||
local out_dir="$DST/$cell"
|
||||
|
||||
if [[ ! -f "$src_dir/model.pt" ]]; then
|
||||
echo "SKIP $cell : no model.pt"
|
||||
return 1
|
||||
fi
|
||||
|
||||
mkdir -p "$out_dir"
|
||||
cp "$src_dir/model.pt" "$out_dir/model.pt"
|
||||
[[ -f "$src_dir/history.json" ]] && cp "$src_dir/history.json" "$out_dir/"
|
||||
|
||||
echo ""
|
||||
echo "----- [$cell] $(date) -----"
|
||||
echo "[detect] starting"
|
||||
if ! uv run python -m detect \
|
||||
--save-dir "$out_dir" \
|
||||
--packets-npz "$PACKETS" \
|
||||
--flows-parquet "$FLOWS" \
|
||||
--n-val "$N_VAL" --n-atk "$N_ATK" \
|
||||
--seed "$SPLIT_SEED" \
|
||||
2>&1 | tail -25
|
||||
then
|
||||
echo "ERROR: detect failed for $cell"
|
||||
return 2
|
||||
fi
|
||||
[[ -f "$out_dir/scores.npz" ]] || { echo "ERROR: no scores.npz for $cell"; return 3; }
|
||||
|
||||
echo "[per_class] starting"
|
||||
uv run python -m eval.per_class --save-dir "$out_dir" 2>&1 | tail -60
|
||||
[[ -f "$out_dir/per_class.json" ]] || { echo "ERROR: no per_class.json for $cell"; return 4; }
|
||||
echo "[$cell] OK"
|
||||
return 0
|
||||
}
|
||||
|
||||
OK_CELLS=()
|
||||
FAIL_CELLS=()
|
||||
for cell in "${CELLS[@]}"; do
|
||||
if run_cell "$cell"; then OK_CELLS+=("$cell"); else FAIL_CELLS+=("$cell"); fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== per-cell status ==="
|
||||
echo "OK (${#OK_CELLS[@]}): ${OK_CELLS[*]:-none}"
|
||||
echo "FAIL (${#FAIL_CELLS[@]}): ${FAIL_CELLS[*]:-none}"
|
||||
|
||||
# ---- summary ----
|
||||
echo ""
|
||||
echo "=== $(date): summary ==="
|
||||
uv run python - "$DST" <<'EOF'
|
||||
import json, sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
dst = Path(sys.argv[1])
|
||||
cells = ["seed42/s0", "seed42/s0.6", "seed43/s0", "seed43/s0.6"]
|
||||
ch = "terminal_norm"
|
||||
keys = [("overall_auroc","overall AUROC"),("overall_auprc","overall AUPRC"),
|
||||
("macro_auroc","macro AUROC"),("macro_auprc","macro AUPRC"),
|
||||
("tpr_at_1fpr","TPR@1%FPR"),("fpr_at_95tpr","FPR@95%TPR")]
|
||||
|
||||
header = f"{'cell':<15s}" + "".join(f" {t:>14s}" for _, t in keys) + f" {'flipped':>8s}"
|
||||
print(header); print("-"*len(header))
|
||||
loaded = {}
|
||||
for c in cells:
|
||||
jp = dst / c / "per_class.json"
|
||||
if not jp.exists():
|
||||
print(f"{c:<15s} (missing)"); continue
|
||||
tn = json.loads(jp.read_text())[ch]
|
||||
loaded[c] = tn
|
||||
line = f"{c:<15s}"
|
||||
for k,_ in keys:
|
||||
v = tn[k]
|
||||
line += f" {'NaN':>14s}" if (isinstance(v,float) and v!=v) else f" {v:>14.4f}"
|
||||
line += f" {str(tn['flipped']):>8s}"
|
||||
print(line)
|
||||
|
||||
if loaded:
|
||||
print("\n=== mean ± std across seeds (per σ) ===")
|
||||
for sg in ["0","0.6"]:
|
||||
pair = [(c,d) for c,d in loaded.items() if c.endswith(f"/s{sg}")]
|
||||
if len(pair) < 2:
|
||||
print(f"σ={sg}: {len(pair)} seed(s), skip"); continue
|
||||
print(f"\n--- σ={sg} ---")
|
||||
for k,t in keys:
|
||||
vs = [d[k] for _,d in pair if isinstance(d[k],float) and d[k]==d[k]]
|
||||
if vs:
|
||||
print(f" {t:<18s} {np.mean(vs):.4f} ± {np.std(vs,ddof=0):.4f}")
|
||||
|
||||
ref = loaded.get("seed42/s0.6") or next(iter(loaded.values()))
|
||||
pc = ref["per_class"]
|
||||
print("\n=== per-class (seed42/s0.6 reference) ===")
|
||||
print(f"{'class':<30s} {'n':>8s} {'auroc':>8s} {'auprc':>8s} {'tpr@1%':>8s}")
|
||||
print("-"*70)
|
||||
for r in pc:
|
||||
fmt = lambda v: "—" if (isinstance(v,float) and v!=v) else f"{v:.4f}"
|
||||
print(f" {r['class']:<28s} {r['n']:>8d} "
|
||||
f"{fmt(r['auroc']):>8s} {fmt(r['auprc']):>8s} {fmt(r['tpr_at_1fpr']):>8s}")
|
||||
|
||||
# Merged-label view.
|
||||
print("\n=== per-class after label merge (DrDoS_* → stripped) ===")
|
||||
def norm(name):
|
||||
return name[len("DrDoS_"):] if name.startswith("DrDoS_") else name
|
||||
buckets = {}
|
||||
for r in pc:
|
||||
if isinstance(r["auroc"],float) and r["auroc"]==r["auroc"]:
|
||||
buckets.setdefault(norm(r["class"]), []).append(r)
|
||||
if buckets:
|
||||
print(f"{'merged class':<20s} {'#shards':>8s} {'n_total':>8s} "
|
||||
f"{'auroc_wtd':>10s} {'auprc_wtd':>10s}")
|
||||
print("-"*68)
|
||||
for k, rs in sorted(buckets.items(), key=lambda x: -sum(r["n"] for r in x[1])):
|
||||
n_tot = sum(r["n"] for r in rs)
|
||||
wtd_a = sum(r["auroc"] * r["n"] for r in rs) / max(n_tot, 1)
|
||||
wtd_ap = sum(r["auprc"] * r["n"] for r in rs) / max(n_tot, 1)
|
||||
print(f" {k:<18s} {len(rs):>8d} {n_tot:>8d} "
|
||||
f"{wtd_a:>10.4f} {wtd_ap:>10.4f}")
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
echo "=== $(date): done ==="
|
||||
293
scripts/auto_transfer_ddos.sh
Executable file
293
scripts/auto_transfer_ddos.sh
Executable file
@@ -0,0 +1,293 @@
|
||||
#!/bin/bash
|
||||
# Unattended cross-dataset transfer eval (v2, with per-shard merge).
|
||||
#
|
||||
# Pipeline:
|
||||
# STAGE 1 : wait for 01-12 re-extraction to finish
|
||||
# STAGE 2 : merge packets.{01-12,03-11}.npz + flows.{01-12,03-11}.parquet
|
||||
# → unified packets.npz + flows.parquet
|
||||
# STAGE 3 : validate unified artifacts
|
||||
# STAGE 4 : detect + per_class across 4 cells (seed × σ from CICIDS2017 sweep)
|
||||
# STAGE 5 : summary table + merged-label view
|
||||
#
|
||||
# Log: runs/transfer_ddos/run.log
|
||||
set -uo pipefail
|
||||
|
||||
ROOT=/home/chy/mambafortrafficmodeling
|
||||
cd "$ROOT"
|
||||
|
||||
SRC_SWEEP="runs/n10k_refactor_20260422_220351"
|
||||
DST="runs/transfer_ddos"
|
||||
LOG="$DST/run.log"
|
||||
DDOS_DIR="datasets/cicddos2019/processed"
|
||||
mkdir -p "$DST"
|
||||
exec > >(tee -a "$LOG") 2>&1
|
||||
|
||||
N_VAL=20000
|
||||
N_ATK=100000
|
||||
SPLIT_SEED=42
|
||||
|
||||
echo "=== $(date): script starts (v2 with merge) ==="
|
||||
echo "source sweep : $SRC_SWEEP"
|
||||
echo "destination : $DST"
|
||||
echo "scoring : n_val=$N_VAL n_atk=$N_ATK split_seed=$SPLIT_SEED"
|
||||
|
||||
# =====================================================================
|
||||
# STAGE 1: wait for extraction_cicddos2019 (01-12 shard) to finish
|
||||
# =====================================================================
|
||||
echo ""
|
||||
echo "=== $(date): STAGE 1 — waiting for 01-12 re-extraction ==="
|
||||
elapsed=0
|
||||
while pgrep -f "scripts/extract_cicddos2019" > /dev/null; do
|
||||
sleep 60
|
||||
elapsed=$((elapsed + 60))
|
||||
if (( elapsed % 600 == 0 )); then
|
||||
# Heartbeat every 10 min
|
||||
rss=$(pgrep -af "scripts/extract_cicddos2019" | head -1 | awk '{print $1}' | xargs -I{} ps -p {} -o rss= 2>/dev/null || echo "?")
|
||||
echo "[heartbeat $(date +%H:%M:%S)] 01-12 extraction running, waited ${elapsed}s rss(parent)=${rss} kB"
|
||||
fi
|
||||
done
|
||||
echo "=== $(date): extraction process exited after ${elapsed}s wait ==="
|
||||
sleep 15
|
||||
|
||||
# =====================================================================
|
||||
# STAGE 2: merge per-shard artifacts
|
||||
# =====================================================================
|
||||
echo ""
|
||||
echo "=== $(date): STAGE 2 — merge shards ==="
|
||||
|
||||
SHARDS_PACKETS="$DDOS_DIR/packets.01-12.npz $DDOS_DIR/packets.03-11.npz"
|
||||
SHARDS_FLOWS="$DDOS_DIR/flows.01-12.parquet $DDOS_DIR/flows.03-11.parquet"
|
||||
|
||||
missing=0
|
||||
for f in $SHARDS_PACKETS $SHARDS_FLOWS; do
|
||||
if [[ ! -f "$f" ]]; then
|
||||
echo "ERROR: shard artifact missing: $f"
|
||||
missing=1
|
||||
fi
|
||||
done
|
||||
if (( missing )); then
|
||||
echo "--- tail of 01-12 extraction log ---"
|
||||
tail -40 runs/extract_logs/extract_ddos_0112.log 2>&1 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "all 4 shard artifacts present; running merge"
|
||||
if ! uv run python scripts/merge_cicddos_shards.py 2>&1; then
|
||||
echo "ERROR: merge failed"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# =====================================================================
|
||||
# STAGE 3: validate unified artifacts
|
||||
# =====================================================================
|
||||
echo ""
|
||||
echo "=== $(date): STAGE 3 — validate unified artifacts ==="
|
||||
PACKETS="$DDOS_DIR/packets.npz"
|
||||
FLOWS="$DDOS_DIR/flows.parquet"
|
||||
if [[ ! -f "$PACKETS" || ! -f "$FLOWS" ]]; then
|
||||
echo "ERROR: merge output missing"
|
||||
echo " $PACKETS : $([[ -f $PACKETS ]] && echo OK || echo MISSING)"
|
||||
echo " $FLOWS : $([[ -f $FLOWS ]] && echo OK || echo MISSING)"
|
||||
exit 3
|
||||
fi
|
||||
ls -lh "$PACKETS" "$FLOWS"
|
||||
|
||||
uv run python - <<'EOF'
|
||||
import numpy as np, pandas as pd, sys
|
||||
try:
|
||||
p = np.load('datasets/cicddos2019/processed/packets.npz')
|
||||
f = pd.read_parquet('datasets/cicddos2019/processed/flows.parquet')
|
||||
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
|
||||
assert set(f.columns) == {'flow_id', 'label'}, f.columns
|
||||
assert p['flow_id'].shape[0] == len(f)
|
||||
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy())
|
||||
n = len(f)
|
||||
n_benign = int((f['label'] == 'normal').sum())
|
||||
print(f'[validate] OK: N={n:,} benign={n_benign:,} attack={n-n_benign:,}')
|
||||
print(f'[validate] packet_tokens shape/dtype: {p["packet_tokens"].shape} {p["packet_tokens"].dtype}')
|
||||
print('[validate] label value_counts (top 20):')
|
||||
print(f['label'].value_counts().head(20).to_string())
|
||||
except Exception as e:
|
||||
print(f'[validate] FAILED: {type(e).__name__}: {e}')
|
||||
sys.exit(2)
|
||||
EOF
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "ERROR: unified artifact validation failed"
|
||||
exit 4
|
||||
fi
|
||||
|
||||
# =====================================================================
|
||||
# STAGE 4: transfer detect + per_class
|
||||
# =====================================================================
|
||||
echo ""
|
||||
echo "=== $(date): STAGE 4 — transfer detect + per_class (4 cells) ==="
|
||||
CELLS=( "seed42/s0" "seed42/s0.6" "seed43/s0" "seed43/s0.6" )
|
||||
|
||||
run_cell() {
|
||||
local cell=$1
|
||||
local src_dir="$SRC_SWEEP/$cell"
|
||||
local out_dir="$DST/$cell"
|
||||
|
||||
if [[ ! -f "$src_dir/model.pt" ]]; then
|
||||
echo "SKIP $cell : no model.pt at $src_dir"
|
||||
return 1
|
||||
fi
|
||||
|
||||
mkdir -p "$out_dir"
|
||||
cp "$src_dir/model.pt" "$out_dir/model.pt"
|
||||
[[ -f "$src_dir/history.json" ]] && cp "$src_dir/history.json" "$out_dir/"
|
||||
|
||||
echo ""
|
||||
echo "----- [$cell] $(date) -----"
|
||||
echo "[detect] starting"
|
||||
if ! uv run python -m detect \
|
||||
--save-dir "$out_dir" \
|
||||
--packets-npz "$PACKETS" \
|
||||
--flows-parquet "$FLOWS" \
|
||||
--n-val "$N_VAL" --n-atk "$N_ATK" \
|
||||
--seed "$SPLIT_SEED" \
|
||||
2>&1 | tail -25
|
||||
then
|
||||
echo "ERROR: detect failed for $cell"
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [[ ! -f "$out_dir/scores.npz" ]]; then
|
||||
echo "ERROR: detect produced no scores.npz for $cell"
|
||||
return 3
|
||||
fi
|
||||
|
||||
echo "[per_class] starting"
|
||||
if ! uv run python -m eval.per_class --save-dir "$out_dir" 2>&1 | tail -80
|
||||
then
|
||||
echo "ERROR: per_class failed for $cell"
|
||||
return 4
|
||||
fi
|
||||
|
||||
if [[ ! -f "$out_dir/per_class.json" ]]; then
|
||||
echo "ERROR: per_class.json missing for $cell"
|
||||
return 5
|
||||
fi
|
||||
|
||||
echo "[$cell] OK"
|
||||
return 0
|
||||
}
|
||||
|
||||
OK_CELLS=()
|
||||
FAIL_CELLS=()
|
||||
for cell in "${CELLS[@]}"; do
|
||||
if run_cell "$cell"; then
|
||||
OK_CELLS+=("$cell")
|
||||
else
|
||||
FAIL_CELLS+=("$cell")
|
||||
echo "[$cell] continuing despite failure"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== per-cell status ==="
|
||||
echo "OK (${#OK_CELLS[@]}): ${OK_CELLS[*]:-none}"
|
||||
echo "FAIL (${#FAIL_CELLS[@]}): ${FAIL_CELLS[*]:-none}"
|
||||
|
||||
# =====================================================================
|
||||
# STAGE 5: summary
|
||||
# =====================================================================
|
||||
echo ""
|
||||
echo "=== $(date): STAGE 5 — summary ==="
|
||||
|
||||
uv run python - "$DST" <<'EOF'
|
||||
import json, sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
dst = Path(sys.argv[1])
|
||||
cells = ["seed42/s0", "seed42/s0.6", "seed43/s0", "seed43/s0.6"]
|
||||
|
||||
ch = "terminal_norm"
|
||||
keys = [("overall_auroc", "overall AUROC"),
|
||||
("overall_auprc", "overall AUPRC"),
|
||||
("macro_auroc", "macro AUROC"),
|
||||
("macro_auprc", "macro AUPRC"),
|
||||
("tpr_at_1fpr", "TPR@1%FPR"),
|
||||
("fpr_at_95tpr", "FPR@95%TPR")]
|
||||
|
||||
header = f"{'cell':<15s}" + "".join(f" {t:>14s}" for _, t in keys) + f" {'flipped':>8s}"
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
|
||||
loaded: dict[str, dict] = {}
|
||||
for c in cells:
|
||||
jp = dst / c / "per_class.json"
|
||||
if not jp.exists():
|
||||
print(f"{c:<15s} (missing per_class.json)")
|
||||
continue
|
||||
try:
|
||||
tn = json.loads(jp.read_text())[ch]
|
||||
except Exception as e:
|
||||
print(f"{c:<15s} (parse error: {e})")
|
||||
continue
|
||||
loaded[c] = tn
|
||||
row = [tn[k] for k, _ in keys]
|
||||
line = f"{c:<15s}"
|
||||
for v in row:
|
||||
if isinstance(v, float) and (v != v):
|
||||
line += f" {'NaN':>14s}"
|
||||
else:
|
||||
line += f" {v:>14.4f}"
|
||||
line += f" {str(tn['flipped']):>8s}"
|
||||
print(line)
|
||||
|
||||
if not loaded:
|
||||
print("\n(no cells loaded — nothing to aggregate)")
|
||||
sys.exit(0)
|
||||
|
||||
print("")
|
||||
print("=== mean ± std across seeds (per σ) ===")
|
||||
for sg in ["0", "0.6"]:
|
||||
pair = [(c, d) for c, d in loaded.items() if c.endswith(f"/s{sg}")]
|
||||
if len(pair) < 2:
|
||||
print(f"σ={sg}: only {len(pair)} seed(s), skip aggregate")
|
||||
continue
|
||||
print(f"\n--- σ={sg} ({len(pair)} seeds) ---")
|
||||
for k, t in keys:
|
||||
vals = [d[k] for _, d in pair if isinstance(d[k], float) and d[k] == d[k]]
|
||||
if not vals:
|
||||
continue
|
||||
m = float(np.mean(vals)); s = float(np.std(vals, ddof=0))
|
||||
print(f" {t:<18s} {m:.4f} ± {s:.4f}")
|
||||
|
||||
ref = loaded.get("seed42/s0.6") or next(iter(loaded.values()))
|
||||
print("")
|
||||
print("=== per-class AUROC (seed42/s0.6 reference) ===")
|
||||
pc = ref["per_class"]
|
||||
print(f"{'class':<30s} {'n':>8s} {'auroc':>8s} {'auprc':>8s} {'tpr@1%':>8s}")
|
||||
print("-" * 70)
|
||||
for r in pc:
|
||||
fmt = lambda v: "—" if (isinstance(v, float) and v != v) else f"{v:.4f}"
|
||||
print(f" {r['class']:<28s} {r['n']:>8d} "
|
||||
f"{fmt(r['auroc']):>8s} {fmt(r['auprc']):>8s} {fmt(r['tpr_at_1fpr']):>8s}")
|
||||
|
||||
print("")
|
||||
print("=== per-class after label merge (DrDoS_* → stripped) ===")
|
||||
def norm(name):
|
||||
if name.startswith("DrDoS_"):
|
||||
return name[len("DrDoS_"):]
|
||||
return name
|
||||
buckets: dict[str, list] = {}
|
||||
for r in pc:
|
||||
if isinstance(r["auroc"], float) and r["auroc"] == r["auroc"]:
|
||||
buckets.setdefault(norm(r["class"]), []).append(r)
|
||||
if buckets:
|
||||
print(f"{'merged class':<20s} {'shards':>6s} {'n_total':>8s} "
|
||||
f"{'auroc_wtd':>10s} {'auroc_mean':>10s}")
|
||||
print("-" * 64)
|
||||
for k, rs in sorted(buckets.items(), key=lambda x: -sum(r["n"] for r in x[1])):
|
||||
n_tot = sum(r["n"] for r in rs)
|
||||
wtd = sum(r["auroc"] * r["n"] for r in rs) / max(n_tot, 1)
|
||||
mean = sum(r["auroc"] for r in rs) / len(rs)
|
||||
print(f" {k:<18s} {len(rs):>6d} {n_tot:>8d} "
|
||||
f"{wtd:>10.4f} {mean:>10.4f}")
|
||||
EOF
|
||||
|
||||
echo ""
|
||||
echo "=== $(date): script done ==="
|
||||
106
scripts/baselines/aggregate_anomaly_transformer.py
Normal file
106
scripts/baselines/aggregate_anomaly_transformer.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
ROOT = REPO / 'artifacts/baselines/anomaly_transformer_2026_04_29'
|
||||
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
|
||||
SEEDS = (42, 43, 44)
|
||||
AGGS = ('mean', 'max', 'median', 'p90')
|
||||
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
|
||||
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
|
||||
|
||||
def _load(protocol, seed):
|
||||
p = ROOT / f'{protocol}_seed{seed}.json'
|
||||
if not p.exists():
|
||||
return None
|
||||
return json.loads(p.read_text())
|
||||
|
||||
def _ms(vals):
|
||||
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
|
||||
if len(arr) == 0:
|
||||
return (float('nan'), float('nan'))
|
||||
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
|
||||
|
||||
def _abs_auroc(v):
|
||||
return max(v, 1.0 - v)
|
||||
|
||||
def main():
|
||||
rows = []
|
||||
full = {'protocols': {}}
|
||||
per_class_collect = {p: {} for p in PROTOCOLS}
|
||||
for protocol in PROTOCOLS:
|
||||
agg_aurocs = {agg: [] for agg in AGGS}
|
||||
agg_abs_aurocs = {agg: [] for agg in AGGS}
|
||||
seeds_run = []
|
||||
for s in SEEDS:
|
||||
r = _load(protocol, s)
|
||||
if r is None:
|
||||
continue
|
||||
seeds_run.append(s)
|
||||
for agg in AGGS:
|
||||
ov = r['overall_by_agg'][agg]
|
||||
agg_aurocs[agg].append(ov['auroc'])
|
||||
agg_abs_aurocs[agg].append(_abs_auroc(ov['auroc']))
|
||||
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
|
||||
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
|
||||
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
|
||||
agg_summary = {}
|
||||
for agg in AGGS:
|
||||
(m, sd) = _ms(agg_aurocs[agg])
|
||||
(am, asd) = _ms(agg_abs_aurocs[agg])
|
||||
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'abs_auroc_mean': am, 'abs_auroc_std': asd}
|
||||
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
|
||||
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['abs_auroc_mean'])
|
||||
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'abs_auroc_mean': agg_summary[best_agg]['abs_auroc_mean'], 'abs_auroc_std': agg_summary[best_agg]['abs_auroc_std'], 'all_aggs': agg_summary})
|
||||
lines = ['# Anomaly-Transformer (ICLR 2022) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: ICLR 2022 Anomaly-Transformer (association-discrepancy minimax). Vendored model class from `baselines/Anomaly-Transformer/model/AnomalyTransformer.py`; training + scoring loop reimplemented to match our protocol (input shape [B, T=64, D=9] = our z-scored packet sequences, same train/val/attack splits as eval_new_scores.py).', 'Hyperparams: d_model=128, n_heads=4, e_layers=3, batch=128, lr=1e-4, k_disc=3.0, temperature=50.0, epochs=15.', 'Score: per-position softmax(-association_KL · T) · MSE(rec, x), then aggregated per flow (mean / max / median / p90).', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm (Unified_CFM) | **AT (ours)** | abs AUROC | best agg | Δ vs terminal |', '|---|---:|---:|---:|---|---:|']
|
||||
for row in rows:
|
||||
p = row['protocol']
|
||||
(tn_m, tn_sd) = TERMINAL_NORM[p]
|
||||
(m, sd) = (row['auroc_mean'], row['auroc_std'])
|
||||
(am, asd) = (row['abs_auroc_mean'], row['abs_auroc_std'])
|
||||
if np.isnan(m):
|
||||
continue
|
||||
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
|
||||
d_terminal = m - tn_m
|
||||
lines.append(f"| {PRETTY[p]} | {tn_str} | **{m:.4f} ± {sd:.4f}** | {am:.4f} ± {asd:.4f} | `{row['best_agg']}` | {d_terminal:+.4f} |")
|
||||
lines.append('')
|
||||
lines.append('## All aggregators (3-seed mean ± std)')
|
||||
lines.append('')
|
||||
lines.append('| Protocol | mean | max | median | p90 |')
|
||||
lines.append('|---|---:|---:|---:|---:|')
|
||||
for row in rows:
|
||||
cells = [PRETTY[row['protocol']]]
|
||||
for agg in AGGS:
|
||||
a = row['all_aggs'][agg]
|
||||
m = a['auroc_mean']
|
||||
if np.isnan(m):
|
||||
cells.append('—')
|
||||
else:
|
||||
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
|
||||
lines.append('| ' + ' | '.join(cells) + ' |')
|
||||
lines.append('')
|
||||
lines.append('## Per-attack (forward + reverse, mean aggregator)')
|
||||
for protocol in ('forward_cross', 'reverse_cross'):
|
||||
lines.append(f'\n### {PRETTY[protocol]}')
|
||||
d = per_class_collect[protocol]
|
||||
if not d:
|
||||
continue
|
||||
lines.append('| attack | n | AT AUROC mean ± std |')
|
||||
lines.append('|---|---:|---:|')
|
||||
for cls in sorted(d):
|
||||
n = d[cls]['n']
|
||||
(m, sd) = _ms(d[cls]['aurocs'])
|
||||
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
|
||||
out = ROOT / 'summary.md'
|
||||
out.write_text('\n'.join(lines))
|
||||
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM}}
|
||||
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
|
||||
print(f'[saved] {out}')
|
||||
print(f"[saved] {ROOT / 'summary.json'}")
|
||||
print()
|
||||
for row in rows:
|
||||
if not np.isnan(row['auroc_mean']):
|
||||
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} raw={row['auroc_mean']:.4f}±{row['auroc_std']:.4f} abs={row['abs_auroc_mean']:.4f}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
109
scripts/baselines/aggregate_kitsune.py
Normal file
109
scripts/baselines/aggregate_kitsune.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
ROOT = REPO / 'artifacts/baselines/kitsune_2026_04_29'
|
||||
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
|
||||
SEEDS = (42, 43, 44)
|
||||
AGGS = ('mean', 'max', 'median', 'p90')
|
||||
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
|
||||
KITSUNE_PAPER = {'iscxtor_within': (0.78, None), 'cicids_within': (0.85, None), 'cicddos_within': (None, None), 'forward_cross': (None, None), 'reverse_cross': (None, None)}
|
||||
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
|
||||
|
||||
def _load(protocol, seed):
|
||||
p = ROOT / f'{protocol}_seed{seed}.json'
|
||||
if not p.exists():
|
||||
return None
|
||||
return json.loads(p.read_text())
|
||||
|
||||
def _ms(vals):
|
||||
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
|
||||
if len(arr) == 0:
|
||||
return (float('nan'), float('nan'))
|
||||
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
|
||||
|
||||
def main():
|
||||
rows = []
|
||||
per_class_collect = {p: {} for p in PROTOCOLS}
|
||||
full = {'protocols': {}}
|
||||
for protocol in PROTOCOLS:
|
||||
agg_aurocs = {agg: [] for agg in AGGS}
|
||||
agg_auprcs = {agg: [] for agg in AGGS}
|
||||
seeds_run = []
|
||||
for s in SEEDS:
|
||||
r = _load(protocol, s)
|
||||
if r is None:
|
||||
continue
|
||||
seeds_run.append(s)
|
||||
for agg in AGGS:
|
||||
ov = r['overall_by_agg'][agg]
|
||||
agg_aurocs[agg].append(ov['auroc'])
|
||||
agg_auprcs[agg].append(ov['auprc'])
|
||||
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
|
||||
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
|
||||
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
|
||||
agg_summary = {}
|
||||
for agg in AGGS:
|
||||
(m, sd) = _ms(agg_aurocs[agg])
|
||||
(ma, sda) = _ms(agg_auprcs[agg])
|
||||
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda}
|
||||
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
|
||||
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['auroc_mean'])
|
||||
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'all_aggs': agg_summary})
|
||||
lines = ['# Kitsune (Path B) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: KitNET ensemble autoencoder (the ML core of Kitsune).', "**Path B**: feeds our **z-scored 9-d packet features** directly through `KitNET.process()` for the FM+AD grace, then `KitNET.execute()` per packet during eval. **AfterImage's 100-d host/session statistics are skipped** (they require sequential pcap streams which our (B,T,9) tensor abstraction discards). This keeps data usage unified with `eval_new_scores.py`.", 'Train: 5000 source-benign flows → ~75-320k packets (≥ FM+AD=55k grace).', 'Score: per-flow aggregate of per-packet RMSE (mean / max / median / p90).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm | Kitsune paper (Shafir reproduction) | **Kitsune Path B (ours)** | best agg | Δ vs paper | Δ vs terminal |', '|---|---:|---:|---:|---|---:|---:|']
|
||||
for row in rows:
|
||||
p = row['protocol']
|
||||
(tn_m, tn_sd) = TERMINAL_NORM[p]
|
||||
(kp_m, _) = KITSUNE_PAPER[p]
|
||||
(m, sd) = (row['auroc_mean'], row['auroc_std'])
|
||||
if np.isnan(m):
|
||||
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | — | (no runs) | — | — | — |')
|
||||
continue
|
||||
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
|
||||
kp_str = f'{kp_m:.4f}' if kp_m is not None else '—'
|
||||
d_terminal = m - tn_m
|
||||
d_paper = m - kp_m if kp_m is not None else None
|
||||
d_paper_str = f'{d_paper:+.4f}' if d_paper is not None else '—'
|
||||
lines.append(f"| {PRETTY[p]} | {tn_str} | {kp_str} | **{m:.4f} ± {sd:.4f}** | `{row['best_agg']}` | {d_paper_str} | {d_terminal:+.4f} |")
|
||||
lines.append('')
|
||||
lines.append('## All aggregators (3-seed mean ± std)')
|
||||
lines.append('')
|
||||
lines.append('| Protocol | mean | max | median | p90 |')
|
||||
lines.append('|---|---:|---:|---:|---:|')
|
||||
for row in rows:
|
||||
cells = [PRETTY[row['protocol']]]
|
||||
for agg in AGGS:
|
||||
a = row['all_aggs'][agg]
|
||||
m = a['auroc_mean']
|
||||
if np.isnan(m):
|
||||
cells.append('—')
|
||||
else:
|
||||
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
|
||||
lines.append('| ' + ' | '.join(cells) + ' |')
|
||||
lines.append('')
|
||||
lines.append('## Per-attack (forward + reverse, mean aggregator)')
|
||||
for protocol in ('forward_cross', 'reverse_cross'):
|
||||
lines.append(f'\n### {PRETTY[protocol]}')
|
||||
d = per_class_collect[protocol]
|
||||
if not d:
|
||||
lines.append('(no runs)')
|
||||
continue
|
||||
lines.append('| attack | n | Kitsune AUROC mean ± std |')
|
||||
lines.append('|---|---:|---:|')
|
||||
for cls in sorted(d):
|
||||
n = d[cls]['n']
|
||||
(m, sd) = _ms(d[cls]['aurocs'])
|
||||
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
|
||||
out = ROOT / 'summary.md'
|
||||
out.write_text('\n'.join(lines))
|
||||
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM, 'kitsune_paper': KITSUNE_PAPER}}
|
||||
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
|
||||
print(f'[saved] {out}')
|
||||
print(f"[saved] {ROOT / 'summary.json'}")
|
||||
print()
|
||||
for row in rows:
|
||||
if not np.isnan(row['auroc_mean']):
|
||||
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
93
scripts/baselines/aggregate_shafir_nf.py
Normal file
93
scripts/baselines/aggregate_shafir_nf.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
ROOT = REPO / 'artifacts/baselines/shafir_nf_2026_04_29'
|
||||
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
|
||||
SEEDS = (42, 43, 44)
|
||||
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
|
||||
SHAFIR_PAPER = {'iscxtor_within': (0.8731, None), 'cicids_within': (0.9303, None), 'cicddos_within': (0.93, None), 'forward_cross': (0.89, None), 'reverse_cross': (0.93, None)}
|
||||
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
|
||||
|
||||
def _load(protocol, seed):
|
||||
p = ROOT / f'{protocol}_seed{seed}.json'
|
||||
if not p.exists():
|
||||
return None
|
||||
return json.loads(p.read_text())
|
||||
|
||||
def _ms(vals):
|
||||
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
|
||||
if len(arr) == 0:
|
||||
return (float('nan'), float('nan'))
|
||||
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
|
||||
|
||||
def main():
|
||||
rows = []
|
||||
per_class_collect = {p: {} for p in PROTOCOLS}
|
||||
for protocol in PROTOCOLS:
|
||||
(aurocs, auprcs, t_train) = ([], [], [])
|
||||
for s in SEEDS:
|
||||
r = _load(protocol, s)
|
||||
if r is None:
|
||||
continue
|
||||
aurocs.append(r['overall']['neg_log_prob']['auroc'])
|
||||
auprcs.append(r['overall']['neg_log_prob']['auprc'])
|
||||
t_train.append(r.get('t_train_sec', 0.0))
|
||||
for (cls, info) in r.get('per_class', {}).items():
|
||||
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
|
||||
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
|
||||
(m, sd) = _ms(aurocs)
|
||||
(ma, sda) = _ms(auprcs)
|
||||
(tt, _) = _ms(t_train)
|
||||
rows.append({'protocol': protocol, 'n_seeds': len(aurocs), 'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda, 't_train_sec_mean': tt})
|
||||
lines = ['# Shafir 2026 NF Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', "Method: Shafir's official `pzflow.Flow` (single basic NF).", 'Features: our **20-d canonical packet-derived flow features** (`common.data_contract.CANONICAL_FLOW_FEATURE_NAMES`), z-scored with the **same source training stats** that the Unified_CFM checkpoint uses.', 'Train cap: 10,000 source-benign samples (Shafir paper protocol).', 'Optimizer: SGD lr=1e-3, 100 epochs (Shafir paper defaults).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (3-seed mean ± std)', '', '| Protocol | terminal_norm (ours) | Shafir NF — paper | **Shafir NF — our features** | Δ vs paper | Δ vs terminal_norm |', '|---|---:|---:|---:|---:|---:|']
|
||||
for row in rows:
|
||||
p = row['protocol']
|
||||
(tn_m, tn_sd) = TERMINAL_NORM[p]
|
||||
(sp_m, _) = SHAFIR_PAPER[p]
|
||||
(m, sd) = (row['auroc_mean'], row['auroc_std'])
|
||||
if np.isnan(m):
|
||||
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | {sp_m:.4f} | (no runs yet) | — | — |')
|
||||
continue
|
||||
d_paper = m - sp_m
|
||||
d_terminal = m - tn_m
|
||||
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
|
||||
lines.append(f'| {PRETTY[p]} | {tn_str} | {sp_m:.4f} | **{m:.4f} ± {sd:.4f}** | {d_paper:+.4f} | {d_terminal:+.4f} |')
|
||||
lines.append('')
|
||||
lines.append('## Per-protocol stats')
|
||||
lines.append('')
|
||||
lines.append('| Protocol | n_seeds | AUPRC mean ± std | Train time (s, mean) |')
|
||||
lines.append('|---|---:|---:|---:|')
|
||||
for row in rows:
|
||||
p = row['protocol']
|
||||
(m, sd) = (row['auprc_mean'], row['auprc_std'])
|
||||
if np.isnan(m):
|
||||
continue
|
||||
lines.append(f"| {PRETTY[p]} | {row['n_seeds']} | {m:.4f} ± {sd:.4f} | {row['t_train_sec_mean']:.1f} |")
|
||||
lines.append('')
|
||||
lines.append('## Per-attack (forward + reverse)')
|
||||
for protocol in ('forward_cross', 'reverse_cross'):
|
||||
lines.append(f'\n### {PRETTY[protocol]}')
|
||||
d = per_class_collect[protocol]
|
||||
if not d:
|
||||
lines.append('(no runs)')
|
||||
continue
|
||||
lines.append('| attack | n | Shafir NF AUROC mean ± std |')
|
||||
lines.append('|---|---:|---:|')
|
||||
for cls in sorted(d):
|
||||
n = d[cls]['n']
|
||||
(m, sd) = _ms(d[cls]['aurocs'])
|
||||
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
|
||||
out = ROOT / 'summary.md'
|
||||
out.write_text('\n'.join(lines))
|
||||
summary_json = {'rows': rows, 'per_class': {p: {cls: {'n': v['n'], **dict(zip(['mean', 'std'], _ms(v['aurocs'])))} for (cls, v) in dd.items()} for (p, dd) in per_class_collect.items()}, 'baselines': {'terminal_norm': TERMINAL_NORM, 'shafir_paper': SHAFIR_PAPER}}
|
||||
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
|
||||
print(f'[saved] {out}')
|
||||
print(f"[saved] {ROOT / 'summary.json'}")
|
||||
print()
|
||||
for row in rows:
|
||||
if not np.isnan(row['auroc_mean']):
|
||||
print(f" {PRETTY[row['protocol']]:<34s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
267
scripts/baselines/run_anomaly_transformer.py
Normal file
267
scripts/baselines/run_anomaly_transformer.py
Normal file
@@ -0,0 +1,267 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import yaml
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(REPO / 'Packet_CFM'))
|
||||
sys.path.insert(0, str(REPO / 'Unified_CFM'))
|
||||
sys.path.insert(0, str(REPO / 'baselines/Anomaly-Transformer'))
|
||||
from data import _apply_mixed_dequant, _zscore, load_unified_data
|
||||
from packet_store import PacketShardStore
|
||||
from model.AnomalyTransformer import AnomalyTransformer
|
||||
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
|
||||
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}}
|
||||
|
||||
def _load_within(model_dir, n_val, n_atk, n_train_cap, seed):
|
||||
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
|
||||
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
|
||||
rng = np.random.default_rng(seed)
|
||||
(train_packets, train_len) = (data.train_packets, data.train_len)
|
||||
if len(train_packets) > n_train_cap:
|
||||
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
|
||||
(train_packets, train_len) = (train_packets[idx], train_len[idx])
|
||||
(val_packets, val_len) = (data.val_packets, data.val_len)
|
||||
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
|
||||
if n_val is not None and len(val_packets) > n_val:
|
||||
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
|
||||
(val_packets, val_len) = (val_packets[idx], val_len[idx])
|
||||
if n_atk is not None and len(atk_packets) > n_atk:
|
||||
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
|
||||
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
|
||||
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
|
||||
|
||||
def _load_cross(spec, ckpt, seed, n_train_cap, T):
|
||||
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
|
||||
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
|
||||
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
|
||||
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
|
||||
src_cfg = yaml.safe_load(src_cfg_path.read_text())
|
||||
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
|
||||
rng = np.random.default_rng(seed + 1000)
|
||||
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
|
||||
if len(train_packets) > n_train_cap:
|
||||
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
|
||||
(train_packets, train_len) = (train_packets[idx], train_len[idx])
|
||||
target_store = REPO / spec['target_store']
|
||||
target_flows = REPO / spec['target_flows']
|
||||
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
|
||||
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
rng2 = np.random.default_rng(seed)
|
||||
benign_idx = np.flatnonzero(labels == 'normal')
|
||||
attack_idx = np.flatnonzero(labels != 'normal')
|
||||
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
|
||||
atk_classes = sorted(set(labels[attack_idx]))
|
||||
per_class = max(1, n_attack // len(atk_classes))
|
||||
chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = attack_idx[labels[attack_idx] == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
chunks.append(rng2.choice(pool, size=k, replace=False))
|
||||
a_sel = np.sort(np.concatenate(chunks))
|
||||
if len(a_sel) > n_attack:
|
||||
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
|
||||
store = PacketShardStore.open(target_store)
|
||||
|
||||
def _materialize(idx):
|
||||
(tok, ll) = store.read_packets(idx, T=T)
|
||||
ll = np.minimum(ll, T).astype(np.int32)
|
||||
return (tok.astype(np.float32), ll)
|
||||
(b_tok, b_len) = _materialize(b_sel)
|
||||
(a_tok, a_len) = _materialize(a_sel)
|
||||
if packet_preprocess == 'mixed_dequant':
|
||||
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
|
||||
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
|
||||
else:
|
||||
val_packets = _zscore(b_tok, packet_mean, packet_std)
|
||||
atk_packets = _zscore(a_tok, packet_mean, packet_std)
|
||||
msk_b = np.arange(T)[None, :] < b_len[:, None]
|
||||
msk_a = np.arange(T)[None, :] < a_len[:, None]
|
||||
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
|
||||
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
|
||||
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
|
||||
|
||||
def _kl(p, q):
|
||||
return torch.sum(p * (torch.log(p + 0.0001) - torch.log(q + 0.0001)), dim=-1)
|
||||
|
||||
def _norm_prior(prior, win_size: int) -> torch.Tensor:
|
||||
return prior / torch.unsqueeze(torch.sum(prior, dim=-1), dim=-1).repeat(1, 1, 1, win_size)
|
||||
|
||||
def _train(model: AnomalyTransformer, train_packets: np.ndarray, train_len: np.ndarray, *, batch_size: int, epochs: int, lr: float, k_disc: float, win_size: int, device: torch.device) -> dict:
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||
criterion = nn.MSELoss()
|
||||
n = len(train_packets)
|
||||
losses_log = []
|
||||
t0 = time.time()
|
||||
for epoch in range(epochs):
|
||||
model.train()
|
||||
rng = np.random.default_rng(epoch)
|
||||
perm = rng.permutation(n)
|
||||
epoch_losses = []
|
||||
for s in range(0, n, batch_size):
|
||||
idx = perm[s:s + batch_size]
|
||||
x = torch.from_numpy(train_packets[idx]).float().to(device)
|
||||
optimizer.zero_grad()
|
||||
(output, series, prior, _) = model(x)
|
||||
series_loss = 0.0
|
||||
prior_loss = 0.0
|
||||
for u in range(len(prior)):
|
||||
norm_p = _norm_prior(prior[u], win_size)
|
||||
series_loss += torch.mean(_kl(series[u], norm_p.detach())) + torch.mean(_kl(norm_p.detach(), series[u]))
|
||||
prior_loss += torch.mean(_kl(norm_p, series[u].detach())) + torch.mean(_kl(series[u].detach(), norm_p))
|
||||
series_loss /= len(prior)
|
||||
prior_loss /= len(prior)
|
||||
rec_loss = criterion(output, x)
|
||||
loss1 = rec_loss - k_disc * series_loss
|
||||
loss2 = rec_loss + k_disc * prior_loss
|
||||
loss1.backward(retain_graph=True)
|
||||
loss2.backward()
|
||||
optimizer.step()
|
||||
epoch_losses.append(rec_loss.item())
|
||||
losses_log.append(float(np.mean(epoch_losses)))
|
||||
if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
|
||||
print(f' [epoch {epoch + 1}/{epochs}] rec_loss={losses_log[-1]:.4f} ({time.time() - t0:.1f}s elapsed)', flush=True)
|
||||
return {'losses': losses_log, 't_train_sec': time.time() - t0}
|
||||
|
||||
@torch.no_grad()
|
||||
def _score(model: AnomalyTransformer, packets: np.ndarray, lens: np.ndarray, *, batch_size: int, win_size: int, temperature: float, device: torch.device) -> dict[str, np.ndarray]:
|
||||
model.eval()
|
||||
n = len(packets)
|
||||
means = np.zeros(n, dtype=np.float32)
|
||||
maxes = np.zeros(n, dtype=np.float32)
|
||||
medians = np.zeros(n, dtype=np.float32)
|
||||
p90s = np.zeros(n, dtype=np.float32)
|
||||
crit = nn.MSELoss(reduction='none')
|
||||
for s in range(0, n, batch_size):
|
||||
x = torch.from_numpy(packets[s:s + batch_size]).float().to(device)
|
||||
L = torch.from_numpy(lens[s:s + batch_size]).long().to(device)
|
||||
(output, series, prior, _) = model(x)
|
||||
rec = crit(output, x).mean(dim=-1)
|
||||
series_loss = 0.0
|
||||
prior_loss = 0.0
|
||||
for u in range(len(prior)):
|
||||
norm_p = _norm_prior(prior[u], win_size)
|
||||
kl1 = _kl(series[u], norm_p.detach())
|
||||
kl2 = _kl(norm_p.detach(), series[u])
|
||||
series_loss = series_loss + (kl1 + kl2)
|
||||
if isinstance(series_loss, torch.Tensor):
|
||||
sl = series_loss.mean(dim=1)
|
||||
metric = torch.softmax(-sl * temperature, dim=-1) * rec
|
||||
else:
|
||||
metric = rec
|
||||
T_eff = x.shape[1]
|
||||
arange = torch.arange(T_eff, device=device).unsqueeze(0).expand_as(metric)
|
||||
mask = arange < L.unsqueeze(1)
|
||||
for i in range(metric.shape[0]):
|
||||
li = int(L[i].item())
|
||||
if li == 0:
|
||||
continue
|
||||
row = metric[i, :li].cpu().numpy()
|
||||
means[s + i] = row.mean()
|
||||
maxes[s + i] = row.max()
|
||||
medians[s + i] = float(np.median(row))
|
||||
p90s[s + i] = float(np.percentile(row, 90))
|
||||
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def _per_class(val_score, atk_score, atk_labels):
|
||||
out = {}
|
||||
for cls in sorted(set(atk_labels)):
|
||||
m = atk_labels == cls
|
||||
n_c = int(m.sum())
|
||||
v_c = atk_score[m]
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
|
||||
s = np.r_[val_score, v_c]
|
||||
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
|
||||
return out
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
|
||||
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
|
||||
p.add_argument('--out-dir', type=Path, required=True)
|
||||
p.add_argument('--n-train-cap', type=int, default=10000)
|
||||
p.add_argument('--epochs', type=int, default=10)
|
||||
p.add_argument('--lr', type=float, default=0.0001)
|
||||
p.add_argument('--k-disc', type=float, default=3.0, help='weight on association-discrepancy KL term')
|
||||
p.add_argument('--temperature', type=float, default=50.0)
|
||||
p.add_argument('--batch-size', type=int, default=64)
|
||||
p.add_argument('--d-model', type=int, default=128)
|
||||
p.add_argument('--n-heads', type=int, default=4)
|
||||
p.add_argument('--e-layers', type=int, default=3)
|
||||
p.add_argument('--T', type=int, default=64)
|
||||
p.add_argument('--device', type=str, default='auto')
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
device = torch.device('cuda' if args.device == 'auto' and torch.cuda.is_available() else args.device if args.device != 'auto' else 'cpu')
|
||||
is_within = args.protocol in WITHIN_DIRS
|
||||
if is_within:
|
||||
(template, caps) = WITHIN_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
|
||||
else:
|
||||
spec = CROSS_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
|
||||
print(f'[run] anomaly_transformer protocol={args.protocol} seed={args.seed}')
|
||||
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
|
||||
if is_within:
|
||||
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
|
||||
else:
|
||||
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
|
||||
n_train = len(arrays['train_packets'])
|
||||
n_val = len(arrays['val_packets'])
|
||||
n_atk = len(arrays['atk_packets'])
|
||||
D = arrays['train_packets'].shape[-1]
|
||||
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D} device={device}')
|
||||
torch.manual_seed(args.seed)
|
||||
model = AnomalyTransformer(win_size=args.T, enc_in=D, c_out=D, d_model=args.d_model, n_heads=args.n_heads, e_layers=args.e_layers, d_ff=args.d_model, dropout=0.0, output_attention=True).to(device)
|
||||
n_params = sum((p.numel() for p in model.parameters()))
|
||||
print(f'[model] params={n_params:,}')
|
||||
train_meta = _train(model, arrays['train_packets'], arrays['train_len'], batch_size=args.batch_size, epochs=args.epochs, lr=args.lr, k_disc=args.k_disc, win_size=args.T, device=device)
|
||||
print(f"[train] {train_meta['t_train_sec']:.1f}s, final rec_loss={train_meta['losses'][-1]:.4f}")
|
||||
t0 = time.time()
|
||||
val_aggs = _score(model, arrays['val_packets'], arrays['val_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
|
||||
print(f'[score] benign in {time.time() - t0:.1f}s')
|
||||
t0 = time.time()
|
||||
atk_aggs = _score(model, arrays['atk_packets'], arrays['atk_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
|
||||
print(f'[score] attack in {time.time() - t0:.1f}s')
|
||||
overall = {}
|
||||
per_class_by_agg = {}
|
||||
for agg in ('mean', 'max', 'median', 'p90'):
|
||||
v = val_aggs[agg]
|
||||
a = atk_aggs[agg]
|
||||
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
|
||||
s = np.r_[v, a]
|
||||
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
|
||||
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
|
||||
out = {'method': 'anomaly_transformer', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': n_train, 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'epochs': args.epochs, 'lr': args.lr, 'k_disc': args.k_disc, 'temperature': args.temperature, 'd_model': args.d_model, 't_train_sec': round(train_meta['t_train_sec'], 2), 'loss_first_last': [train_meta['losses'][0], train_meta['losses'][-1]], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
npz_path = out_json.with_suffix('.npz')
|
||||
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
|
||||
for agg in ('mean', 'max', 'median', 'p90'):
|
||||
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
|
||||
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
|
||||
np.savez_compressed(npz_path, **save)
|
||||
print(f'[saved] {out_json}')
|
||||
best = max(overall, key=lambda k: overall[k]['auroc'])
|
||||
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
|
||||
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
|
||||
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
37
scripts/baselines/run_anomaly_transformer_all.sh
Executable file
37
scripts/baselines/run_anomaly_transformer_all.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
REPO=$(cd "$(dirname "$0")/../.." && pwd)
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="artifacts/baselines/anomaly_transformer_2026_04_29"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG="$OUT_DIR/master.log"
|
||||
: > "$LOG"
|
||||
|
||||
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
|
||||
SEEDS_DEFAULT="42 43 44"
|
||||
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
|
||||
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
|
||||
EPOCHS="${EPOCHS:-15}"
|
||||
BATCH="${BATCH:-128}"
|
||||
D_MODEL="${D_MODEL:-128}"
|
||||
|
||||
for protocol in $PROTOCOLS; do
|
||||
for seed in $SEEDS; do
|
||||
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
|
||||
if [[ -f "$out_json" ]]; then
|
||||
echo "[skip] $out_json exists" | tee -a "$LOG"
|
||||
continue
|
||||
fi
|
||||
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS batch=$BATCH ===" | tee -a "$LOG"
|
||||
ts=$(date +%s)
|
||||
uv run --no-sync python scripts/baselines/run_anomaly_transformer.py \
|
||||
--protocol "$protocol" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
--epochs "$EPOCHS" --batch-size "$BATCH" --d-model "$D_MODEL" \
|
||||
2>&1 | tee -a "$LOG"
|
||||
te=$(date +%s)
|
||||
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
|
||||
done
|
||||
done
|
||||
echo "ALL DONE"
|
||||
223
scripts/baselines/run_kitsune.py
Normal file
223
scripts/baselines/run_kitsune.py
Normal file
@@ -0,0 +1,223 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import yaml
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(REPO / 'Packet_CFM'))
|
||||
sys.path.insert(0, str(REPO / 'Unified_CFM'))
|
||||
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
|
||||
from data import _apply_mixed_dequant, _zscore, load_unified_data
|
||||
from packet_store import PacketShardStore
|
||||
from KitNET.KitNET import KitNET
|
||||
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
|
||||
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def _load_within(model_dir: Path, n_val, n_atk, n_train_cap, seed):
|
||||
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
|
||||
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
|
||||
rng = np.random.default_rng(seed)
|
||||
(train_packets, train_len) = (data.train_packets, data.train_len)
|
||||
if len(train_packets) > n_train_cap:
|
||||
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
|
||||
(train_packets, train_len) = (train_packets[idx], train_len[idx])
|
||||
(val_packets, val_len) = (data.val_packets, data.val_len)
|
||||
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
|
||||
if n_val is not None and len(val_packets) > n_val:
|
||||
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
|
||||
(val_packets, val_len) = (val_packets[idx], val_len[idx])
|
||||
if n_atk is not None and len(atk_packets) > n_atk:
|
||||
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
|
||||
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
|
||||
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
|
||||
|
||||
def _load_cross(spec, ckpt, seed, n_train_cap, T):
|
||||
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
|
||||
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
|
||||
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
|
||||
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
|
||||
src_cfg = yaml.safe_load(src_cfg_path.read_text())
|
||||
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
|
||||
rng = np.random.default_rng(seed + 1000)
|
||||
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
|
||||
if len(train_packets) > n_train_cap:
|
||||
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
|
||||
(train_packets, train_len) = (train_packets[idx], train_len[idx])
|
||||
target_store = REPO / spec['target_store']
|
||||
target_flows = REPO / spec['target_flows']
|
||||
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
|
||||
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
rng2 = np.random.default_rng(seed)
|
||||
benign_idx = np.flatnonzero(labels == 'normal')
|
||||
attack_idx = np.flatnonzero(labels != 'normal')
|
||||
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
|
||||
atk_classes = sorted(set(labels[attack_idx]))
|
||||
per_class = max(1, n_attack // len(atk_classes))
|
||||
chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = attack_idx[labels[attack_idx] == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
chunks.append(rng2.choice(pool, size=k, replace=False))
|
||||
a_sel = np.sort(np.concatenate(chunks))
|
||||
if len(a_sel) > n_attack:
|
||||
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
|
||||
store = PacketShardStore.open(target_store)
|
||||
|
||||
def _materialize(idx):
|
||||
(tok, ll) = store.read_packets(idx, T=T)
|
||||
ll = np.minimum(ll, T).astype(np.int32)
|
||||
return (tok.astype(np.float32), ll)
|
||||
(b_tok, b_len) = _materialize(b_sel)
|
||||
(a_tok, a_len) = _materialize(a_sel)
|
||||
if packet_preprocess == 'mixed_dequant':
|
||||
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
|
||||
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
|
||||
else:
|
||||
val_packets = _zscore(b_tok, packet_mean, packet_std)
|
||||
atk_packets = _zscore(a_tok, packet_mean, packet_std)
|
||||
msk_b = np.arange(T)[None, :] < b_len[:, None]
|
||||
msk_a = np.arange(T)[None, :] < a_len[:, None]
|
||||
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
|
||||
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
|
||||
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
|
||||
|
||||
def _flatten_packets(packets: np.ndarray, lens: np.ndarray) -> np.ndarray:
|
||||
out_chunks = []
|
||||
for i in range(len(packets)):
|
||||
L = int(lens[i])
|
||||
if L > 0:
|
||||
out_chunks.append(packets[i, :L])
|
||||
if not out_chunks:
|
||||
return np.empty((0, packets.shape[-1]), dtype=np.float32)
|
||||
return np.concatenate(out_chunks, axis=0).astype(np.float32)
|
||||
|
||||
def _train_kitnet(kit: KitNET, train_flat: np.ndarray) -> dict[str, float]:
|
||||
t0 = time.time()
|
||||
last_rmse = 0.0
|
||||
for i in range(len(train_flat)):
|
||||
last_rmse = kit.process(train_flat[i])
|
||||
if (i + 1) % 50000 == 0:
|
||||
print(f' [train] processed {i + 1:,}/{len(train_flat):,} last_rmse={last_rmse:.4f}', flush=True)
|
||||
return {'t_train_sec': round(time.time() - t0, 2), 'n_trained_packets': len(train_flat)}
|
||||
|
||||
def _score_flows(kit: KitNET, packets: np.ndarray, lens: np.ndarray) -> dict[str, np.ndarray]:
|
||||
N = len(packets)
|
||||
means = np.zeros(N, dtype=np.float32)
|
||||
maxes = np.zeros(N, dtype=np.float32)
|
||||
medians = np.zeros(N, dtype=np.float32)
|
||||
p90s = np.zeros(N, dtype=np.float32)
|
||||
for i in range(N):
|
||||
L = int(lens[i])
|
||||
if L == 0:
|
||||
continue
|
||||
rmses = np.zeros(L, dtype=np.float32)
|
||||
for t in range(L):
|
||||
rmses[t] = kit.execute(packets[i, t])
|
||||
means[i] = rmses.mean()
|
||||
maxes[i] = rmses.max()
|
||||
medians[i] = np.median(rmses)
|
||||
p90s[i] = np.percentile(rmses, 90)
|
||||
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
|
||||
|
||||
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
|
||||
out = {}
|
||||
for cls in sorted(set(atk_labels)):
|
||||
m = atk_labels == cls
|
||||
n_c = int(m.sum())
|
||||
v_c = atk_score[m]
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
|
||||
s = np.r_[val_score, v_c]
|
||||
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
|
||||
return out
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
|
||||
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
|
||||
p.add_argument('--out-dir', type=Path, required=True)
|
||||
p.add_argument('--n-train-cap', type=int, default=2000, help='Cap source-benign train flows (each contributes ~T packets).')
|
||||
p.add_argument('--fm-grace', type=int, default=2000, help='Kitsune feature-mapper grace period (packets).')
|
||||
p.add_argument('--ad-grace', type=int, default=20000, help='Kitsune anomaly-detector grace period (packets).')
|
||||
p.add_argument('--max-ae-size', type=int, default=10)
|
||||
p.add_argument('--lr', type=float, default=0.1)
|
||||
p.add_argument('--hidden-ratio', type=float, default=0.75)
|
||||
p.add_argument('--T', type=int, default=64)
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
is_within = args.protocol in WITHIN_DIRS
|
||||
if is_within:
|
||||
(template, caps) = WITHIN_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
|
||||
else:
|
||||
spec = CROSS_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
|
||||
print(f'[run] kitsune protocol={args.protocol} seed={args.seed}')
|
||||
print(f'[run] using packet stats from {model_dir}/model.pt')
|
||||
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
|
||||
if is_within:
|
||||
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
|
||||
else:
|
||||
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
|
||||
n_train = len(arrays['train_packets'])
|
||||
n_val = len(arrays['val_packets'])
|
||||
n_atk = len(arrays['atk_packets'])
|
||||
D = arrays['train_packets'].shape[-1]
|
||||
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D}')
|
||||
train_flat = _flatten_packets(arrays['train_packets'], arrays['train_len'])
|
||||
print(f'[data] train_flat packets={len(train_flat):,} FM_grace={args.fm_grace} AD_grace={args.ad_grace}')
|
||||
if len(train_flat) < args.fm_grace + args.ad_grace:
|
||||
raise ValueError(f'Need at least FM+AD={args.fm_grace + args.ad_grace} packets, have {len(train_flat)} (try increasing --n-train-cap).')
|
||||
kit = KitNET(n=D, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
|
||||
train_meta = _train_kitnet(kit, train_flat)
|
||||
print(f'[train] {train_meta}')
|
||||
t0 = time.time()
|
||||
val_aggs = _score_flows(kit, arrays['val_packets'], arrays['val_len'])
|
||||
print(f'[score] benign in {time.time() - t0:.1f}s')
|
||||
t0 = time.time()
|
||||
atk_aggs = _score_flows(kit, arrays['atk_packets'], arrays['atk_len'])
|
||||
print(f'[score] attack in {time.time() - t0:.1f}s')
|
||||
overall = {}
|
||||
per_class_by_agg = {}
|
||||
for agg in ('mean', 'max', 'median', 'p90'):
|
||||
v = val_aggs[agg]
|
||||
a = atk_aggs[agg]
|
||||
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
|
||||
s = np.r_[v, a]
|
||||
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
|
||||
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
|
||||
out = {'method': 'kitsune_path_b', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train_flows': n_train, 'n_train_packets': int(len(train_flat)), 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'max_ae_size': args.max_ae_size, 't_train_sec': train_meta['t_train_sec'], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
npz_path = out_json.with_suffix('.npz')
|
||||
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
|
||||
for agg in ('mean', 'max', 'median', 'p90'):
|
||||
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
|
||||
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
|
||||
np.savez_compressed(npz_path, **save)
|
||||
print(f'[saved] {out_json}')
|
||||
print(f'[saved] {npz_path}')
|
||||
best = max(overall, key=lambda k: overall[k]['auroc'])
|
||||
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
|
||||
print()
|
||||
print('=== overall AUROC by aggregator ===')
|
||||
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
|
||||
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
35
scripts/baselines/run_kitsune_all.sh
Executable file
35
scripts/baselines/run_kitsune_all.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
REPO=$(cd "$(dirname "$0")/../.." && pwd)
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="artifacts/baselines/kitsune_2026_04_29"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG="$OUT_DIR/master.log"
|
||||
: > "$LOG"
|
||||
|
||||
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
|
||||
SEEDS_DEFAULT="42 43 44"
|
||||
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
|
||||
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
|
||||
N_TRAIN_CAP="${N_TRAIN_CAP:-5000}"
|
||||
|
||||
for protocol in $PROTOCOLS; do
|
||||
for seed in $SEEDS; do
|
||||
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
|
||||
if [[ -f "$out_json" ]]; then
|
||||
echo "[skip] $out_json exists" | tee -a "$LOG"
|
||||
continue
|
||||
fi
|
||||
echo "=== protocol=$protocol seed=$seed n_train_cap=$N_TRAIN_CAP ===" | tee -a "$LOG"
|
||||
ts=$(date +%s)
|
||||
uv run --no-sync python scripts/baselines/run_kitsune.py \
|
||||
--protocol "$protocol" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
--n-train-cap "$N_TRAIN_CAP" \
|
||||
2>&1 | tee -a "$LOG"
|
||||
te=$(date +%s)
|
||||
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
|
||||
done
|
||||
done
|
||||
echo "ALL DONE"
|
||||
211
scripts/baselines/run_kitsune_path_a.py
Normal file
211
scripts/baselines/run_kitsune_path_a.py
Normal file
@@ -0,0 +1,211 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import yaml
|
||||
if not hasattr(np, 'Inf'):
|
||||
np.Inf = np.inf
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
|
||||
sys.path.insert(0, str(REPO / 'Unified_CFM'))
|
||||
from FeatureExtractor import FE
|
||||
from KitNET.KitNET import KitNET
|
||||
from data import load_unified_data
|
||||
PCAP_GLOBS = {'iscxtor': str(REPO / 'datasets/iscxtor2016/raw/pcap_extracted/**/*.pcap'), 'cicids2017': str(REPO / 'datasets/cicids2017/raw/pcap/*.pcap'), 'cicddos2019': str(REPO / 'datasets/cicddos2019/raw/pcap/*')}
|
||||
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', 'iscxtor', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'cicids2017', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'cicddos2019', {'n_val': 10000, 'n_atk': 20000})}
|
||||
|
||||
def _canonical_key(src_ip, dst_ip, src_port, dst_port, protocol) -> tuple:
|
||||
a = (src_ip, src_port)
|
||||
b = (dst_ip, dst_port)
|
||||
if a <= b:
|
||||
return (a[0], b[0], a[1], b[1], int(protocol))
|
||||
return (b[0], a[0], b[1], a[1], int(protocol))
|
||||
|
||||
def _proto_from_kitsune(srcproto: str, dstproto: str) -> int:
|
||||
if srcproto == 'icmp':
|
||||
return 1
|
||||
if srcproto == 'arp':
|
||||
return 0
|
||||
return -1
|
||||
|
||||
class FEWithMeta(FE):
|
||||
|
||||
def __init__(self, path, limit=np.inf):
|
||||
super().__init__(path, limit)
|
||||
self._last_ts = None
|
||||
self._last_5tuple = None
|
||||
self._last_framelen = None
|
||||
|
||||
def get_next_vector(self):
|
||||
if self.curPacketIndx == self.limit:
|
||||
if self.parse_type == 'tsv':
|
||||
self.tsvinf.close()
|
||||
return []
|
||||
if self.parse_type == 'tsv':
|
||||
row = self.tsvin.__next__()
|
||||
IPtype = np.nan
|
||||
timestamp = row[0]
|
||||
framelen = row[1]
|
||||
srcIP = ''
|
||||
dstIP = ''
|
||||
if row[4] != '':
|
||||
(srcIP, dstIP, IPtype) = (row[4], row[5], 0)
|
||||
elif row[17] != '':
|
||||
(srcIP, dstIP, IPtype) = (row[17], row[18], 1)
|
||||
srcproto = row[6] + row[8]
|
||||
dstproto = row[7] + row[9]
|
||||
(srcMAC, dstMAC) = (row[2], row[3])
|
||||
if srcproto == '':
|
||||
if row[12] != '':
|
||||
(srcproto, dstproto) = ('arp', 'arp')
|
||||
(srcIP, dstIP, IPtype) = (row[14], row[16], 0)
|
||||
elif row[10] != '':
|
||||
(srcproto, dstproto, IPtype) = ('icmp', 'icmp', 0)
|
||||
elif srcIP + srcproto + dstIP + dstproto == '':
|
||||
(srcIP, dstIP) = (row[2], row[3])
|
||||
else:
|
||||
return []
|
||||
try:
|
||||
sp = int(srcproto) if srcproto.isdigit() else 0
|
||||
dp = int(dstproto) if dstproto.isdigit() else 0
|
||||
except Exception:
|
||||
(sp, dp) = (0, 0)
|
||||
try:
|
||||
self._last_ts = float(timestamp)
|
||||
except Exception:
|
||||
self._last_ts = np.nan
|
||||
self._last_5tuple = (srcIP, dstIP, sp, dp)
|
||||
try:
|
||||
self._last_framelen = int(framelen)
|
||||
except Exception:
|
||||
self._last_framelen = 0
|
||||
self.curPacketIndx += 1
|
||||
try:
|
||||
return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))
|
||||
except Exception as e:
|
||||
print(f' [warn] netStat error: {e}')
|
||||
return []
|
||||
|
||||
def _stream_pcap_kitsune(pcap_path: Path, *, kit: KitNET, fm_grace: int, ad_grace: int, packet_limit: int, fivetuple_to_rmses: dict, n_packets_total: list) -> None:
|
||||
print(f' [stream] {pcap_path.name}', flush=True)
|
||||
fe = FEWithMeta(str(pcap_path), limit=packet_limit)
|
||||
t0 = time.time()
|
||||
n_local = 0
|
||||
while True:
|
||||
x = fe.get_next_vector()
|
||||
if len(x) == 0:
|
||||
break
|
||||
n_local += 1
|
||||
n_packets_total[0] += 1
|
||||
rmse = kit.process(x)
|
||||
if rmse is None or rmse == 0:
|
||||
continue
|
||||
if fe._last_5tuple is None:
|
||||
continue
|
||||
(srcIP, dstIP, sp, dp) = fe._last_5tuple
|
||||
key = (srcIP, dstIP, sp, dp) if (srcIP, sp) <= (dstIP, dp) else (dstIP, srcIP, dp, sp)
|
||||
fivetuple_to_rmses[key].append(rmse)
|
||||
if n_local % 200000 == 0:
|
||||
print(f' [{n_local:,}] elapsed {time.time() - t0:.0f}s ({n_local / max(time.time() - t0, 0.001):.0f} pkt/s)', flush=True)
|
||||
print(f' [stream] {pcap_path.name} done: {n_local:,} packets in {time.time() - t0:.0f}s', flush=True)
|
||||
|
||||
def _flows_to_key(flows_df: pd.DataFrame) -> np.ndarray:
|
||||
keys = []
|
||||
for (src_ip, dst_ip, sp, dp) in zip(flows_df['src_ip'], flows_df['dst_ip'], flows_df['src_port'], flows_df['dst_port']):
|
||||
if (str(src_ip), int(sp)) <= (str(dst_ip), int(dp)):
|
||||
k = (str(src_ip), str(dst_ip), int(sp), int(dp))
|
||||
else:
|
||||
k = (str(dst_ip), str(src_ip), int(dp), int(sp))
|
||||
keys.append(k)
|
||||
return np.asarray(keys, dtype=object)
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS))
|
||||
p.add_argument('--seed', type=int, required=True)
|
||||
p.add_argument('--out-dir', type=Path, required=True)
|
||||
p.add_argument('--fm-grace', type=int, default=5000)
|
||||
p.add_argument('--ad-grace', type=int, default=50000)
|
||||
p.add_argument('--max-ae-size', type=int, default=10)
|
||||
p.add_argument('--lr', type=float, default=0.1)
|
||||
p.add_argument('--hidden-ratio', type=float, default=0.75)
|
||||
p.add_argument('--packet-limit-per-pcap', type=int, default=2000000, help='Cap per-pcap packets to keep runtime tractable. None = full.')
|
||||
p.add_argument('--max-pcaps', type=int, default=None, help='Cap number of pcap files processed (default: all).')
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
(template, ds_name, caps) = WITHIN_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
|
||||
print(f'[run] kitsune_path_a protocol={args.protocol} seed={args.seed}')
|
||||
print(f'[run] dataset={ds_name} model_dir={model_dir}')
|
||||
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
|
||||
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else caps['n_atk'], val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else caps['n_val'])
|
||||
flows_full = pd.read_parquet(cfg['flows_parquet'])
|
||||
print(f'[data] flows.parquet rows: {len(flows_full):,}; val={len(data.val_flow):,} attack={len(data.attack_flow):,}')
|
||||
from glob import glob
|
||||
pcaps = sorted(glob(PCAP_GLOBS[ds_name], recursive=True))
|
||||
pcaps = [Path(p) for p in pcaps]
|
||||
if args.max_pcaps is not None:
|
||||
pcaps = pcaps[:args.max_pcaps]
|
||||
print(f'[pcap] discovered {len(pcaps)} pcap(s)')
|
||||
for p in pcaps[:5]:
|
||||
print(f' {p}')
|
||||
if len(pcaps) > 5:
|
||||
print(f' ...({len(pcaps) - 5} more)')
|
||||
kit = KitNET(n=100, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
|
||||
fivetuple_to_rmses: dict = defaultdict(list)
|
||||
n_total = [0]
|
||||
t0 = time.time()
|
||||
for p in pcaps:
|
||||
_stream_pcap_kitsune(p, kit=kit, fm_grace=args.fm_grace, ad_grace=args.ad_grace, packet_limit=args.packet_limit_per_pcap, fivetuple_to_rmses=fivetuple_to_rmses, n_packets_total=n_total)
|
||||
elapsed = time.time() - t0
|
||||
print(f'[stream] total {n_total[0]:,} packets in {elapsed:.0f}s ({n_total[0] / max(elapsed, 0.001):.0f} pkt/s)')
|
||||
print(f'[stream] unique 5-tuples seen: {len(fivetuple_to_rmses):,}')
|
||||
keys_full = _flows_to_key(flows_full)
|
||||
print(f'[match] keying {len(keys_full):,} flows to 5-tuples')
|
||||
flow_score_mean = np.full(len(flows_full), np.nan, dtype=np.float64)
|
||||
flow_score_max = np.full(len(flows_full), np.nan, dtype=np.float64)
|
||||
flow_score_median = np.full(len(flows_full), np.nan, dtype=np.float64)
|
||||
n_matched = 0
|
||||
for (i, k) in enumerate(keys_full):
|
||||
rl = fivetuple_to_rmses.get(tuple(k))
|
||||
if rl:
|
||||
flow_score_mean[i] = float(np.mean(rl))
|
||||
flow_score_max[i] = float(np.max(rl))
|
||||
flow_score_median[i] = float(np.median(rl))
|
||||
n_matched += 1
|
||||
print(f'[match] flows with RMSE coverage: {n_matched:,}/{len(flows_full):,} ({100 * n_matched / max(len(flows_full), 1):.1f}%)')
|
||||
val_flow_ids = set((int(x) for x in data.val_flow_ids)) if hasattr(data, 'val_flow_ids') else None
|
||||
bin_labels = (flows_full['label'].astype(str) != cfg.get('benign_label', 'normal')).astype(int).to_numpy()
|
||||
keys = ['mean', 'max', 'median']
|
||||
score_arrs = {'mean': flow_score_mean, 'max': flow_score_max, 'median': flow_score_median}
|
||||
overall = {}
|
||||
for k in keys:
|
||||
s = score_arrs[k]
|
||||
valid = ~np.isnan(s)
|
||||
if valid.sum() < 10:
|
||||
overall[k] = {'auroc': float('nan'), 'auprc': float('nan'), 'n_valid': int(valid.sum())}
|
||||
continue
|
||||
y = bin_labels[valid]
|
||||
sv = s[valid]
|
||||
overall[k] = {'auroc': _safe_metric(roc_auc_score, y, sv), 'auprc': _safe_metric(average_precision_score, y, sv), 'n_valid': int(valid.sum())}
|
||||
print(f" [{k}] AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f} (n_valid={overall[k]['n_valid']:,})")
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out = {'method': 'kitsune_path_a', 'protocol': args.protocol, 'seed': args.seed, 'dataset': ds_name, 'n_pcaps': len(pcaps), 'n_total_packets': int(n_total[0]), 'n_unique_5tuples': int(len(fivetuple_to_rmses)), 'n_flows_total': int(len(flows_full)), 'n_flows_matched': int(n_matched), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'packet_limit_per_pcap': args.packet_limit_per_pcap, 'elapsed_sec': round(elapsed, 1), 'overall_by_agg': overall}
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
np.savez_compressed(out_json.with_suffix('.npz'), flow_score_mean=flow_score_mean.astype(np.float32), flow_score_max=flow_score_max.astype(np.float32), flow_score_median=flow_score_median.astype(np.float32), binary_label=bin_labels.astype(np.int8))
|
||||
print(f'[saved] {out_json}')
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
227
scripts/baselines/run_shafir_nf.py
Normal file
227
scripts/baselines/run_shafir_nf.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import yaml
|
||||
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
|
||||
import optax
|
||||
from pzflow import Flow
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(REPO / 'Packet_CFM'))
|
||||
sys.path.insert(0, str(REPO / 'Unified_CFM'))
|
||||
from data import _apply_mixed_dequant, _zscore, load_unified_data
|
||||
from model import UnifiedCFMConfig, UnifiedTokenCFM
|
||||
from packet_store import PacketShardStore
|
||||
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000}), 'ciciot_within': ('runs/unified_cfm_ciciot2023_shafir5_2026_04_29', {'n_val': 10000, 'n_atk': 30000})}
|
||||
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
|
||||
|
||||
def _load_within(model_dir: Path, n_val: int | None, n_atk: int | None, n_train_cap: int, seed: int) -> dict[str, Any]:
|
||||
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
|
||||
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
|
||||
rng = np.random.default_rng(seed)
|
||||
train_flow = data.train_flow
|
||||
if len(train_flow) > n_train_cap:
|
||||
idx = np.sort(rng.choice(len(train_flow), size=n_train_cap, replace=False))
|
||||
train_flow = train_flow[idx]
|
||||
val_flow = data.val_flow
|
||||
(atk_flow, atk_labels) = (data.attack_flow, data.attack_labels)
|
||||
if n_val is not None and len(val_flow) > n_val:
|
||||
idx = np.sort(rng.choice(len(val_flow), size=n_val, replace=False))
|
||||
val_flow = val_flow[idx]
|
||||
if n_atk is not None and len(atk_flow) > n_atk:
|
||||
idx = np.sort(rng.choice(len(atk_flow), size=n_atk, replace=False))
|
||||
atk_flow = atk_flow[idx]
|
||||
atk_labels = atk_labels[idx]
|
||||
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
|
||||
|
||||
def _load_cross(spec: dict[str, Any], ckpt_dict: dict[str, Any], seed: int, T: int, n_train_cap: int) -> dict[str, Any]:
|
||||
flow_mean = np.asarray(ckpt_dict['flow_mean'], dtype=np.float32)
|
||||
flow_std = np.asarray(ckpt_dict['flow_std'], dtype=np.float32)
|
||||
flow_names = [str(n) for n in ckpt_dict['flow_feature_names']]
|
||||
target_store = REPO / spec['target_store']
|
||||
target_flows = REPO / spec['target_flows']
|
||||
target_flow_features = REPO / spec['target_flow_features']
|
||||
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
|
||||
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
|
||||
ff = pd.read_parquet(target_flow_features)
|
||||
if not np.array_equal(flows['flow_id'].to_numpy(dtype=np.uint64), ff['flow_id'].to_numpy(dtype=np.uint64)):
|
||||
raise ValueError('target flows and flow_features not row-aligned')
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
rng = np.random.default_rng(seed)
|
||||
benign_idx = np.flatnonzero(labels == 'normal')
|
||||
attack_idx = np.flatnonzero(labels != 'normal')
|
||||
b_sel = np.sort(rng.choice(benign_idx, size=n_benign, replace=False))
|
||||
atk_classes = sorted(set(labels[attack_idx]))
|
||||
per_class = max(1, n_attack // len(atk_classes))
|
||||
a_sel_chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = attack_idx[labels[attack_idx] == cls]
|
||||
k = min(per_class, len(pool))
|
||||
if k:
|
||||
a_sel_chunks.append(rng.choice(pool, size=k, replace=False))
|
||||
a_sel = np.sort(np.concatenate(a_sel_chunks))
|
||||
if len(a_sel) > n_attack:
|
||||
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
|
||||
|
||||
def _flow_only(idx):
|
||||
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
|
||||
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
|
||||
val_flow = _flow_only(b_sel)
|
||||
atk_flow = _flow_only(a_sel)
|
||||
atk_labels = labels[a_sel]
|
||||
src_flows = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flows'], columns=['flow_id', 'label'])
|
||||
src_ff = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flow_features'])
|
||||
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
|
||||
raise ValueError('source flows and flow_features not row-aligned')
|
||||
src_labels = src_flows['label'].astype(str).to_numpy()
|
||||
src_benign_idx = np.flatnonzero(src_labels == 'normal')
|
||||
rng2 = np.random.default_rng(seed + 1000)
|
||||
if len(src_benign_idx) > n_train_cap:
|
||||
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=n_train_cap, replace=False))
|
||||
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
|
||||
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
|
||||
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels, 'flow_names': flow_names}
|
||||
|
||||
def ckpt_dict_paths(ckpt: dict[str, Any]) -> dict[str, str]:
|
||||
raise NotImplementedError('paths must be passed via main()')
|
||||
|
||||
def _train_and_score(train_flow: np.ndarray, val_flow: np.ndarray, atk_flow: np.ndarray, *, epochs: int, lr: float, optimizer: str, verbose: bool):
|
||||
cols = [f'x{i}' for i in range(train_flow.shape[1])]
|
||||
df_train = pd.DataFrame(train_flow.astype(np.float32), columns=cols)
|
||||
df_val = pd.DataFrame(val_flow.astype(np.float32), columns=cols)
|
||||
df_atk = pd.DataFrame(atk_flow.astype(np.float32), columns=cols)
|
||||
if optimizer == 'sgd':
|
||||
opt = optax.sgd(learning_rate=lr)
|
||||
elif optimizer == 'adam':
|
||||
opt = optax.adam(learning_rate=lr)
|
||||
else:
|
||||
raise ValueError(f'unknown optimizer {optimizer!r}')
|
||||
flow = Flow(df_train.columns.tolist())
|
||||
t0 = time.time()
|
||||
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=verbose)
|
||||
t_train = time.time() - t0
|
||||
t0 = time.time()
|
||||
lp_val = np.asarray(flow.log_prob(df_val))
|
||||
lp_atk = np.asarray(flow.log_prob(df_atk))
|
||||
t_score = time.time() - t0
|
||||
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
|
||||
out = {}
|
||||
for cls in sorted(set(atk_labels)):
|
||||
m = atk_labels == cls
|
||||
n_c = int(m.sum())
|
||||
v_c = atk_score[m]
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
|
||||
s = np.r_[val_score, v_c]
|
||||
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
|
||||
return out
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
|
||||
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
|
||||
p.add_argument('--out-dir', type=Path, required=True)
|
||||
p.add_argument('--n-train-cap', type=int, default=10000, help='Cap benign train (default 10k mirrors Shafir).')
|
||||
p.add_argument('--epochs', type=int, default=100)
|
||||
p.add_argument('--lr', type=float, default=0.001)
|
||||
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
|
||||
p.add_argument('--T', type=int, default=64)
|
||||
p.add_argument('--verbose', action='store_true')
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
is_within = args.protocol in WITHIN_DIRS
|
||||
if is_within:
|
||||
(template, caps) = WITHIN_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
|
||||
else:
|
||||
spec = CROSS_DIRS[args.protocol]
|
||||
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
|
||||
print(f'[run] shafir_nf protocol={args.protocol} seed={args.seed}')
|
||||
print(f'[run] using normalization stats from {model_dir}/model.pt (source ckpt)')
|
||||
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
|
||||
if is_within:
|
||||
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
|
||||
else:
|
||||
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
|
||||
flows_parquet = Path(cfg['flows_parquet'])
|
||||
flow_features_path = Path(cfg['flow_features_path'])
|
||||
flow_mean = np.asarray(ckpt['flow_mean'], dtype=np.float32)
|
||||
flow_std = np.asarray(ckpt['flow_std'], dtype=np.float32)
|
||||
flow_names = [str(n) for n in ckpt['flow_feature_names']]
|
||||
src_flows = pd.read_parquet(flows_parquet, columns=['flow_id', 'label'])
|
||||
src_ff = pd.read_parquet(flow_features_path)
|
||||
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
|
||||
raise ValueError('source flows and flow_features not row-aligned')
|
||||
src_labels = src_flows['label'].astype(str).to_numpy()
|
||||
src_benign_idx = np.flatnonzero(src_labels == 'normal')
|
||||
rng2 = np.random.default_rng(args.seed + 1000)
|
||||
if len(src_benign_idx) > args.n_train_cap:
|
||||
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=args.n_train_cap, replace=False))
|
||||
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
|
||||
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
|
||||
target_store = REPO / spec['target_store']
|
||||
target_flows = REPO / spec['target_flows']
|
||||
target_flow_features = REPO / spec['target_flow_features']
|
||||
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
|
||||
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
|
||||
ff = pd.read_parquet(target_flow_features)
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
rng = np.random.default_rng(args.seed)
|
||||
b_sel = np.sort(rng.choice(np.flatnonzero(labels == 'normal'), size=n_benign, replace=False))
|
||||
atk_idx = np.flatnonzero(labels != 'normal')
|
||||
atk_classes = sorted(set(labels[atk_idx]))
|
||||
per_class_n = max(1, n_attack // len(atk_classes))
|
||||
chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = atk_idx[labels[atk_idx] == cls]
|
||||
k = min(per_class_n, len(pool))
|
||||
if k:
|
||||
chunks.append(rng.choice(pool, size=k, replace=False))
|
||||
a_sel = np.sort(np.concatenate(chunks))
|
||||
if len(a_sel) > n_attack:
|
||||
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
|
||||
|
||||
def _flow_only(idx):
|
||||
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
|
||||
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
|
||||
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
|
||||
val_flow = _flow_only(b_sel)
|
||||
atk_flow = _flow_only(a_sel)
|
||||
atk_labels = labels[a_sel]
|
||||
arrays = {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
|
||||
print(f"[data] train={len(arrays['train_flow']):,} val={len(arrays['val_flow']):,} attack={len(arrays['atk_flow']):,} D={arrays['train_flow'].shape[1]}")
|
||||
res = _train_and_score(arrays['train_flow'], arrays['val_flow'], arrays['atk_flow'], epochs=args.epochs, lr=args.lr, optimizer=args.optimizer, verbose=args.verbose)
|
||||
(val_score, atk_score) = (res['score_val'], res['score_atk'])
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
|
||||
s = np.r_[val_score, atk_score]
|
||||
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
|
||||
per_cls = _per_class(val_score, atk_score, np.asarray(arrays['atk_labels']).astype(str))
|
||||
out = {'method': 'shafir_nf', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': int(len(arrays['train_flow'])), 'n_val': int(len(arrays['val_flow'])), 'n_atk': int(len(arrays['atk_flow'])), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
npz_path = out_json.with_suffix('.npz')
|
||||
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=np.asarray(arrays['atk_labels']).astype(str), losses=res['losses'])
|
||||
print(f'[saved] {out_json}')
|
||||
print(f'[saved] {npz_path}')
|
||||
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s score={res['t_score']:.1f}s")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
38
scripts/baselines/run_shafir_nf_all.sh
Executable file
38
scripts/baselines/run_shafir_nf_all.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
REPO=$(cd "$(dirname "$0")/../.." && pwd)
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="artifacts/baselines/shafir_nf_2026_04_29"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG="$OUT_DIR/master.log"
|
||||
: > "$LOG"
|
||||
|
||||
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
|
||||
SEEDS_DEFAULT="42 43 44"
|
||||
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
|
||||
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
|
||||
EPOCHS="${EPOCHS:-100}"
|
||||
LR="${LR:-0.001}"
|
||||
OPTIMIZER="${OPTIMIZER:-sgd}"
|
||||
|
||||
for protocol in $PROTOCOLS; do
|
||||
for seed in $SEEDS; do
|
||||
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
|
||||
if [[ -f "$out_json" ]]; then
|
||||
echo "[skip] $out_json exists" | tee -a "$LOG"
|
||||
continue
|
||||
fi
|
||||
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
|
||||
ts=$(date +%s)
|
||||
uv run --no-sync python scripts/baselines/run_shafir_nf.py \
|
||||
--protocol "$protocol" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
|
||||
2>&1 | tee -a "$LOG"
|
||||
te=$(date +%s)
|
||||
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
|
||||
done
|
||||
done
|
||||
|
||||
echo "ALL DONE"
|
||||
265
scripts/baselines/run_shafir_nf_csv.py
Normal file
265
scripts/baselines/run_shafir_nf_csv.py
Normal file
@@ -0,0 +1,265 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import warnings
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
|
||||
warnings.filterwarnings('ignore')
|
||||
import optax
|
||||
from pzflow import Flow
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
REPO = Path(__file__).resolve().parents[2]
|
||||
IDS2017_FEATURES = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
|
||||
TOR2016_FEATURES = ['Protocol', 'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
|
||||
CICIOT5_FEATURES = ['HTTPS', 'Protocol Type', 'Magnitude', 'Variance', 'fin_count']
|
||||
CICIDS_BEST5_FEATURES = ['Bwd Packet Length Mean', 'Fwd Packets/s', 'ACK Flag Count', 'Total Length of Bwd Packets', 'Flow Duration']
|
||||
TOR_BEST4_FEATURES = ['Flow IAT Std', 'Flow Bytes/s', 'Flow Packets/s', 'Bwd IAT Max']
|
||||
COLUMN_ALIASES = {'Total Fwd Packets': ['Total Fwd Packet'], 'Total Backward Packets': ['Total Bwd packets'], 'Total Length of Fwd Packets': ['Total Length of Fwd Packet'], 'Total Length of Bwd Packets': ['Total Length of Bwd Packet'], 'Fwd Header Length': ['Fwd Header Length.1'], 'Init_Win_bytes_forward': ['FWD Init Win Bytes', 'Init Win Bytes Fwd'], 'Init_Win_bytes_backward': ['Bwd Init Win Bytes', 'Init Win Bytes Bwd'], 'act_data_pkt_fwd': ['Fwd Act Data Pkts'], 'min_seg_size_forward': ['Fwd Seg Size Min'], 'Avg Fwd Segment Size': ['Fwd Segment Size Avg'], 'Avg Bwd Segment Size': ['Bwd Segment Size Avg'], 'Min Packet Length': ['Packet Length Min'], 'Max Packet Length': ['Packet Length Max']}
|
||||
DATASETS = {'iscxtor': {'csv_glob': str(REPO / 'datasets/iscxtor2016/raw/csv/Scenario-*-merged_5s.csv'), 'label_col': 'label', 'benign_values': ['nonTOR'], 'drop_patterns': [], 'feature_set': TOR_BEST4_FEATURES}, 'cicids2017': {'csv_glob': str(REPO / 'datasets/cicids2017/raw/csv/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [' - Attempted', '- Attempted'], 'feature_set': CICIDS_BEST5_FEATURES}, 'cicddos2019': {'csv_glob': str(REPO / 'datasets/cicddos2019/raw/csv/**/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [], 'feature_set': CICIDS_BEST5_FEATURES}, 'ciciot2023': {'csv_glob': str(REPO / 'datasets/ciciot2023/raw/csv/CSV/*/*.pcap.csv'), 'label_col': None, 'benign_folder': 'Benign_Final', 'drop_patterns': [], 'feature_set': CICIOT5_FEATURES}}
|
||||
PROTOCOL_CONFIG = {'iscxtor_within': ('iscxtor', 'iscxtor', {'n_train': 10000, 'n_val': 10000, 'n_attack': None}), 'cicids_within': ('cicids2017', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'cicddos_within': ('cicddos2019', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 20000}), 'ciciot_within': ('ciciot2023', 'ciciot2023', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'forward_cross': ('cicids2017', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000}), 'reverse_cross': ('cicddos2019', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000})}
|
||||
|
||||
def _resolve_columns(df: pd.DataFrame, names: list[str]) -> tuple[list[str], list[str]]:
|
||||
df.columns = [c.strip() if isinstance(c, str) else c for c in df.columns]
|
||||
(resolved, missing) = ([], [])
|
||||
for n in names:
|
||||
if n in df.columns:
|
||||
resolved.append(n)
|
||||
continue
|
||||
found = None
|
||||
for alias in COLUMN_ALIASES.get(n, []):
|
||||
if alias in df.columns:
|
||||
found = alias
|
||||
break
|
||||
if found is None:
|
||||
low = {c.lower(): c for c in df.columns}
|
||||
if n.lower() in low:
|
||||
found = low[n.lower()]
|
||||
if found is None:
|
||||
missing.append(n)
|
||||
else:
|
||||
resolved.append(found)
|
||||
return (resolved, missing)
|
||||
|
||||
def _load_csvs(dataset_name: str, return_paths: bool=False):
|
||||
cfg = DATASETS[dataset_name]
|
||||
paths = sorted(glob(cfg['csv_glob'], recursive=True))
|
||||
if not paths:
|
||||
raise FileNotFoundError(f"no CSVs match {cfg['csv_glob']}")
|
||||
print(f' [csv] {dataset_name}: {len(paths)} files')
|
||||
return paths if return_paths else paths
|
||||
|
||||
def _attach_labels(df: pd.DataFrame, dataset_name: str, source_path: str | None=None) -> pd.DataFrame:
|
||||
cfg = DATASETS[dataset_name]
|
||||
if cfg.get('label_col') is None:
|
||||
folder = Path(source_path).parent.name
|
||||
df = df.copy()
|
||||
df['cls_label'] = folder
|
||||
df['binary_label'] = 0 if folder == cfg['benign_folder'] else 1
|
||||
else:
|
||||
lbl_col = cfg['label_col'].strip()
|
||||
match = None
|
||||
for c in df.columns:
|
||||
if isinstance(c, str) and c.strip() == lbl_col:
|
||||
match = c
|
||||
break
|
||||
if match is None:
|
||||
raise KeyError(f'label column {lbl_col!r} not found in {source_path}')
|
||||
df = df.copy()
|
||||
df['cls_label'] = df[match].astype(str).str.strip()
|
||||
for pat in cfg['drop_patterns']:
|
||||
df = df[~df['cls_label'].str.contains(pat, na=False, regex=False)]
|
||||
df['binary_label'] = df['cls_label'].apply(lambda x: 0 if x in cfg['benign_values'] else 1)
|
||||
return df
|
||||
|
||||
def _load_dataset(dataset_name: str, feature_set: list[str]) -> pd.DataFrame:
|
||||
cfg = DATASETS[dataset_name]
|
||||
paths = _load_csvs(dataset_name)
|
||||
dfs = []
|
||||
for p in paths:
|
||||
try:
|
||||
df = pd.read_csv(p, low_memory=False)
|
||||
except Exception as e:
|
||||
print(f' [csv-warn] skip {p}: {e}')
|
||||
continue
|
||||
df = _attach_labels(df, dataset_name, source_path=p)
|
||||
(resolved, missing) = _resolve_columns(df, feature_set)
|
||||
if missing:
|
||||
if not hasattr(_load_dataset, '_warned'):
|
||||
_load_dataset._warned = set()
|
||||
key = (dataset_name, tuple(missing))
|
||||
if key not in _load_dataset._warned:
|
||||
_load_dataset._warned.add(key)
|
||||
print(f' [warn] {Path(p).name}: missing {missing}')
|
||||
sub = df[resolved + ['binary_label', 'cls_label']].copy()
|
||||
rename = {r: n for (r, n) in zip(resolved, [f for f in feature_set if f not in missing])}
|
||||
sub = sub.rename(columns=rename)
|
||||
dfs.append(sub)
|
||||
if not dfs:
|
||||
raise RuntimeError(f'no usable CSVs for {dataset_name}')
|
||||
full = pd.concat(dfs, axis=0, ignore_index=True)
|
||||
for c in [c for c in feature_set if c in full.columns]:
|
||||
full[c] = pd.to_numeric(full[c], errors='coerce')
|
||||
full = full.replace([np.inf, -np.inf], np.nan)
|
||||
feat_cols = [c for c in feature_set if c in full.columns]
|
||||
full = full.dropna(subset=feat_cols).reset_index(drop=True)
|
||||
print(f' [csv] {dataset_name} concat: {len(full):,} rows benign={int((full.binary_label == 0).sum()):,} attack={int((full.binary_label == 1).sum()):,} features_kept={len(feat_cols)}')
|
||||
return (full, feat_cols)
|
||||
|
||||
def _sample_within(df: pd.DataFrame, caps: dict, seed: int):
|
||||
rng = np.random.default_rng(seed)
|
||||
benign = df[df.binary_label == 0]
|
||||
attack = df[df.binary_label == 1]
|
||||
n_train = caps['n_train']
|
||||
n_val = caps['n_val']
|
||||
n_atk = caps['n_attack']
|
||||
needed_b = n_train + n_val
|
||||
if len(benign) < needed_b:
|
||||
raise RuntimeError(f'only {len(benign)} benign rows, need {needed_b}')
|
||||
b_idx = rng.permutation(len(benign))
|
||||
train = benign.iloc[b_idx[:n_train]]
|
||||
val = benign.iloc[b_idx[n_train:n_train + n_val]]
|
||||
if n_atk is None:
|
||||
atk = attack
|
||||
else:
|
||||
atk_classes = sorted(attack['cls_label'].unique())
|
||||
per = max(1, n_atk // len(atk_classes))
|
||||
chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = attack[attack['cls_label'] == cls]
|
||||
k = min(per, len(pool))
|
||||
if k:
|
||||
chunks.append(pool.sample(n=k, random_state=seed))
|
||||
atk = pd.concat(chunks, axis=0, ignore_index=True)
|
||||
if len(atk) > n_atk:
|
||||
atk = atk.sample(n=n_atk, random_state=seed)
|
||||
return (train, val, atk)
|
||||
|
||||
def _sample_cross(src_df, tgt_df, caps, seed):
|
||||
rng = np.random.default_rng(seed + 1000)
|
||||
src_benign = src_df[src_df.binary_label == 0]
|
||||
if len(src_benign) < caps['n_train']:
|
||||
raise RuntimeError(f"src benign only {len(src_benign)}, need {caps['n_train']}")
|
||||
sb_idx = rng.permutation(len(src_benign))
|
||||
train = src_benign.iloc[sb_idx[:caps['n_train']]]
|
||||
rng2 = np.random.default_rng(seed)
|
||||
tgt_benign = tgt_df[tgt_df.binary_label == 0]
|
||||
tgt_attack = tgt_df[tgt_df.binary_label == 1]
|
||||
if len(tgt_benign) < caps['n_val']:
|
||||
raise RuntimeError(f'tgt benign only {len(tgt_benign)}')
|
||||
tb_idx = rng2.permutation(len(tgt_benign))
|
||||
val = tgt_benign.iloc[tb_idx[:caps['n_val']]]
|
||||
atk_classes = sorted(tgt_attack['cls_label'].unique())
|
||||
per = max(1, caps['n_attack'] // len(atk_classes))
|
||||
chunks = []
|
||||
for cls in atk_classes:
|
||||
pool = tgt_attack[tgt_attack['cls_label'] == cls]
|
||||
k = min(per, len(pool))
|
||||
if k:
|
||||
chunks.append(pool.sample(n=k, random_state=seed))
|
||||
atk = pd.concat(chunks, axis=0, ignore_index=True)
|
||||
if len(atk) > caps['n_attack']:
|
||||
atk = atk.sample(n=caps['n_attack'], random_state=seed)
|
||||
return (train, val, atk)
|
||||
|
||||
def _safe_metric(fn, y, s) -> float:
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
|
||||
try:
|
||||
return float(fn(y, s))
|
||||
except ValueError:
|
||||
return float('nan')
|
||||
|
||||
def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer):
|
||||
raw_train = train[feat_cols].astype(np.float64).values
|
||||
keep = raw_train.std(axis=0) > 0
|
||||
if not keep.all():
|
||||
dropped = [c for (c, k) in zip(feat_cols, keep) if not k]
|
||||
print(f' [train] dropping {len(dropped)} zero-variance cols: {dropped}')
|
||||
feat_cols = [c for (c, k) in zip(feat_cols, keep) if k]
|
||||
raw_train = raw_train[:, keep]
|
||||
raw_val = val[feat_cols].astype(np.float64).values
|
||||
raw_atk = atk[feat_cols].astype(np.float64).values
|
||||
scaler = StandardScaler()
|
||||
X_train = scaler.fit_transform(raw_train)
|
||||
X_val = scaler.transform(raw_val)
|
||||
X_atk = scaler.transform(raw_atk)
|
||||
clip_lim = 30.0
|
||||
X_train = np.clip(X_train, -clip_lim, clip_lim)
|
||||
X_val = np.clip(X_val, -clip_lim, clip_lim)
|
||||
X_atk = np.clip(X_atk, -clip_lim, clip_lim)
|
||||
df_train = pd.DataFrame(X_train.astype(np.float32), columns=[f'x{i}' for i in range(len(feat_cols))])
|
||||
df_val = pd.DataFrame(X_val.astype(np.float32), columns=df_train.columns)
|
||||
df_atk = pd.DataFrame(X_atk.astype(np.float32), columns=df_train.columns)
|
||||
if optimizer == 'sgd':
|
||||
opt = optax.sgd(learning_rate=lr)
|
||||
else:
|
||||
opt = optax.adam(learning_rate=lr)
|
||||
flow = Flow(df_train.columns.tolist())
|
||||
t0 = time.time()
|
||||
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False)
|
||||
t_train = time.time() - t0
|
||||
t0 = time.time()
|
||||
lp_val = np.asarray(flow.log_prob(df_val))
|
||||
lp_atk = np.asarray(flow.log_prob(df_atk))
|
||||
t_score = time.time() - t0
|
||||
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
|
||||
|
||||
def _per_class(val_score, atk_score, atk_labels):
|
||||
out = {}
|
||||
for cls in sorted(set(atk_labels)):
|
||||
m = atk_labels == cls
|
||||
n_c = int(m.sum())
|
||||
v_c = atk_score[m]
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
|
||||
s = np.r_[val_score, v_c]
|
||||
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
|
||||
return out
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--protocol', required=True, choices=list(PROTOCOL_CONFIG))
|
||||
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
|
||||
p.add_argument('--out-dir', type=Path, required=True)
|
||||
p.add_argument('--epochs', type=int, default=100)
|
||||
p.add_argument('--lr', type=float, default=0.001)
|
||||
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
|
||||
args = p.parse_args()
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
(src_name, tgt_name, caps) = PROTOCOL_CONFIG[args.protocol]
|
||||
cross = src_name != tgt_name
|
||||
print(f'[run] shafir_nf_csv protocol={args.protocol} seed={args.seed}')
|
||||
print(f' src={src_name} tgt={tgt_name} cross={cross}')
|
||||
feat_set = DATASETS[src_name]['feature_set']
|
||||
(src_df, src_feat_cols) = _load_dataset(src_name, feat_set)
|
||||
if cross:
|
||||
(tgt_df, tgt_feat_cols) = _load_dataset(tgt_name, feat_set)
|
||||
feat_cols = [c for c in feat_set if c in src_feat_cols and c in tgt_feat_cols]
|
||||
print(f' [features] cross intersection: {len(feat_cols)} cols')
|
||||
(train, val, atk) = _sample_cross(src_df, tgt_df, caps, args.seed)
|
||||
else:
|
||||
feat_cols = src_feat_cols
|
||||
print(f' [features] within: {len(feat_cols)} cols')
|
||||
(train, val, atk) = _sample_within(src_df, caps, args.seed)
|
||||
print(f' [data] train={len(train):,} val={len(val):,} attack={len(atk):,} D={len(feat_cols)}')
|
||||
res = _train_and_score(train, val, atk, feat_cols, epochs=args.epochs, lr=args.lr, optimizer=args.optimizer)
|
||||
(val_score, atk_score) = (res['score_val'], res['score_atk'])
|
||||
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
|
||||
s = np.r_[val_score, atk_score]
|
||||
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
|
||||
a_labels = atk['cls_label'].astype(str).to_numpy()
|
||||
per_cls = _per_class(val_score, atk_score, a_labels)
|
||||
out = {'method': 'shafir_nf_csv', 'protocol': args.protocol, 'seed': args.seed, 'src_dataset': src_name, 'tgt_dataset': tgt_name, 'feature_set': feat_cols, 'n_features': len(feat_cols), 'n_train': len(train), 'n_val': len(val), 'n_atk': len(atk), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
|
||||
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
|
||||
out_json.write_text(json.dumps(out, indent=2))
|
||||
npz_path = out_json.with_suffix('.npz')
|
||||
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=a_labels.astype(str), losses=res['losses'])
|
||||
print(f'[saved] {out_json}')
|
||||
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
37
scripts/baselines/run_shafir_nf_csv_all.sh
Executable file
37
scripts/baselines/run_shafir_nf_csv_all.sh
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
REPO=$(cd "$(dirname "$0")/../.." && pwd)
|
||||
cd "$REPO"
|
||||
|
||||
OUT_DIR="artifacts/baselines/shafir_nf_csv_2026_04_29"
|
||||
mkdir -p "$OUT_DIR"
|
||||
LOG="$OUT_DIR/master.log"
|
||||
: > "$LOG"
|
||||
|
||||
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within ciciot_within forward_cross reverse_cross"
|
||||
SEEDS_DEFAULT="42 43 44"
|
||||
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
|
||||
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
|
||||
EPOCHS="${EPOCHS:-100}"
|
||||
LR="${LR:-0.001}"
|
||||
OPTIMIZER="${OPTIMIZER:-sgd}"
|
||||
|
||||
for protocol in $PROTOCOLS; do
|
||||
for seed in $SEEDS; do
|
||||
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
|
||||
if [[ -f "$out_json" ]]; then
|
||||
echo "[skip] $out_json exists" | tee -a "$LOG"
|
||||
continue
|
||||
fi
|
||||
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
|
||||
ts=$(date +%s)
|
||||
uv run --no-sync python scripts/baselines/run_shafir_nf_csv.py \
|
||||
--protocol "$protocol" --seed "$seed" \
|
||||
--out-dir "$OUT_DIR" \
|
||||
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
|
||||
2>&1 | tee -a "$LOG"
|
||||
te=$(date +%s)
|
||||
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
|
||||
done
|
||||
done
|
||||
echo "ALL DONE"
|
||||
87
scripts/compute_shafir5_features.py
Normal file
87
scripts/compute_shafir5_features.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
REPO = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(REPO / 'Packet_CFM'))
|
||||
from packet_store import PacketShardStore
|
||||
SHAFIR5_FEATURE_NAMES = ('HTTPS', 'Protocol_Type', 'Magnitude', 'Variance', 'fin_count')
|
||||
|
||||
def _compute_batch(tokens: np.ndarray, lens: np.ndarray, dst_ports: np.ndarray, protocols: np.ndarray) -> np.ndarray:
|
||||
(B, T, _) = tokens.shape
|
||||
out = np.zeros((B, 5), dtype=np.float32)
|
||||
arange = np.arange(T)[None, :]
|
||||
mask = arange < lens[:, None]
|
||||
log_size = tokens[:, :, 0]
|
||||
sizes = np.expm1(np.maximum(log_size, 0.0))
|
||||
sizes = np.where(mask, sizes, 0.0)
|
||||
n = lens.astype(np.float32)
|
||||
n_safe = np.maximum(n, 1.0)
|
||||
sum_sq = (sizes * sizes).sum(axis=1)
|
||||
mean = sizes.sum(axis=1) / n_safe
|
||||
mean_sq = sum_sq / n_safe
|
||||
magnitude = np.sqrt(np.maximum(mean_sq, 0.0))
|
||||
variance = np.maximum(mean_sq - mean * mean, 0.0)
|
||||
fin_flags = tokens[:, :, 4]
|
||||
fin_flags = np.where(mask, fin_flags, 0.0)
|
||||
fin_count = fin_flags.sum(axis=1)
|
||||
https = (dst_ports == 443).astype(np.float32)
|
||||
proto_type = protocols.astype(np.float32)
|
||||
out[:, 0] = https
|
||||
out[:, 1] = proto_type
|
||||
out[:, 2] = magnitude
|
||||
out[:, 3] = variance
|
||||
out[:, 4] = fin_count
|
||||
return out
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('--source-store', type=Path, required=True)
|
||||
p.add_argument('--flows-parquet', type=Path, required=True)
|
||||
p.add_argument('--out', type=Path, required=True)
|
||||
p.add_argument('--T', type=int, default=None, help='Truncate to first T packets (default = stored).')
|
||||
p.add_argument('--batch', type=int, default=100000)
|
||||
args = p.parse_args()
|
||||
print(f'[read] {args.flows_parquet}')
|
||||
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label', 'dst_port', 'protocol'])
|
||||
flow_id = flows['flow_id'].to_numpy(dtype=np.uint64)
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
dst_ports = flows['dst_port'].to_numpy(dtype=np.uint32)
|
||||
protocols = flows['protocol'].to_numpy(dtype=np.uint8)
|
||||
store = PacketShardStore.open(args.source_store)
|
||||
store_fid = store.read_flows(columns=['flow_id'])['flow_id'].to_numpy(dtype=np.uint64)
|
||||
if len(store_fid) != len(flow_id) or not np.array_equal(store_fid, flow_id):
|
||||
raise ValueError('store flow_id ordering differs from flows.parquet')
|
||||
T_stored = int(store.manifest['packet_length'].max())
|
||||
T = args.T if args.T is not None else T_stored
|
||||
n = len(flows)
|
||||
feats = np.zeros((n, 5), dtype=np.float32)
|
||||
print(f'[stream] {n:,} flows × T={T} (stored {T_stored}), batch={args.batch}')
|
||||
t0 = time.time()
|
||||
for start in range(0, n, args.batch):
|
||||
end = min(start + args.batch, n)
|
||||
idx = np.arange(start, end, dtype=np.int64)
|
||||
(tok, ll) = store.read_packets(idx, T=T)
|
||||
ll = np.minimum(ll, T).astype(np.int32)
|
||||
feats[start:end] = _compute_batch(tok.astype(np.float32), ll, dst_ports[start:end], protocols[start:end])
|
||||
if start // args.batch % 20 == 0 or end == n:
|
||||
dt = time.time() - t0
|
||||
rate = end / max(dt, 1e-06)
|
||||
eta = (n - end) / max(rate, 1.0)
|
||||
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
|
||||
for (i, name) in enumerate(SHAFIR5_FEATURE_NAMES):
|
||||
df[name] = feats[:, i]
|
||||
df.to_parquet(args.out, compression='snappy', index=False)
|
||||
print(f'[write] {args.out} rows={len(df):,} cols={list(df.columns)}')
|
||||
print(f'[stats] HTTPS=1 fraction: {(feats[:, 0] > 0).mean():.4f}')
|
||||
print(f'[stats] Protocol_Type unique values: {np.unique(feats[:, 1].astype(int))[:10]}')
|
||||
print(f'[stats] Magnitude mean={feats[:, 2].mean():.1f} median={np.median(feats[:, 2]):.1f}')
|
||||
print(f'[stats] Variance mean={feats[:, 3].mean():.1f}')
|
||||
print(f'[stats] fin_count mean={feats[:, 4].mean():.3f}')
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
119
scripts/convert_npz_splits_to_store.py
Normal file
119
scripts/convert_npz_splits_to_store.py
Normal file
@@ -0,0 +1,119 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import BinaryIO
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from numpy.lib import format as npy_format
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from packet_store import PacketShardStore, PacketShardWriter
|
||||
|
||||
def _read_npy_header(fp: BinaryIO) -> tuple[tuple[int, ...], np.dtype, bool]:
|
||||
version = npy_format.read_magic(fp)
|
||||
if version == (1, 0):
|
||||
(shape, fortran_order, dtype) = npy_format.read_array_header_1_0(fp)
|
||||
elif version == (2, 0):
|
||||
(shape, fortran_order, dtype) = npy_format.read_array_header_2_0(fp)
|
||||
else:
|
||||
raise ValueError(f'unsupported npy version {version}')
|
||||
return (tuple((int(v) for v in shape)), np.dtype(dtype), bool(fortran_order))
|
||||
|
||||
def _read_exact(fp: BinaryIO, n_bytes: int) -> bytes:
|
||||
chunks: list[bytes] = []
|
||||
remaining = int(n_bytes)
|
||||
while remaining:
|
||||
chunk = fp.read(remaining)
|
||||
if not chunk:
|
||||
raise EOFError(f'expected {n_bytes} bytes, missing {remaining}')
|
||||
chunks.append(chunk)
|
||||
remaining -= len(chunk)
|
||||
return b''.join(chunks)
|
||||
|
||||
def _open_member(zf: zipfile.ZipFile, name: str) -> tuple[BinaryIO, tuple[int, ...], np.dtype]:
|
||||
fp = zf.open(name)
|
||||
(shape, dtype, fortran_order) = _read_npy_header(fp)
|
||||
if fortran_order:
|
||||
fp.close()
|
||||
raise ValueError(f'{name} uses Fortran order, expected C order')
|
||||
return (fp, shape, dtype)
|
||||
|
||||
def _iter_npz_rows(npz_path: Path, rows: int, chunk_rows: int):
|
||||
with zipfile.ZipFile(npz_path) as zf:
|
||||
(token_fp, token_shape, token_dtype) = _open_member(zf, 'packet_tokens.npy')
|
||||
(length_fp, length_shape, length_dtype) = _open_member(zf, 'packet_lengths.npy')
|
||||
try:
|
||||
if len(token_shape) != 3:
|
||||
raise ValueError(f'packet_tokens.npy must be 3-D, got {token_shape}')
|
||||
if length_shape != (token_shape[0],):
|
||||
raise ValueError(f'packet_lengths.npy shape {length_shape} does not match tokens {token_shape}')
|
||||
if rows > token_shape[0]:
|
||||
raise ValueError(f'requested {rows} rows, but {npz_path} has {token_shape[0]}')
|
||||
row_values = int(np.prod(token_shape[1:], dtype=np.int64))
|
||||
token_row_bytes = row_values * token_dtype.itemsize
|
||||
length_row_bytes = length_dtype.itemsize
|
||||
emitted = 0
|
||||
while emitted < rows:
|
||||
take = min(int(chunk_rows), rows - emitted)
|
||||
token_bytes = _read_exact(token_fp, take * token_row_bytes)
|
||||
length_bytes = _read_exact(length_fp, take * length_row_bytes)
|
||||
tokens = np.frombuffer(token_bytes, dtype=token_dtype).reshape(take, token_shape[1], token_shape[2])
|
||||
lengths = np.frombuffer(length_bytes, dtype=length_dtype).reshape(take)
|
||||
yield (emitted, tokens, lengths)
|
||||
emitted += take
|
||||
finally:
|
||||
token_fp.close()
|
||||
length_fp.close()
|
||||
|
||||
def _npz_token_shape(npz_path: Path) -> tuple[int, int, int]:
|
||||
with zipfile.ZipFile(npz_path) as zf:
|
||||
(fp, shape, _dtype) = _open_member(zf, 'packet_tokens.npy')
|
||||
fp.close()
|
||||
if len(shape) != 3:
|
||||
raise ValueError(f'packet_tokens.npy must be 3-D, got {shape}')
|
||||
return shape
|
||||
|
||||
def convert(args: argparse.Namespace) -> None:
|
||||
pairs = list(zip(args.packets_npz, args.flows_parquet, strict=True))
|
||||
first_shape = _npz_token_shape(pairs[0][0])
|
||||
total_rows = 0
|
||||
with PacketShardWriter(args.out_store, shard_size=args.shard_size, T_full=first_shape[1], D=first_shape[2], overwrite=args.overwrite) as writer:
|
||||
for (split_id, (npz_path, flows_path)) in enumerate(pairs):
|
||||
token_shape = _npz_token_shape(npz_path)
|
||||
if token_shape[1:] != first_shape[1:]:
|
||||
raise ValueError(f'{npz_path} shape {token_shape} does not match {first_shape}')
|
||||
flows = pd.read_parquet(flows_path)
|
||||
rows = min(len(flows), token_shape[0])
|
||||
if args.max_rows_per_split > 0:
|
||||
rows = min(rows, args.max_rows_per_split)
|
||||
if len(flows) != token_shape[0]:
|
||||
raise ValueError(f'{flows_path} has {len(flows)} rows but {npz_path} has {token_shape[0]}')
|
||||
print(f'[split {split_id}] npz={npz_path} flows={flows_path} rows={rows:,} shape={token_shape}', flush=True)
|
||||
for (start, tokens, lengths) in _iter_npz_rows(npz_path, rows, args.chunk_rows):
|
||||
end = start + len(lengths)
|
||||
writer.add_batch(tokens, lengths, flows.iloc[start:end].reset_index(drop=True))
|
||||
total_rows += len(lengths)
|
||||
if total_rows % args.report_every < len(lengths) or end == rows:
|
||||
print(f'[split {split_id}] emitted={end:,}/{rows:,} total={total_rows:,}', flush=True)
|
||||
store = PacketShardStore.open(args.out_store)
|
||||
flows = store.read_flows(columns=['label'])
|
||||
print(f"[done] store={args.out_store} rows={store.n_flows:,} shards={store.metadata['n_shards']}")
|
||||
print(flows['label'].value_counts().to_string())
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument('--packets-npz', type=Path, nargs='+', required=True)
|
||||
parser.add_argument('--flows-parquet', type=Path, nargs='+', required=True)
|
||||
parser.add_argument('--out-store', type=Path, required=True)
|
||||
parser.add_argument('--shard-size', type=int, default=50000)
|
||||
parser.add_argument('--chunk-rows', type=int, default=10000)
|
||||
parser.add_argument('--report-every', type=int, default=250000)
|
||||
parser.add_argument('--max-rows-per-split', type=int, default=0)
|
||||
parser.add_argument('--overwrite', action='store_true')
|
||||
args = parser.parse_args()
|
||||
if len(args.packets_npz) != len(args.flows_parquet):
|
||||
raise SystemExit('--packets-npz and --flows-parquet must have the same count')
|
||||
convert(args)
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
114
scripts/csv_adapter.py
Normal file
114
scripts/csv_adapter.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from __future__ import annotations
|
||||
import csv
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
import numpy as np
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
from extract_lib import _canonical_key
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CsvFlowAdapter:
|
||||
join_cols: dict[str, str]
|
||||
label_col: str
|
||||
timestamp_formats: tuple[str, ...]
|
||||
benign_aliases: frozenset[str]
|
||||
benign_token: str = 'normal'
|
||||
drop_label_patterns: tuple[str, ...] = ()
|
||||
label_aliases: dict[str, str] = field(default_factory=dict)
|
||||
label_normalizer: Callable[[str], str] | None = None
|
||||
|
||||
def normalize_label(self, raw: str) -> str:
|
||||
if self.label_normalizer is not None:
|
||||
return self.label_normalizer(raw)
|
||||
s = raw.strip()
|
||||
if s in self.benign_aliases:
|
||||
return self.benign_token
|
||||
return self.label_aliases.get(s, s)
|
||||
|
||||
def parse_timestamp(self, raw: str) -> float | None:
|
||||
s = raw.strip()
|
||||
if not s:
|
||||
return None
|
||||
for fmt in self.timestamp_formats:
|
||||
try:
|
||||
return datetime.strptime(s, fmt).timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def parse_csv_rows(*, csv_path: Path, row_idx_start: int, time_offset_seconds: float, adapter: CsvFlowAdapter, max_per_class: int | None=None, max_benign: int | None=None, rng: np.random.Generator | None=None) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int, dict[str, int]]:
|
||||
if (max_per_class is not None or max_benign is not None) and rng is None:
|
||||
rng = np.random.default_rng(42)
|
||||
parsed: list[tuple[tuple, float, str]] = []
|
||||
n_skip = 0
|
||||
with open(csv_path, 'r', newline='') as f:
|
||||
reader = csv.reader(f)
|
||||
header = [h.strip() for h in next(reader)]
|
||||
h2i = {h: i for (i, h) in enumerate(header)}
|
||||
needed = list(adapter.join_cols.values()) + [adapter.label_col]
|
||||
for col in needed:
|
||||
if col not in h2i:
|
||||
raise KeyError(f'{csv_path.name}: missing column {col!r}')
|
||||
i_src_ip = h2i[adapter.join_cols['src_ip']]
|
||||
i_src_port = h2i[adapter.join_cols['src_port']]
|
||||
i_dst_ip = h2i[adapter.join_cols['dst_ip']]
|
||||
i_dst_port = h2i[adapter.join_cols['dst_port']]
|
||||
i_proto = h2i[adapter.join_cols['protocol']]
|
||||
i_ts = h2i[adapter.join_cols['timestamp']]
|
||||
i_label = h2i[adapter.label_col]
|
||||
for row in reader:
|
||||
if not row:
|
||||
continue
|
||||
try:
|
||||
raw_label = row[i_label]
|
||||
except IndexError:
|
||||
n_skip += 1
|
||||
continue
|
||||
if any((pat in raw_label for pat in adapter.drop_label_patterns)):
|
||||
n_skip += 1
|
||||
continue
|
||||
try:
|
||||
sp = int(float(row[i_src_port])) if row[i_src_port].strip() else 0
|
||||
dp = int(float(row[i_dst_port])) if row[i_dst_port].strip() else 0
|
||||
proto = int(float(row[i_proto])) if row[i_proto].strip() else 0
|
||||
except (ValueError, IndexError):
|
||||
n_skip += 1
|
||||
continue
|
||||
sip = row[i_src_ip].strip()
|
||||
dip = row[i_dst_ip].strip()
|
||||
ck = _canonical_key(sip, dip, sp, dp, proto)
|
||||
ts_parsed = adapter.parse_timestamp(row[i_ts])
|
||||
ts_epoch = float('nan') if ts_parsed is None else ts_parsed + time_offset_seconds
|
||||
parsed.append((ck, ts_epoch, adapter.normalize_label(raw_label)))
|
||||
keep_idx = _select_indices(labels=[p[2] for p in parsed], benign_token=adapter.benign_token, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
|
||||
rows_by_key: dict[tuple, list[tuple[int, float]]] = {}
|
||||
labels_out: list[str] = []
|
||||
class_counts: dict[str, int] = {}
|
||||
row_idx = row_idx_start
|
||||
for i in keep_idx:
|
||||
(ck, ts_epoch, label) = parsed[i]
|
||||
rows_by_key.setdefault(ck, []).append((row_idx, ts_epoch))
|
||||
labels_out.append(label)
|
||||
class_counts[label] = class_counts.get(label, 0) + 1
|
||||
row_idx += 1
|
||||
return (rows_by_key, labels_out, row_idx - row_idx_start, n_skip, class_counts)
|
||||
|
||||
def _select_indices(*, labels: list[str], benign_token: str, max_per_class: int | None, max_benign: int | None, rng: np.random.Generator | None) -> list[int]:
|
||||
if max_per_class is None and max_benign is None:
|
||||
return list(range(len(labels)))
|
||||
assert rng is not None
|
||||
buckets: dict[str, list[int]] = {}
|
||||
for (i, label) in enumerate(labels):
|
||||
buckets.setdefault(label, []).append(i)
|
||||
keep: list[int] = []
|
||||
for (label, idxs) in buckets.items():
|
||||
cap = max_benign if label == benign_token else max_per_class
|
||||
if cap is not None and len(idxs) > cap:
|
||||
pick = rng.choice(len(idxs), size=cap, replace=False)
|
||||
idxs = [idxs[j] for j in sorted(pick)]
|
||||
keep.extend(idxs)
|
||||
keep.sort()
|
||||
return keep
|
||||
112
scripts/download/README.md
Normal file
112
scripts/download/README.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# Dataset download scripts
|
||||
|
||||
Target layout (mirrors `datasets/cicids2017/`):
|
||||
|
||||
```
|
||||
datasets/
|
||||
ciciot2023/raw/{pcap,csv}
|
||||
iscxtor2016/raw/{pcap,csv}
|
||||
cicapt_iiot2024/raw/{pcap,csv}
|
||||
ustc_tfc2016/raw/pcap
|
||||
datacon2020/raw/pcap
|
||||
```
|
||||
|
||||
## CICIoT2023 / ISCXTor2016 (automated)
|
||||
|
||||
UNB/CIC gates downloads behind a consent form. After submission the site issues
|
||||
a `Token` cookie (domain `.cicresearch.ca`) that unlocks two endpoints:
|
||||
|
||||
- `browse.php?p=<path>` — HTML directory listing
|
||||
- `download.php?file=<path>` — raw file bytes
|
||||
|
||||
`cic_download.py` is a stdlib-only recursive crawler that walks `browse.php`
|
||||
and fetches each leaf via `download.php`. Already-downloaded files are
|
||||
skipped (presence-based; the PHP endpoint does not advertise sizes).
|
||||
|
||||
### Workflow
|
||||
|
||||
1. Open the dataset page in a browser, fill and submit the form:
|
||||
- CICIoT2023 : <https://www.unb.ca/cic/datasets/iotdataset-2023.html>
|
||||
- ISCXTor2016: <https://www.unb.ca/cic/datasets/tor.html>
|
||||
2. After submit, click through to `cicresearch.ca/.../browse.php`. The page
|
||||
must load successfully in your browser — this proves the Token is set.
|
||||
3. Export the cookie in **Netscape format** (tab-separated). One line is
|
||||
sufficient:
|
||||
|
||||
```
|
||||
# Netscape HTTP Cookie File
|
||||
.cicresearch.ca TRUE / TRUE <expiry> Token <value>
|
||||
```
|
||||
|
||||
Save as:
|
||||
- `scripts/download/cookies_ciciot2023.txt`
|
||||
- `scripts/download/cookies_iscxtor2016.txt`
|
||||
|
||||
Tokens are per-dataset — a CICIoT2023 cookie will not work for ISCXTor.
|
||||
4. Run:
|
||||
|
||||
```bash
|
||||
bash scripts/download/download_ciciot2023.sh
|
||||
bash scripts/download/download_iscxtor2016.sh
|
||||
```
|
||||
|
||||
Env vars: `WHAT=pcap|csv|both`, `DEST=`, `COOKIES=`, `DRY_RUN=1`, `LIMIT=N`.
|
||||
For ISCXTor, if the remote subdir names differ from the defaults
|
||||
(`Pcaps` / `CSVs`), set `PCAP_ROOT=` / `CSV_ROOT=`.
|
||||
|
||||
### Known remote tree sizes
|
||||
|
||||
- **CICIoT2023** — `CSV/` 328 files (includes `CSV.zip`, `MERGED_CSV.zip`,
|
||||
`MERGED_CSV/`, and per-attack CSVs), `PCAP/` 311 files across 36 attack
|
||||
categories. Full dataset is ~12 GB.
|
||||
|
||||
### Quick commands
|
||||
|
||||
```bash
|
||||
# Dry-run (enumerate only, no downloads)
|
||||
DRY_RUN=1 bash scripts/download/download_ciciot2023.sh
|
||||
|
||||
# Download first 5 files as a smoke test
|
||||
LIMIT=5 WHAT=csv bash scripts/download/download_ciciot2023.sh
|
||||
|
||||
# Full download
|
||||
bash scripts/download/download_ciciot2023.sh
|
||||
```
|
||||
|
||||
## CICAPT-IIoT2024 (automated)
|
||||
|
||||
Same UNB/CIC pipeline as CICIoT2023, but crawled in a single pass — the
|
||||
entire `CICAPT-IIoT Dataset/` top-level folder is mirrored (pcap, csv, and
|
||||
anything else) under `datasets/cicapt_iiot2024/raw/`.
|
||||
|
||||
Cookie file: `scripts/download/cookies_cicapt_iiot2024.txt` (Token for
|
||||
`.cicresearch.ca`).
|
||||
|
||||
```bash
|
||||
# Smoke test first
|
||||
DRY_RUN=1 LIMIT=5 bash scripts/download/download_cicapt_iiot2024.sh
|
||||
|
||||
# Full download
|
||||
bash scripts/download/download_cicapt_iiot2024.sh
|
||||
|
||||
# Skip heavy archives if they duplicate a per-file tree
|
||||
SKIP_EXT=zip,7z bash scripts/download/download_cicapt_iiot2024.sh
|
||||
```
|
||||
|
||||
Reference URL (browser, with Token cookie live):
|
||||
<https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset>
|
||||
|
||||
## USTC-TFC2016 (manual)
|
||||
|
||||
```bash
|
||||
cd datasets/ustc_tfc2016/raw/pcap
|
||||
git clone --depth=1 https://github.com/yungshenglu/USTC-TFC2016.git .
|
||||
```
|
||||
|
||||
No official CSV — extract features yourself (CICFlowMeter, USTC-TK2016).
|
||||
|
||||
## DataCon2020 (manual)
|
||||
|
||||
Register at <https://datacon.qianxin.com/opendata/maliciousstream> and place
|
||||
the `black/` `white/` `test/` pcap bundles under
|
||||
`datasets/datacon2020/raw/pcap/`. No official CSV.
|
||||
54
scripts/download/_run_ciciot2023_pcap_loop.sh
Executable file
54
scripts/download/_run_ciciot2023_pcap_loop.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env bash
|
||||
# Background wrapper: retry CICIoT2023 PCAP download until it reports
|
||||
# a clean "Done." with n_files > 0. Each attempt is delimited in the log
|
||||
# so the monitor can grep for progress.
|
||||
#
|
||||
# Invoked detached (nohup ... &). The inner script is resumable via
|
||||
# the .part-file convention in cic_download.py.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
LOG="${REPO_ROOT}/logs/ciciot2023_pcap.log"
|
||||
|
||||
# nohup strips the interactive PATH; re-expose the project venv so
|
||||
# `python` resolves inside download_ciciot2023.sh.
|
||||
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
|
||||
export PATH="${REPO_ROOT}/.venv/bin:${PATH:-/usr/local/bin:/usr/bin:/bin}"
|
||||
fi
|
||||
|
||||
# Route through the local proxy; detached bash does not inherit the
|
||||
# interactive shell's proxy env, and cicresearch.ca's WAF rate-limits
|
||||
# bare-IP traffic much more aggressively than the proxy exit.
|
||||
export HTTP_PROXY="http://127.0.0.1:7093"
|
||||
export HTTPS_PROXY="http://127.0.0.1:7093"
|
||||
export ALL_PROXY="socks5h://127.0.0.1:7093"
|
||||
export NO_PROXY="localhost,127.0.0.1,::1"
|
||||
export http_proxy="${HTTP_PROXY}"
|
||||
export https_proxy="${HTTPS_PROXY}"
|
||||
export all_proxy="${ALL_PROXY}"
|
||||
export no_proxy="${NO_PROXY}"
|
||||
|
||||
i=0
|
||||
while :; do
|
||||
i=$((i + 1))
|
||||
ts=$(date +%F\ %T)
|
||||
printf '\n=== attempt %d %s ===\n' "$i" "$ts" >>"$LOG"
|
||||
# Skip bundle zips (e.g. PCAP.zip) — we want per-attack-class .pcap files,
|
||||
# not the whole dataset as one archive.
|
||||
WHAT=pcap SKIP_EXT="zip,7z" bash "${SCRIPT_DIR}/download_ciciot2023.sh" >>"$LOG" 2>&1
|
||||
rc=$?
|
||||
# If inner script exited with 0 AND last "Done." line reports >0 files,
|
||||
# we consider the listing+walk to have succeeded at least once. Otherwise
|
||||
# keep retrying on network/SSL failures.
|
||||
last_done=$(grep -E '^Done\. [0-9]+ files processed' "$LOG" | tail -1 || true)
|
||||
n=$(printf '%s' "$last_done" | awk '{print $2}')
|
||||
if [[ "$rc" -eq 0 && -n "$n" && "$n" -gt 0 ]]; then
|
||||
printf '=== loop finished clean %s (files=%s) ===\n' "$(date +%F\ %T)" "$n" >>"$LOG"
|
||||
break
|
||||
fi
|
||||
printf '=== attempt %d ended rc=%s last_done=%q; sleep 60 ===\n' \
|
||||
"$i" "$rc" "$last_done" >>"$LOG"
|
||||
sleep 60
|
||||
done
|
||||
185
scripts/download/cic_download.py
Normal file
185
scripts/download/cic_download.py
Normal file
@@ -0,0 +1,185 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import http.cookiejar
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
UA = 'Mozilla/5.0 (cic-downloader)'
|
||||
LINK_RE = re.compile('href="(browse\\.php\\?p=[^"]+|download\\.php\\?file=[^"]+)"')
|
||||
|
||||
def build_opener(cookies_path: Path) -> urllib.request.OpenerDirector:
|
||||
jar = http.cookiejar.MozillaCookieJar()
|
||||
jar.load(str(cookies_path), ignore_discard=True, ignore_expires=True)
|
||||
return urllib.request.build_opener(urllib.request.HTTPCookieProcessor(jar))
|
||||
|
||||
def http_get(opener, url: str, timeout: int=60, retries: int=5) -> bytes:
|
||||
last: Exception | None = None
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'User-Agent': UA})
|
||||
with opener.open(req, timeout=timeout) as resp:
|
||||
final = resp.geturl()
|
||||
if 'unb.ca/cic/datasets' in final:
|
||||
raise RuntimeError(f'Got redirected to UNB form page ({final}). Token cookie is missing/expired or wrong dataset scope.')
|
||||
return resp.read()
|
||||
except RuntimeError:
|
||||
raise
|
||||
except Exception as e:
|
||||
last = e
|
||||
wait = min(30, 2 ** attempt)
|
||||
print(f' WARN GET {url} failed ({e!r}); retry in {wait}s ({attempt + 1}/{retries})', file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
raise RuntimeError(f'GET {url} failed after {retries} attempts: {last!r}')
|
||||
|
||||
def list_dir(opener, base: str, p: str) -> list[tuple[str, str]]:
|
||||
url = urllib.parse.urljoin(base, 'browse.php') + '?p=' + urllib.parse.quote(p, safe='/')
|
||||
html = http_get(opener, url).decode('utf-8', 'replace')
|
||||
out: list[tuple[str, str]] = []
|
||||
for m in LINK_RE.finditer(html):
|
||||
href = m.group(1)
|
||||
qs = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
|
||||
if href.startswith('browse.php'):
|
||||
out.append(('dir', qs['p'][0]))
|
||||
else:
|
||||
out.append(('file', qs['file'][0]))
|
||||
return out
|
||||
|
||||
def walk(opener, base: str, root: str):
|
||||
stack = [root]
|
||||
seen: set[str] = set()
|
||||
while stack:
|
||||
p = stack.pop()
|
||||
if p in seen:
|
||||
continue
|
||||
seen.add(p)
|
||||
try:
|
||||
entries = list_dir(opener, base, p)
|
||||
except Exception as e:
|
||||
print(f' WARN list_dir({p}) failed permanently: {e!r}', file=sys.stderr)
|
||||
continue
|
||||
for (kind, val) in sorted(entries):
|
||||
if kind == 'dir':
|
||||
stack.append(val)
|
||||
else:
|
||||
yield val
|
||||
|
||||
def download_file(opener, base: str, remote: str, dest_root: Path, *, root_prefix: str) -> None:
|
||||
url = urllib.parse.urljoin(base, 'download.php') + '?file=' + urllib.parse.quote(remote, safe='')
|
||||
rel = remote[len(root_prefix):].lstrip('/') if remote.startswith(root_prefix) else remote
|
||||
local = dest_root / rel
|
||||
local.parent.mkdir(parents=True, exist_ok=True)
|
||||
if local.exists() and local.stat().st_size > 0:
|
||||
print(f' SKIP {rel} ({local.stat().st_size} bytes, already present)')
|
||||
return
|
||||
tmp = local.with_suffix(local.suffix + '.part')
|
||||
last: Exception | None = None
|
||||
for attempt in range(5):
|
||||
resume_from = tmp.stat().st_size if tmp.exists() else 0
|
||||
try:
|
||||
headers = {'User-Agent': UA}
|
||||
if resume_from > 0:
|
||||
headers['Range'] = f'bytes={resume_from}-'
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
t0 = time.monotonic()
|
||||
bytes_read = 0
|
||||
with opener.open(req, timeout=1800) as resp:
|
||||
final = resp.geturl()
|
||||
if 'unb.ca/cic/datasets' in final:
|
||||
raise RuntimeError('Token cookie invalid mid-download.')
|
||||
status = getattr(resp, 'status', None)
|
||||
mode = 'ab'
|
||||
if resume_from <= 0:
|
||||
mode = 'wb'
|
||||
elif status != 206:
|
||||
print(f' INFO {rel} resume request ignored (status={status}); restarting from zero')
|
||||
resume_from = 0
|
||||
mode = 'wb'
|
||||
with open(tmp, mode) as fh:
|
||||
while True:
|
||||
buf = resp.read(1 << 20)
|
||||
if not buf:
|
||||
break
|
||||
fh.write(buf)
|
||||
bytes_read += len(buf)
|
||||
tmp.replace(local)
|
||||
dt = time.monotonic() - t0
|
||||
total_bytes = local.stat().st_size
|
||||
mb = total_bytes / (1 << 20)
|
||||
delta_mb = bytes_read / (1 << 20)
|
||||
rate = mb / dt if dt > 0 else 0
|
||||
if resume_from > 0:
|
||||
resumed_mb = resume_from / (1 << 20)
|
||||
rate = delta_mb / dt if dt > 0 else 0
|
||||
print(f' GOT {rel} {mb:.1f} MB +{delta_mb:.1f} MB from {resumed_mb:.1f} MB {rate:.1f} MB/s')
|
||||
else:
|
||||
print(f' GOT {rel} {mb:.1f} MB {rate:.1f} MB/s')
|
||||
return
|
||||
except urllib.error.HTTPError as e:
|
||||
last = e
|
||||
if e.code == 416 and resume_from > 0:
|
||||
print(f' WARN {rel} resume rejected with 416; restarting from zero', file=sys.stderr)
|
||||
try:
|
||||
tmp.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
time.sleep(1)
|
||||
continue
|
||||
wait = min(30, 2 ** attempt)
|
||||
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
except RuntimeError:
|
||||
raise
|
||||
except Exception as e:
|
||||
last = e
|
||||
wait = min(30, 2 ** attempt)
|
||||
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
|
||||
time.sleep(wait)
|
||||
raise RuntimeError(f'download failed after 5 attempts: {last!r}')
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--cookies', required=True, type=Path)
|
||||
ap.add_argument('--base', required=True, help='dataset URL ending with /, e.g. https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/')
|
||||
ap.add_argument('--root', required=True, help='sub-path to crawl (e.g. PCAP or CSV)')
|
||||
ap.add_argument('--dest', required=True, type=Path, help='local directory to mirror into')
|
||||
ap.add_argument('--dry-run', action='store_true', help='enumerate only; do not download')
|
||||
ap.add_argument('--limit', type=int, default=0, help='stop after N files (0 = no limit)')
|
||||
ap.add_argument('--skip-ext', default='', help="comma-separated file extensions to skip (e.g. 'zip,7z'); case-insensitive, no dots")
|
||||
args = ap.parse_args()
|
||||
skip_exts = {e.strip().lower().lstrip('.') for e in args.skip_ext.split(',') if e.strip()}
|
||||
if not args.cookies.is_file():
|
||||
print(f'ERROR: cookies file not found: {args.cookies}', file=sys.stderr)
|
||||
return 2
|
||||
opener = build_opener(args.cookies)
|
||||
args.dest.mkdir(parents=True, exist_ok=True)
|
||||
print(f'Base : {args.base}')
|
||||
print(f'Root : {args.root}')
|
||||
print(f'Dest : {args.dest}')
|
||||
print(f'Walking tree...')
|
||||
n_files = 0
|
||||
n_skipped = 0
|
||||
for remote in walk(opener, args.base, args.root):
|
||||
ext = remote.rsplit('.', 1)[-1].lower() if '.' in remote else ''
|
||||
if ext in skip_exts:
|
||||
n_skipped += 1
|
||||
print(f" SKIP {remote} (extension '.{ext}' excluded)")
|
||||
continue
|
||||
n_files += 1
|
||||
if args.dry_run:
|
||||
print(f' FILE {remote}')
|
||||
else:
|
||||
try:
|
||||
download_file(opener, args.base, remote, args.dest, root_prefix=args.root.rstrip('/'))
|
||||
except Exception as e:
|
||||
print(f' FAIL {remote}: {e}', file=sys.stderr)
|
||||
if args.limit and n_files >= args.limit:
|
||||
print(f'-- stopped after {args.limit} (--limit) --')
|
||||
break
|
||||
print(f'Done. {n_files} files processed, {n_skipped} skipped by --skip-ext.')
|
||||
return 0
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
5
scripts/download/cookies_cicapt_iiot2024.txt
Normal file
5
scripts/download/cookies_cicapt_iiot2024.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
# Netscape HTTP Cookie File
|
||||
# https://curl.haxx.se/rfc/cookie_spec.html
|
||||
# This is a generated file! Do not edit.
|
||||
|
||||
.cicresearch.ca TRUE / TRUE 1777047525 Token ef8ooumh5qdh42r0k410mjoq0c
|
||||
4
scripts/download/cookies_cicddos2019.txt
Normal file
4
scripts/download/cookies_cicddos2019.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
# Netscape HTTP Cookie File
|
||||
# https://curl.haxx.se/rfc/cookie_spec.html
|
||||
|
||||
.cicresearch.ca TRUE / TRUE 1776910223 Token 8kfh51fj8u46lum8kvu6safonr
|
||||
5
scripts/download/cookies_ciciot2023.txt
Normal file
5
scripts/download/cookies_ciciot2023.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
# Netscape HTTP Cookie File
|
||||
# https://curl.haxx.se/rfc/cookie_spec.html
|
||||
# This is a generated file! Do not edit.
|
||||
|
||||
.cicresearch.ca TRUE / TRUE 1777518468 Token qn181atofvua6sn8ouv1hlcoo8
|
||||
5
scripts/download/cookies_iscxtor2016.txt
Normal file
5
scripts/download/cookies_iscxtor2016.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
# Netscape HTTP Cookie File
|
||||
# https://curl.haxx.se/rfc/cookie_spec.html
|
||||
# This is a generated file! Do not edit.
|
||||
|
||||
.cicresearch.ca TRUE / TRUE 1776990463 Token t4sfffhk5mnttgkh300buhg0it
|
||||
38
scripts/download/download_cicapt_iiot2024.sh
Executable file
38
scripts/download/download_cicapt_iiot2024.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download CICAPT-IIoT2024 (entire dataset tree) from UNB CIC via cic_download.py.
|
||||
#
|
||||
# Prereq: Token cookie for .cicresearch.ca saved as
|
||||
# scripts/download/cookies_cicapt_iiot2024.txt
|
||||
#
|
||||
# Remote tree is crawled in a single pass under ROOT="CICAPT-IIoT Dataset"
|
||||
# (the top-level folder at
|
||||
# https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset ).
|
||||
# Every leaf file — pcap, csv, whatever — is mirrored under
|
||||
# datasets/cicapt_iiot2024/raw/
|
||||
# preserving the remote subdirectory layout.
|
||||
#
|
||||
# Usage:
|
||||
# bash download_cicapt_iiot2024.sh # full download
|
||||
# DRY_RUN=1 bash download_cicapt_iiot2024.sh # enumerate only
|
||||
# LIMIT=5 bash download_cicapt_iiot2024.sh # smoke test (first 5 files)
|
||||
# SKIP_EXT=zip,7z bash download_cicapt_iiot2024.sh # skip archives
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicapt_iiot2024/raw}"
|
||||
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicapt_iiot2024.txt}"
|
||||
BASE="${BASE:-https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/}"
|
||||
ROOT="${ROOT:-CICAPT-IIoT Dataset}"
|
||||
|
||||
EXTRA=()
|
||||
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
|
||||
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
|
||||
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
|
||||
|
||||
echo "=== ${ROOT} -> ${DEST_ROOT} ==="
|
||||
python3 -u "${SCRIPT_DIR}/cic_download.py" \
|
||||
--cookies "${COOKIES}" --base "${BASE}" \
|
||||
--root "${ROOT}" --dest "${DEST_ROOT}" "${EXTRA[@]}"
|
||||
58
scripts/download/download_cicddos2019.sh
Executable file
58
scripts/download/download_cicddos2019.sh
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download CICDDoS2019 (CSV, optionally PCAP) from UNB CIC via cic_download.py.
|
||||
#
|
||||
# Prereq: submit the form at
|
||||
# https://www.unb.ca/cic/datasets/ddos-2019.html
|
||||
# in a browser, then save the issued Token cookie (Netscape format) as
|
||||
# scripts/download/cookies_cicddos2019.txt
|
||||
# Tokens are scoped per-dataset — the CICIoT2023 / ISCXTor cookies will NOT
|
||||
# work here.
|
||||
#
|
||||
# PCAPs for this dataset are already downloaded (see datasets/cicddos2019/raw/
|
||||
# pcap/). Default WHAT=csv reflects that. Switch to WHAT=pcap or WHAT=both if
|
||||
# you need to re-fetch.
|
||||
#
|
||||
# Usage:
|
||||
# bash download_cicddos2019.sh # CSVs only (default)
|
||||
# WHAT=pcap bash download_cicddos2019.sh # PCAPs only
|
||||
# WHAT=both bash download_cicddos2019.sh # everything
|
||||
# DRY_RUN=1 bash download_cicddos2019.sh # enumerate without downloading
|
||||
# CSV_ROOT=CSV bash download_cicddos2019.sh # override root if server uses a different name
|
||||
#
|
||||
# First-time tip: run with DRY_RUN=1 to discover the exact remote root names.
|
||||
# The CIC site is inconsistent across datasets (CSV / CSVs / CSV-01-12 ...).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicddos2019/raw}"
|
||||
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicddos2019.txt}"
|
||||
BASE="https://cicresearch.ca/CICDataset/CICDDoS2019/"
|
||||
WHAT="${WHAT:-csv}"
|
||||
|
||||
# Default root names. Override via env if dry-run shows a different layout.
|
||||
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
|
||||
CSV_ROOT="${CSV_ROOT:-CSVs}"
|
||||
|
||||
EXTRA=()
|
||||
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
|
||||
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
|
||||
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
|
||||
|
||||
run() {
|
||||
local root="$1" dest="$2"
|
||||
echo "=== ${root} -> ${dest} ==="
|
||||
python -u "${SCRIPT_DIR}/cic_download.py" \
|
||||
--cookies "${COOKIES}" --base "${BASE}" \
|
||||
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
|
||||
}
|
||||
|
||||
case "${WHAT}" in
|
||||
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
|
||||
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
|
||||
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
|
||||
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
|
||||
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
|
||||
esac
|
||||
45
scripts/download/download_ciciot2023.sh
Executable file
45
scripts/download/download_ciciot2023.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download CICIoT2023 (PCAP + CSV) from UNB CIC via cic_download.py.
|
||||
#
|
||||
# Prereq: submit the form at
|
||||
# https://www.unb.ca/cic/datasets/iotdataset-2023.html
|
||||
# in a browser, then save the issued Token cookie in Netscape format as
|
||||
# scripts/download/cookies_ciciot2023.txt
|
||||
# The cookie domain must be .cicresearch.ca and the name must be "Token".
|
||||
#
|
||||
# Usage:
|
||||
# bash download_ciciot2023.sh # both PCAP and CSV
|
||||
# WHAT=pcap bash download_ciciot2023.sh # PCAP only
|
||||
# WHAT=csv bash download_ciciot2023.sh # CSV only
|
||||
# DRY_RUN=1 bash download_ciciot2023.sh # enumerate without downloading
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/ciciot2023/raw}"
|
||||
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_ciciot2023.txt}"
|
||||
BASE="https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/"
|
||||
WHAT="${WHAT:-both}"
|
||||
|
||||
EXTRA=()
|
||||
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
|
||||
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
|
||||
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
|
||||
|
||||
run() {
|
||||
local root="$1" dest="$2"
|
||||
echo "=== ${root} -> ${dest} ==="
|
||||
python -u "${SCRIPT_DIR}/cic_download.py" \
|
||||
--cookies "${COOKIES}" --base "${BASE}" \
|
||||
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
|
||||
}
|
||||
|
||||
case "${WHAT}" in
|
||||
pcap) run PCAP "${DEST_ROOT}/pcap" ;;
|
||||
csv) run CSV "${DEST_ROOT}/csv" ;;
|
||||
both) run PCAP "${DEST_ROOT}/pcap"
|
||||
run CSV "${DEST_ROOT}/csv" ;;
|
||||
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
|
||||
esac
|
||||
75
scripts/download/download_iscxtor2016.sh
Executable file
75
scripts/download/download_iscxtor2016.sh
Executable file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env bash
|
||||
# Download ISCXTor2016 (PCAP + CSV) from UNB CIC via cic_download.py.
|
||||
#
|
||||
# Prereq: submit the form at
|
||||
# https://www.unb.ca/cic/datasets/tor.html
|
||||
# in a browser, then save the issued Token cookie (Netscape format) as
|
||||
# scripts/download/cookies_iscxtor2016.txt
|
||||
# Tokens are scoped per-dataset — the CICIoT2023 cookie will NOT work here.
|
||||
#
|
||||
# Usage:
|
||||
# bash download_iscxtor2016.sh
|
||||
# WHAT=pcap|csv|both DEST=... COOKIES=... DRY_RUN=1 LIMIT=N
|
||||
# PCAP_ROOT=... CSV_ROOT=... SKIP_EXT=zip,7z
|
||||
#
|
||||
# Note: the remote sub-path names ("Pcaps" / "CSVs" or similar) are only
|
||||
# visible after authenticating. Run with DRY_RUN=1 first to confirm the
|
||||
# tree; if the roots differ, set PCAP_ROOT=... and/or CSV_ROOT=....
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/iscxtor2016/raw}"
|
||||
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_iscxtor2016.txt}"
|
||||
BASE="https://cicresearch.ca/CICDataset/ISCX-Tor-NonTor-2017/"
|
||||
WHAT="${WHAT:-both}"
|
||||
|
||||
# Default root names (override via env if the server uses different casing)
|
||||
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
|
||||
CSV_ROOT="${CSV_ROOT:-CSVs}"
|
||||
|
||||
EXTRA=()
|
||||
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
|
||||
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
|
||||
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
|
||||
|
||||
resolve_python() {
|
||||
if [[ -n "${PYTHON:-}" ]]; then
|
||||
printf '%s\n' "${PYTHON}"
|
||||
return
|
||||
fi
|
||||
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
|
||||
printf '%s\n' "${REPO_ROOT}/.venv/bin/python"
|
||||
return
|
||||
fi
|
||||
if command -v python >/dev/null 2>&1; then
|
||||
command -v python
|
||||
return
|
||||
fi
|
||||
if command -v python3 >/dev/null 2>&1; then
|
||||
command -v python3
|
||||
return
|
||||
fi
|
||||
echo "ERROR: no Python interpreter found. Set PYTHON=/path/to/python." >&2
|
||||
exit 127
|
||||
}
|
||||
|
||||
PYTHON_BIN="$(resolve_python)"
|
||||
|
||||
run() {
|
||||
local root="$1" dest="$2"
|
||||
echo "=== ${root} -> ${dest} ==="
|
||||
"${PYTHON_BIN}" -u "${SCRIPT_DIR}/cic_download.py" \
|
||||
--cookies "${COOKIES}" --base "${BASE}" \
|
||||
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
|
||||
}
|
||||
|
||||
case "${WHAT}" in
|
||||
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
|
||||
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
|
||||
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
|
||||
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
|
||||
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
|
||||
esac
|
||||
114
scripts/eval_cross_dataset_protocol.py
Normal file
114
scripts/eval_cross_dataset_protocol.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics import average_precision_score, roc_auc_score
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from data import _preprocess_packet_batch
|
||||
from detect import _load_model
|
||||
from packet_store import PacketShardStore
|
||||
|
||||
@torch.no_grad()
|
||||
def _score_indices(*, store: PacketShardStore, indices: np.ndarray, model, device: torch.device, preprocess: str, mean: np.ndarray, std: np.ndarray, clip_lo: np.ndarray | None, clip_hi: np.ndarray | None, split_tag: str, split_seed: int, batch: int, materialize_batch: int, n_steps: int) -> dict[str, np.ndarray]:
|
||||
out = {'terminal_norm': [], 'arc_length': [], 'kinetic_energy': [], 'velocity_score': []}
|
||||
total = len(indices)
|
||||
report_every = max(1, total // 4)
|
||||
next_report = 0
|
||||
for start in range(0, total, materialize_batch):
|
||||
idx = indices[start:start + materialize_batch]
|
||||
(x_np, lens_np) = store.read_packets(idx, T=model.cfg.T)
|
||||
x_np = _preprocess_packet_batch(x_np, lens_np, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag=split_tag, split_seed=split_seed, flow_ids=idx)
|
||||
for pos in range(0, len(idx), batch):
|
||||
bx = torch.from_numpy(x_np[pos:pos + batch]).float().to(device)
|
||||
bl = torch.from_numpy(lens_np[pos:pos + batch]).long().to(device)
|
||||
m = model.trajectory_metrics(bx, lens=bl, cond=None, n_steps=n_steps)
|
||||
for key in ('terminal_norm', 'arc_length', 'kinetic_energy'):
|
||||
out[key].append(m[key].cpu().numpy())
|
||||
vs = model.velocity_score(bx, lens=bl, cond=None, t_eval=(0.5, 0.75, 1.0))
|
||||
out['velocity_score'].append(vs.cpu().numpy())
|
||||
done = min(start + len(idx), total)
|
||||
if done >= next_report or done == total:
|
||||
print(f'[{split_tag}] {done:,}/{total:,}', flush=True)
|
||||
next_report = done + report_every
|
||||
return {key: np.concatenate(parts) for (key, parts) in out.items()}
|
||||
|
||||
def run(args: argparse.Namespace) -> None:
|
||||
device = torch.device('cuda' if args.device == 'auto' and torch.cuda.is_available() else 'cpu' if args.device == 'auto' else args.device)
|
||||
save_dir = Path(args.save_dir)
|
||||
ckpt = torch.load(save_dir / 'model.pt', map_location='cpu', weights_only=False)
|
||||
preprocess = str(ckpt.get('preprocess', 'zscore'))
|
||||
mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
|
||||
std = np.asarray(ckpt['packet_std'], dtype=np.float32)
|
||||
clip_lo = np.asarray(ckpt['clip_lo'], dtype=np.float32) if 'clip_lo' in ckpt else None
|
||||
clip_hi = np.asarray(ckpt['clip_hi'], dtype=np.float32) if 'clip_hi' in ckpt else None
|
||||
model = _load_model(save_dir, device)
|
||||
store = PacketShardStore.open(Path(args.target_store))
|
||||
flows = store.read_flows(columns=['flow_id', 'label'])
|
||||
labels = flows['label'].to_numpy().astype(str)
|
||||
lens = store.manifest['packet_length'].to_numpy(dtype=np.int32)
|
||||
keep = lens >= int(args.min_len)
|
||||
benign_idx = flows.loc[keep & (labels == args.benign_label), 'flow_id'].to_numpy(dtype=np.int64)
|
||||
attack_df = flows.loc[keep & (labels != args.benign_label), ['flow_id', 'label']]
|
||||
attack_idx_all = attack_df['flow_id'].to_numpy(dtype=np.int64)
|
||||
attack_labels_all = attack_df['label'].to_numpy().astype(str)
|
||||
if len(benign_idx) < args.n_benign:
|
||||
raise ValueError(f'target has only {len(benign_idx)} benign rows, need {args.n_benign}')
|
||||
if len(attack_idx_all) < args.n_attack:
|
||||
raise ValueError(f'target has only {len(attack_idx_all)} attack rows, need {args.n_attack}')
|
||||
print(f'[target] store={args.target_store} benign_pool={len(benign_idx):,} attack_pool={len(attack_idx_all):,} T={model.cfg.T} preprocess={preprocess}', flush=True)
|
||||
results: dict[str, object] = {'save_dir': str(save_dir), 'target_store': str(args.target_store), 'n_benign': int(args.n_benign), 'n_attack': int(args.n_attack), 'seeds': [], 'mean': {}, 'std': {}}
|
||||
metrics = ('terminal_norm', 'arc_length', 'kinetic_energy', 'velocity_score')
|
||||
per_metric_values = {f'{metric}_auroc': [] for metric in metrics}
|
||||
per_metric_values.update({f'{metric}_auprc': [] for metric in metrics})
|
||||
for seed in args.seeds:
|
||||
rng = np.random.default_rng(int(seed))
|
||||
b_idx = np.sort(rng.choice(benign_idx, args.n_benign, replace=False))
|
||||
a_pos = rng.choice(len(attack_idx_all), args.n_attack, replace=False)
|
||||
a_pos.sort()
|
||||
a_idx = attack_idx_all[a_pos]
|
||||
a_labels = attack_labels_all[a_pos]
|
||||
print(f'[seed={seed}] scoring benign={len(b_idx):,} attack={len(a_idx):,}', flush=True)
|
||||
b_scores = _score_indices(store=store, indices=b_idx, model=model, device=device, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag='val', split_seed=int(seed), batch=args.batch, materialize_batch=args.materialize_batch, n_steps=args.n_steps)
|
||||
a_scores = _score_indices(store=store, indices=a_idx, model=model, device=device, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag='attack', split_seed=int(seed), batch=args.batch, materialize_batch=args.materialize_batch, n_steps=args.n_steps)
|
||||
seed_result: dict[str, object] = {'seed': int(seed), 'attack_label_counts': {str(k): int(v) for (k, v) in zip(*np.unique(a_labels, return_counts=True))}, 'metrics': {}}
|
||||
for metric in metrics:
|
||||
y = np.r_[np.zeros(len(b_scores[metric])), np.ones(len(a_scores[metric]))]
|
||||
s = np.r_[b_scores[metric], a_scores[metric]]
|
||||
s = np.nan_to_num(s, nan=0.0, posinf=1000000.0, neginf=-1000000.0)
|
||||
auroc = float(roc_auc_score(y, s))
|
||||
auprc = float(average_precision_score(y, s))
|
||||
seed_result['metrics'][metric] = {'auroc': auroc, 'auprc': auprc}
|
||||
per_metric_values[f'{metric}_auroc'].append(auroc)
|
||||
per_metric_values[f'{metric}_auprc'].append(auprc)
|
||||
print(f'[seed={seed}] {metric:<16s} AUROC={auroc:.4f} AUPRC={auprc:.4f}', flush=True)
|
||||
results['seeds'].append(seed_result)
|
||||
for (key, values) in per_metric_values.items():
|
||||
arr = np.asarray(values, dtype=np.float64)
|
||||
results['mean'][key] = float(arr.mean())
|
||||
results['std'][key] = float(arr.std(ddof=0))
|
||||
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
||||
Path(args.output).write_text(json.dumps(results, indent=2, sort_keys=True) + '\n')
|
||||
print(f'[saved] {args.output}', flush=True)
|
||||
for metric in metrics:
|
||||
print(f"[mean] {metric:<16s} AUROC={results['mean'][metric + '_auroc']:.4f}±{results['std'][metric + '_auroc']:.4f} AUPRC={results['mean'][metric + '_auprc']:.4f}±{results['std'][metric + '_auprc']:.4f}", flush=True)
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument('--save-dir', type=Path, required=True)
|
||||
parser.add_argument('--target-store', type=Path, required=True)
|
||||
parser.add_argument('--output', type=Path, required=True)
|
||||
parser.add_argument('--n-benign', type=int, default=10000)
|
||||
parser.add_argument('--n-attack', type=int, default=10000)
|
||||
parser.add_argument('--seeds', type=int, nargs='+', default=[0, 1, 2, 3, 4])
|
||||
parser.add_argument('--benign-label', type=str, default='normal')
|
||||
parser.add_argument('--min-len', type=int, default=2)
|
||||
parser.add_argument('--n-steps', type=int, default=16)
|
||||
parser.add_argument('--batch', type=int, default=4096)
|
||||
parser.add_argument('--materialize-batch', type=int, default=32768)
|
||||
parser.add_argument('--device', type=str, default='auto')
|
||||
run(parser.parse_args())
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
132
scripts/extract_cicddos2019.py
Normal file
132
scripts/extract_cicddos2019.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
from extract_lib import extract_dataset, _canonical_key
|
||||
from csv_adapter import CsvFlowAdapter, parse_csv_rows
|
||||
JOIN_COLS = {'src_ip': 'Source IP', 'src_port': 'Source Port', 'dst_ip': 'Destination IP', 'dst_port': 'Destination Port', 'protocol': 'Protocol', 'timestamp': 'Timestamp'}
|
||||
LABEL_COL = 'Label'
|
||||
TIMESTAMP_FORMATS = ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S')
|
||||
BENIGN_ALIASES = {'BENIGN', 'Benign', 'benign'}
|
||||
BENIGN_TOKEN = 'normal'
|
||||
DROP_LABEL_PATTERNS: tuple[str, ...] = ()
|
||||
LABEL_ALIASES = {'UDP-lag': 'UDPLag'}
|
||||
SHARDS = {'01-12': 'SAT-01-12-2018', '03-11': 'SAT-03-11-2018'}
|
||||
SHARD_OFFSETS_DEFAULT = {'01-12': 43200.0, '03-11': 39600.0}
|
||||
DEFAULT_CSV_DIR = Path('datasets/cicddos2019/raw/csv')
|
||||
DEFAULT_PCAP_DIR = Path('datasets/cicddos2019/raw/pcap')
|
||||
DEFAULT_OUT_PACKETS = Path('datasets/cicddos2019/processed/packets.npz')
|
||||
DEFAULT_OUT_FLOWS = Path('datasets/cicddos2019/processed/flows.parquet')
|
||||
CICDDOS2019_ADAPTER = CsvFlowAdapter(join_cols=JOIN_COLS, label_col=LABEL_COL, timestamp_formats=TIMESTAMP_FORMATS, benign_aliases=frozenset(BENIGN_ALIASES), benign_token=BENIGN_TOKEN, drop_label_patterns=DROP_LABEL_PATTERNS, label_aliases=LABEL_ALIASES)
|
||||
|
||||
def _normalize_label(raw: str) -> str:
|
||||
s = raw.strip()
|
||||
if s in BENIGN_ALIASES:
|
||||
return BENIGN_TOKEN
|
||||
return LABEL_ALIASES.get(s, s)
|
||||
|
||||
def _parse_timestamp(ts: str) -> float | None:
|
||||
s = ts.strip()
|
||||
if not s:
|
||||
return None
|
||||
for fmt in TIMESTAMP_FORMATS:
|
||||
try:
|
||||
return datetime.strptime(s, fmt).timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _find_pcaps_for_shard(pcap_dir: Path, prefix: str) -> list[Path]:
|
||||
found: list[Path] = []
|
||||
seen = set()
|
||||
for pat in (f'{prefix}*', f'{prefix}*.pcap', f'{prefix}*.pcapng'):
|
||||
for p in sorted(pcap_dir.glob(pat)):
|
||||
if p.is_file() and p not in seen:
|
||||
found.append(p)
|
||||
seen.add(p)
|
||||
return found
|
||||
|
||||
def _parse_csv(csv_path: Path, row_idx_start: int, time_offset_seconds: float, max_per_class: int | None, max_benign: int | None, rng: np.random.Generator) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int, dict[str, int]]:
|
||||
return parse_csv_rows(csv_path=csv_path, row_idx_start=row_idx_start, time_offset_seconds=time_offset_seconds, adapter=CICDDOS2019_ADAPTER, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--csv-dir', type=Path, default=DEFAULT_CSV_DIR)
|
||||
ap.add_argument('--pcap-dir', type=Path, default=DEFAULT_PCAP_DIR)
|
||||
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
|
||||
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
|
||||
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
|
||||
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
|
||||
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Matched flows per temporary worker chunk when --out-store is set.')
|
||||
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
|
||||
ap.add_argument('--match-strategy', choices=('auto', 'hungarian', 'stream_nearest'), default='auto', help='CSV↔pcap matching strategy. auto uses stream_nearest for --out-store and hungarian for legacy npz output.')
|
||||
ap.add_argument('--T-full', type=int, default=256)
|
||||
ap.add_argument('--idle-timeout', type=float, default=120.0)
|
||||
ap.add_argument('--time-tolerance', type=float, default=2.0)
|
||||
ap.add_argument('--time-offset', type=float, default=0.0, help='Extra seconds added to per-shard SHARD_OFFSETS_DEFAULT. Default 0 assumes a UTC+8 host (matches the SHARD_OFFSETS_DEFAULT values: 03-11=39600, 01-12=43200). If the per-shard time-delta diagnostic shows a non-zero median, add that to this flag.')
|
||||
ap.add_argument('--jobs', type=int, default=0, help='0=auto (min(n_shards, cpu_count)). 1=serial.')
|
||||
ap.add_argument('--shards', type=str, nargs='*', default=None, choices=sorted(SHARDS.keys()), help='Subset of shards to process (default: all).')
|
||||
ap.add_argument('--max-per-class', type=int, default=500000, help='Per-file, per-attack-class row cap (random subsample). Default 500k. Pass 0 to disable.')
|
||||
ap.add_argument('--max-benign', type=int, default=None, help='Per-file benign row cap. Default: uncapped (keep all).')
|
||||
ap.add_argument('--max-packets-per-pcap', type=int, default=None, help='Cap per-pcap packets (smoke only).')
|
||||
ap.add_argument('--max-pcap-files-per-shard', type=int, default=None, help='Only process the first N pcap chunks per shard (smoke only).')
|
||||
ap.add_argument('--sample-seed', type=int, default=42)
|
||||
args = ap.parse_args()
|
||||
max_per_class = args.max_per_class or None
|
||||
max_benign = args.max_benign or None
|
||||
rng = np.random.default_rng(args.sample_seed)
|
||||
shards = args.shards or sorted(SHARDS.keys())
|
||||
csv_rows_by_day: dict[str, dict] = {}
|
||||
all_labels: list[str] = []
|
||||
total_rows = 0
|
||||
total_skip = 0
|
||||
aggregate_counts: dict[str, int] = {}
|
||||
print(f'=== parsing CSVs in {args.csv_dir} ===')
|
||||
print(f' max_per_class={max_per_class} max_benign={max_benign}')
|
||||
print(f' additive time_offset={args.time_offset}s (on top of per-shard defaults)')
|
||||
for shard in shards:
|
||||
shard_offset = SHARD_OFFSETS_DEFAULT.get(shard, 0.0) + args.time_offset
|
||||
print(f'[{shard}] effective time_offset={shard_offset}s (= default {SHARD_OFFSETS_DEFAULT.get(shard, 0.0)} + CLI {args.time_offset})')
|
||||
shard_dir = args.csv_dir / shard
|
||||
if not shard_dir.is_dir():
|
||||
print(f'[{shard}] {shard_dir} not found — skipping')
|
||||
continue
|
||||
csvs = sorted(shard_dir.glob('*.csv'))
|
||||
if not csvs:
|
||||
print(f'[{shard}] no CSVs under {shard_dir}')
|
||||
continue
|
||||
shard_rows: dict[tuple, list[tuple[int, float]]] = {}
|
||||
for csv_path in csvs:
|
||||
(day_rows, labels, n_emit, n_skip, cls_counts) = _parse_csv(csv_path, row_idx_start=total_rows, time_offset_seconds=shard_offset, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
|
||||
for (ck, rs) in day_rows.items():
|
||||
shard_rows.setdefault(ck, []).extend(rs)
|
||||
all_labels.extend(labels)
|
||||
total_rows += n_emit
|
||||
total_skip += n_skip
|
||||
for (lbl, c) in cls_counts.items():
|
||||
aggregate_counts[lbl] = aggregate_counts.get(lbl, 0) + c
|
||||
print(f'[{shard}/{csv_path.name}] emitted {n_emit:,} skipped {n_skip:,} cls={dict(sorted(cls_counts.items()))}')
|
||||
csv_rows_by_day[shard] = shard_rows
|
||||
print(f'[{shard}] shard total: {sum((len(v) for v in shard_rows.values())):,} canonical keys')
|
||||
labels_by_row = np.asarray(all_labels, dtype=object)
|
||||
print(f'\nTotal CSV rows emitted: {total_rows:,} skipped: {total_skip:,}')
|
||||
print(f'Aggregate label distribution (post-subsample):')
|
||||
for (lbl, cnt) in sorted(aggregate_counts.items(), key=lambda x: -x[1]):
|
||||
print(f' {lbl:<40s} {cnt:>12,}')
|
||||
print(f'\n=== locating pcap chunks in {args.pcap_dir} ===')
|
||||
pcap_files_by_day: dict[str, list[Path]] = {}
|
||||
for shard in shards:
|
||||
prefix = SHARDS[shard]
|
||||
files = _find_pcaps_for_shard(args.pcap_dir, prefix)
|
||||
if args.max_pcap_files_per_shard is not None:
|
||||
files = files[:args.max_pcap_files_per_shard]
|
||||
pcap_files_by_day[shard] = files
|
||||
print(f'[{shard}] prefix {prefix!r} → {len(files):,} pcap chunks')
|
||||
print(f'\n=== extracting packet sequences ===')
|
||||
extract_dataset(csv_rows_by_day=csv_rows_by_day, labels_by_row=labels_by_row, pcap_files_by_day=pcap_files_by_day, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, match_strategy=None if args.match_strategy == 'auto' else args.match_strategy, T_full=args.T_full, idle_timeout=args.idle_timeout, time_tolerance_seconds=args.time_tolerance, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs)
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
104
scripts/extract_cicids2017.py
Normal file
104
scripts/extract_cicids2017.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
from extract_lib import extract_dataset, _canonical_key
|
||||
from csv_adapter import CsvFlowAdapter, parse_csv_rows
|
||||
JOIN_COLS = {'src_ip': 'Src IP', 'src_port': 'Src Port', 'dst_ip': 'Dst IP', 'dst_port': 'Dst Port', 'protocol': 'Protocol', 'timestamp': 'Timestamp'}
|
||||
LABEL_COL = 'Label'
|
||||
TIMESTAMP_FORMATS = ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M:%S', '%d/%m/%Y %H:%M')
|
||||
BENIGN_ALIASES = {'BENIGN', 'Benign', 'benign'}
|
||||
BENIGN_TOKEN = 'normal'
|
||||
DROP_LABEL_PATTERNS = ('- Attempted',)
|
||||
SHARDS = ('monday', 'tuesday', 'wednesday', 'thursday', 'friday')
|
||||
DEFAULT_CSV_DIR = Path('datasets/cicids2017/raw/csv')
|
||||
DEFAULT_PCAP_DIR = Path('datasets/cicids2017/raw/pcap')
|
||||
DEFAULT_OUT_PACKETS = Path('datasets/cicids2017/processed/packets.npz')
|
||||
DEFAULT_OUT_FLOWS = Path('datasets/cicids2017/processed/flows.parquet')
|
||||
CICIDS2017_ADAPTER = CsvFlowAdapter(join_cols=JOIN_COLS, label_col=LABEL_COL, timestamp_formats=TIMESTAMP_FORMATS, benign_aliases=frozenset(BENIGN_ALIASES), benign_token=BENIGN_TOKEN, drop_label_patterns=DROP_LABEL_PATTERNS)
|
||||
|
||||
def _normalize_label(raw: str) -> str:
|
||||
s = raw.strip()
|
||||
return BENIGN_TOKEN if s in BENIGN_ALIASES else s
|
||||
|
||||
def _parse_timestamp(ts: str) -> float | None:
|
||||
s = ts.strip()
|
||||
if not s:
|
||||
return None
|
||||
for fmt in TIMESTAMP_FORMATS:
|
||||
try:
|
||||
return datetime.strptime(s, fmt).timestamp()
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _find_pcaps_for_day(pcap_dir: Path, day: str) -> list[Path]:
|
||||
day_lc = day.lower()
|
||||
day_cap = day.capitalize()
|
||||
pats = [f'*{day_lc}*.pcap', f'*{day_lc}*.pcapng', f'*{day_cap}*.pcap', f'*{day_cap}*.pcapng']
|
||||
found: list[Path] = []
|
||||
seen = set()
|
||||
for pat in pats:
|
||||
for p in sorted(pcap_dir.glob(pat)):
|
||||
if p not in seen:
|
||||
found.append(p)
|
||||
seen.add(p)
|
||||
return found
|
||||
|
||||
def _parse_day_csv(csv_path: Path, row_idx_start: int, time_offset_seconds: float) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int]:
|
||||
(day_rows, labels, n_emit, n_skip, _) = parse_csv_rows(csv_path=csv_path, row_idx_start=row_idx_start, time_offset_seconds=time_offset_seconds, adapter=CICIDS2017_ADAPTER)
|
||||
return (day_rows, labels, n_emit, n_skip)
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--csv-dir', type=Path, default=DEFAULT_CSV_DIR)
|
||||
ap.add_argument('--pcap-dir', type=Path, default=DEFAULT_PCAP_DIR)
|
||||
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
|
||||
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
|
||||
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
|
||||
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
|
||||
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Matched flows per temporary worker chunk when --out-store is set.')
|
||||
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
|
||||
ap.add_argument('--match-strategy', choices=('auto', 'hungarian', 'stream_nearest'), default='auto', help='CSV↔pcap matching strategy. auto uses stream_nearest for --out-store and hungarian for legacy npz output.')
|
||||
ap.add_argument('--T-full', type=int, default=256)
|
||||
ap.add_argument('--idle-timeout', type=float, default=120.0)
|
||||
ap.add_argument('--time-tolerance', type=float, default=2.0, help='Max |t_csv - t_pcap| seconds for flow match.')
|
||||
ap.add_argument('--time-offset', type=float, default=0.0, help='Seconds added to CSV timestamps before matching.')
|
||||
ap.add_argument('--jobs', type=int, default=0, help='0 = auto (min(n_days, cpu_count)). 1 = serial.')
|
||||
ap.add_argument('--days', type=str, nargs='*', default=None, help='Subset of shards to process (default: all 5).')
|
||||
ap.add_argument('--max-packets-per-pcap', type=int, default=None, help='Cap per-pcap packets (smoke tests only).')
|
||||
args = ap.parse_args()
|
||||
days = tuple(args.days) if args.days else SHARDS
|
||||
csv_rows_by_day: dict[str, dict] = {}
|
||||
all_labels: list[str] = []
|
||||
total_rows = 0
|
||||
total_skip = 0
|
||||
print(f'=== parsing CSVs in {args.csv_dir} ===')
|
||||
for day in days:
|
||||
csv_path = args.csv_dir / f'{day}.csv'
|
||||
if not csv_path.exists():
|
||||
print(f'[{day}] {csv_path} not found, skipping')
|
||||
continue
|
||||
(day_rows, labels, n_emit, n_skip) = _parse_day_csv(csv_path, row_idx_start=total_rows, time_offset_seconds=args.time_offset)
|
||||
csv_rows_by_day[day] = day_rows
|
||||
all_labels.extend(labels)
|
||||
total_rows += n_emit
|
||||
total_skip += n_skip
|
||||
print(f'[{day}] emitted {n_emit:,} rows skipped {n_skip:,} canonical keys {len(day_rows):,}')
|
||||
labels_by_row = np.asarray(all_labels, dtype=object)
|
||||
print(f'Total CSV rows emitted: {total_rows:,} (skipped {total_skip:,})')
|
||||
print(f'\n=== locating pcap files in {args.pcap_dir} ===')
|
||||
pcap_files_by_day: dict[str, list[Path]] = {}
|
||||
for day in days:
|
||||
files = _find_pcaps_for_day(args.pcap_dir, day)
|
||||
pcap_files_by_day[day] = files
|
||||
names = [p.name for p in files]
|
||||
print(f'[{day}] {len(files)} pcap(s): {names}')
|
||||
print(f'\n=== extracting packet sequences ===')
|
||||
extract_dataset(csv_rows_by_day=csv_rows_by_day, labels_by_row=labels_by_row, pcap_files_by_day=pcap_files_by_day, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, match_strategy=None if args.match_strategy == 'auto' else args.match_strategy, T_full=args.T_full, idle_timeout=args.idle_timeout, time_tolerance_seconds=args.time_tolerance, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs)
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
56
scripts/extract_ciciot2023.py
Normal file
56
scripts/extract_ciciot2023.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
from extract_lib import extract_labeled_pcaps
|
||||
DEFAULT_PCAP_ROOT = Path('datasets/ciciot2023/raw/pcap')
|
||||
DEFAULT_OUT_PACKETS = Path('datasets/ciciot2023/processed/packets.npz')
|
||||
DEFAULT_OUT_FLOWS = Path('datasets/ciciot2023/processed/flows.parquet')
|
||||
BENIGN_FOLDER = 'Benign_Final'
|
||||
BENIGN_LABEL = 'normal'
|
||||
|
||||
def _label_for_folder(folder: str) -> str:
|
||||
if folder == BENIGN_FOLDER:
|
||||
return BENIGN_LABEL
|
||||
return folder.lower()
|
||||
|
||||
def _find_pcap_files(pcap_root: Path, *, max_pcaps_per_class: int | None) -> list[tuple[Path, str, dict]]:
|
||||
triples: list[tuple[Path, str, dict]] = []
|
||||
for class_dir in sorted((p for p in pcap_root.iterdir() if p.is_dir())):
|
||||
folder = class_dir.name
|
||||
label = _label_for_folder(folder)
|
||||
pcaps = sorted(class_dir.rglob('*.pcap')) + sorted(class_dir.rglob('*.pcapng'))
|
||||
if max_pcaps_per_class is not None and len(pcaps) > max_pcaps_per_class:
|
||||
pcaps = pcaps[:max_pcaps_per_class]
|
||||
for p in pcaps:
|
||||
triples.append((p, label, {'class_folder': folder}))
|
||||
return triples
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--pcap-root', type=Path, default=DEFAULT_PCAP_ROOT)
|
||||
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
|
||||
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
|
||||
ap.add_argument('--out-store', type=Path, default=None, help='Sharded PacketShardStore output. Recommended for CICIoT2023 since the raw set is large.')
|
||||
ap.add_argument('--shard-size', type=int, default=100000)
|
||||
ap.add_argument('--worker-flush-size', type=int, default=10000)
|
||||
ap.add_argument('--spool-dir', type=Path, default=None)
|
||||
ap.add_argument('--T-full', type=int, default=256)
|
||||
ap.add_argument('--idle-timeout', type=float, default=120.0)
|
||||
ap.add_argument('--jobs', type=int, default=0)
|
||||
ap.add_argument('--max-pcaps-per-class', type=int, default=1, help='Cap pcap files per class folder. Default 1 (single pcap per class) keeps extraction tractable.')
|
||||
ap.add_argument('--max-packets-per-pcap', type=int, default=2000000, help='Cap packets per pcap to bound RAM/IO. Default 2M.')
|
||||
args = ap.parse_args()
|
||||
triples = _find_pcap_files(args.pcap_root, max_pcaps_per_class=args.max_pcaps_per_class)
|
||||
if not triples:
|
||||
raise RuntimeError(f'No pcap files found under {args.pcap_root}')
|
||||
print(f'[discover] {len(triples)} pcap files across {len(set((t[1] for t in triples)))} labels')
|
||||
by_label: dict[str, int] = {}
|
||||
for (_, lbl, _) in triples:
|
||||
by_label[lbl] = by_label.get(lbl, 0) + 1
|
||||
for (lbl, n) in sorted(by_label.items()):
|
||||
print(f' {lbl:<28s} {n} pcap(s)')
|
||||
extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('class_folder',))
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
96
scripts/extract_iscxtor2016.py
Normal file
96
scripts/extract_iscxtor2016.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
||||
from extract_lib import extract_labeled_pcaps
|
||||
DEFAULT_PCAP_ARCHIVE_DIR = Path('datasets/iscxtor2016/raw/pcap')
|
||||
DEFAULT_DECOMPRESS_DIR = Path('datasets/iscxtor2016/raw/pcap_extracted')
|
||||
DEFAULT_OUT_PACKETS = Path('datasets/iscxtor2016/processed/packets.npz')
|
||||
DEFAULT_OUT_FLOWS = Path('datasets/iscxtor2016/processed/flows.parquet')
|
||||
NONTOR_ARCHIVE = 'NonTor.tar.xz'
|
||||
TOR_ARCHIVE = 'Tor.zip'
|
||||
ACTIVITY_PATTERNS = (('mail', re.compile('mail|email|imap|pop_|smtp|thunderbird')), ('voip', re.compile('voip|voice|call|facebook_voice|hangouts_voice')), ('audio', re.compile('audio|spotify|skype_audio|hangout_audio|facebook_audio')), ('browsing', re.compile('browsing|browser|ssl_browsing|gate_ssl')), ('chat', re.compile('chat|aim|icq|skypechat')), ('file', re.compile('file[-_]?transfer|ftp|sftp|tftp')), ('p2p', re.compile('p2p|multispeed|multiple[sS]peed|bittor|utor')), ('video', re.compile('video|youtube|vimeo')))
|
||||
|
||||
def _infer_activity(pcap_name: str) -> str:
|
||||
lower = pcap_name.lower()
|
||||
for (act, pat) in ACTIVITY_PATTERNS:
|
||||
if pat.search(lower):
|
||||
return act
|
||||
return 'other'
|
||||
|
||||
def _decompress_archives(archive_dir: Path, out_dir: Path) -> None:
|
||||
nontor_arc = archive_dir / NONTOR_ARCHIVE
|
||||
tor_arc = archive_dir / TOR_ARCHIVE
|
||||
out_nontor = out_dir / 'NonTor'
|
||||
out_tor = out_dir / 'Tor'
|
||||
if not out_nontor.exists():
|
||||
out_nontor.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f'[decompress] {nontor_arc} → {out_dir}/ (tar xf)')
|
||||
t0 = time.time()
|
||||
subprocess.run(['tar', '-xf', str(nontor_arc), '-C', str(out_dir)], check=True)
|
||||
print(f'[decompress] NonTor done in {time.time() - t0:.1f}s')
|
||||
else:
|
||||
print(f'[decompress] {out_nontor} already exists — skipping NonTor unpack')
|
||||
if not out_tor.exists():
|
||||
print(f'[decompress] {tor_arc} → {out_dir}/ (unzip)')
|
||||
t0 = time.time()
|
||||
subprocess.run(['unzip', '-q', '-o', str(tor_arc), '-d', str(out_dir)], check=True)
|
||||
print(f'[decompress] Tor done in {time.time() - t0:.1f}s')
|
||||
else:
|
||||
print(f'[decompress] {out_tor} already exists — skipping Tor unpack')
|
||||
|
||||
def _find_pcap_files(decompressed_root: Path) -> list[tuple[Path, str, dict]]:
|
||||
triples: list[tuple[Path, str, dict]] = []
|
||||
for (sub, coarse) in (('NonTor', 'nontor'), ('Tor', 'tor')):
|
||||
sub_dir = decompressed_root / sub
|
||||
if not sub_dir.exists():
|
||||
print(f'[warn] {sub_dir} not found — skipping')
|
||||
continue
|
||||
pcaps = sorted(sub_dir.rglob('*.pcap')) + sorted(sub_dir.rglob('*.pcapng'))
|
||||
for p in pcaps:
|
||||
activity = _infer_activity(p.name)
|
||||
triples.append((p, coarse, {'activity': activity}))
|
||||
return triples
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--archive-dir', type=Path, default=DEFAULT_PCAP_ARCHIVE_DIR)
|
||||
ap.add_argument('--decompressed-dir', type=Path, default=DEFAULT_DECOMPRESS_DIR)
|
||||
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
|
||||
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
|
||||
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
|
||||
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
|
||||
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Flows per temporary worker chunk when --out-store is set.')
|
||||
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
|
||||
ap.add_argument('--T-full', type=int, default=256)
|
||||
ap.add_argument('--idle-timeout', type=float, default=120.0)
|
||||
ap.add_argument('--jobs', type=int, default=0)
|
||||
ap.add_argument('--max-packets-per-pcap', type=int, default=None)
|
||||
ap.add_argument('--decompress-only', action='store_true', help='Extract the archives then stop (for staged runs).')
|
||||
ap.add_argument('--skip-decompress', action='store_true', help='Assume decompressed-dir is already populated.')
|
||||
args = ap.parse_args()
|
||||
if not args.skip_decompress:
|
||||
_decompress_archives(args.archive_dir, args.decompressed_dir)
|
||||
if args.decompress_only:
|
||||
print('[decompress-only] exiting as requested.')
|
||||
return
|
||||
triples = _find_pcap_files(args.decompressed_dir)
|
||||
if not triples:
|
||||
raise RuntimeError(f'No pcap files found under {args.decompressed_dir}')
|
||||
print(f'\n[discover] found {len(triples)} pcap file(s)')
|
||||
by_coarse: dict[str, int] = {}
|
||||
by_act: dict[str, int] = {}
|
||||
for (_, lbl, extra) in triples:
|
||||
by_coarse[lbl] = by_coarse.get(lbl, 0) + 1
|
||||
by_act[extra['activity']] = by_act.get(extra['activity'], 0) + 1
|
||||
print(f' by label: {by_coarse}')
|
||||
print(f' by activity: {by_act}')
|
||||
print(f'\n[extract] writing to {args.out_packets} + {args.out_flows}')
|
||||
extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('activity',))
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
774
scripts/extract_lib.py
Normal file
774
scripts/extract_lib.py
Normal file
@@ -0,0 +1,774 @@
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import shutil
|
||||
import socket
|
||||
import sys
|
||||
import tempfile
|
||||
import time as _time
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
import dpkt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
_SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
_REPO_ROOT = _SCRIPT_DIR.parent
|
||||
sys.path.insert(0, str(_REPO_ROOT / 'Packet_CFM'))
|
||||
from packet_store import PacketShardWriter
|
||||
PACKET_FEATURE_NAMES = ('log_size', 'log_dt_ms', 'direction', 'tcp_syn', 'tcp_fin', 'tcp_rst', 'tcp_psh', 'tcp_ack', 'log_win')
|
||||
PACKET_D = len(PACKET_FEATURE_NAMES)
|
||||
(FIN, SYN, RST, PSH, ACK) = (1, 2, 4, 8, 16)
|
||||
|
||||
@dataclass(slots=True)
|
||||
class PacketRecord:
|
||||
timestamp: float
|
||||
src_ip: str
|
||||
dst_ip: str
|
||||
src_port: int
|
||||
dst_port: int
|
||||
protocol: int
|
||||
tcp_flags: int
|
||||
payload_len: int
|
||||
header_len: int
|
||||
total_len: int
|
||||
window_size: int
|
||||
|
||||
def _try_open_pcap(f):
|
||||
try:
|
||||
return dpkt.pcap.Reader(f)
|
||||
except ValueError:
|
||||
f.seek(0)
|
||||
return dpkt.pcapng.Reader(f)
|
||||
|
||||
def iter_packets(pcap_path: Path, max_packets: int | None=None) -> Iterator[PacketRecord]:
|
||||
n = 0
|
||||
with open(pcap_path, 'rb') as f:
|
||||
reader = _try_open_pcap(f)
|
||||
link_type = reader.datalink()
|
||||
for (ts, buf) in reader:
|
||||
try:
|
||||
if link_type == dpkt.pcap.DLT_EN10MB:
|
||||
eth = dpkt.ethernet.Ethernet(buf)
|
||||
if eth.type != dpkt.ethernet.ETH_TYPE_IP:
|
||||
continue
|
||||
ip = eth.data
|
||||
elif link_type == dpkt.pcap.DLT_RAW:
|
||||
ip = dpkt.ip.IP(buf)
|
||||
elif link_type == dpkt.pcap.DLT_LINUX_SLL:
|
||||
sll = dpkt.sll.SLL(buf)
|
||||
if sll.ethtype != dpkt.ethernet.ETH_TYPE_IP:
|
||||
continue
|
||||
ip = sll.data
|
||||
else:
|
||||
continue
|
||||
if not isinstance(ip, dpkt.ip.IP):
|
||||
continue
|
||||
src_ip = socket.inet_ntoa(ip.src)
|
||||
dst_ip = socket.inet_ntoa(ip.dst)
|
||||
transport = ip.data
|
||||
if isinstance(transport, dpkt.tcp.TCP):
|
||||
yield PacketRecord(timestamp=ts, src_ip=src_ip, dst_ip=dst_ip, src_port=transport.sport, dst_port=transport.dport, protocol=6, tcp_flags=transport.flags, payload_len=len(transport.data), header_len=transport.off * 4, total_len=ip.len, window_size=transport.win)
|
||||
elif isinstance(transport, dpkt.udp.UDP):
|
||||
yield PacketRecord(timestamp=ts, src_ip=src_ip, dst_ip=dst_ip, src_port=transport.sport, dst_port=transport.dport, protocol=17, tcp_flags=0, payload_len=len(transport.data), header_len=8, total_len=ip.len, window_size=0)
|
||||
else:
|
||||
continue
|
||||
except (dpkt.NeedData, dpkt.UnpackError, AttributeError):
|
||||
continue
|
||||
n += 1
|
||||
if max_packets is not None and n >= max_packets:
|
||||
return
|
||||
|
||||
def _packet_token(pkt: PacketRecord, prev_ts: float | None, direction: int) -> np.ndarray:
|
||||
dt_ms = 0.0 if prev_ts is None else max(0.0, (pkt.timestamp - prev_ts) * 1000.0)
|
||||
syn = int(bool(pkt.tcp_flags & SYN))
|
||||
fin = int(bool(pkt.tcp_flags & FIN))
|
||||
rst = int(bool(pkt.tcp_flags & RST))
|
||||
psh = int(bool(pkt.tcp_flags & PSH))
|
||||
ack = int(bool(pkt.tcp_flags & ACK))
|
||||
return np.array([float(np.log1p(max(pkt.total_len, 0))), float(np.log1p(dt_ms)), float(direction), syn, fin, rst, psh, ack, float(np.log1p(max(pkt.window_size, 0)))], dtype=np.float32)
|
||||
|
||||
class _TokenFlow:
|
||||
__slots__ = ('key_fwd', 'start_ts', 'last_ts', 'fin_count', 'tokens', 'prev_ts', 'n_pkts')
|
||||
|
||||
def __init__(self, key_fwd: tuple, start_ts: float) -> None:
|
||||
self.key_fwd = key_fwd
|
||||
self.start_ts = start_ts
|
||||
self.last_ts = start_ts
|
||||
self.fin_count = 0
|
||||
self.tokens: list[np.ndarray] = []
|
||||
self.prev_ts: float | None = None
|
||||
self.n_pkts: int = 0
|
||||
|
||||
def add(self, pkt: PacketRecord, is_forward: bool, max_len: int) -> None:
|
||||
direction = 0 if is_forward else 1
|
||||
if len(self.tokens) < max_len:
|
||||
self.tokens.append(_packet_token(pkt, self.prev_ts, direction))
|
||||
self.prev_ts = pkt.timestamp
|
||||
self.last_ts = pkt.timestamp
|
||||
self.n_pkts += 1
|
||||
|
||||
def stream_token_flows(packet_iter: Iterator[PacketRecord], idle_timeout: float, max_len: int, gc_every: int=200000) -> Iterator[_TokenFlow]:
|
||||
active: dict[tuple, _TokenFlow] = {}
|
||||
last_pkt_ts = 0.0
|
||||
n_seen = 0
|
||||
for pkt in packet_iter:
|
||||
last_pkt_ts = pkt.timestamp
|
||||
fwd_key = (pkt.src_ip, pkt.dst_ip, pkt.src_port, pkt.dst_port, pkt.protocol)
|
||||
bwd_key = (pkt.dst_ip, pkt.src_ip, pkt.dst_port, pkt.src_port, pkt.protocol)
|
||||
flow: _TokenFlow | None = None
|
||||
key = fwd_key
|
||||
is_forward = True
|
||||
if fwd_key in active:
|
||||
(flow, key, is_forward) = (active[fwd_key], fwd_key, True)
|
||||
elif bwd_key in active:
|
||||
(flow, key, is_forward) = (active[bwd_key], bwd_key, False)
|
||||
if flow is not None and pkt.timestamp - flow.last_ts > idle_timeout:
|
||||
old = active.pop(key)
|
||||
yield old
|
||||
flow = None
|
||||
if flow is None:
|
||||
flow = _TokenFlow(key_fwd=fwd_key, start_ts=pkt.timestamp)
|
||||
key = fwd_key
|
||||
is_forward = True
|
||||
active[key] = flow
|
||||
flow.add(pkt, is_forward, max_len)
|
||||
if pkt.protocol == 6:
|
||||
if pkt.tcp_flags & RST:
|
||||
yield active.pop(key)
|
||||
elif pkt.tcp_flags & FIN:
|
||||
flow.fin_count += 1
|
||||
if flow.fin_count >= 2:
|
||||
yield active.pop(key)
|
||||
n_seen += 1
|
||||
if n_seen % gc_every == 0:
|
||||
stale = [k for (k, fl) in active.items() if last_pkt_ts - fl.last_ts > idle_timeout]
|
||||
for k in stale:
|
||||
yield active.pop(k)
|
||||
for fl in list(active.values()):
|
||||
yield fl
|
||||
active.clear()
|
||||
|
||||
def _canonical_key(src_ip: str, dst_ip: str, src_port: int, dst_port: int, proto: int) -> tuple:
|
||||
a = (src_ip, src_port)
|
||||
b = (dst_ip, dst_port)
|
||||
if a <= b:
|
||||
return (a[0], a[1], b[0], b[1], proto)
|
||||
return (b[0], b[1], a[0], a[1], proto)
|
||||
|
||||
def _to_fixed_tensor(flow_tokens: list[np.ndarray], max_len: int) -> np.ndarray:
|
||||
out = np.zeros((max_len, PACKET_D), dtype=np.float32)
|
||||
n = min(len(flow_tokens), max_len)
|
||||
if n > 0:
|
||||
out[:n] = np.stack(flow_tokens[:n], axis=0)
|
||||
return out
|
||||
|
||||
class _WorkerChunkWriter:
|
||||
|
||||
def __init__(self, root: Path, *, prefix: str, T_full: int, chunk_size: int) -> None:
|
||||
self.root = Path(root)
|
||||
self.root.mkdir(parents=True, exist_ok=True)
|
||||
self.prefix = prefix
|
||||
self.T_full = T_full
|
||||
self.chunk_size = max(1, int(chunk_size))
|
||||
self._tokens: list[np.ndarray] = []
|
||||
self._records: list[dict] = []
|
||||
self._next_chunk = 0
|
||||
self.chunks: list[dict[str, str]] = []
|
||||
|
||||
def add_csv_match(self, row_i: int, tok: np.ndarray, ln: int, meta: dict) -> None:
|
||||
rec = dict(meta)
|
||||
rec['csv_row_idx'] = int(row_i)
|
||||
rec['packet_length'] = int(ln)
|
||||
self._add(tok, rec)
|
||||
|
||||
def add_labeled(self, tok: np.ndarray, ln: int, meta: dict, label: str, extra: dict) -> None:
|
||||
rec = dict(meta)
|
||||
rec['packet_length'] = int(ln)
|
||||
rec['label'] = str(label)
|
||||
for (k, v) in extra.items():
|
||||
rec[str(k)] = v
|
||||
self._add(tok, rec)
|
||||
|
||||
def close(self) -> list[dict[str, str]]:
|
||||
if self._tokens:
|
||||
self._flush()
|
||||
return self.chunks
|
||||
|
||||
def _add(self, tok: np.ndarray, rec: dict) -> None:
|
||||
self._tokens.append(tok.astype(np.float32, copy=False))
|
||||
self._records.append(rec)
|
||||
if len(self._tokens) >= self.chunk_size:
|
||||
self._flush()
|
||||
|
||||
def _flush(self) -> None:
|
||||
n = len(self._tokens)
|
||||
tokens = np.empty((n, self.T_full, PACKET_D), dtype=np.float32)
|
||||
for (i, tok) in enumerate(self._tokens):
|
||||
tokens[i] = tok
|
||||
stem = f'{self.prefix}-chunk-{self._next_chunk:06d}'
|
||||
token_path = self.root / f'{stem}.npy'
|
||||
meta_path = self.root / f'{stem}.parquet'
|
||||
np.save(token_path, tokens, allow_pickle=False)
|
||||
pd.DataFrame(self._records).to_parquet(meta_path, compression='snappy', index=False)
|
||||
self.chunks.append({'tokens': str(token_path), 'meta': str(meta_path)})
|
||||
self._tokens.clear()
|
||||
self._records.clear()
|
||||
self._next_chunk += 1
|
||||
|
||||
def _flow_meta(fl: _TokenFlow) -> dict:
|
||||
(sip, dip, sp, dp, proto) = fl.key_fwd
|
||||
return {'start_ts': float(fl.start_ts), 'src_ip': str(sip), 'dst_ip': str(dip), 'src_port': int(sp), 'dst_port': int(dp), 'protocol': int(proto), 'n_pkts': int(fl.n_pkts)}
|
||||
|
||||
def _build_stream_csv_index(csv_rows_for_day: dict[tuple, list[tuple[int, float]]]) -> dict[tuple, dict[str, np.ndarray]]:
|
||||
out: dict[tuple, dict[str, np.ndarray]] = {}
|
||||
for (ck, rows) in csv_rows_for_day.items():
|
||||
finite = [(int(row_i), float(ts)) for (row_i, ts) in rows if not np.isnan(ts)]
|
||||
if not finite:
|
||||
continue
|
||||
finite.sort(key=lambda x: (x[1], x[0]))
|
||||
row_idx = np.asarray([r for (r, _) in finite], dtype=np.int64)
|
||||
ts = np.asarray([t for (_, t) in finite], dtype=np.float64)
|
||||
used = np.zeros(len(finite), dtype=bool)
|
||||
out[ck] = {'row_idx': row_idx, 'ts': ts, 'used': used}
|
||||
return out
|
||||
|
||||
def _nearest_unused_row(entry: dict[str, np.ndarray], ts: float, tolerance: float) -> tuple[int | None, float | None]:
|
||||
csv_ts = entry['ts']
|
||||
used = entry['used']
|
||||
pos = int(np.searchsorted(csv_ts, ts, side='left'))
|
||||
best_i: int | None = None
|
||||
best_abs = float('inf')
|
||||
j = pos - 1
|
||||
while j >= 0:
|
||||
diff = abs(float(csv_ts[j]) - ts)
|
||||
if diff > tolerance:
|
||||
break
|
||||
if not bool(used[j]) and diff < best_abs:
|
||||
best_i = j
|
||||
best_abs = diff
|
||||
j -= 1
|
||||
j = pos
|
||||
n = len(csv_ts)
|
||||
while j < n:
|
||||
diff = abs(float(csv_ts[j]) - ts)
|
||||
if diff > tolerance:
|
||||
break
|
||||
if not bool(used[j]) and diff < best_abs:
|
||||
best_i = j
|
||||
best_abs = diff
|
||||
j += 1
|
||||
if best_i is None:
|
||||
return (None, None)
|
||||
used[best_i] = True
|
||||
return (int(entry['row_idx'][best_i]), ts - float(csv_ts[best_i]))
|
||||
|
||||
def _extract_day_worker(day: str, pcap_files_str: list[str], csv_rows_for_day: dict[tuple, list[tuple[int, float]]], max_len: int, idle_timeout: float, time_tolerance_seconds: float, max_packets_per_pcap: int | None, spool_dir: str | None=None, worker_flush_size: int=10000, match_strategy: str='hungarian') -> dict:
|
||||
if match_strategy == 'stream_nearest':
|
||||
if spool_dir is None:
|
||||
raise ValueError('stream_nearest requires spool_dir')
|
||||
return _extract_day_worker_stream_nearest(day=day, pcap_files_str=pcap_files_str, csv_rows_for_day=csv_rows_for_day, max_len=max_len, idle_timeout=idle_timeout, time_tolerance_seconds=time_tolerance_seconds, max_packets_per_pcap=max_packets_per_pcap, spool_dir=spool_dir, worker_flush_size=worker_flush_size)
|
||||
pcap_by_key: dict[tuple, list[_TokenFlow]] = defaultdict(list)
|
||||
n_pkts = 0
|
||||
t_start = _time.time()
|
||||
|
||||
def _counting_iter(pkt_iter):
|
||||
nonlocal n_pkts
|
||||
for pkt in pkt_iter:
|
||||
n_pkts += 1
|
||||
yield pkt
|
||||
for pcap_path_str in pcap_files_str:
|
||||
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
|
||||
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
|
||||
(sip, dip, sp, dp, proto) = fl.key_fwd
|
||||
ck = _canonical_key(sip, dip, sp, dp, proto)
|
||||
pcap_by_key[ck].append(fl)
|
||||
n_flows = sum((len(v) for v in pcap_by_key.values()))
|
||||
elapsed = _time.time() - t_start
|
||||
BIG = time_tolerance_seconds * 1000.0
|
||||
results: list[tuple[int, np.ndarray, int, dict]] = []
|
||||
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'day-{day}', T_full=max_len, chunk_size=worker_flush_size) if spool_dir is not None else None
|
||||
n_joined = 0
|
||||
n_collision = 0
|
||||
n_csv_keys = len(csv_rows_for_day)
|
||||
n_intersection = 0
|
||||
|
||||
def _emit(row_i: int, fl: _TokenFlow) -> None:
|
||||
nonlocal n_joined
|
||||
tok = _to_fixed_tensor(fl.tokens, max_len)
|
||||
ln = min(len(fl.tokens), max_len)
|
||||
meta = _flow_meta(fl)
|
||||
if chunk_writer is not None:
|
||||
chunk_writer.add_csv_match(row_i, tok, ln, meta)
|
||||
else:
|
||||
results.append((row_i, tok, ln, meta))
|
||||
n_joined += 1
|
||||
for (ck, rows) in sorted(csv_rows_for_day.items(), key=lambda kv: kv[1][0][0]):
|
||||
if ck not in pcap_by_key:
|
||||
continue
|
||||
n_intersection += 1
|
||||
pcap_flows = pcap_by_key[ck]
|
||||
csv_ts = np.array([r[1] for r in rows], dtype=np.float64)
|
||||
pcap_ts = np.array([fl.start_ts for fl in pcap_flows], dtype=np.float64)
|
||||
(n_csv, n_pcap) = (len(csv_ts), len(pcap_ts))
|
||||
if n_csv == 1 and n_pcap == 1:
|
||||
row_i = rows[0][0]
|
||||
ts = csv_ts[0]
|
||||
fl = pcap_flows[0]
|
||||
if not np.isnan(ts) and abs(fl.start_ts - ts) <= time_tolerance_seconds:
|
||||
_emit(row_i, fl)
|
||||
else:
|
||||
n_collision += 1
|
||||
continue
|
||||
cost = np.abs(csv_ts[:, None] - pcap_ts[None, :])
|
||||
cost[np.isnan(cost)] = BIG
|
||||
cost[cost > time_tolerance_seconds] = BIG
|
||||
(row_ind, col_ind) = linear_sum_assignment(cost)
|
||||
for (r, c) in zip(row_ind, col_ind):
|
||||
if cost[r, c] >= BIG:
|
||||
n_collision += 1
|
||||
continue
|
||||
row_i = rows[r][0]
|
||||
fl = pcap_flows[c]
|
||||
_emit(row_i, fl)
|
||||
deltas: list[float] = []
|
||||
sampled = 0
|
||||
for (ck, rows) in csv_rows_for_day.items():
|
||||
if sampled >= 10000 or ck not in pcap_by_key:
|
||||
if sampled >= 10000:
|
||||
break
|
||||
continue
|
||||
(row_i, ts) = rows[0]
|
||||
if np.isnan(ts):
|
||||
continue
|
||||
deltas.append(pcap_by_key[ck][0].start_ts - ts)
|
||||
sampled += 1
|
||||
return {'day': day, 'results': results, 'chunks': [] if chunk_writer is None else chunk_writer.close(), 'n_joined': n_joined, 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed, 'n_pcap_keys': len(pcap_by_key), 'n_csv_keys': n_csv_keys, 'n_intersection': n_intersection, 'n_collision': n_collision, 'deltas': deltas, 'match_strategy': match_strategy}
|
||||
|
||||
def _extract_day_worker_stream_nearest(*, day: str, pcap_files_str: list[str], csv_rows_for_day: dict[tuple, list[tuple[int, float]]], max_len: int, idle_timeout: float, time_tolerance_seconds: float, max_packets_per_pcap: int | None, spool_dir: str, worker_flush_size: int) -> dict:
|
||||
t_start = _time.time()
|
||||
n_pkts = 0
|
||||
n_flows = 0
|
||||
n_joined = 0
|
||||
n_collision = 0
|
||||
seen_pcap_keys: set[tuple] = set()
|
||||
intersected_keys: set[tuple] = set()
|
||||
deltas: list[float] = []
|
||||
csv_index = _build_stream_csv_index(csv_rows_for_day)
|
||||
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'day-{day}', T_full=max_len, chunk_size=worker_flush_size)
|
||||
|
||||
def _counting_iter(pkt_iter):
|
||||
nonlocal n_pkts
|
||||
for pkt in pkt_iter:
|
||||
n_pkts += 1
|
||||
yield pkt
|
||||
for pcap_path_str in pcap_files_str:
|
||||
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
|
||||
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
|
||||
n_flows += 1
|
||||
(sip, dip, sp, dp, proto) = fl.key_fwd
|
||||
ck = _canonical_key(sip, dip, sp, dp, proto)
|
||||
seen_pcap_keys.add(ck)
|
||||
entry = csv_index.get(ck)
|
||||
if entry is None:
|
||||
continue
|
||||
intersected_keys.add(ck)
|
||||
(row_i, delta) = _nearest_unused_row(entry, float(fl.start_ts), time_tolerance_seconds)
|
||||
if row_i is None:
|
||||
n_collision += 1
|
||||
continue
|
||||
tok = _to_fixed_tensor(fl.tokens, max_len)
|
||||
ln = min(len(fl.tokens), max_len)
|
||||
chunk_writer.add_csv_match(row_i, tok, ln, _flow_meta(fl))
|
||||
n_joined += 1
|
||||
if delta is not None and len(deltas) < 10000:
|
||||
deltas.append(float(delta))
|
||||
elapsed = _time.time() - t_start
|
||||
return {'day': day, 'results': [], 'chunks': chunk_writer.close(), 'n_joined': n_joined, 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed, 'n_pcap_keys': len(seen_pcap_keys), 'n_csv_keys': len(csv_rows_for_day), 'n_intersection': len(intersected_keys), 'n_collision': n_collision, 'deltas': deltas, 'match_strategy': 'stream_nearest'}
|
||||
|
||||
def _print_day_stats(res: dict) -> None:
|
||||
day = res['day']
|
||||
strategy = res.get('match_strategy', 'hungarian')
|
||||
print(f"[{day}] {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s match={strategy} ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
|
||||
print(f" pcap_keys={res['n_pcap_keys']:,} csv_keys={res['n_csv_keys']:,} intersection={res['n_intersection']:,} joined={int(res.get('n_joined', len(res.get('results', ())))):,} within-key-miss={res['n_collision']:,}")
|
||||
deltas = res.get('deltas') or []
|
||||
if deltas:
|
||||
arr = np.asarray(deltas, dtype=np.float64)
|
||||
print(f' time-delta (pcap_start - csv_ts), seconds: median={np.median(arr):+.2f} mean={arr.mean():+.2f} std={arr.std():.2f} p05={np.percentile(arr, 5):+.2f} p95={np.percentile(arr, 95):+.2f}')
|
||||
med = float(np.median(arr))
|
||||
if abs(med) > 2.0:
|
||||
print(f' -> median |{med:.1f}s| > 2s: rerun with --time-offset {med:.0f}')
|
||||
|
||||
def extract_dataset(*, csv_rows_by_day: dict[str, dict[tuple, list[tuple[int, float]]]], labels_by_row: np.ndarray, pcap_files_by_day: dict[str, list[Path]], out_packets: Path, out_flows: Path, out_store: Path | None=None, shard_size: int=100000, worker_flush_size: int=10000, spool_dir: Path | None=None, match_strategy: str | None=None, T_full: int=256, idle_timeout: float=120.0, time_tolerance_seconds: float=2.0, max_packets_per_pcap: int | None=None, n_jobs: int=0) -> None:
|
||||
N_csv = len(labels_by_row)
|
||||
print(f'[extract_dataset] N_csv={N_csv:,} T_full={T_full} days={sorted(csv_rows_by_day.keys())}')
|
||||
if match_strategy is None:
|
||||
match_strategy = 'stream_nearest' if out_store is not None else 'hungarian'
|
||||
if match_strategy not in ('hungarian', 'stream_nearest'):
|
||||
raise ValueError("match_strategy must be 'hungarian' or 'stream_nearest'")
|
||||
if match_strategy == 'stream_nearest' and out_store is None:
|
||||
raise ValueError('stream_nearest is only supported with --out-store')
|
||||
print(f'[extract_dataset] match_strategy={match_strategy}')
|
||||
tasks: list[tuple] = []
|
||||
for (day, rows_dict) in csv_rows_by_day.items():
|
||||
pcap_files = pcap_files_by_day.get(day, [])
|
||||
if not pcap_files:
|
||||
print(f'[{day}] NO pcap files — skipping ({len(rows_dict):,} CSV keys unmatched)')
|
||||
continue
|
||||
tasks.append((day, [str(p) for p in pcap_files], dict(rows_dict)))
|
||||
if not tasks:
|
||||
raise RuntimeError('No days with pcap files — nothing to extract.')
|
||||
if n_jobs <= 0:
|
||||
n_jobs = min(len(tasks), os.cpu_count() or 1)
|
||||
print(f'[extract_dataset] running {len(tasks)} day(s) with {n_jobs} worker(s)')
|
||||
store_writer: PacketShardWriter | None = None
|
||||
spool_root: Path | None = None
|
||||
if out_store is not None:
|
||||
print(f'[extract_dataset] sharded output enabled: {out_store} shard_size={shard_size:,}')
|
||||
store_writer = PacketShardWriter(out_store, shard_size=shard_size, T_full=T_full, D=PACKET_D, overwrite=True)
|
||||
if spool_dir is None:
|
||||
out_store_parent = Path(out_store).parent
|
||||
out_store_parent.mkdir(parents=True, exist_ok=True)
|
||||
spool_root = Path(tempfile.mkdtemp(prefix=f'.{Path(out_store).name}.spool.', dir=out_store_parent))
|
||||
else:
|
||||
spool_root = Path(spool_dir)
|
||||
if spool_root.exists():
|
||||
shutil.rmtree(spool_root)
|
||||
spool_root.mkdir(parents=True, exist_ok=True)
|
||||
print(f'[extract_dataset] worker spool={spool_root} flush_size={worker_flush_size:,}')
|
||||
tok_chunks: list[np.ndarray] = []
|
||||
len_chunks: list[np.ndarray] = []
|
||||
row_chunks: list[np.ndarray] = []
|
||||
meta_chunks: list[list[dict]] = []
|
||||
total_joined = 0
|
||||
|
||||
def _materialize_results(results: list[tuple[int, np.ndarray, int, dict]]) -> tuple[np.ndarray, np.ndarray, np.ndarray, list[dict]]:
|
||||
results = sorted(results, key=lambda x: x[0])
|
||||
n = len(results)
|
||||
tok_arr = np.empty((n, T_full, PACKET_D), dtype=np.float32)
|
||||
len_arr = np.empty(n, dtype=np.int32)
|
||||
row_arr = np.empty(n, dtype=np.int64)
|
||||
meta_arr: list[dict] = [None] * n
|
||||
for (i, (row_i, tok, ln, meta)) in enumerate(results):
|
||||
tok_arr[i] = tok
|
||||
len_arr[i] = ln
|
||||
row_arr[i] = row_i
|
||||
meta_arr[i] = meta
|
||||
return (tok_arr, len_arr, row_arr, meta_arr)
|
||||
|
||||
def _flows_from_meta(row_arr: np.ndarray, meta_arr: list[dict]) -> pd.DataFrame:
|
||||
labels = labels_by_row[row_arr].astype(str)
|
||||
return pd.DataFrame({'label': labels, 'start_ts': np.asarray([m['start_ts'] for m in meta_arr], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_arr], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_arr], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_arr], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_arr], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_arr], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_arr], dtype=np.uint32)})
|
||||
|
||||
def _append_spool_chunks(res: dict) -> None:
|
||||
chunks = res.get('chunks') or []
|
||||
for chunk in chunks:
|
||||
tokens = np.load(chunk['tokens'], mmap_mode='r')
|
||||
meta_df = pd.read_parquet(chunk['meta'])
|
||||
if meta_df.empty:
|
||||
continue
|
||||
meta_df = meta_df.assign(__token_row=np.arange(len(meta_df), dtype=np.int64))
|
||||
meta_df = meta_df.sort_values('csv_row_idx', kind='stable').reset_index(drop=True)
|
||||
row_arr = meta_df['csv_row_idx'].to_numpy(dtype=np.int64)
|
||||
lengths = meta_df['packet_length'].to_numpy(dtype=np.int32)
|
||||
order = meta_df['__token_row'].to_numpy(dtype=np.int64)
|
||||
labels = labels_by_row[row_arr].astype(str)
|
||||
flows = pd.DataFrame({'label': labels, 'start_ts': meta_df['start_ts'].to_numpy(dtype=np.float64), 'src_ip': meta_df['src_ip'].to_numpy(dtype=object), 'dst_ip': meta_df['dst_ip'].to_numpy(dtype=object), 'src_port': meta_df['src_port'].to_numpy(dtype=np.uint32), 'dst_port': meta_df['dst_port'].to_numpy(dtype=np.uint32), 'protocol': meta_df['protocol'].to_numpy(dtype=np.uint8), 'n_pkts': meta_df['n_pkts'].to_numpy(dtype=np.uint32)})
|
||||
assert store_writer is not None
|
||||
store_writer.add_batch(np.asarray(tokens[order]), lengths, flows)
|
||||
|
||||
def _absorb(res: dict, *, print_stats: bool=True) -> None:
|
||||
if print_stats:
|
||||
_print_day_stats(res)
|
||||
results = res['results']
|
||||
if not results:
|
||||
return
|
||||
(tok_arr, len_arr, row_arr, meta_arr) = _materialize_results(results)
|
||||
if store_writer is not None:
|
||||
store_writer.add_batch(tok_arr, len_arr, _flows_from_meta(row_arr, meta_arr))
|
||||
else:
|
||||
tok_chunks.append(tok_arr)
|
||||
len_chunks.append(len_arr)
|
||||
row_chunks.append(row_arr)
|
||||
meta_chunks.append(meta_arr)
|
||||
if n_jobs <= 1:
|
||||
try:
|
||||
for (i, (day, pcaps, rows)) in enumerate(tasks):
|
||||
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{day}')
|
||||
res = _extract_day_worker(day, pcaps, rows, T_full, idle_timeout, time_tolerance_seconds, max_packets_per_pcap, task_spool, worker_flush_size, match_strategy)
|
||||
_print_day_stats(res)
|
||||
total_joined += int(res.get('n_joined', 0))
|
||||
if store_writer is not None:
|
||||
_append_spool_chunks(res)
|
||||
else:
|
||||
_absorb(res, print_stats=False)
|
||||
finally:
|
||||
if spool_root is not None:
|
||||
shutil.rmtree(spool_root, ignore_errors=True)
|
||||
else:
|
||||
try:
|
||||
with ProcessPoolExecutor(max_workers=n_jobs) as pool:
|
||||
futs = []
|
||||
for (i, (day, pcaps, rows)) in enumerate(tasks):
|
||||
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{day}')
|
||||
futs.append(pool.submit(_extract_day_worker, day, pcaps, rows, T_full, idle_timeout, time_tolerance_seconds, max_packets_per_pcap, task_spool, worker_flush_size, match_strategy))
|
||||
if store_writer is not None:
|
||||
completed: dict[str, dict] = {}
|
||||
for fut in as_completed(futs):
|
||||
res = fut.result()
|
||||
_print_day_stats(res)
|
||||
completed[res['day']] = res
|
||||
for (day, _, _) in tasks:
|
||||
if day in completed:
|
||||
total_joined += int(completed[day].get('n_joined', 0))
|
||||
_append_spool_chunks(completed[day])
|
||||
else:
|
||||
for fut in as_completed(futs):
|
||||
_absorb(fut.result())
|
||||
finally:
|
||||
if spool_root is not None:
|
||||
shutil.rmtree(spool_root, ignore_errors=True)
|
||||
if store_writer is not None:
|
||||
if total_joined == 0:
|
||||
raise RuntimeError('No matched flows — check timestamps (--time-offset) and pcap×CSV correspondence.')
|
||||
store_writer.close()
|
||||
print(f'[extract_dataset] wrote sharded store {out_store}')
|
||||
return
|
||||
if not tok_chunks:
|
||||
raise RuntimeError('No matched flows — check timestamps (--time-offset) and pcap×CSV correspondence.')
|
||||
tokens = np.concatenate(tok_chunks, axis=0)
|
||||
lengths = np.concatenate(len_chunks, axis=0)
|
||||
csv_rows = np.concatenate(row_chunks, axis=0)
|
||||
meta_list: list[dict] = [m for chunk in meta_chunks for m in chunk]
|
||||
del tok_chunks, len_chunks, row_chunks, meta_chunks
|
||||
order = np.argsort(csv_rows, kind='stable')
|
||||
tokens = tokens[order]
|
||||
lengths = lengths[order]
|
||||
csv_rows = csv_rows[order]
|
||||
meta_list = [meta_list[i] for i in order]
|
||||
N_matched = len(tokens)
|
||||
labels = labels_by_row[csv_rows].astype(str)
|
||||
flow_id = np.arange(N_matched, dtype=np.uint64)
|
||||
print(f'\n[extract_dataset] matched {N_matched:,}/{N_csv:,} ({100.0 * N_matched / max(N_csv, 1):.2f}%)')
|
||||
print(f'[extract_dataset] label distribution (matched rows):')
|
||||
(ulabels, counts) = np.unique(labels, return_counts=True)
|
||||
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
|
||||
print(f' {lbl:<40s} {cnt:>10,}')
|
||||
out_packets.parent.mkdir(parents=True, exist_ok=True)
|
||||
np.savez_compressed(out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
|
||||
print(f'[extract_dataset] wrote {out_packets} ({out_packets.stat().st_size / 1000000000.0:.2f} GB)')
|
||||
out_flows.parent.mkdir(parents=True, exist_ok=True)
|
||||
flow_df = pd.DataFrame({'flow_id': flow_id, 'label': labels, 'start_ts': np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_list], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_list], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_list], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_list], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_list], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_list], dtype=np.uint32)})
|
||||
flow_df.to_parquet(out_flows, compression='snappy', index=False)
|
||||
print(f'[extract_dataset] wrote {out_flows} ({out_flows.stat().st_size / 1000000.0:.2f} MB)')
|
||||
_write_canonical_flow_features(tokens=tokens, lengths=lengths, flow_id=flow_id, labels=labels, out_path=out_flows.parent / 'flow_features.parquet')
|
||||
|
||||
def _write_canonical_flow_features(*, tokens: np.ndarray, lengths: np.ndarray, flow_id: np.ndarray, labels: np.ndarray, out_path: Path) -> None:
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES, compute_flow_features_from_packets
|
||||
print(f'[extract_dataset] computing canonical {len(CANONICAL_FLOW_FEATURE_NAMES)}-d flow features from packet tokens ...')
|
||||
feats = compute_flow_features_from_packets(tokens, lengths)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
|
||||
for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES):
|
||||
df[name] = feats[:, i]
|
||||
df.to_parquet(out_path, compression='snappy', index=False)
|
||||
print(f'[extract_dataset] wrote {out_path} ({out_path.stat().st_size / 1000000.0:.2f} MB)')
|
||||
|
||||
def _extract_single_pcap_worker(pcap_path_str: str, label: str, extra: dict, max_len: int, idle_timeout: float, max_packets_per_pcap: int | None, spool_dir: str | None=None, worker_flush_size: int=10000) -> dict:
|
||||
t_start = _time.time()
|
||||
n_pkts = 0
|
||||
n_flows = 0
|
||||
results: list[tuple[np.ndarray, int, dict]] = []
|
||||
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'pcap-{Path(pcap_path_str).stem}', T_full=max_len, chunk_size=worker_flush_size) if spool_dir is not None else None
|
||||
|
||||
def _counting_iter(pkt_iter):
|
||||
nonlocal n_pkts
|
||||
for pkt in pkt_iter:
|
||||
n_pkts += 1
|
||||
yield pkt
|
||||
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
|
||||
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
|
||||
(sip, dip, sp, dp, proto) = fl.key_fwd
|
||||
meta = {'start_ts': float(fl.start_ts), 'src_ip': str(sip), 'dst_ip': str(dip), 'src_port': int(sp), 'dst_port': int(dp), 'protocol': int(proto), 'n_pkts': int(fl.n_pkts)}
|
||||
tok = _to_fixed_tensor(fl.tokens, max_len)
|
||||
ln = min(len(fl.tokens), max_len)
|
||||
if chunk_writer is not None:
|
||||
chunk_writer.add_labeled(tok, ln, meta, label, extra)
|
||||
else:
|
||||
results.append((tok, ln, meta))
|
||||
n_flows += 1
|
||||
elapsed = _time.time() - t_start
|
||||
return {'pcap': pcap_path_str, 'label': label, 'extra': extra, 'results': results, 'chunks': [] if chunk_writer is None else chunk_writer.close(), 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed}
|
||||
|
||||
def extract_labeled_pcaps(*, pcap_files_with_labels: list[tuple[Path, str, dict]], out_packets: Path, out_flows: Path, out_store: Path | None=None, shard_size: int=100000, worker_flush_size: int=10000, spool_dir: Path | None=None, T_full: int=256, idle_timeout: float=120.0, max_packets_per_pcap: int | None=None, n_jobs: int=0, extra_column_names: tuple[str, ...]=()) -> None:
|
||||
N_pcap = len(pcap_files_with_labels)
|
||||
print(f'[extract_labeled_pcaps] n_pcaps={N_pcap} T_full={T_full} extra_cols={extra_column_names}')
|
||||
for (p, lbl, extra) in pcap_files_with_labels[:10]:
|
||||
print(f' {lbl:<20s} {Path(p).name:<60s} extra={extra}')
|
||||
if N_pcap > 10:
|
||||
print(f' ... ({N_pcap - 10} more)')
|
||||
if n_jobs <= 0:
|
||||
n_jobs = min(N_pcap, os.cpu_count() or 1)
|
||||
print(f'[extract_labeled_pcaps] running {N_pcap} pcap(s) with {n_jobs} worker(s)')
|
||||
store_writer: PacketShardWriter | None = None
|
||||
spool_root: Path | None = None
|
||||
if out_store is not None:
|
||||
print(f'[extract_labeled_pcaps] sharded output enabled: {out_store} shard_size={shard_size:,}')
|
||||
store_writer = PacketShardWriter(out_store, shard_size=shard_size, T_full=T_full, D=PACKET_D, overwrite=True)
|
||||
if spool_dir is None:
|
||||
out_store_parent = Path(out_store).parent
|
||||
out_store_parent.mkdir(parents=True, exist_ok=True)
|
||||
spool_root = Path(tempfile.mkdtemp(prefix=f'.{Path(out_store).name}.spool.', dir=out_store_parent))
|
||||
else:
|
||||
spool_root = Path(spool_dir)
|
||||
if spool_root.exists():
|
||||
shutil.rmtree(spool_root)
|
||||
spool_root.mkdir(parents=True, exist_ok=True)
|
||||
print(f'[extract_labeled_pcaps] worker spool={spool_root} flush_size={worker_flush_size:,}')
|
||||
tok_chunks: list[np.ndarray] = []
|
||||
len_chunks: list[np.ndarray] = []
|
||||
meta_chunks: list[list[dict]] = []
|
||||
label_chunks: list[np.ndarray] = []
|
||||
extra_chunks: list[dict[str, list]] = []
|
||||
total_flows = 0
|
||||
|
||||
def _flows_for_labeled_chunk(res: dict, meta_arr: list[dict], n: int) -> pd.DataFrame:
|
||||
cols = {'label': np.full(n, res['label'], dtype=object), 'start_ts': np.asarray([m['start_ts'] for m in meta_arr], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_arr], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_arr], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_arr], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_arr], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_arr], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_arr], dtype=np.uint32)}
|
||||
for col in extra_column_names:
|
||||
cols[col] = np.full(n, res['extra'].get(col, ''), dtype=object)
|
||||
return pd.DataFrame(cols)
|
||||
|
||||
def _append_labeled_spool_chunks(res: dict) -> None:
|
||||
chunks = res.get('chunks') or []
|
||||
for chunk in chunks:
|
||||
tokens = np.load(chunk['tokens'], mmap_mode='r')
|
||||
flows = pd.read_parquet(chunk['meta'])
|
||||
if flows.empty:
|
||||
continue
|
||||
flows = flows.assign(__token_row=np.arange(len(flows), dtype=np.int64))
|
||||
sort_keys = ['label', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'start_ts']
|
||||
flows = flows.sort_values(sort_keys, kind='stable').reset_index(drop=True)
|
||||
order = flows['__token_row'].to_numpy(dtype=np.int64)
|
||||
lengths = flows['packet_length'].to_numpy(dtype=np.int32)
|
||||
flows = flows.drop(columns=['packet_length', '__token_row'])
|
||||
assert store_writer is not None
|
||||
store_writer.add_batch(np.asarray(tokens[order]), lengths, flows)
|
||||
|
||||
def _absorb(res: dict, *, print_stats: bool=True) -> None:
|
||||
pcap_name = Path(res['pcap']).name
|
||||
lbl = res['label']
|
||||
extra = res['extra']
|
||||
if print_stats:
|
||||
print(f"[pcap:{pcap_name}] label={lbl} {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
|
||||
if not res['results']:
|
||||
return
|
||||
n = len(res['results'])
|
||||
tok_arr = np.empty((n, T_full, PACKET_D), dtype=np.float32)
|
||||
len_arr = np.empty(n, dtype=np.int32)
|
||||
meta_arr: list[dict] = [None] * n
|
||||
for (i, (tok, ln, meta)) in enumerate(res['results']):
|
||||
tok_arr[i] = tok
|
||||
len_arr[i] = ln
|
||||
meta_arr[i] = meta
|
||||
if store_writer is not None:
|
||||
flows = _flows_for_labeled_chunk(res, meta_arr, n)
|
||||
order = np.lexsort((flows['start_ts'].to_numpy(dtype=np.float64), flows['protocol'].to_numpy(dtype=np.int64), flows['dst_port'].to_numpy(dtype=np.int64), flows['src_port'].to_numpy(dtype=np.int64), flows['dst_ip'].to_numpy(dtype=object), flows['src_ip'].to_numpy(dtype=object), flows['label'].to_numpy(dtype=object)))
|
||||
store_writer.add_batch(tok_arr[order], len_arr[order], flows.iloc[order].reset_index(drop=True))
|
||||
else:
|
||||
tok_chunks.append(tok_arr)
|
||||
len_chunks.append(len_arr)
|
||||
meta_chunks.append(meta_arr)
|
||||
label_chunks.append(np.full(n, lbl, dtype=object))
|
||||
ex: dict[str, list] = {}
|
||||
for col in extra_column_names:
|
||||
val = extra.get(col, '')
|
||||
ex[col] = [val] * n
|
||||
extra_chunks.append(ex)
|
||||
if n_jobs <= 1:
|
||||
try:
|
||||
for (i, (p, lbl, extra)) in enumerate(pcap_files_with_labels):
|
||||
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{Path(p).stem}')
|
||||
res = _extract_single_pcap_worker(str(p), lbl, extra, T_full, idle_timeout, max_packets_per_pcap, task_spool, worker_flush_size)
|
||||
_absorb(res)
|
||||
total_flows += int(res.get('n_flows', 0))
|
||||
if store_writer is not None:
|
||||
_append_labeled_spool_chunks(res)
|
||||
finally:
|
||||
if spool_root is not None:
|
||||
shutil.rmtree(spool_root, ignore_errors=True)
|
||||
else:
|
||||
try:
|
||||
with ProcessPoolExecutor(max_workers=n_jobs) as pool:
|
||||
futs = []
|
||||
for (i, (p, lbl, extra)) in enumerate(pcap_files_with_labels):
|
||||
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{Path(p).stem}')
|
||||
futs.append(pool.submit(_extract_single_pcap_worker, str(p), lbl, extra, T_full, idle_timeout, max_packets_per_pcap, task_spool, worker_flush_size))
|
||||
if store_writer is not None:
|
||||
completed: dict[str, dict] = {}
|
||||
for fut in as_completed(futs):
|
||||
res = fut.result()
|
||||
pcap_name = Path(res['pcap']).name
|
||||
print(f"[pcap:{pcap_name}] label={res['label']} {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
|
||||
completed[str(res['pcap'])] = res
|
||||
for (p, _, _) in pcap_files_with_labels:
|
||||
res = completed.get(str(p))
|
||||
if res is not None:
|
||||
total_flows += int(res.get('n_flows', 0))
|
||||
_append_labeled_spool_chunks(res)
|
||||
else:
|
||||
for fut in as_completed(futs):
|
||||
_absorb(fut.result())
|
||||
finally:
|
||||
if spool_root is not None:
|
||||
shutil.rmtree(spool_root, ignore_errors=True)
|
||||
if store_writer is not None:
|
||||
if total_flows == 0:
|
||||
raise RuntimeError('No flows emitted — check pcap contents.')
|
||||
store_writer.close()
|
||||
print(f'[extract_labeled_pcaps] wrote sharded store {out_store}')
|
||||
return
|
||||
if not tok_chunks:
|
||||
raise RuntimeError('No flows emitted — check pcap contents.')
|
||||
tokens = np.concatenate(tok_chunks, axis=0)
|
||||
lengths = np.concatenate(len_chunks, axis=0)
|
||||
meta_list: list[dict] = [m for chunk in meta_chunks for m in chunk]
|
||||
labels = np.concatenate(label_chunks, axis=0)
|
||||
extra_dict: dict[str, list] = {col: [] for col in extra_column_names}
|
||||
for chunk in extra_chunks:
|
||||
for col in extra_column_names:
|
||||
extra_dict[col].extend(chunk[col])
|
||||
del tok_chunks, len_chunks, meta_chunks, label_chunks, extra_chunks
|
||||
sip_arr = np.asarray([m['src_ip'] for m in meta_list], dtype=object)
|
||||
dip_arr = np.asarray([m['dst_ip'] for m in meta_list], dtype=object)
|
||||
sp_arr = np.asarray([m['src_port'] for m in meta_list], dtype=np.int64)
|
||||
dp_arr = np.asarray([m['dst_port'] for m in meta_list], dtype=np.int64)
|
||||
pr_arr = np.asarray([m['protocol'] for m in meta_list], dtype=np.int64)
|
||||
ts_arr = np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64)
|
||||
order = np.lexsort((ts_arr, pr_arr, dp_arr, sp_arr, dip_arr, sip_arr, labels))
|
||||
tokens = tokens[order]
|
||||
lengths = lengths[order]
|
||||
labels = labels[order]
|
||||
meta_list = [meta_list[i] for i in order]
|
||||
for col in extra_column_names:
|
||||
extra_dict[col] = [extra_dict[col][i] for i in order]
|
||||
N = len(tokens)
|
||||
flow_id = np.arange(N, dtype=np.uint64)
|
||||
print(f'\n[extract_labeled_pcaps] total flows: {N:,}')
|
||||
print(f'[extract_labeled_pcaps] label distribution:')
|
||||
(ulabels, counts) = np.unique(labels, return_counts=True)
|
||||
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
|
||||
print(f' {lbl:<40s} {cnt:>10,}')
|
||||
out_packets.parent.mkdir(parents=True, exist_ok=True)
|
||||
np.savez_compressed(out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
|
||||
print(f'[extract_labeled_pcaps] wrote {out_packets} ({out_packets.stat().st_size / 1000000000.0:.2f} GB)')
|
||||
out_flows.parent.mkdir(parents=True, exist_ok=True)
|
||||
cols = {'flow_id': flow_id, 'label': labels.astype(str), 'start_ts': np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_list], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_list], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_list], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_list], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_list], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_list], dtype=np.uint32)}
|
||||
for col in extra_column_names:
|
||||
cols[col] = np.asarray(extra_dict[col], dtype=object)
|
||||
flow_df = pd.DataFrame(cols)
|
||||
flow_df.to_parquet(out_flows, compression='snappy', index=False)
|
||||
print(f'[extract_labeled_pcaps] wrote {out_flows} ({out_flows.stat().st_size / 1000000.0:.2f} MB) cols={list(flow_df.columns)}')
|
||||
_write_canonical_flow_features(tokens=tokens, lengths=lengths, flow_id=flow_id, labels=labels.astype(str), out_path=out_flows.parent / 'flow_features.parquet')
|
||||
97
scripts/generate_flow_features.py
Normal file
97
scripts/generate_flow_features.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES, compute_flow_features_from_packets
|
||||
|
||||
def _from_npz(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
print(f'[read] {args.packets_npz}')
|
||||
pz = np.load(args.packets_npz)
|
||||
tokens = pz['packet_tokens']
|
||||
lens = pz['packet_lengths'].astype(np.int32)
|
||||
packet_flow_id = pz['flow_id'] if 'flow_id' in pz.files else None
|
||||
T_stored = tokens.shape[1]
|
||||
if args.T is not None:
|
||||
if args.T > T_stored:
|
||||
raise ValueError(f'requested T={args.T} > stored {T_stored}')
|
||||
tokens = tokens[:, :args.T, :]
|
||||
lens = np.minimum(lens, args.T).astype(np.int32)
|
||||
print(f'[read] {args.flows_parquet}')
|
||||
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
|
||||
if len(flows) != len(tokens):
|
||||
raise ValueError(f'row count mismatch: packets={len(tokens):,} flows={len(flows):,}')
|
||||
flow_id = np.asarray(flows['flow_id'].to_numpy(), dtype=np.uint64)
|
||||
if packet_flow_id is not None:
|
||||
if not np.array_equal(flow_id, packet_flow_id.astype(np.uint64)):
|
||||
raise ValueError('packets.npz flow_id != flows.parquet flow_id')
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
print(f'[compute] {len(tokens):,} flows × T={tokens.shape[1]} → {len(CANONICAL_FLOW_FEATURE_NAMES)} features ...')
|
||||
t0 = time.time()
|
||||
feats = compute_flow_features_from_packets(tokens, lens)
|
||||
dt = time.time() - t0
|
||||
print(f'[compute] {dt:.1f}s ({len(tokens) / max(dt, 1e-06):.0f} flows/s)')
|
||||
return (feats, flow_id, labels, np.array([T_stored if args.T is None else args.T]))
|
||||
|
||||
def _from_store(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'Packet_CFM'))
|
||||
from packet_store import PacketShardStore
|
||||
store = PacketShardStore.open(args.source_store)
|
||||
T_stored = int(store.manifest['packet_length'].max())
|
||||
T = args.T if args.T is not None else T_stored
|
||||
if T > T_stored:
|
||||
raise ValueError(f'requested T={T} > stored max {T_stored}')
|
||||
print(f'[read] {args.flows_parquet}')
|
||||
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
|
||||
n = len(flows)
|
||||
store_flows = store.read_flows(columns=['flow_id'])
|
||||
if len(store_flows) != n:
|
||||
raise ValueError(f'store has {len(store_flows):,} rows but flows.parquet has {n:,}')
|
||||
if not np.array_equal(store_flows['flow_id'].to_numpy(dtype=np.uint64), flows['flow_id'].to_numpy(dtype=np.uint64)):
|
||||
raise ValueError('store flow_id ordering differs from flows.parquet')
|
||||
flow_id = flows['flow_id'].to_numpy(dtype=np.uint64)
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
feats = np.zeros((n, len(CANONICAL_FLOW_FEATURE_NAMES)), dtype=np.float32)
|
||||
print(f'[stream] {n:,} flows × T={T} (full={T_stored}), batch={args.batch} ...')
|
||||
t0 = time.time()
|
||||
all_idx = np.arange(n, dtype=np.int64)
|
||||
for start in range(0, n, args.batch):
|
||||
end = min(start + args.batch, n)
|
||||
idx = all_idx[start:end]
|
||||
(tok, lens) = store.read_packets(idx, T=T)
|
||||
lens = np.minimum(lens, T).astype(np.int32)
|
||||
feats[start:end] = compute_flow_features_from_packets(tok, lens)
|
||||
if start // args.batch % 20 == 0 or end == n:
|
||||
dt = time.time() - t0
|
||||
rate = end / max(dt, 1e-06)
|
||||
eta = (n - end) / max(rate, 1.0)
|
||||
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
|
||||
return (feats, flow_id, labels, np.array([T]))
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
p.add_argument('--packets-npz', type=Path, default=None, help='Monolithic packets.npz path (mutually exclusive with --source-store).')
|
||||
p.add_argument('--source-store', type=Path, default=None, help='PacketShardStore directory (mutually exclusive with --packets-npz).')
|
||||
p.add_argument('--flows-parquet', type=Path, required=True)
|
||||
p.add_argument('--out', type=Path, required=True)
|
||||
p.add_argument('--T', type=int, default=None, help='Truncate packet sequences to first T positions (default: use stored T_full).')
|
||||
p.add_argument('--batch', type=int, default=100000, help='Batch size when streaming from --source-store.')
|
||||
args = p.parse_args()
|
||||
if (args.packets_npz is None) == (args.source_store is None):
|
||||
p.error('pass exactly one of --packets-npz or --source-store')
|
||||
if args.packets_npz is not None:
|
||||
(feats, flow_id, labels, _) = _from_npz(args)
|
||||
else:
|
||||
(feats, flow_id, labels, _) = _from_store(args)
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
|
||||
for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES):
|
||||
df[name] = feats[:, i]
|
||||
df.to_parquet(args.out, compression='snappy', index=False)
|
||||
sz_mb = args.out.stat().st_size / 1000000.0
|
||||
print(f'[write] {args.out} ({sz_mb:.2f} MB, {len(df):,} rows × {len(df.columns)} cols)')
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
122
scripts/generate_spectral_features.py
Normal file
122
scripts/generate_spectral_features.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES
|
||||
|
||||
def compute_spectral_features(packet_tokens: np.ndarray, packet_lengths: np.ndarray, n_bands: int=8) -> np.ndarray:
|
||||
(N, T, _) = packet_tokens.shape
|
||||
mask = (np.arange(T)[None, :] < packet_lengths[:, None]).astype(np.float32)
|
||||
sig = packet_tokens[..., :2].astype(np.float32) * mask[..., None]
|
||||
Z = np.fft.rfft(sig, axis=1)
|
||||
if n_bands > Z.shape[1]:
|
||||
raise ValueError(f'n_bands={n_bands} > {Z.shape[1]} available bins')
|
||||
Z_K = Z[:, :n_bands]
|
||||
size_re = Z_K[..., 0].real.astype(np.float32)
|
||||
size_im = Z_K[..., 0].imag.astype(np.float32)
|
||||
iat_re = Z_K[..., 1].real.astype(np.float32)
|
||||
iat_im = Z_K[..., 1].imag.astype(np.float32)
|
||||
out = np.concatenate([size_re, size_im, iat_re, iat_im], axis=1)
|
||||
return out
|
||||
|
||||
def _spectral_column_names(n_bands: int) -> list[str]:
|
||||
cols: list[str] = []
|
||||
for prefix in ('spec_size_re', 'spec_size_im', 'spec_iat_re', 'spec_iat_im'):
|
||||
for k in range(n_bands):
|
||||
cols.append(f'{prefix}_K{k}')
|
||||
return cols
|
||||
|
||||
def _from_npz(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
print(f'[read] {args.packets_npz}')
|
||||
pz = np.load(args.packets_npz)
|
||||
tokens = pz['packet_tokens']
|
||||
lens = pz['packet_lengths'].astype(np.int32)
|
||||
if args.T is not None:
|
||||
if args.T > tokens.shape[1]:
|
||||
raise ValueError(f'requested T={args.T} > stored {tokens.shape[1]}')
|
||||
tokens = tokens[:, :args.T, :]
|
||||
lens = np.minimum(lens, args.T).astype(np.int32)
|
||||
flow_id = pz['flow_id'].astype(np.uint64) if 'flow_id' in pz.files else None
|
||||
print(f'[compute] {len(tokens):,} flows × T={tokens.shape[1]} → {4 * args.n_bands} spectral cols ...')
|
||||
t0 = time.time()
|
||||
spec = compute_spectral_features(tokens, lens, n_bands=args.n_bands)
|
||||
print(f'[compute] {time.time() - t0:.1f}s')
|
||||
return (spec, flow_id, lens)
|
||||
|
||||
def _from_store(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'Packet_CFM'))
|
||||
from packet_store import PacketShardStore
|
||||
store = PacketShardStore.open(args.source_store)
|
||||
T_stored = int(store.manifest['packet_length'].max())
|
||||
T = args.T if args.T is not None else T_stored
|
||||
if T > T_stored:
|
||||
raise ValueError(f'requested T={T} > stored max {T_stored}')
|
||||
store_flows = store.read_flows(columns=['flow_id'])
|
||||
n = len(store_flows)
|
||||
flow_id = store_flows['flow_id'].to_numpy(dtype=np.uint64)
|
||||
spec = np.zeros((n, 4 * args.n_bands), dtype=np.float32)
|
||||
print(f'[stream] {n:,} flows × T={T} (full={T_stored}), batch={args.batch} ...')
|
||||
t0 = time.time()
|
||||
all_idx = np.arange(n, dtype=np.int64)
|
||||
for start in range(0, n, args.batch):
|
||||
end = min(start + args.batch, n)
|
||||
idx = all_idx[start:end]
|
||||
(tok, lens) = store.read_packets(idx, T=T)
|
||||
lens = np.minimum(lens, T).astype(np.int32)
|
||||
spec[start:end] = compute_spectral_features(tok, lens, n_bands=args.n_bands)
|
||||
if start // args.batch % 20 == 0 or end == n:
|
||||
dt = time.time() - t0
|
||||
rate = end / max(dt, 1e-06)
|
||||
eta = (n - end) / max(rate, 1.0)
|
||||
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
|
||||
return (spec, flow_id, None)
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
p.add_argument('--packets-npz', type=Path, default=None, help='Monolithic packets.npz path (mutually exclusive with --source-store).')
|
||||
p.add_argument('--source-store', type=Path, default=None, help='PacketShardStore directory (mutually exclusive with --packets-npz).')
|
||||
p.add_argument('--flows-parquet', type=Path, required=True, help='flows.parquet for flow_id + label.')
|
||||
p.add_argument('--base-features', type=Path, required=True, help='Existing canonical flow_features.parquet (20-d).')
|
||||
p.add_argument('--out', type=Path, required=True)
|
||||
p.add_argument('--n-bands', type=int, default=8)
|
||||
p.add_argument('--T', type=int, default=None, help='Truncate to first T packets before FFT (default: stored T).')
|
||||
p.add_argument('--batch', type=int, default=100000)
|
||||
args = p.parse_args()
|
||||
if (args.packets_npz is None) == (args.source_store is None):
|
||||
p.error('pass exactly one of --packets-npz or --source-store')
|
||||
print(f'[read] {args.flows_parquet}')
|
||||
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
|
||||
n = len(flows)
|
||||
flow_id_flows = flows['flow_id'].to_numpy(dtype=np.uint64)
|
||||
labels = flows['label'].astype(str).to_numpy()
|
||||
print(f'[read] {args.base_features}')
|
||||
base = pd.read_parquet(args.base_features)
|
||||
if len(base) != n:
|
||||
raise ValueError(f'base features rows {len(base):,} != flows rows {n:,}')
|
||||
if 'flow_id' in base.columns:
|
||||
if not np.array_equal(base['flow_id'].to_numpy(dtype=np.uint64), flow_id_flows):
|
||||
raise ValueError('base flow_id != flows flow_id (row alignment broken)')
|
||||
if args.packets_npz is not None:
|
||||
(spec, flow_id_pkt, _) = _from_npz(args)
|
||||
else:
|
||||
(spec, flow_id_pkt, _) = _from_store(args)
|
||||
if flow_id_pkt is not None and (not np.array_equal(flow_id_pkt, flow_id_flows)):
|
||||
raise ValueError('packet flow_id != flows flow_id')
|
||||
out_df = pd.DataFrame({'flow_id': flow_id_flows, 'label': labels})
|
||||
for name in CANONICAL_FLOW_FEATURE_NAMES:
|
||||
if name not in base.columns:
|
||||
raise ValueError(f'base parquet missing canonical feature {name!r}')
|
||||
out_df[name] = base[name].to_numpy(dtype=np.float32)
|
||||
spec_cols = _spectral_column_names(args.n_bands)
|
||||
for (i, name) in enumerate(spec_cols):
|
||||
out_df[name] = spec[:, i]
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_df.to_parquet(args.out, compression='snappy', index=False)
|
||||
sz_mb = args.out.stat().st_size / 1000000.0
|
||||
print(f'[write] {args.out} ({sz_mb:.2f} MB, {len(out_df):,} rows × {len(out_df.columns)} cols, +{4 * args.n_bands} spectral)')
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
132
scripts/iscxtor_companion.sh
Executable file
132
scripts/iscxtor_companion.sh
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/bin/bash
|
||||
# Companion to scripts/repr_experiment.sh.
|
||||
#
|
||||
# Timeline:
|
||||
# Phase I (parallel with main pipeline after S2b):
|
||||
# extract ISCXTor2016 pcaps into unified artifacts.
|
||||
# CPU-bound, coexists with GPU training of E0/E1/E2.
|
||||
# Phase II (after main pipeline DONE):
|
||||
# for each trained model (E0, E1, E2), run detect + per_class
|
||||
# against ISCXTor2016 (benign=nontor, attack=tor), emitting
|
||||
# `iscxtor_eval/` subdir per model.
|
||||
# Phase III : unified summary across both transfer targets.
|
||||
#
|
||||
# Log layout (reused from main repr_experiment):
|
||||
# $MAIN_DIR/companion.log — this script's orchestration log
|
||||
# $MAIN_DIR/companion_iscxtor_extract.log
|
||||
# $MAIN_DIR/<tag>/iscxtor_eval/ — per-model ISCXTor2016 results
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
ROOT=/home/chy/mambafortrafficmodeling
|
||||
cd "$ROOT"
|
||||
|
||||
MAIN_DIR="runs/repr_experiment_20260423_092147"
|
||||
MAIN_LOG="$MAIN_DIR/orch.log"
|
||||
COMP_LOG="$MAIN_DIR/companion.log"
|
||||
mkdir -p "$MAIN_DIR"
|
||||
exec > >(tee -a "$COMP_LOG") 2>&1
|
||||
|
||||
N_VAL=20000
|
||||
N_ATK=20000 # ISCXTor2016 has fewer attack flows than CICDDoS2019
|
||||
SPLIT_SEED=42
|
||||
|
||||
echo "========================================================================"
|
||||
echo "= $(date): iscxtor_companion START ="
|
||||
echo "= main dir: $MAIN_DIR ="
|
||||
echo "========================================================================"
|
||||
|
||||
wait_for_pattern() {
|
||||
local pattern=$1 log=$2 desc=$3
|
||||
echo ">>> $(date): waiting for '$desc' (pattern='$pattern' in $log)"
|
||||
local waited=0
|
||||
while ! grep -q "$pattern" "$log" 2>/dev/null; do
|
||||
sleep 60
|
||||
waited=$((waited + 60))
|
||||
if (( waited % 600 == 0 )); then
|
||||
echo " [heartbeat $(date +%H:%M:%S)] waited ${waited}s for $desc"
|
||||
fi
|
||||
done
|
||||
echo "<<< $(date): '$desc' detected after ${waited}s wait"
|
||||
}
|
||||
|
||||
run_stage() {
|
||||
local name=$1; shift
|
||||
local log="$MAIN_DIR/${name}.log"
|
||||
echo ""
|
||||
echo ">>> $(date): [$name] START"
|
||||
local t0=$(date +%s)
|
||||
if ! "$@" > "$log" 2>&1; then
|
||||
local t1=$(date +%s)
|
||||
echo "!!! $(date): [$name] FAILED after $((t1-t0))s — see $log"
|
||||
tail -30 "$log"
|
||||
exit 1
|
||||
fi
|
||||
local t1=$(date +%s)
|
||||
echo "<<< $(date): [$name] OK in $((t1-t0))s (log: $log)"
|
||||
tail -6 "$log" | sed 's/^/ | /'
|
||||
}
|
||||
|
||||
# =====================================================================
|
||||
# Phase I — ISCXTor2016 extraction (after S2b, parallel with main training)
|
||||
# =====================================================================
|
||||
|
||||
wait_for_pattern "s2b_extract_cicddos2019_01-12.*OK" "$MAIN_LOG" \
|
||||
"S2b CICDDoS2019 01-12 extraction to complete"
|
||||
|
||||
run_stage "companion_iscxtor_extract" \
|
||||
nice -n 10 ionice -c 3 uv run python scripts/extract_iscxtor2016.py \
|
||||
--skip-decompress --jobs 6
|
||||
|
||||
# =====================================================================
|
||||
# Phase II — wait for main pipeline DONE, then detect + per_class
|
||||
# =====================================================================
|
||||
|
||||
wait_for_pattern "repr_experiment DONE" "$MAIN_LOG" \
|
||||
"main repr_experiment to finish (S7 summary)"
|
||||
|
||||
detect_and_per_class_iscxtor() {
|
||||
local tag=$1
|
||||
local src="$MAIN_DIR/$tag"
|
||||
local dst="$MAIN_DIR/$tag/iscxtor_eval"
|
||||
|
||||
if [ ! -f "$src/model.pt" ]; then
|
||||
echo "!!! $(date): [$tag] model.pt not found at $src — skipping"
|
||||
return 1
|
||||
fi
|
||||
|
||||
mkdir -p "$dst"
|
||||
# Symlink the trained model into the eval subdir — detect.py reads model.pt
|
||||
# from --save-dir. This keeps the original $tag/ directory pristine
|
||||
# (CICDDoS2019 artifacts stay where they were).
|
||||
ln -sf "../model.pt" "$dst/model.pt"
|
||||
|
||||
run_stage "${tag}_detect_iscxtor" \
|
||||
uv run python -m detect \
|
||||
--save-dir "$dst" \
|
||||
--packets-npz datasets/iscxtor2016/processed/packets.npz \
|
||||
--flows-parquet datasets/iscxtor2016/processed/flows.parquet \
|
||||
--benign-label nontor \
|
||||
--per-class-column activity \
|
||||
--n-val "$N_VAL" --n-atk "$N_ATK" --seed "$SPLIT_SEED"
|
||||
|
||||
run_stage "${tag}_per_class_iscxtor" \
|
||||
uv run python -m eval.per_class --save-dir "$dst"
|
||||
}
|
||||
|
||||
detect_and_per_class_iscxtor "e0_baseline"
|
||||
detect_and_per_class_iscxtor "e1_relv2"
|
||||
detect_and_per_class_iscxtor "e2_relv2_ctx"
|
||||
|
||||
# =====================================================================
|
||||
# Phase III — unified summary across both transfer targets
|
||||
# =====================================================================
|
||||
|
||||
run_stage "companion_summary" \
|
||||
uv run python scripts/summarize_repr_exp.py --root "$MAIN_DIR" --with-iscxtor
|
||||
|
||||
echo ""
|
||||
echo "========================================================================"
|
||||
echo "= $(date): iscxtor_companion DONE ="
|
||||
echo "= results: $MAIN_DIR/{summary.txt, summary.json} ="
|
||||
echo "========================================================================"
|
||||
54
scripts/merge_cicddos_shards.py
Normal file
54
scripts/merge_cicddos_shards.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
DEFAULT_DIR = Path('datasets/cicddos2019/processed')
|
||||
|
||||
def _load_shard(dir: Path, shard: str) -> tuple[dict, pd.DataFrame]:
|
||||
p = np.load(dir / f'packets.{shard}.npz')
|
||||
f = pd.read_parquet(dir / f'flows.{shard}.parquet')
|
||||
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
|
||||
assert set(f.columns) == {'flow_id', 'label'}, f.columns
|
||||
assert len(p['flow_id']) == len(f), f'row count mismatch in {shard}'
|
||||
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy()), f'flow_id mismatch in {shard}'
|
||||
return ({'packet_tokens': p['packet_tokens'], 'packet_lengths': p['packet_lengths'], 'flow_id': p['flow_id']}, f)
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--dir', type=Path, default=DEFAULT_DIR)
|
||||
ap.add_argument('--out-packets', type=Path, default=None)
|
||||
ap.add_argument('--out-flows', type=Path, default=None)
|
||||
args = ap.parse_args()
|
||||
out_p = args.out_packets or args.dir / 'packets.npz'
|
||||
out_f = args.out_flows or args.dir / 'flows.parquet'
|
||||
print(f'=== merging shards from {args.dir} ===')
|
||||
(p1, f1) = _load_shard(args.dir, '01-12')
|
||||
(p3, f3) = _load_shard(args.dir, '03-11')
|
||||
n1 = len(f1)
|
||||
n3 = len(f3)
|
||||
N = n1 + n3
|
||||
print(f'01-12 rows: {n1:,} 03-11 rows: {n3:,} total: {N:,}')
|
||||
tokens = np.concatenate([p1['packet_tokens'], p3['packet_tokens']], axis=0)
|
||||
lengths = np.concatenate([p1['packet_lengths'], p3['packet_lengths']], axis=0)
|
||||
flow_id = np.arange(N, dtype=np.uint64)
|
||||
print(f' tokens shape={tokens.shape} dtype={tokens.dtype}')
|
||||
print(f' lengths shape={lengths.shape} dtype={lengths.dtype}')
|
||||
flows = pd.concat([f1.drop(columns=['flow_id']), f3.drop(columns=['flow_id'])], ignore_index=True)
|
||||
flows.insert(0, 'flow_id', flow_id)
|
||||
print(f" flows rows={len(flows):,} label unique={flows['label'].nunique()}")
|
||||
assert len(tokens) == N == len(flows)
|
||||
assert np.array_equal(flow_id, flows['flow_id'].to_numpy())
|
||||
print(f'\n=== writing {out_p} ===')
|
||||
out_p.parent.mkdir(parents=True, exist_ok=True)
|
||||
np.savez_compressed(out_p, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
|
||||
sz = out_p.stat().st_size / 1000000000.0
|
||||
print(f' wrote {sz:.2f} GB')
|
||||
print(f'\n=== writing {out_f} ===')
|
||||
flows.to_parquet(out_f, compression='snappy', index=False)
|
||||
sz = out_f.stat().st_size / 1000000.0
|
||||
print(f' wrote {sz:.2f} MB')
|
||||
print(f'\n=== summary ===')
|
||||
print(flows['label'].value_counts().to_string())
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
70
scripts/merge_shard_artifacts.py
Normal file
70
scripts/merge_shard_artifacts.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
LABEL_ALIASES = {'UDP-lag': 'UDPLag'}
|
||||
|
||||
def _infer_flows_path(packets_path: Path) -> Path:
|
||||
name = packets_path.name
|
||||
if name.startswith('packets.'):
|
||||
flows_name = 'flows.' + name[len('packets.'):].removesuffix('.npz') + '.parquet'
|
||||
else:
|
||||
raise ValueError(f'Cannot infer flows path from {packets_path}')
|
||||
return packets_path.parent / flows_name
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--in', dest='inputs', action='append', type=Path, required=True, help='packets.<shard>.npz path. Pass multiple times (one per shard). flows.<shard>.parquet is inferred.')
|
||||
ap.add_argument('--out-packets', type=Path, required=True)
|
||||
ap.add_argument('--out-flows', type=Path, required=True)
|
||||
args = ap.parse_args()
|
||||
tok_chunks: list[np.ndarray] = []
|
||||
len_chunks: list[np.ndarray] = []
|
||||
flow_dfs: list[pd.DataFrame] = []
|
||||
META_COLS = ['flow_id', 'label', 'start_ts', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'n_pkts']
|
||||
for pkt_path in args.inputs:
|
||||
flow_path = _infer_flows_path(pkt_path)
|
||||
if not pkt_path.exists():
|
||||
raise FileNotFoundError(pkt_path)
|
||||
if not flow_path.exists():
|
||||
raise FileNotFoundError(flow_path)
|
||||
p = np.load(pkt_path)
|
||||
available = set(pd.read_parquet(flow_path).columns)
|
||||
cols = [c for c in META_COLS if c in available]
|
||||
f = pd.read_parquet(flow_path, columns=cols)
|
||||
if len(p['flow_id']) != len(f):
|
||||
raise ValueError(f'{pkt_path.name}: row count mismatch with {flow_path.name}')
|
||||
if not np.array_equal(p['flow_id'], f['flow_id'].to_numpy()):
|
||||
raise ValueError(f'{pkt_path.name}: flow_id mismatch with {flow_path.name}')
|
||||
tok_chunks.append(np.asarray(p['packet_tokens']))
|
||||
len_chunks.append(np.asarray(p['packet_lengths']))
|
||||
flow_dfs.append(f)
|
||||
print(f"[load] {pkt_path.name} : {len(p['flow_id']):>10,} rows cols={cols}")
|
||||
T_full_set = {t.shape[1] for t in tok_chunks}
|
||||
D_set = {t.shape[2] for t in tok_chunks}
|
||||
if len(T_full_set) != 1 or len(D_set) != 1:
|
||||
raise ValueError(f'inconsistent T/D across shards: T={T_full_set} D={D_set}')
|
||||
tokens = np.concatenate(tok_chunks, axis=0)
|
||||
lengths = np.concatenate(len_chunks, axis=0)
|
||||
flow_df = pd.concat(flow_dfs, ignore_index=True)
|
||||
del tok_chunks, len_chunks, flow_dfs
|
||||
if LABEL_ALIASES:
|
||||
flow_df['label'] = flow_df['label'].map(lambda s: LABEL_ALIASES.get(s, s)).astype(str)
|
||||
N = len(tokens)
|
||||
flow_id = np.arange(N, dtype=np.uint64)
|
||||
flow_df['flow_id'] = flow_id
|
||||
labels = flow_df['label'].to_numpy().astype(str)
|
||||
print(f'\n[merge] total rows: {N:,}')
|
||||
print(f'[merge] label distribution:')
|
||||
(ulabels, counts) = np.unique(labels, return_counts=True)
|
||||
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
|
||||
print(f' {lbl:<40s} {cnt:>10,}')
|
||||
args.out_packets.parent.mkdir(parents=True, exist_ok=True)
|
||||
np.savez(args.out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
|
||||
print(f'\n[merge] wrote {args.out_packets} ({args.out_packets.stat().st_size / 1000000000.0:.2f} GB)')
|
||||
args.out_flows.parent.mkdir(parents=True, exist_ok=True)
|
||||
flow_df.to_parquet(args.out_flows, compression='snappy', index=False)
|
||||
print(f'[merge] wrote {args.out_flows} ({args.out_flows.stat().st_size / 1000000.0:.2f} MB) cols={list(flow_df.columns)}')
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
124
scripts/repr_experiment.sh
Executable file
124
scripts/repr_experiment.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
# End-to-end representation experiment: re-extract CICIDS2017 + CICDDoS2019
|
||||
# with metadata columns, then train E0/E1/E2 with fixed 10k benign and
|
||||
# evaluate on CICDDoS2019.
|
||||
#
|
||||
# Stages (each with wall-clock logging + per-stage log file):
|
||||
# S1 re-extract CICIDS2017 → datasets/cicids2017/processed/*
|
||||
# S2a re-extract CICDDoS2019 03-11 shard
|
||||
# S2b re-extract CICDDoS2019 01-12 shard
|
||||
# S2c merge CICDDoS2019 shards
|
||||
# S3 train E0 (mixed_dequant, no ctx) [configs/n10k_baseline.yaml]
|
||||
# S4 train E1 (relative_v2, no ctx) [configs/n10k_relv2.yaml]
|
||||
# S5 train E2 (relative_v2, with 8-d ctx) [configs/n10k_relv2_ctx.yaml]
|
||||
# S6 detect+per_class for each on CICDDoS2019
|
||||
# S7 summary table
|
||||
#
|
||||
# Any stage's failure aborts the rest and leaves the partial log intact.
|
||||
set -uo pipefail
|
||||
|
||||
ROOT=/home/chy/mambafortrafficmodeling
|
||||
cd "$ROOT"
|
||||
|
||||
STAMP=$(date +%Y%m%d_%H%M%S)
|
||||
OUT_DIR="runs/repr_experiment_${STAMP}"
|
||||
mkdir -p "$OUT_DIR"
|
||||
MAIN_LOG="$OUT_DIR/orch.log"
|
||||
exec > >(tee -a "$MAIN_LOG") 2>&1
|
||||
|
||||
N_VAL=20000
|
||||
N_ATK=100000
|
||||
SPLIT_SEED=42
|
||||
|
||||
echo "========================================================================"
|
||||
echo "= $(date): repr_experiment start ="
|
||||
echo "= output root: $OUT_DIR ="
|
||||
echo "========================================================================"
|
||||
|
||||
run_stage() {
|
||||
local name=$1; shift
|
||||
local log="$OUT_DIR/${name}.log"
|
||||
echo ""
|
||||
echo ">>> $(date): [$name] START"
|
||||
echo ">>> $(date): [$name] command: $*"
|
||||
local t0=$(date +%s)
|
||||
if ! "$@" > "$log" 2>&1; then
|
||||
local t1=$(date +%s); echo "!!! $(date): [$name] FAILED after $((t1-t0))s — see $log"
|
||||
tail -30 "$log"
|
||||
exit 1
|
||||
fi
|
||||
local t1=$(date +%s)
|
||||
echo "<<< $(date): [$name] OK in $((t1-t0))s (log: $log)"
|
||||
# Print tail of log so orch.log shows meaningful progress.
|
||||
tail -10 "$log" | sed 's/^/ | /'
|
||||
}
|
||||
|
||||
# ====================================================================
|
||||
# S1 — re-extract CICIDS2017
|
||||
# ====================================================================
|
||||
run_stage "s1_extract_cicids2017" \
|
||||
uv run python scripts/extract_cicids2017.py --jobs 5 --time-offset 28800
|
||||
|
||||
# ====================================================================
|
||||
# S2 — re-extract CICDDoS2019 (per-shard) + merge
|
||||
# ====================================================================
|
||||
run_stage "s2a_extract_cicddos2019_03-11" \
|
||||
uv run python scripts/extract_cicddos2019.py \
|
||||
--shards 03-11 --jobs 1 \
|
||||
--out-packets datasets/cicddos2019/processed/packets.03-11.npz \
|
||||
--out-flows datasets/cicddos2019/processed/flows.03-11.parquet
|
||||
|
||||
run_stage "s2b_extract_cicddos2019_01-12" \
|
||||
uv run python scripts/extract_cicddos2019.py \
|
||||
--shards 01-12 --jobs 1 \
|
||||
--out-packets datasets/cicddos2019/processed/packets.01-12.npz \
|
||||
--out-flows datasets/cicddos2019/processed/flows.01-12.parquet
|
||||
|
||||
run_stage "s2c_merge_cicddos2019" \
|
||||
uv run python scripts/merge_shard_artifacts.py \
|
||||
--in datasets/cicddos2019/processed/packets.03-11.npz \
|
||||
--in datasets/cicddos2019/processed/packets.01-12.npz \
|
||||
--out-packets datasets/cicddos2019/processed/packets.npz \
|
||||
--out-flows datasets/cicddos2019/processed/flows.parquet
|
||||
|
||||
# ====================================================================
|
||||
# S3..S5 — train E0 / E1 / E2 with the same 10k benign
|
||||
# ====================================================================
|
||||
train_and_eval() {
|
||||
local tag=$1 cfg=$2
|
||||
local run_dir="$OUT_DIR/$tag"
|
||||
mkdir -p "$run_dir"
|
||||
|
||||
# Copy config and patch save_dir to our per-tag directory.
|
||||
cp "$cfg" "$run_dir/config.yaml"
|
||||
sed -i "s#^save_dir:.*#save_dir: $run_dir#" "$run_dir/config.yaml"
|
||||
|
||||
run_stage "${tag}_train" \
|
||||
uv run python -m train --config "$run_dir/config.yaml"
|
||||
|
||||
run_stage "${tag}_detect_ddos" \
|
||||
uv run python -m detect \
|
||||
--save-dir "$run_dir" \
|
||||
--packets-npz datasets/cicddos2019/processed/packets.npz \
|
||||
--flows-parquet datasets/cicddos2019/processed/flows.parquet \
|
||||
--n-val "$N_VAL" --n-atk "$N_ATK" --seed "$SPLIT_SEED"
|
||||
|
||||
run_stage "${tag}_per_class" \
|
||||
uv run python -m eval.per_class --save-dir "$run_dir"
|
||||
}
|
||||
|
||||
train_and_eval "e0_baseline" "configs/n10k_baseline.yaml"
|
||||
train_and_eval "e1_relv2" "configs/n10k_relv2.yaml"
|
||||
train_and_eval "e2_relv2_ctx" "configs/n10k_relv2_ctx.yaml"
|
||||
|
||||
# ====================================================================
|
||||
# S7 — summary table
|
||||
# ====================================================================
|
||||
run_stage "s7_summary" \
|
||||
uv run python scripts/summarize_repr_exp.py --root "$OUT_DIR"
|
||||
|
||||
echo ""
|
||||
echo "========================================================================"
|
||||
echo "= $(date): repr_experiment DONE ="
|
||||
echo "= results under: $OUT_DIR ="
|
||||
echo "========================================================================"
|
||||
93
scripts/summarize_repr_exp.py
Normal file
93
scripts/summarize_repr_exp.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
HARD_CLASSES = ('Syn', 'UDPLag', 'DrDoS_NTP')
|
||||
|
||||
def _load_pc(run_dir: Path) -> dict | None:
|
||||
p = run_dir / 'per_class.json'
|
||||
if not p.exists():
|
||||
print(f'[warn] missing {p}')
|
||||
return None
|
||||
return json.loads(p.read_text())['terminal_norm']
|
||||
|
||||
def _render_block(title: str, data: list[dict], hard_classes: tuple[str, ...]) -> list[str]:
|
||||
lines: list[str] = []
|
||||
lines.append('')
|
||||
lines.append('=' * 96)
|
||||
lines.append(f'# {title}')
|
||||
lines.append('=' * 96)
|
||||
lines.append(f"{'experiment':<40s} {'overall AUROC':>14s} {'macro AUROC':>14s} {'TPR@1%FPR':>12s} {'FPR@95%TPR':>12s}")
|
||||
lines.append('-' * 96)
|
||||
if not data:
|
||||
lines.append('(no results)')
|
||||
return lines
|
||||
base = data[0]['pc']
|
||||
for d in data:
|
||||
pc = d['pc']
|
||||
delta_overall = pc['overall_auroc'] - base['overall_auroc']
|
||||
delta_macro = pc['macro_auroc'] - base['macro_auroc']
|
||||
delta_tpr = pc['tpr_at_1fpr'] - base['tpr_at_1fpr']
|
||||
lines.append(f"{d['label']:<40s} {pc['overall_auroc']:>8.4f} ({delta_overall:+.4f}) {pc['macro_auroc']:>8.4f} ({delta_macro:+.4f}) {pc['tpr_at_1fpr']:>6.4f} ({delta_tpr:+.4f}) {pc['fpr_at_95tpr']:>12.4f}")
|
||||
if hard_classes:
|
||||
lines.append('')
|
||||
lines.append(f"--- focus classes: {', '.join(hard_classes)} ---")
|
||||
for c in hard_classes:
|
||||
row = f'{c:<18s}'
|
||||
for d in data:
|
||||
pc = d['pc']
|
||||
match = next((r for r in pc['per_class'] if r['class'] == c), None)
|
||||
if match is None:
|
||||
row += f" {d['tag']}:n/a"
|
||||
else:
|
||||
row += f" {d['tag']}:{match['auroc']:.3f}(tpr={match['tpr_at_1fpr']:.3f})"
|
||||
lines.append(row)
|
||||
lines.append('')
|
||||
lines.append('--- all classes (sorted by E0 AUROC ascending) ---')
|
||||
base_pc = data[0]['pc']['per_class']
|
||||
ordered = sorted(base_pc, key=lambda r: r['auroc'])
|
||||
hdr2 = f"{'class':<22s} {'N':>8s}" + ''.join((f" {d['tag']:>14s}" for d in data))
|
||||
lines.append(hdr2)
|
||||
for row_b in ordered:
|
||||
cls = row_b['class']
|
||||
row = f"{cls:<22s} {row_b['n']:>8d}"
|
||||
for d in data:
|
||||
pc = d['pc']
|
||||
match = next((r for r in pc['per_class'] if r['class'] == cls), None)
|
||||
row += f" {match['auroc']:>14.4f}" if match else f" {'—':>14s}"
|
||||
lines.append(row)
|
||||
return lines
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=__doc__)
|
||||
ap.add_argument('--root', type=Path, required=True)
|
||||
ap.add_argument('--with-iscxtor', action='store_true', help='Also load iscxtor_eval/per_class.json under each tag and render a second comparison block for the CICIDS2017 → ISCXTor2016 transfer target.')
|
||||
args = ap.parse_args()
|
||||
runs = [('E0 baseline (mixed_dequant)', 'e0_baseline'), ('E1 relative_v2 (channel rehab)', 'e1_relv2'), ('E2 relative_v2 + 8-d context', 'e2_relv2_ctx')]
|
||||
ddos_data: list[dict] = []
|
||||
for (label, tag) in runs:
|
||||
pc = _load_pc(args.root / tag)
|
||||
if pc is not None:
|
||||
ddos_data.append({'label': label, 'tag': tag, 'pc': pc})
|
||||
iscx_data: list[dict] = []
|
||||
if args.with_iscxtor:
|
||||
for (label, tag) in runs:
|
||||
pc = _load_pc(args.root / tag / 'iscxtor_eval')
|
||||
if pc is not None:
|
||||
iscx_data.append({'label': label, 'tag': tag, 'pc': pc})
|
||||
if not ddos_data and (not iscx_data):
|
||||
print('[err] no results found under', args.root)
|
||||
return
|
||||
lines: list[str] = []
|
||||
if ddos_data:
|
||||
lines.extend(_render_block('CICIDS2017 → CICDDoS2019 (target=DDoS attacks; benign=normal)', ddos_data, HARD_CLASSES))
|
||||
if iscx_data:
|
||||
lines.extend(_render_block('CICIDS2017 → ISCXTor2016 (target=Tor flows; benign=nontor)', iscx_data, ()))
|
||||
txt = '\n'.join(lines)
|
||||
print(txt)
|
||||
(args.root / 'summary.txt').write_text(txt + '\n')
|
||||
combined = {'cicddos2019': ddos_data, 'iscxtor2016': iscx_data}
|
||||
(args.root / 'summary.json').write_text(json.dumps(combined, indent=2))
|
||||
print(f"\n[saved] {args.root / 'summary.txt'}")
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user