Initial commit: code, paper, small artifacts

This commit is contained in:
2026-05-07 20:47:30 +08:00
commit fae2db8cff
322 changed files with 33159 additions and 0 deletions

View File

@@ -0,0 +1,205 @@
#!/bin/bash
# Wait for user's merge_shard_artifacts.py to finish, then run transfer eval.
#
# The previous auto_transfer_ddos.sh aborted at STAGE 3 because
# np.savez_compressed on 50+ GB array produces a corrupt zip. User's
# merge_shard_artifacts.py uses uncompressed np.savez and is currently
# running. We just wait for it, validate, then do the 4-cell transfer.
set -uo pipefail
ROOT=/home/chy/mambafortrafficmodeling
cd "$ROOT"
SRC_SWEEP="runs/n10k_refactor_20260422_220351"
DST="runs/transfer_ddos"
LOG="$DST/run.log"
DDOS_DIR="datasets/cicddos2019/processed"
mkdir -p "$DST"
# Append to existing log.
exec > >(tee -a "$LOG") 2>&1
N_VAL=20000
N_ATK=100000
SPLIT_SEED=42
echo ""
echo "=== $(date): [after-merge script] starts ==="
echo "waiting for merge_shard_artifacts.py to finish..."
elapsed=0
while pgrep -f "merge_shard_artifacts" > /dev/null; do
sleep 30
elapsed=$((elapsed + 30))
if (( elapsed % 180 == 0 )); then
rss=$(pgrep -af "merge_shard_artifacts" | head -1 | awk '{print $1}' | xargs -I{} ps -p {} -o rss= 2>/dev/null || echo "?")
echo "[heartbeat $(date +%H:%M:%S)] merge still running, waited ${elapsed}s rss=${rss} kB"
fi
done
echo "=== $(date): merge exited after ${elapsed}s ==="
sleep 10
PACKETS="$DDOS_DIR/packets.npz"
FLOWS="$DDOS_DIR/flows.parquet"
# ---- validate ----
if [[ ! -f "$PACKETS" || ! -f "$FLOWS" ]]; then
echo "ERROR: artifacts missing after merge"
echo " $PACKETS : $([[ -f $PACKETS ]] && echo OK || echo MISSING)"
echo " $FLOWS : $([[ -f $FLOWS ]] && echo OK || echo MISSING)"
exit 1
fi
ls -lh "$PACKETS" "$FLOWS"
uv run python - <<'EOF'
import numpy as np, pandas as pd, sys
try:
p = np.load('datasets/cicddos2019/processed/packets.npz')
f = pd.read_parquet('datasets/cicddos2019/processed/flows.parquet')
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
assert set(f.columns) == {'flow_id', 'label'}, f.columns
assert p['flow_id'].shape[0] == len(f)
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy())
n = len(f)
n_benign = int((f['label'] == 'normal').sum())
print(f'[validate] OK: N={n:,} benign={n_benign:,} attack={n-n_benign:,}')
print(f'[validate] packet_tokens shape/dtype: {p["packet_tokens"].shape} {p["packet_tokens"].dtype}')
print('[validate] label value_counts (top 20):')
print(f['label'].value_counts().head(20).to_string())
except Exception as e:
print(f'[validate] FAILED: {type(e).__name__}: {e}')
sys.exit(2)
EOF
if [[ $? -ne 0 ]]; then
echo "ERROR: validation failed"
exit 2
fi
# ---- transfer detect + per_class per cell ----
echo ""
echo "=== $(date): transfer detect + per_class (4 cells) ==="
CELLS=( "seed42/s0" "seed42/s0.6" "seed43/s0" "seed43/s0.6" )
run_cell() {
local cell=$1
local src_dir="$SRC_SWEEP/$cell"
local out_dir="$DST/$cell"
if [[ ! -f "$src_dir/model.pt" ]]; then
echo "SKIP $cell : no model.pt"
return 1
fi
mkdir -p "$out_dir"
cp "$src_dir/model.pt" "$out_dir/model.pt"
[[ -f "$src_dir/history.json" ]] && cp "$src_dir/history.json" "$out_dir/"
echo ""
echo "----- [$cell] $(date) -----"
echo "[detect] starting"
if ! uv run python -m detect \
--save-dir "$out_dir" \
--packets-npz "$PACKETS" \
--flows-parquet "$FLOWS" \
--n-val "$N_VAL" --n-atk "$N_ATK" \
--seed "$SPLIT_SEED" \
2>&1 | tail -25
then
echo "ERROR: detect failed for $cell"
return 2
fi
[[ -f "$out_dir/scores.npz" ]] || { echo "ERROR: no scores.npz for $cell"; return 3; }
echo "[per_class] starting"
uv run python -m eval.per_class --save-dir "$out_dir" 2>&1 | tail -60
[[ -f "$out_dir/per_class.json" ]] || { echo "ERROR: no per_class.json for $cell"; return 4; }
echo "[$cell] OK"
return 0
}
OK_CELLS=()
FAIL_CELLS=()
for cell in "${CELLS[@]}"; do
if run_cell "$cell"; then OK_CELLS+=("$cell"); else FAIL_CELLS+=("$cell"); fi
done
echo ""
echo "=== per-cell status ==="
echo "OK (${#OK_CELLS[@]}): ${OK_CELLS[*]:-none}"
echo "FAIL (${#FAIL_CELLS[@]}): ${FAIL_CELLS[*]:-none}"
# ---- summary ----
echo ""
echo "=== $(date): summary ==="
uv run python - "$DST" <<'EOF'
import json, sys
from pathlib import Path
import numpy as np
dst = Path(sys.argv[1])
cells = ["seed42/s0", "seed42/s0.6", "seed43/s0", "seed43/s0.6"]
ch = "terminal_norm"
keys = [("overall_auroc","overall AUROC"),("overall_auprc","overall AUPRC"),
("macro_auroc","macro AUROC"),("macro_auprc","macro AUPRC"),
("tpr_at_1fpr","TPR@1%FPR"),("fpr_at_95tpr","FPR@95%TPR")]
header = f"{'cell':<15s}" + "".join(f" {t:>14s}" for _, t in keys) + f" {'flipped':>8s}"
print(header); print("-"*len(header))
loaded = {}
for c in cells:
jp = dst / c / "per_class.json"
if not jp.exists():
print(f"{c:<15s} (missing)"); continue
tn = json.loads(jp.read_text())[ch]
loaded[c] = tn
line = f"{c:<15s}"
for k,_ in keys:
v = tn[k]
line += f" {'NaN':>14s}" if (isinstance(v,float) and v!=v) else f" {v:>14.4f}"
line += f" {str(tn['flipped']):>8s}"
print(line)
if loaded:
print("\n=== mean ± std across seeds (per σ) ===")
for sg in ["0","0.6"]:
pair = [(c,d) for c,d in loaded.items() if c.endswith(f"/s{sg}")]
if len(pair) < 2:
print(f"σ={sg}: {len(pair)} seed(s), skip"); continue
print(f"\n--- σ={sg} ---")
for k,t in keys:
vs = [d[k] for _,d in pair if isinstance(d[k],float) and d[k]==d[k]]
if vs:
print(f" {t:<18s} {np.mean(vs):.4f} ± {np.std(vs,ddof=0):.4f}")
ref = loaded.get("seed42/s0.6") or next(iter(loaded.values()))
pc = ref["per_class"]
print("\n=== per-class (seed42/s0.6 reference) ===")
print(f"{'class':<30s} {'n':>8s} {'auroc':>8s} {'auprc':>8s} {'tpr@1%':>8s}")
print("-"*70)
for r in pc:
fmt = lambda v: "—" if (isinstance(v,float) and v!=v) else f"{v:.4f}"
print(f" {r['class']:<28s} {r['n']:>8d} "
f"{fmt(r['auroc']):>8s} {fmt(r['auprc']):>8s} {fmt(r['tpr_at_1fpr']):>8s}")
# Merged-label view.
print("\n=== per-class after label merge (DrDoS_* → stripped) ===")
def norm(name):
return name[len("DrDoS_"):] if name.startswith("DrDoS_") else name
buckets = {}
for r in pc:
if isinstance(r["auroc"],float) and r["auroc"]==r["auroc"]:
buckets.setdefault(norm(r["class"]), []).append(r)
if buckets:
print(f"{'merged class':<20s} {'#shards':>8s} {'n_total':>8s} "
f"{'auroc_wtd':>10s} {'auprc_wtd':>10s}")
print("-"*68)
for k, rs in sorted(buckets.items(), key=lambda x: -sum(r["n"] for r in x[1])):
n_tot = sum(r["n"] for r in rs)
wtd_a = sum(r["auroc"] * r["n"] for r in rs) / max(n_tot, 1)
wtd_ap = sum(r["auprc"] * r["n"] for r in rs) / max(n_tot, 1)
print(f" {k:<18s} {len(rs):>8d} {n_tot:>8d} "
f"{wtd_a:>10.4f} {wtd_ap:>10.4f}")
EOF
echo ""
echo "=== $(date): done ==="

293
scripts/auto_transfer_ddos.sh Executable file
View File

@@ -0,0 +1,293 @@
#!/bin/bash
# Unattended cross-dataset transfer eval (v2, with per-shard merge).
#
# Pipeline:
# STAGE 1 : wait for 01-12 re-extraction to finish
# STAGE 2 : merge packets.{01-12,03-11}.npz + flows.{01-12,03-11}.parquet
# → unified packets.npz + flows.parquet
# STAGE 3 : validate unified artifacts
# STAGE 4 : detect + per_class across 4 cells (seed × σ from CICIDS2017 sweep)
# STAGE 5 : summary table + merged-label view
#
# Log: runs/transfer_ddos/run.log
set -uo pipefail
ROOT=/home/chy/mambafortrafficmodeling
cd "$ROOT"
SRC_SWEEP="runs/n10k_refactor_20260422_220351"
DST="runs/transfer_ddos"
LOG="$DST/run.log"
DDOS_DIR="datasets/cicddos2019/processed"
mkdir -p "$DST"
exec > >(tee -a "$LOG") 2>&1
N_VAL=20000
N_ATK=100000
SPLIT_SEED=42
echo "=== $(date): script starts (v2 with merge) ==="
echo "source sweep : $SRC_SWEEP"
echo "destination : $DST"
echo "scoring : n_val=$N_VAL n_atk=$N_ATK split_seed=$SPLIT_SEED"
# =====================================================================
# STAGE 1: wait for extraction_cicddos2019 (01-12 shard) to finish
# =====================================================================
echo ""
echo "=== $(date): STAGE 1 — waiting for 01-12 re-extraction ==="
elapsed=0
while pgrep -f "scripts/extract_cicddos2019" > /dev/null; do
sleep 60
elapsed=$((elapsed + 60))
if (( elapsed % 600 == 0 )); then
# Heartbeat every 10 min
rss=$(pgrep -af "scripts/extract_cicddos2019" | head -1 | awk '{print $1}' | xargs -I{} ps -p {} -o rss= 2>/dev/null || echo "?")
echo "[heartbeat $(date +%H:%M:%S)] 01-12 extraction running, waited ${elapsed}s rss(parent)=${rss} kB"
fi
done
echo "=== $(date): extraction process exited after ${elapsed}s wait ==="
sleep 15
# =====================================================================
# STAGE 2: merge per-shard artifacts
# =====================================================================
echo ""
echo "=== $(date): STAGE 2 — merge shards ==="
SHARDS_PACKETS="$DDOS_DIR/packets.01-12.npz $DDOS_DIR/packets.03-11.npz"
SHARDS_FLOWS="$DDOS_DIR/flows.01-12.parquet $DDOS_DIR/flows.03-11.parquet"
missing=0
for f in $SHARDS_PACKETS $SHARDS_FLOWS; do
if [[ ! -f "$f" ]]; then
echo "ERROR: shard artifact missing: $f"
missing=1
fi
done
if (( missing )); then
echo "--- tail of 01-12 extraction log ---"
tail -40 runs/extract_logs/extract_ddos_0112.log 2>&1 || true
exit 1
fi
echo "all 4 shard artifacts present; running merge"
if ! uv run python scripts/merge_cicddos_shards.py 2>&1; then
echo "ERROR: merge failed"
exit 2
fi
# =====================================================================
# STAGE 3: validate unified artifacts
# =====================================================================
echo ""
echo "=== $(date): STAGE 3 — validate unified artifacts ==="
PACKETS="$DDOS_DIR/packets.npz"
FLOWS="$DDOS_DIR/flows.parquet"
if [[ ! -f "$PACKETS" || ! -f "$FLOWS" ]]; then
echo "ERROR: merge output missing"
echo " $PACKETS : $([[ -f $PACKETS ]] && echo OK || echo MISSING)"
echo " $FLOWS : $([[ -f $FLOWS ]] && echo OK || echo MISSING)"
exit 3
fi
ls -lh "$PACKETS" "$FLOWS"
uv run python - <<'EOF'
import numpy as np, pandas as pd, sys
try:
p = np.load('datasets/cicddos2019/processed/packets.npz')
f = pd.read_parquet('datasets/cicddos2019/processed/flows.parquet')
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
assert set(f.columns) == {'flow_id', 'label'}, f.columns
assert p['flow_id'].shape[0] == len(f)
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy())
n = len(f)
n_benign = int((f['label'] == 'normal').sum())
print(f'[validate] OK: N={n:,} benign={n_benign:,} attack={n-n_benign:,}')
print(f'[validate] packet_tokens shape/dtype: {p["packet_tokens"].shape} {p["packet_tokens"].dtype}')
print('[validate] label value_counts (top 20):')
print(f['label'].value_counts().head(20).to_string())
except Exception as e:
print(f'[validate] FAILED: {type(e).__name__}: {e}')
sys.exit(2)
EOF
if [[ $? -ne 0 ]]; then
echo "ERROR: unified artifact validation failed"
exit 4
fi
# =====================================================================
# STAGE 4: transfer detect + per_class
# =====================================================================
echo ""
echo "=== $(date): STAGE 4 — transfer detect + per_class (4 cells) ==="
CELLS=( "seed42/s0" "seed42/s0.6" "seed43/s0" "seed43/s0.6" )
run_cell() {
local cell=$1
local src_dir="$SRC_SWEEP/$cell"
local out_dir="$DST/$cell"
if [[ ! -f "$src_dir/model.pt" ]]; then
echo "SKIP $cell : no model.pt at $src_dir"
return 1
fi
mkdir -p "$out_dir"
cp "$src_dir/model.pt" "$out_dir/model.pt"
[[ -f "$src_dir/history.json" ]] && cp "$src_dir/history.json" "$out_dir/"
echo ""
echo "----- [$cell] $(date) -----"
echo "[detect] starting"
if ! uv run python -m detect \
--save-dir "$out_dir" \
--packets-npz "$PACKETS" \
--flows-parquet "$FLOWS" \
--n-val "$N_VAL" --n-atk "$N_ATK" \
--seed "$SPLIT_SEED" \
2>&1 | tail -25
then
echo "ERROR: detect failed for $cell"
return 2
fi
if [[ ! -f "$out_dir/scores.npz" ]]; then
echo "ERROR: detect produced no scores.npz for $cell"
return 3
fi
echo "[per_class] starting"
if ! uv run python -m eval.per_class --save-dir "$out_dir" 2>&1 | tail -80
then
echo "ERROR: per_class failed for $cell"
return 4
fi
if [[ ! -f "$out_dir/per_class.json" ]]; then
echo "ERROR: per_class.json missing for $cell"
return 5
fi
echo "[$cell] OK"
return 0
}
OK_CELLS=()
FAIL_CELLS=()
for cell in "${CELLS[@]}"; do
if run_cell "$cell"; then
OK_CELLS+=("$cell")
else
FAIL_CELLS+=("$cell")
echo "[$cell] continuing despite failure"
fi
done
echo ""
echo "=== per-cell status ==="
echo "OK (${#OK_CELLS[@]}): ${OK_CELLS[*]:-none}"
echo "FAIL (${#FAIL_CELLS[@]}): ${FAIL_CELLS[*]:-none}"
# =====================================================================
# STAGE 5: summary
# =====================================================================
echo ""
echo "=== $(date): STAGE 5 — summary ==="
uv run python - "$DST" <<'EOF'
import json, sys
from pathlib import Path
import numpy as np
dst = Path(sys.argv[1])
cells = ["seed42/s0", "seed42/s0.6", "seed43/s0", "seed43/s0.6"]
ch = "terminal_norm"
keys = [("overall_auroc", "overall AUROC"),
("overall_auprc", "overall AUPRC"),
("macro_auroc", "macro AUROC"),
("macro_auprc", "macro AUPRC"),
("tpr_at_1fpr", "TPR@1%FPR"),
("fpr_at_95tpr", "FPR@95%TPR")]
header = f"{'cell':<15s}" + "".join(f" {t:>14s}" for _, t in keys) + f" {'flipped':>8s}"
print(header)
print("-" * len(header))
loaded: dict[str, dict] = {}
for c in cells:
jp = dst / c / "per_class.json"
if not jp.exists():
print(f"{c:<15s} (missing per_class.json)")
continue
try:
tn = json.loads(jp.read_text())[ch]
except Exception as e:
print(f"{c:<15s} (parse error: {e})")
continue
loaded[c] = tn
row = [tn[k] for k, _ in keys]
line = f"{c:<15s}"
for v in row:
if isinstance(v, float) and (v != v):
line += f" {'NaN':>14s}"
else:
line += f" {v:>14.4f}"
line += f" {str(tn['flipped']):>8s}"
print(line)
if not loaded:
print("\n(no cells loaded — nothing to aggregate)")
sys.exit(0)
print("")
print("=== mean ± std across seeds (per σ) ===")
for sg in ["0", "0.6"]:
pair = [(c, d) for c, d in loaded.items() if c.endswith(f"/s{sg}")]
if len(pair) < 2:
print(f"σ={sg}: only {len(pair)} seed(s), skip aggregate")
continue
print(f"\n--- σ={sg} ({len(pair)} seeds) ---")
for k, t in keys:
vals = [d[k] for _, d in pair if isinstance(d[k], float) and d[k] == d[k]]
if not vals:
continue
m = float(np.mean(vals)); s = float(np.std(vals, ddof=0))
print(f" {t:<18s} {m:.4f} ± {s:.4f}")
ref = loaded.get("seed42/s0.6") or next(iter(loaded.values()))
print("")
print("=== per-class AUROC (seed42/s0.6 reference) ===")
pc = ref["per_class"]
print(f"{'class':<30s} {'n':>8s} {'auroc':>8s} {'auprc':>8s} {'tpr@1%':>8s}")
print("-" * 70)
for r in pc:
fmt = lambda v: "—" if (isinstance(v, float) and v != v) else f"{v:.4f}"
print(f" {r['class']:<28s} {r['n']:>8d} "
f"{fmt(r['auroc']):>8s} {fmt(r['auprc']):>8s} {fmt(r['tpr_at_1fpr']):>8s}")
print("")
print("=== per-class after label merge (DrDoS_* → stripped) ===")
def norm(name):
if name.startswith("DrDoS_"):
return name[len("DrDoS_"):]
return name
buckets: dict[str, list] = {}
for r in pc:
if isinstance(r["auroc"], float) and r["auroc"] == r["auroc"]:
buckets.setdefault(norm(r["class"]), []).append(r)
if buckets:
print(f"{'merged class':<20s} {'shards':>6s} {'n_total':>8s} "
f"{'auroc_wtd':>10s} {'auroc_mean':>10s}")
print("-" * 64)
for k, rs in sorted(buckets.items(), key=lambda x: -sum(r["n"] for r in x[1])):
n_tot = sum(r["n"] for r in rs)
wtd = sum(r["auroc"] * r["n"] for r in rs) / max(n_tot, 1)
mean = sum(r["auroc"] for r in rs) / len(rs)
print(f" {k:<18s} {len(rs):>6d} {n_tot:>8d} "
f"{wtd:>10.4f} {mean:>10.4f}")
EOF
echo ""
echo "=== $(date): script done ==="

View File

@@ -0,0 +1,106 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/anomaly_transformer_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
AGGS = ('mean', 'max', 'median', 'p90')
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def _abs_auroc(v):
return max(v, 1.0 - v)
def main():
rows = []
full = {'protocols': {}}
per_class_collect = {p: {} for p in PROTOCOLS}
for protocol in PROTOCOLS:
agg_aurocs = {agg: [] for agg in AGGS}
agg_abs_aurocs = {agg: [] for agg in AGGS}
seeds_run = []
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
seeds_run.append(s)
for agg in AGGS:
ov = r['overall_by_agg'][agg]
agg_aurocs[agg].append(ov['auroc'])
agg_abs_aurocs[agg].append(_abs_auroc(ov['auroc']))
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
agg_summary = {}
for agg in AGGS:
(m, sd) = _ms(agg_aurocs[agg])
(am, asd) = _ms(agg_abs_aurocs[agg])
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'abs_auroc_mean': am, 'abs_auroc_std': asd}
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['abs_auroc_mean'])
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'abs_auroc_mean': agg_summary[best_agg]['abs_auroc_mean'], 'abs_auroc_std': agg_summary[best_agg]['abs_auroc_std'], 'all_aggs': agg_summary})
lines = ['# Anomaly-Transformer (ICLR 2022) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: ICLR 2022 Anomaly-Transformer (association-discrepancy minimax). Vendored model class from `baselines/Anomaly-Transformer/model/AnomalyTransformer.py`; training + scoring loop reimplemented to match our protocol (input shape [B, T=64, D=9] = our z-scored packet sequences, same train/val/attack splits as eval_new_scores.py).', 'Hyperparams: d_model=128, n_heads=4, e_layers=3, batch=128, lr=1e-4, k_disc=3.0, temperature=50.0, epochs=15.', 'Score: per-position softmax(-association_KL · T) · MSE(rec, x), then aggregated per flow (mean / max / median / p90).', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm (Unified_CFM) | **AT (ours)** | abs AUROC | best agg | Δ vs terminal |', '|---|---:|---:|---:|---|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
(am, asd) = (row['abs_auroc_mean'], row['abs_auroc_std'])
if np.isnan(m):
continue
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
d_terminal = m - tn_m
lines.append(f"| {PRETTY[p]} | {tn_str} | **{m:.4f} ± {sd:.4f}** | {am:.4f} ± {asd:.4f} | `{row['best_agg']}` | {d_terminal:+.4f} |")
lines.append('')
lines.append('## All aggregators (3-seed mean ± std)')
lines.append('')
lines.append('| Protocol | mean | max | median | p90 |')
lines.append('|---|---:|---:|---:|---:|')
for row in rows:
cells = [PRETTY[row['protocol']]]
for agg in AGGS:
a = row['all_aggs'][agg]
m = a['auroc_mean']
if np.isnan(m):
cells.append('')
else:
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
lines.append('| ' + ' | '.join(cells) + ' |')
lines.append('')
lines.append('## Per-attack (forward + reverse, mean aggregator)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
continue
lines.append('| attack | n | AT AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} raw={row['auroc_mean']:.4f}±{row['auroc_std']:.4f} abs={row['abs_auroc_mean']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,109 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/kitsune_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
AGGS = ('mean', 'max', 'median', 'p90')
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
KITSUNE_PAPER = {'iscxtor_within': (0.78, None), 'cicids_within': (0.85, None), 'cicddos_within': (None, None), 'forward_cross': (None, None), 'reverse_cross': (None, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def main():
rows = []
per_class_collect = {p: {} for p in PROTOCOLS}
full = {'protocols': {}}
for protocol in PROTOCOLS:
agg_aurocs = {agg: [] for agg in AGGS}
agg_auprcs = {agg: [] for agg in AGGS}
seeds_run = []
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
seeds_run.append(s)
for agg in AGGS:
ov = r['overall_by_agg'][agg]
agg_aurocs[agg].append(ov['auroc'])
agg_auprcs[agg].append(ov['auprc'])
for (cls, info) in r.get('per_class_by_agg', {}).get('mean', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
agg_summary = {}
for agg in AGGS:
(m, sd) = _ms(agg_aurocs[agg])
(ma, sda) = _ms(agg_auprcs[agg])
agg_summary[agg] = {'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda}
full['protocols'][protocol] = {'seeds': seeds_run, 'by_agg': agg_summary}
best_agg = max(agg_summary, key=lambda a: agg_summary[a]['auroc_mean'])
rows.append({'protocol': protocol, 'n_seeds': len(seeds_run), 'best_agg': best_agg, 'auroc_mean': agg_summary[best_agg]['auroc_mean'], 'auroc_std': agg_summary[best_agg]['auroc_std'], 'all_aggs': agg_summary})
lines = ['# Kitsune (Path B) Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', 'Method: KitNET ensemble autoencoder (the ML core of Kitsune).', "**Path B**: feeds our **z-scored 9-d packet features** directly through `KitNET.process()` for the FM+AD grace, then `KitNET.execute()` per packet during eval. **AfterImage's 100-d host/session statistics are skipped** (they require sequential pcap streams which our (B,T,9) tensor abstraction discards). This keeps data usage unified with `eval_new_scores.py`.", 'Train: 5000 source-benign flows → ~75-320k packets (≥ FM+AD=55k grace).', 'Score: per-flow aggregate of per-packet RMSE (mean / max / median / p90).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (best aggregator per protocol, 3-seed mean ± std)', '', '| Protocol | terminal_norm | Kitsune paper (Shafir reproduction) | **Kitsune Path B (ours)** | best agg | Δ vs paper | Δ vs terminal |', '|---|---:|---:|---:|---|---:|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(kp_m, _) = KITSUNE_PAPER[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
if np.isnan(m):
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | — | (no runs) | — | — | — |')
continue
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
kp_str = f'{kp_m:.4f}' if kp_m is not None else ''
d_terminal = m - tn_m
d_paper = m - kp_m if kp_m is not None else None
d_paper_str = f'{d_paper:+.4f}' if d_paper is not None else ''
lines.append(f"| {PRETTY[p]} | {tn_str} | {kp_str} | **{m:.4f} ± {sd:.4f}** | `{row['best_agg']}` | {d_paper_str} | {d_terminal:+.4f} |")
lines.append('')
lines.append('## All aggregators (3-seed mean ± std)')
lines.append('')
lines.append('| Protocol | mean | max | median | p90 |')
lines.append('|---|---:|---:|---:|---:|')
for row in rows:
cells = [PRETTY[row['protocol']]]
for agg in AGGS:
a = row['all_aggs'][agg]
m = a['auroc_mean']
if np.isnan(m):
cells.append('')
else:
cells.append(f"{m:.4f} ± {a['auroc_std']:.4f}")
lines.append('| ' + ' | '.join(cells) + ' |')
lines.append('')
lines.append('## Per-attack (forward + reverse, mean aggregator)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
lines.append('(no runs)')
continue
lines.append('| attack | n | Kitsune AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': per_class_collect, 'baselines': {'terminal_norm': TERMINAL_NORM, 'kitsune_paper': KITSUNE_PAPER}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} best={row['best_agg']:<6s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,93 @@
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
REPO = Path(__file__).resolve().parents[2]
ROOT = REPO / 'artifacts/baselines/shafir_nf_2026_04_29'
PROTOCOLS = ('iscxtor_within', 'cicids_within', 'cicddos_within', 'forward_cross', 'reverse_cross')
SEEDS = (42, 43, 44)
TERMINAL_NORM = {'iscxtor_within': (0.9945, 0.0011), 'cicids_within': (0.9858, 0.0021), 'cicddos_within': (0.996, 0.001), 'forward_cross': (0.9109, 0.0032), 'reverse_cross': (0.5999, None)}
SHAFIR_PAPER = {'iscxtor_within': (0.8731, None), 'cicids_within': (0.9303, None), 'cicddos_within': (0.93, None), 'forward_cross': (0.89, None), 'reverse_cross': (0.93, None)}
PRETTY = {'iscxtor_within': 'ISCXTor2016 within', 'cicids_within': 'CICIDS2017 within (σ=0.6)', 'cicddos_within': 'CICDDoS2019 within', 'forward_cross': 'IDS2017→DDoS2019 forward', 'reverse_cross': 'DDoS2019→IDS2017 reverse'}
def _load(protocol, seed):
p = ROOT / f'{protocol}_seed{seed}.json'
if not p.exists():
return None
return json.loads(p.read_text())
def _ms(vals):
arr = np.asarray([v for v in vals if v is not None and (not np.isnan(v))], dtype=np.float64)
if len(arr) == 0:
return (float('nan'), float('nan'))
return (float(arr.mean()), float(arr.std(ddof=1)) if len(arr) > 1 else 0.0)
def main():
rows = []
per_class_collect = {p: {} for p in PROTOCOLS}
for protocol in PROTOCOLS:
(aurocs, auprcs, t_train) = ([], [], [])
for s in SEEDS:
r = _load(protocol, s)
if r is None:
continue
aurocs.append(r['overall']['neg_log_prob']['auroc'])
auprcs.append(r['overall']['neg_log_prob']['auprc'])
t_train.append(r.get('t_train_sec', 0.0))
for (cls, info) in r.get('per_class', {}).items():
per_class_collect[protocol].setdefault(cls, {'n': int(info['_n']), 'aurocs': []})
per_class_collect[protocol][cls]['aurocs'].append(info['auroc'])
(m, sd) = _ms(aurocs)
(ma, sda) = _ms(auprcs)
(tt, _) = _ms(t_train)
rows.append({'protocol': protocol, 'n_seeds': len(aurocs), 'auroc_mean': m, 'auroc_std': sd, 'auprc_mean': ma, 'auprc_std': sda, 't_train_sec_mean': tt})
lines = ['# Shafir 2026 NF Baseline — On Our 5-Protocol Layout', '', 'Date: 2026-04-29', '', "Method: Shafir's official `pzflow.Flow` (single basic NF).", 'Features: our **20-d canonical packet-derived flow features** (`common.data_contract.CANONICAL_FLOW_FEATURE_NAMES`), z-scored with the **same source training stats** that the Unified_CFM checkpoint uses.', 'Train cap: 10,000 source-benign samples (Shafir paper protocol).', 'Optimizer: SGD lr=1e-3, 100 epochs (Shafir paper defaults).', 'Sampling: same seeds & stratification as `eval_new_scores.py`.', '', '## Headline AUROC (3-seed mean ± std)', '', '| Protocol | terminal_norm (ours) | Shafir NF — paper | **Shafir NF — our features** | Δ vs paper | Δ vs terminal_norm |', '|---|---:|---:|---:|---:|---:|']
for row in rows:
p = row['protocol']
(tn_m, tn_sd) = TERMINAL_NORM[p]
(sp_m, _) = SHAFIR_PAPER[p]
(m, sd) = (row['auroc_mean'], row['auroc_std'])
if np.isnan(m):
lines.append(f'| {PRETTY[p]} | {tn_m:.4f} | {sp_m:.4f} | (no runs yet) | — | — |')
continue
d_paper = m - sp_m
d_terminal = m - tn_m
tn_str = f'{tn_m:.4f} ± {tn_sd:.4f}' if tn_sd is not None else f'{tn_m:.4f}'
lines.append(f'| {PRETTY[p]} | {tn_str} | {sp_m:.4f} | **{m:.4f} ± {sd:.4f}** | {d_paper:+.4f} | {d_terminal:+.4f} |')
lines.append('')
lines.append('## Per-protocol stats')
lines.append('')
lines.append('| Protocol | n_seeds | AUPRC mean ± std | Train time (s, mean) |')
lines.append('|---|---:|---:|---:|')
for row in rows:
p = row['protocol']
(m, sd) = (row['auprc_mean'], row['auprc_std'])
if np.isnan(m):
continue
lines.append(f"| {PRETTY[p]} | {row['n_seeds']} | {m:.4f} ± {sd:.4f} | {row['t_train_sec_mean']:.1f} |")
lines.append('')
lines.append('## Per-attack (forward + reverse)')
for protocol in ('forward_cross', 'reverse_cross'):
lines.append(f'\n### {PRETTY[protocol]}')
d = per_class_collect[protocol]
if not d:
lines.append('(no runs)')
continue
lines.append('| attack | n | Shafir NF AUROC mean ± std |')
lines.append('|---|---:|---:|')
for cls in sorted(d):
n = d[cls]['n']
(m, sd) = _ms(d[cls]['aurocs'])
lines.append(f'| `{cls}` | {n} | {m:.4f} ± {sd:.4f} |')
out = ROOT / 'summary.md'
out.write_text('\n'.join(lines))
summary_json = {'rows': rows, 'per_class': {p: {cls: {'n': v['n'], **dict(zip(['mean', 'std'], _ms(v['aurocs'])))} for (cls, v) in dd.items()} for (p, dd) in per_class_collect.items()}, 'baselines': {'terminal_norm': TERMINAL_NORM, 'shafir_paper': SHAFIR_PAPER}}
(ROOT / 'summary.json').write_text(json.dumps(summary_json, indent=2))
print(f'[saved] {out}')
print(f"[saved] {ROOT / 'summary.json'}")
print()
for row in rows:
if not np.isnan(row['auroc_mean']):
print(f" {PRETTY[row['protocol']]:<34s} {row['auroc_mean']:.4f} ± {row['auroc_std']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,267 @@
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import yaml
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
sys.path.insert(0, str(REPO / 'baselines/Anomaly-Transformer'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from packet_store import PacketShardStore
from model.AnomalyTransformer import AnomalyTransformer
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _load_within(model_dir, n_val, n_atk, n_train_cap, seed):
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
(train_packets, train_len) = (data.train_packets, data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
(val_packets, val_len) = (data.val_packets, data.val_len)
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
if n_val is not None and len(val_packets) > n_val:
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
(val_packets, val_len) = (val_packets[idx], val_len[idx])
if n_atk is not None and len(atk_packets) > n_atk:
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
def _load_cross(spec, ckpt, seed, n_train_cap, T):
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
src_cfg = yaml.safe_load(src_cfg_path.read_text())
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
rng = np.random.default_rng(seed + 1000)
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
labels = flows['label'].astype(str).to_numpy()
rng2 = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng2.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
store = PacketShardStore.open(target_store)
def _materialize(idx):
(tok, ll) = store.read_packets(idx, T=T)
ll = np.minimum(ll, T).astype(np.int32)
return (tok.astype(np.float32), ll)
(b_tok, b_len) = _materialize(b_sel)
(a_tok, a_len) = _materialize(a_sel)
if packet_preprocess == 'mixed_dequant':
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
else:
val_packets = _zscore(b_tok, packet_mean, packet_std)
atk_packets = _zscore(a_tok, packet_mean, packet_std)
msk_b = np.arange(T)[None, :] < b_len[:, None]
msk_a = np.arange(T)[None, :] < a_len[:, None]
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
def _kl(p, q):
return torch.sum(p * (torch.log(p + 0.0001) - torch.log(q + 0.0001)), dim=-1)
def _norm_prior(prior, win_size: int) -> torch.Tensor:
return prior / torch.unsqueeze(torch.sum(prior, dim=-1), dim=-1).repeat(1, 1, 1, win_size)
def _train(model: AnomalyTransformer, train_packets: np.ndarray, train_len: np.ndarray, *, batch_size: int, epochs: int, lr: float, k_disc: float, win_size: int, device: torch.device) -> dict:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()
n = len(train_packets)
losses_log = []
t0 = time.time()
for epoch in range(epochs):
model.train()
rng = np.random.default_rng(epoch)
perm = rng.permutation(n)
epoch_losses = []
for s in range(0, n, batch_size):
idx = perm[s:s + batch_size]
x = torch.from_numpy(train_packets[idx]).float().to(device)
optimizer.zero_grad()
(output, series, prior, _) = model(x)
series_loss = 0.0
prior_loss = 0.0
for u in range(len(prior)):
norm_p = _norm_prior(prior[u], win_size)
series_loss += torch.mean(_kl(series[u], norm_p.detach())) + torch.mean(_kl(norm_p.detach(), series[u]))
prior_loss += torch.mean(_kl(norm_p, series[u].detach())) + torch.mean(_kl(series[u].detach(), norm_p))
series_loss /= len(prior)
prior_loss /= len(prior)
rec_loss = criterion(output, x)
loss1 = rec_loss - k_disc * series_loss
loss2 = rec_loss + k_disc * prior_loss
loss1.backward(retain_graph=True)
loss2.backward()
optimizer.step()
epoch_losses.append(rec_loss.item())
losses_log.append(float(np.mean(epoch_losses)))
if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
print(f' [epoch {epoch + 1}/{epochs}] rec_loss={losses_log[-1]:.4f} ({time.time() - t0:.1f}s elapsed)', flush=True)
return {'losses': losses_log, 't_train_sec': time.time() - t0}
@torch.no_grad()
def _score(model: AnomalyTransformer, packets: np.ndarray, lens: np.ndarray, *, batch_size: int, win_size: int, temperature: float, device: torch.device) -> dict[str, np.ndarray]:
model.eval()
n = len(packets)
means = np.zeros(n, dtype=np.float32)
maxes = np.zeros(n, dtype=np.float32)
medians = np.zeros(n, dtype=np.float32)
p90s = np.zeros(n, dtype=np.float32)
crit = nn.MSELoss(reduction='none')
for s in range(0, n, batch_size):
x = torch.from_numpy(packets[s:s + batch_size]).float().to(device)
L = torch.from_numpy(lens[s:s + batch_size]).long().to(device)
(output, series, prior, _) = model(x)
rec = crit(output, x).mean(dim=-1)
series_loss = 0.0
prior_loss = 0.0
for u in range(len(prior)):
norm_p = _norm_prior(prior[u], win_size)
kl1 = _kl(series[u], norm_p.detach())
kl2 = _kl(norm_p.detach(), series[u])
series_loss = series_loss + (kl1 + kl2)
if isinstance(series_loss, torch.Tensor):
sl = series_loss.mean(dim=1)
metric = torch.softmax(-sl * temperature, dim=-1) * rec
else:
metric = rec
T_eff = x.shape[1]
arange = torch.arange(T_eff, device=device).unsqueeze(0).expand_as(metric)
mask = arange < L.unsqueeze(1)
for i in range(metric.shape[0]):
li = int(L[i].item())
if li == 0:
continue
row = metric[i, :li].cpu().numpy()
means[s + i] = row.mean()
maxes[s + i] = row.max()
medians[s + i] = float(np.median(row))
p90s[s + i] = float(np.percentile(row, 90))
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _per_class(val_score, atk_score, atk_labels):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=10000)
p.add_argument('--epochs', type=int, default=10)
p.add_argument('--lr', type=float, default=0.0001)
p.add_argument('--k-disc', type=float, default=3.0, help='weight on association-discrepancy KL term')
p.add_argument('--temperature', type=float, default=50.0)
p.add_argument('--batch-size', type=int, default=64)
p.add_argument('--d-model', type=int, default=128)
p.add_argument('--n-heads', type=int, default=4)
p.add_argument('--e-layers', type=int, default=3)
p.add_argument('--T', type=int, default=64)
p.add_argument('--device', type=str, default='auto')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda' if args.device == 'auto' and torch.cuda.is_available() else args.device if args.device != 'auto' else 'cpu')
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] anomaly_transformer protocol={args.protocol} seed={args.seed}')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
n_train = len(arrays['train_packets'])
n_val = len(arrays['val_packets'])
n_atk = len(arrays['atk_packets'])
D = arrays['train_packets'].shape[-1]
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D} device={device}')
torch.manual_seed(args.seed)
model = AnomalyTransformer(win_size=args.T, enc_in=D, c_out=D, d_model=args.d_model, n_heads=args.n_heads, e_layers=args.e_layers, d_ff=args.d_model, dropout=0.0, output_attention=True).to(device)
n_params = sum((p.numel() for p in model.parameters()))
print(f'[model] params={n_params:,}')
train_meta = _train(model, arrays['train_packets'], arrays['train_len'], batch_size=args.batch_size, epochs=args.epochs, lr=args.lr, k_disc=args.k_disc, win_size=args.T, device=device)
print(f"[train] {train_meta['t_train_sec']:.1f}s, final rec_loss={train_meta['losses'][-1]:.4f}")
t0 = time.time()
val_aggs = _score(model, arrays['val_packets'], arrays['val_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
print(f'[score] benign in {time.time() - t0:.1f}s')
t0 = time.time()
atk_aggs = _score(model, arrays['atk_packets'], arrays['atk_len'], batch_size=args.batch_size, win_size=args.T, temperature=args.temperature, device=device)
print(f'[score] attack in {time.time() - t0:.1f}s')
overall = {}
per_class_by_agg = {}
for agg in ('mean', 'max', 'median', 'p90'):
v = val_aggs[agg]
a = atk_aggs[agg]
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
s = np.r_[v, a]
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'anomaly_transformer', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': n_train, 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'epochs': args.epochs, 'lr': args.lr, 'k_disc': args.k_disc, 'temperature': args.temperature, 'd_model': args.d_model, 't_train_sec': round(train_meta['t_train_sec'], 2), 'loss_first_last': [train_meta['losses'][0], train_meta['losses'][-1]], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
for agg in ('mean', 'max', 'median', 'p90'):
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
np.savez_compressed(npz_path, **save)
print(f'[saved] {out_json}')
best = max(overall, key=lambda k: overall[k]['auroc'])
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/anomaly_transformer_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-15}"
BATCH="${BATCH:-128}"
D_MODEL="${D_MODEL:-128}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS batch=$BATCH ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_anomaly_transformer.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --batch-size "$BATCH" --d-model "$D_MODEL" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,223 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import yaml
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from packet_store import PacketShardStore
from KitNET.KitNET import KitNET
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _load_within(model_dir: Path, n_val, n_atk, n_train_cap, seed):
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
(train_packets, train_len) = (data.train_packets, data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
(val_packets, val_len) = (data.val_packets, data.val_len)
(atk_packets, atk_len, atk_labels) = (data.attack_packets, data.attack_len, data.attack_labels)
if n_val is not None and len(val_packets) > n_val:
idx = np.sort(rng.choice(len(val_packets), size=n_val, replace=False))
(val_packets, val_len) = (val_packets[idx], val_len[idx])
if n_atk is not None and len(atk_packets) > n_atk:
idx = np.sort(rng.choice(len(atk_packets), size=n_atk, replace=False))
(atk_packets, atk_len, atk_labels) = (atk_packets[idx], atk_len[idx], atk_labels[idx])
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': val_len, 'atk_packets': atk_packets, 'atk_len': atk_len, 'atk_labels': atk_labels}
def _load_cross(spec, ckpt, seed, n_train_cap, T):
packet_mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
packet_std = np.asarray(ckpt['packet_std'], dtype=np.float32)
packet_preprocess = str(ckpt.get('packet_preprocess', 'mixed_dequant'))
src_cfg_path = REPO / 'artifacts' / spec['model_template'].format(seed=seed) / 'config.yaml'
src_cfg = yaml.safe_load(src_cfg_path.read_text())
src_data = load_unified_data(packets_npz=Path(src_cfg['packets_npz']) if src_cfg.get('packets_npz') else None, source_store=Path(src_cfg['source_store']) if src_cfg.get('source_store') else None, flows_parquet=Path(src_cfg['flows_parquet']), flow_features_path=Path(src_cfg['flow_features_path']) if src_cfg.get('flow_features_path') else None, flow_feature_columns=src_cfg.get('flow_feature_columns'), flow_features_align=str(src_cfg.get('flow_features_align', 'auto')), T=int(src_cfg['T']), split_seed=int(src_cfg.get('data_seed', src_cfg.get('seed', 42))), train_ratio=float(src_cfg.get('train_ratio', 0.8)), benign_label=str(src_cfg.get('benign_label', 'normal')), min_len=int(src_cfg.get('min_len', 2)), packet_preprocess=packet_preprocess, attack_cap=None, val_cap=None)
rng = np.random.default_rng(seed + 1000)
(train_packets, train_len) = (src_data.train_packets, src_data.train_len)
if len(train_packets) > n_train_cap:
idx = np.sort(rng.choice(len(train_packets), size=n_train_cap, replace=False))
(train_packets, train_len) = (train_packets[idx], train_len[idx])
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
labels = flows['label'].astype(str).to_numpy()
rng2 = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng2.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
chunks.append(rng2.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng2.choice(a_sel, size=n_attack, replace=False))
store = PacketShardStore.open(target_store)
def _materialize(idx):
(tok, ll) = store.read_packets(idx, T=T)
ll = np.minimum(ll, T).astype(np.int32)
return (tok.astype(np.float32), ll)
(b_tok, b_len) = _materialize(b_sel)
(a_tok, a_len) = _materialize(a_sel)
if packet_preprocess == 'mixed_dequant':
val_packets = _apply_mixed_dequant(b_tok, b_len, packet_mean, packet_std, split_tag='val', seed=seed)
atk_packets = _apply_mixed_dequant(a_tok, a_len, packet_mean, packet_std, split_tag='attack', seed=seed)
else:
val_packets = _zscore(b_tok, packet_mean, packet_std)
atk_packets = _zscore(a_tok, packet_mean, packet_std)
msk_b = np.arange(T)[None, :] < b_len[:, None]
msk_a = np.arange(T)[None, :] < a_len[:, None]
val_packets = (val_packets * msk_b[:, :, None]).astype(np.float32)
atk_packets = (atk_packets * msk_a[:, :, None]).astype(np.float32)
return {'train_packets': train_packets, 'train_len': train_len, 'val_packets': val_packets, 'val_len': b_len, 'atk_packets': atk_packets, 'atk_len': a_len, 'atk_labels': labels[a_sel]}
def _flatten_packets(packets: np.ndarray, lens: np.ndarray) -> np.ndarray:
out_chunks = []
for i in range(len(packets)):
L = int(lens[i])
if L > 0:
out_chunks.append(packets[i, :L])
if not out_chunks:
return np.empty((0, packets.shape[-1]), dtype=np.float32)
return np.concatenate(out_chunks, axis=0).astype(np.float32)
def _train_kitnet(kit: KitNET, train_flat: np.ndarray) -> dict[str, float]:
t0 = time.time()
last_rmse = 0.0
for i in range(len(train_flat)):
last_rmse = kit.process(train_flat[i])
if (i + 1) % 50000 == 0:
print(f' [train] processed {i + 1:,}/{len(train_flat):,} last_rmse={last_rmse:.4f}', flush=True)
return {'t_train_sec': round(time.time() - t0, 2), 'n_trained_packets': len(train_flat)}
def _score_flows(kit: KitNET, packets: np.ndarray, lens: np.ndarray) -> dict[str, np.ndarray]:
N = len(packets)
means = np.zeros(N, dtype=np.float32)
maxes = np.zeros(N, dtype=np.float32)
medians = np.zeros(N, dtype=np.float32)
p90s = np.zeros(N, dtype=np.float32)
for i in range(N):
L = int(lens[i])
if L == 0:
continue
rmses = np.zeros(L, dtype=np.float32)
for t in range(L):
rmses[t] = kit.execute(packets[i, t])
means[i] = rmses.mean()
maxes[i] = rmses.max()
medians[i] = np.median(rmses)
p90s[i] = np.percentile(rmses, 90)
return {'mean': means, 'max': maxes, 'median': medians, 'p90': p90s}
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=2000, help='Cap source-benign train flows (each contributes ~T packets).')
p.add_argument('--fm-grace', type=int, default=2000, help='Kitsune feature-mapper grace period (packets).')
p.add_argument('--ad-grace', type=int, default=20000, help='Kitsune anomaly-detector grace period (packets).')
p.add_argument('--max-ae-size', type=int, default=10)
p.add_argument('--lr', type=float, default=0.1)
p.add_argument('--hidden-ratio', type=float, default=0.75)
p.add_argument('--T', type=int, default=64)
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] kitsune protocol={args.protocol} seed={args.seed}')
print(f'[run] using packet stats from {model_dir}/model.pt')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
arrays = _load_cross(spec, ckpt, args.seed, args.n_train_cap, args.T)
n_train = len(arrays['train_packets'])
n_val = len(arrays['val_packets'])
n_atk = len(arrays['atk_packets'])
D = arrays['train_packets'].shape[-1]
print(f'[data] train_flows={n_train:,} val={n_val:,} attack={n_atk:,} D={D}')
train_flat = _flatten_packets(arrays['train_packets'], arrays['train_len'])
print(f'[data] train_flat packets={len(train_flat):,} FM_grace={args.fm_grace} AD_grace={args.ad_grace}')
if len(train_flat) < args.fm_grace + args.ad_grace:
raise ValueError(f'Need at least FM+AD={args.fm_grace + args.ad_grace} packets, have {len(train_flat)} (try increasing --n-train-cap).')
kit = KitNET(n=D, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
train_meta = _train_kitnet(kit, train_flat)
print(f'[train] {train_meta}')
t0 = time.time()
val_aggs = _score_flows(kit, arrays['val_packets'], arrays['val_len'])
print(f'[score] benign in {time.time() - t0:.1f}s')
t0 = time.time()
atk_aggs = _score_flows(kit, arrays['atk_packets'], arrays['atk_len'])
print(f'[score] attack in {time.time() - t0:.1f}s')
overall = {}
per_class_by_agg = {}
for agg in ('mean', 'max', 'median', 'p90'):
v = val_aggs[agg]
a = atk_aggs[agg]
y = np.r_[np.zeros(len(v)), np.ones(len(a))]
s = np.r_[v, a]
overall[agg] = {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}
per_class_by_agg[agg] = _per_class(v, a, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'kitsune_path_b', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train_flows': n_train, 'n_train_packets': int(len(train_flat)), 'n_val': n_val, 'n_atk': n_atk, 'D': int(D), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'max_ae_size': args.max_ae_size, 't_train_sec': train_meta['t_train_sec'], 'overall_by_agg': overall, 'per_class_by_agg': per_class_by_agg}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
save = {'a_labels': np.asarray(arrays['atk_labels']).astype(str)}
for agg in ('mean', 'max', 'median', 'p90'):
save[f'b_{agg}'] = val_aggs[agg].astype(np.float32)
save[f'a_{agg}'] = atk_aggs[agg].astype(np.float32)
np.savez_compressed(npz_path, **save)
print(f'[saved] {out_json}')
print(f'[saved] {npz_path}')
best = max(overall, key=lambda k: overall[k]['auroc'])
print(f"[best agg={best}] AUROC={overall[best]['auroc']:.4f} AUPRC={overall[best]['auprc']:.4f}")
print()
print('=== overall AUROC by aggregator ===')
for k in sorted(overall, key=lambda kk: -overall[kk]['auroc']):
print(f" {k:<8s} AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/kitsune_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
N_TRAIN_CAP="${N_TRAIN_CAP:-5000}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed n_train_cap=$N_TRAIN_CAP ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_kitsune.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--n-train-cap "$N_TRAIN_CAP" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,211 @@
from __future__ import annotations
import argparse
import json
import sys
import time
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
if not hasattr(np, 'Inf'):
np.Inf = np.inf
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'baselines/Kitsune-py'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
from FeatureExtractor import FE
from KitNET.KitNET import KitNET
from data import load_unified_data
PCAP_GLOBS = {'iscxtor': str(REPO / 'datasets/iscxtor2016/raw/pcap_extracted/**/*.pcap'), 'cicids2017': str(REPO / 'datasets/cicids2017/raw/pcap/*.pcap'), 'cicddos2019': str(REPO / 'datasets/cicddos2019/raw/pcap/*')}
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', 'iscxtor', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'cicids2017', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'cicddos2019', {'n_val': 10000, 'n_atk': 20000})}
def _canonical_key(src_ip, dst_ip, src_port, dst_port, protocol) -> tuple:
a = (src_ip, src_port)
b = (dst_ip, dst_port)
if a <= b:
return (a[0], b[0], a[1], b[1], int(protocol))
return (b[0], a[0], b[1], a[1], int(protocol))
def _proto_from_kitsune(srcproto: str, dstproto: str) -> int:
if srcproto == 'icmp':
return 1
if srcproto == 'arp':
return 0
return -1
class FEWithMeta(FE):
def __init__(self, path, limit=np.inf):
super().__init__(path, limit)
self._last_ts = None
self._last_5tuple = None
self._last_framelen = None
def get_next_vector(self):
if self.curPacketIndx == self.limit:
if self.parse_type == 'tsv':
self.tsvinf.close()
return []
if self.parse_type == 'tsv':
row = self.tsvin.__next__()
IPtype = np.nan
timestamp = row[0]
framelen = row[1]
srcIP = ''
dstIP = ''
if row[4] != '':
(srcIP, dstIP, IPtype) = (row[4], row[5], 0)
elif row[17] != '':
(srcIP, dstIP, IPtype) = (row[17], row[18], 1)
srcproto = row[6] + row[8]
dstproto = row[7] + row[9]
(srcMAC, dstMAC) = (row[2], row[3])
if srcproto == '':
if row[12] != '':
(srcproto, dstproto) = ('arp', 'arp')
(srcIP, dstIP, IPtype) = (row[14], row[16], 0)
elif row[10] != '':
(srcproto, dstproto, IPtype) = ('icmp', 'icmp', 0)
elif srcIP + srcproto + dstIP + dstproto == '':
(srcIP, dstIP) = (row[2], row[3])
else:
return []
try:
sp = int(srcproto) if srcproto.isdigit() else 0
dp = int(dstproto) if dstproto.isdigit() else 0
except Exception:
(sp, dp) = (0, 0)
try:
self._last_ts = float(timestamp)
except Exception:
self._last_ts = np.nan
self._last_5tuple = (srcIP, dstIP, sp, dp)
try:
self._last_framelen = int(framelen)
except Exception:
self._last_framelen = 0
self.curPacketIndx += 1
try:
return self.nstat.updateGetStats(IPtype, srcMAC, dstMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))
except Exception as e:
print(f' [warn] netStat error: {e}')
return []
def _stream_pcap_kitsune(pcap_path: Path, *, kit: KitNET, fm_grace: int, ad_grace: int, packet_limit: int, fivetuple_to_rmses: dict, n_packets_total: list) -> None:
print(f' [stream] {pcap_path.name}', flush=True)
fe = FEWithMeta(str(pcap_path), limit=packet_limit)
t0 = time.time()
n_local = 0
while True:
x = fe.get_next_vector()
if len(x) == 0:
break
n_local += 1
n_packets_total[0] += 1
rmse = kit.process(x)
if rmse is None or rmse == 0:
continue
if fe._last_5tuple is None:
continue
(srcIP, dstIP, sp, dp) = fe._last_5tuple
key = (srcIP, dstIP, sp, dp) if (srcIP, sp) <= (dstIP, dp) else (dstIP, srcIP, dp, sp)
fivetuple_to_rmses[key].append(rmse)
if n_local % 200000 == 0:
print(f' [{n_local:,}] elapsed {time.time() - t0:.0f}s ({n_local / max(time.time() - t0, 0.001):.0f} pkt/s)', flush=True)
print(f' [stream] {pcap_path.name} done: {n_local:,} packets in {time.time() - t0:.0f}s', flush=True)
def _flows_to_key(flows_df: pd.DataFrame) -> np.ndarray:
keys = []
for (src_ip, dst_ip, sp, dp) in zip(flows_df['src_ip'], flows_df['dst_ip'], flows_df['src_port'], flows_df['dst_port']):
if (str(src_ip), int(sp)) <= (str(dst_ip), int(dp)):
k = (str(src_ip), str(dst_ip), int(sp), int(dp))
else:
k = (str(dst_ip), str(src_ip), int(dp), int(sp))
keys.append(k)
return np.asarray(keys, dtype=object)
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS))
p.add_argument('--seed', type=int, required=True)
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--fm-grace', type=int, default=5000)
p.add_argument('--ad-grace', type=int, default=50000)
p.add_argument('--max-ae-size', type=int, default=10)
p.add_argument('--lr', type=float, default=0.1)
p.add_argument('--hidden-ratio', type=float, default=0.75)
p.add_argument('--packet-limit-per-pcap', type=int, default=2000000, help='Cap per-pcap packets to keep runtime tractable. None = full.')
p.add_argument('--max-pcaps', type=int, default=None, help='Cap number of pcap files processed (default: all).')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
(template, ds_name, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
print(f'[run] kitsune_path_a protocol={args.protocol} seed={args.seed}')
print(f'[run] dataset={ds_name} model_dir={model_dir}')
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else caps['n_atk'], val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else caps['n_val'])
flows_full = pd.read_parquet(cfg['flows_parquet'])
print(f'[data] flows.parquet rows: {len(flows_full):,}; val={len(data.val_flow):,} attack={len(data.attack_flow):,}')
from glob import glob
pcaps = sorted(glob(PCAP_GLOBS[ds_name], recursive=True))
pcaps = [Path(p) for p in pcaps]
if args.max_pcaps is not None:
pcaps = pcaps[:args.max_pcaps]
print(f'[pcap] discovered {len(pcaps)} pcap(s)')
for p in pcaps[:5]:
print(f' {p}')
if len(pcaps) > 5:
print(f' ...({len(pcaps) - 5} more)')
kit = KitNET(n=100, max_autoencoder_size=args.max_ae_size, FM_grace_period=args.fm_grace, AD_grace_period=args.ad_grace, learning_rate=args.lr, hidden_ratio=args.hidden_ratio)
fivetuple_to_rmses: dict = defaultdict(list)
n_total = [0]
t0 = time.time()
for p in pcaps:
_stream_pcap_kitsune(p, kit=kit, fm_grace=args.fm_grace, ad_grace=args.ad_grace, packet_limit=args.packet_limit_per_pcap, fivetuple_to_rmses=fivetuple_to_rmses, n_packets_total=n_total)
elapsed = time.time() - t0
print(f'[stream] total {n_total[0]:,} packets in {elapsed:.0f}s ({n_total[0] / max(elapsed, 0.001):.0f} pkt/s)')
print(f'[stream] unique 5-tuples seen: {len(fivetuple_to_rmses):,}')
keys_full = _flows_to_key(flows_full)
print(f'[match] keying {len(keys_full):,} flows to 5-tuples')
flow_score_mean = np.full(len(flows_full), np.nan, dtype=np.float64)
flow_score_max = np.full(len(flows_full), np.nan, dtype=np.float64)
flow_score_median = np.full(len(flows_full), np.nan, dtype=np.float64)
n_matched = 0
for (i, k) in enumerate(keys_full):
rl = fivetuple_to_rmses.get(tuple(k))
if rl:
flow_score_mean[i] = float(np.mean(rl))
flow_score_max[i] = float(np.max(rl))
flow_score_median[i] = float(np.median(rl))
n_matched += 1
print(f'[match] flows with RMSE coverage: {n_matched:,}/{len(flows_full):,} ({100 * n_matched / max(len(flows_full), 1):.1f}%)')
val_flow_ids = set((int(x) for x in data.val_flow_ids)) if hasattr(data, 'val_flow_ids') else None
bin_labels = (flows_full['label'].astype(str) != cfg.get('benign_label', 'normal')).astype(int).to_numpy()
keys = ['mean', 'max', 'median']
score_arrs = {'mean': flow_score_mean, 'max': flow_score_max, 'median': flow_score_median}
overall = {}
for k in keys:
s = score_arrs[k]
valid = ~np.isnan(s)
if valid.sum() < 10:
overall[k] = {'auroc': float('nan'), 'auprc': float('nan'), 'n_valid': int(valid.sum())}
continue
y = bin_labels[valid]
sv = s[valid]
overall[k] = {'auroc': _safe_metric(roc_auc_score, y, sv), 'auprc': _safe_metric(average_precision_score, y, sv), 'n_valid': int(valid.sum())}
print(f" [{k}] AUROC={overall[k]['auroc']:.4f} AUPRC={overall[k]['auprc']:.4f} (n_valid={overall[k]['n_valid']:,})")
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out = {'method': 'kitsune_path_a', 'protocol': args.protocol, 'seed': args.seed, 'dataset': ds_name, 'n_pcaps': len(pcaps), 'n_total_packets': int(n_total[0]), 'n_unique_5tuples': int(len(fivetuple_to_rmses)), 'n_flows_total': int(len(flows_full)), 'n_flows_matched': int(n_matched), 'fm_grace': args.fm_grace, 'ad_grace': args.ad_grace, 'packet_limit_per_pcap': args.packet_limit_per_pcap, 'elapsed_sec': round(elapsed, 1), 'overall_by_agg': overall}
out_json.write_text(json.dumps(out, indent=2))
np.savez_compressed(out_json.with_suffix('.npz'), flow_score_mean=flow_score_mean.astype(np.float32), flow_score_max=flow_score_max.astype(np.float32), flow_score_median=flow_score_median.astype(np.float32), binary_label=bin_labels.astype(np.int8))
print(f'[saved] {out_json}')
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,227 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import torch
import yaml
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
import optax
from pzflow import Flow
from sklearn.metrics import average_precision_score, roc_auc_score
REPO = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
sys.path.insert(0, str(REPO / 'Unified_CFM'))
from data import _apply_mixed_dequant, _zscore, load_unified_data
from model import UnifiedCFMConfig, UnifiedTokenCFM
from packet_store import PacketShardStore
WITHIN_DIRS = {'iscxtor_within': ('phase25_multiseed_2026_04_25/iscxtor2016_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': None}), 'cicids_within': ('phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', {'n_val': 10000, 'n_atk': 30000}), 'cicddos_within': ('phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', {'n_val': 10000, 'n_atk': 20000}), 'ciciot_within': ('runs/unified_cfm_ciciot2023_shafir5_2026_04_29', {'n_val': 10000, 'n_atk': 30000})}
CROSS_DIRS = {'forward_cross': {'model_template': 'phase25_sigma06_multiseed_2026_04_25/cicids2017_lambda0p3_sigma0p6_seed{seed}', 'target_store': 'datasets/cicddos2019/processed/full_store', 'target_flows': 'datasets/cicddos2019/processed/flows.parquet', 'target_flow_features': 'datasets/cicddos2019/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}, 'reverse_cross': {'model_template': 'phase25_multiseed_2026_04_25/cicddos2019_lambda0p3_seed{seed}', 'target_store': 'datasets/cicids2017/processed/full_store', 'target_flows': 'datasets/cicids2017/processed/flows.parquet', 'target_flow_features': 'datasets/cicids2017/processed/flow_features.parquet', 'n_benign': 10000, 'n_attack': 10000}}
def _load_within(model_dir: Path, n_val: int | None, n_atk: int | None, n_train_cap: int, seed: int) -> dict[str, Any]:
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
data = load_unified_data(packets_npz=Path(cfg['packets_npz']) if cfg.get('packets_npz') else None, source_store=Path(cfg['source_store']) if cfg.get('source_store') else None, flows_parquet=Path(cfg['flows_parquet']), flow_features_path=Path(cfg['flow_features_path']) if cfg.get('flow_features_path') else None, flow_feature_columns=cfg.get('flow_feature_columns'), flow_features_align=str(cfg.get('flow_features_align', 'auto')), T=int(cfg['T']), split_seed=int(cfg.get('data_seed', cfg.get('seed', 42))), train_ratio=float(cfg.get('train_ratio', 0.8)), benign_label=str(cfg.get('benign_label', 'normal')), min_len=int(cfg.get('min_len', 2)), packet_preprocess=str(cfg.get('packet_preprocess', 'mixed_dequant')), attack_cap=int(cfg['attack_cap']) if cfg.get('attack_cap') else n_atk, val_cap=int(cfg['val_cap']) if cfg.get('val_cap') else n_val)
rng = np.random.default_rng(seed)
train_flow = data.train_flow
if len(train_flow) > n_train_cap:
idx = np.sort(rng.choice(len(train_flow), size=n_train_cap, replace=False))
train_flow = train_flow[idx]
val_flow = data.val_flow
(atk_flow, atk_labels) = (data.attack_flow, data.attack_labels)
if n_val is not None and len(val_flow) > n_val:
idx = np.sort(rng.choice(len(val_flow), size=n_val, replace=False))
val_flow = val_flow[idx]
if n_atk is not None and len(atk_flow) > n_atk:
idx = np.sort(rng.choice(len(atk_flow), size=n_atk, replace=False))
atk_flow = atk_flow[idx]
atk_labels = atk_labels[idx]
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
def _load_cross(spec: dict[str, Any], ckpt_dict: dict[str, Any], seed: int, T: int, n_train_cap: int) -> dict[str, Any]:
flow_mean = np.asarray(ckpt_dict['flow_mean'], dtype=np.float32)
flow_std = np.asarray(ckpt_dict['flow_std'], dtype=np.float32)
flow_names = [str(n) for n in ckpt_dict['flow_feature_names']]
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
target_flow_features = REPO / spec['target_flow_features']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
ff = pd.read_parquet(target_flow_features)
if not np.array_equal(flows['flow_id'].to_numpy(dtype=np.uint64), ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('target flows and flow_features not row-aligned')
labels = flows['label'].astype(str).to_numpy()
rng = np.random.default_rng(seed)
benign_idx = np.flatnonzero(labels == 'normal')
attack_idx = np.flatnonzero(labels != 'normal')
b_sel = np.sort(rng.choice(benign_idx, size=n_benign, replace=False))
atk_classes = sorted(set(labels[attack_idx]))
per_class = max(1, n_attack // len(atk_classes))
a_sel_chunks = []
for cls in atk_classes:
pool = attack_idx[labels[attack_idx] == cls]
k = min(per_class, len(pool))
if k:
a_sel_chunks.append(rng.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(a_sel_chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
def _flow_only(idx):
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
val_flow = _flow_only(b_sel)
atk_flow = _flow_only(a_sel)
atk_labels = labels[a_sel]
src_flows = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flows'], columns=['flow_id', 'label'])
src_ff = pd.read_parquet(REPO / ckpt_dict_paths(ckpt_dict)['flow_features'])
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('source flows and flow_features not row-aligned')
src_labels = src_flows['label'].astype(str).to_numpy()
src_benign_idx = np.flatnonzero(src_labels == 'normal')
rng2 = np.random.default_rng(seed + 1000)
if len(src_benign_idx) > n_train_cap:
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=n_train_cap, replace=False))
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
return {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels, 'flow_names': flow_names}
def ckpt_dict_paths(ckpt: dict[str, Any]) -> dict[str, str]:
raise NotImplementedError('paths must be passed via main()')
def _train_and_score(train_flow: np.ndarray, val_flow: np.ndarray, atk_flow: np.ndarray, *, epochs: int, lr: float, optimizer: str, verbose: bool):
cols = [f'x{i}' for i in range(train_flow.shape[1])]
df_train = pd.DataFrame(train_flow.astype(np.float32), columns=cols)
df_val = pd.DataFrame(val_flow.astype(np.float32), columns=cols)
df_atk = pd.DataFrame(atk_flow.astype(np.float32), columns=cols)
if optimizer == 'sgd':
opt = optax.sgd(learning_rate=lr)
elif optimizer == 'adam':
opt = optax.adam(learning_rate=lr)
else:
raise ValueError(f'unknown optimizer {optimizer!r}')
flow = Flow(df_train.columns.tolist())
t0 = time.time()
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=verbose)
t_train = time.time() - t0
t0 = time.time()
lp_val = np.asarray(flow.log_prob(df_val))
lp_atk = np.asarray(flow.log_prob(df_atk))
t_score = time.time() - t0
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _per_class(val_score: np.ndarray, atk_score: np.ndarray, atk_labels: np.ndarray):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(WITHIN_DIRS) + list(CROSS_DIRS))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--n-train-cap', type=int, default=10000, help='Cap benign train (default 10k mirrors Shafir).')
p.add_argument('--epochs', type=int, default=100)
p.add_argument('--lr', type=float, default=0.001)
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
p.add_argument('--T', type=int, default=64)
p.add_argument('--verbose', action='store_true')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
is_within = args.protocol in WITHIN_DIRS
if is_within:
(template, caps) = WITHIN_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / template.format(seed=args.seed)
else:
spec = CROSS_DIRS[args.protocol]
model_dir = REPO / 'artifacts' / spec['model_template'].format(seed=args.seed)
print(f'[run] shafir_nf protocol={args.protocol} seed={args.seed}')
print(f'[run] using normalization stats from {model_dir}/model.pt (source ckpt)')
ckpt = torch.load(model_dir / 'model.pt', map_location='cpu', weights_only=False)
if is_within:
arrays = _load_within(model_dir, n_val=caps['n_val'], n_atk=caps['n_atk'], n_train_cap=args.n_train_cap, seed=args.seed)
else:
cfg = yaml.safe_load((model_dir / 'config.yaml').read_text())
flows_parquet = Path(cfg['flows_parquet'])
flow_features_path = Path(cfg['flow_features_path'])
flow_mean = np.asarray(ckpt['flow_mean'], dtype=np.float32)
flow_std = np.asarray(ckpt['flow_std'], dtype=np.float32)
flow_names = [str(n) for n in ckpt['flow_feature_names']]
src_flows = pd.read_parquet(flows_parquet, columns=['flow_id', 'label'])
src_ff = pd.read_parquet(flow_features_path)
if not np.array_equal(src_flows['flow_id'].to_numpy(dtype=np.uint64), src_ff['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('source flows and flow_features not row-aligned')
src_labels = src_flows['label'].astype(str).to_numpy()
src_benign_idx = np.flatnonzero(src_labels == 'normal')
rng2 = np.random.default_rng(args.seed + 1000)
if len(src_benign_idx) > args.n_train_cap:
src_benign_idx = np.sort(rng2.choice(src_benign_idx, size=args.n_train_cap, replace=False))
src_train = src_ff.iloc[src_benign_idx][flow_names].to_numpy(dtype=np.float64)
src_train = np.nan_to_num(src_train, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
train_flow = ((src_train - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
target_store = REPO / spec['target_store']
target_flows = REPO / spec['target_flows']
target_flow_features = REPO / spec['target_flow_features']
(n_benign, n_attack) = (int(spec['n_benign']), int(spec['n_attack']))
flows = pd.read_parquet(target_flows, columns=['flow_id', 'label'])
ff = pd.read_parquet(target_flow_features)
labels = flows['label'].astype(str).to_numpy()
rng = np.random.default_rng(args.seed)
b_sel = np.sort(rng.choice(np.flatnonzero(labels == 'normal'), size=n_benign, replace=False))
atk_idx = np.flatnonzero(labels != 'normal')
atk_classes = sorted(set(labels[atk_idx]))
per_class_n = max(1, n_attack // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = atk_idx[labels[atk_idx] == cls]
k = min(per_class_n, len(pool))
if k:
chunks.append(rng.choice(pool, size=k, replace=False))
a_sel = np.sort(np.concatenate(chunks))
if len(a_sel) > n_attack:
a_sel = np.sort(rng.choice(a_sel, size=n_attack, replace=False))
def _flow_only(idx):
f = ff.iloc[idx][flow_names].to_numpy(dtype=np.float64)
f = np.nan_to_num(f, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
return ((f - flow_mean) / np.maximum(flow_std, 1e-06)).astype(np.float32)
val_flow = _flow_only(b_sel)
atk_flow = _flow_only(a_sel)
atk_labels = labels[a_sel]
arrays = {'train_flow': train_flow, 'val_flow': val_flow, 'atk_flow': atk_flow, 'atk_labels': atk_labels}
print(f"[data] train={len(arrays['train_flow']):,} val={len(arrays['val_flow']):,} attack={len(arrays['atk_flow']):,} D={arrays['train_flow'].shape[1]}")
res = _train_and_score(arrays['train_flow'], arrays['val_flow'], arrays['atk_flow'], epochs=args.epochs, lr=args.lr, optimizer=args.optimizer, verbose=args.verbose)
(val_score, atk_score) = (res['score_val'], res['score_atk'])
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
s = np.r_[val_score, atk_score]
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
per_cls = _per_class(val_score, atk_score, np.asarray(arrays['atk_labels']).astype(str))
out = {'method': 'shafir_nf', 'protocol': args.protocol, 'seed': args.seed, 'model_dir': str(model_dir), 'n_train': int(len(arrays['train_flow'])), 'n_val': int(len(arrays['val_flow'])), 'n_atk': int(len(arrays['atk_flow'])), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=np.asarray(arrays['atk_labels']).astype(str), losses=res['losses'])
print(f'[saved] {out_json}')
print(f'[saved] {npz_path}')
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s score={res['t_score']:.1f}s")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/shafir_nf_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-100}"
LR="${LR:-0.001}"
OPTIMIZER="${OPTIMIZER:-sgd}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_shafir_nf.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,265 @@
from __future__ import annotations
import argparse
import json
import os
import sys
import time
import warnings
from glob import glob
from pathlib import Path
import numpy as np
import pandas as pd
os.environ.setdefault('JAX_PLATFORMS', 'cpu')
warnings.filterwarnings('ignore')
import optax
from pzflow import Flow
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
REPO = Path(__file__).resolve().parents[2]
IDS2017_FEATURES = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
TOR2016_FEATURES = ['Protocol', 'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']
CICIOT5_FEATURES = ['HTTPS', 'Protocol Type', 'Magnitude', 'Variance', 'fin_count']
CICIDS_BEST5_FEATURES = ['Bwd Packet Length Mean', 'Fwd Packets/s', 'ACK Flag Count', 'Total Length of Bwd Packets', 'Flow Duration']
TOR_BEST4_FEATURES = ['Flow IAT Std', 'Flow Bytes/s', 'Flow Packets/s', 'Bwd IAT Max']
COLUMN_ALIASES = {'Total Fwd Packets': ['Total Fwd Packet'], 'Total Backward Packets': ['Total Bwd packets'], 'Total Length of Fwd Packets': ['Total Length of Fwd Packet'], 'Total Length of Bwd Packets': ['Total Length of Bwd Packet'], 'Fwd Header Length': ['Fwd Header Length.1'], 'Init_Win_bytes_forward': ['FWD Init Win Bytes', 'Init Win Bytes Fwd'], 'Init_Win_bytes_backward': ['Bwd Init Win Bytes', 'Init Win Bytes Bwd'], 'act_data_pkt_fwd': ['Fwd Act Data Pkts'], 'min_seg_size_forward': ['Fwd Seg Size Min'], 'Avg Fwd Segment Size': ['Fwd Segment Size Avg'], 'Avg Bwd Segment Size': ['Bwd Segment Size Avg'], 'Min Packet Length': ['Packet Length Min'], 'Max Packet Length': ['Packet Length Max']}
DATASETS = {'iscxtor': {'csv_glob': str(REPO / 'datasets/iscxtor2016/raw/csv/Scenario-*-merged_5s.csv'), 'label_col': 'label', 'benign_values': ['nonTOR'], 'drop_patterns': [], 'feature_set': TOR_BEST4_FEATURES}, 'cicids2017': {'csv_glob': str(REPO / 'datasets/cicids2017/raw/csv/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [' - Attempted', '- Attempted'], 'feature_set': CICIDS_BEST5_FEATURES}, 'cicddos2019': {'csv_glob': str(REPO / 'datasets/cicddos2019/raw/csv/**/*.csv'), 'label_col': 'Label', 'benign_values': ['BENIGN', 'Benign', 'benign'], 'drop_patterns': [], 'feature_set': CICIDS_BEST5_FEATURES}, 'ciciot2023': {'csv_glob': str(REPO / 'datasets/ciciot2023/raw/csv/CSV/*/*.pcap.csv'), 'label_col': None, 'benign_folder': 'Benign_Final', 'drop_patterns': [], 'feature_set': CICIOT5_FEATURES}}
PROTOCOL_CONFIG = {'iscxtor_within': ('iscxtor', 'iscxtor', {'n_train': 10000, 'n_val': 10000, 'n_attack': None}), 'cicids_within': ('cicids2017', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'cicddos_within': ('cicddos2019', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 20000}), 'ciciot_within': ('ciciot2023', 'ciciot2023', {'n_train': 10000, 'n_val': 10000, 'n_attack': 30000}), 'forward_cross': ('cicids2017', 'cicddos2019', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000}), 'reverse_cross': ('cicddos2019', 'cicids2017', {'n_train': 10000, 'n_val': 10000, 'n_attack': 10000})}
def _resolve_columns(df: pd.DataFrame, names: list[str]) -> tuple[list[str], list[str]]:
df.columns = [c.strip() if isinstance(c, str) else c for c in df.columns]
(resolved, missing) = ([], [])
for n in names:
if n in df.columns:
resolved.append(n)
continue
found = None
for alias in COLUMN_ALIASES.get(n, []):
if alias in df.columns:
found = alias
break
if found is None:
low = {c.lower(): c for c in df.columns}
if n.lower() in low:
found = low[n.lower()]
if found is None:
missing.append(n)
else:
resolved.append(found)
return (resolved, missing)
def _load_csvs(dataset_name: str, return_paths: bool=False):
cfg = DATASETS[dataset_name]
paths = sorted(glob(cfg['csv_glob'], recursive=True))
if not paths:
raise FileNotFoundError(f"no CSVs match {cfg['csv_glob']}")
print(f' [csv] {dataset_name}: {len(paths)} files')
return paths if return_paths else paths
def _attach_labels(df: pd.DataFrame, dataset_name: str, source_path: str | None=None) -> pd.DataFrame:
cfg = DATASETS[dataset_name]
if cfg.get('label_col') is None:
folder = Path(source_path).parent.name
df = df.copy()
df['cls_label'] = folder
df['binary_label'] = 0 if folder == cfg['benign_folder'] else 1
else:
lbl_col = cfg['label_col'].strip()
match = None
for c in df.columns:
if isinstance(c, str) and c.strip() == lbl_col:
match = c
break
if match is None:
raise KeyError(f'label column {lbl_col!r} not found in {source_path}')
df = df.copy()
df['cls_label'] = df[match].astype(str).str.strip()
for pat in cfg['drop_patterns']:
df = df[~df['cls_label'].str.contains(pat, na=False, regex=False)]
df['binary_label'] = df['cls_label'].apply(lambda x: 0 if x in cfg['benign_values'] else 1)
return df
def _load_dataset(dataset_name: str, feature_set: list[str]) -> pd.DataFrame:
cfg = DATASETS[dataset_name]
paths = _load_csvs(dataset_name)
dfs = []
for p in paths:
try:
df = pd.read_csv(p, low_memory=False)
except Exception as e:
print(f' [csv-warn] skip {p}: {e}')
continue
df = _attach_labels(df, dataset_name, source_path=p)
(resolved, missing) = _resolve_columns(df, feature_set)
if missing:
if not hasattr(_load_dataset, '_warned'):
_load_dataset._warned = set()
key = (dataset_name, tuple(missing))
if key not in _load_dataset._warned:
_load_dataset._warned.add(key)
print(f' [warn] {Path(p).name}: missing {missing}')
sub = df[resolved + ['binary_label', 'cls_label']].copy()
rename = {r: n for (r, n) in zip(resolved, [f for f in feature_set if f not in missing])}
sub = sub.rename(columns=rename)
dfs.append(sub)
if not dfs:
raise RuntimeError(f'no usable CSVs for {dataset_name}')
full = pd.concat(dfs, axis=0, ignore_index=True)
for c in [c for c in feature_set if c in full.columns]:
full[c] = pd.to_numeric(full[c], errors='coerce')
full = full.replace([np.inf, -np.inf], np.nan)
feat_cols = [c for c in feature_set if c in full.columns]
full = full.dropna(subset=feat_cols).reset_index(drop=True)
print(f' [csv] {dataset_name} concat: {len(full):,} rows benign={int((full.binary_label == 0).sum()):,} attack={int((full.binary_label == 1).sum()):,} features_kept={len(feat_cols)}')
return (full, feat_cols)
def _sample_within(df: pd.DataFrame, caps: dict, seed: int):
rng = np.random.default_rng(seed)
benign = df[df.binary_label == 0]
attack = df[df.binary_label == 1]
n_train = caps['n_train']
n_val = caps['n_val']
n_atk = caps['n_attack']
needed_b = n_train + n_val
if len(benign) < needed_b:
raise RuntimeError(f'only {len(benign)} benign rows, need {needed_b}')
b_idx = rng.permutation(len(benign))
train = benign.iloc[b_idx[:n_train]]
val = benign.iloc[b_idx[n_train:n_train + n_val]]
if n_atk is None:
atk = attack
else:
atk_classes = sorted(attack['cls_label'].unique())
per = max(1, n_atk // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = attack[attack['cls_label'] == cls]
k = min(per, len(pool))
if k:
chunks.append(pool.sample(n=k, random_state=seed))
atk = pd.concat(chunks, axis=0, ignore_index=True)
if len(atk) > n_atk:
atk = atk.sample(n=n_atk, random_state=seed)
return (train, val, atk)
def _sample_cross(src_df, tgt_df, caps, seed):
rng = np.random.default_rng(seed + 1000)
src_benign = src_df[src_df.binary_label == 0]
if len(src_benign) < caps['n_train']:
raise RuntimeError(f"src benign only {len(src_benign)}, need {caps['n_train']}")
sb_idx = rng.permutation(len(src_benign))
train = src_benign.iloc[sb_idx[:caps['n_train']]]
rng2 = np.random.default_rng(seed)
tgt_benign = tgt_df[tgt_df.binary_label == 0]
tgt_attack = tgt_df[tgt_df.binary_label == 1]
if len(tgt_benign) < caps['n_val']:
raise RuntimeError(f'tgt benign only {len(tgt_benign)}')
tb_idx = rng2.permutation(len(tgt_benign))
val = tgt_benign.iloc[tb_idx[:caps['n_val']]]
atk_classes = sorted(tgt_attack['cls_label'].unique())
per = max(1, caps['n_attack'] // len(atk_classes))
chunks = []
for cls in atk_classes:
pool = tgt_attack[tgt_attack['cls_label'] == cls]
k = min(per, len(pool))
if k:
chunks.append(pool.sample(n=k, random_state=seed))
atk = pd.concat(chunks, axis=0, ignore_index=True)
if len(atk) > caps['n_attack']:
atk = atk.sample(n=caps['n_attack'], random_state=seed)
return (train, val, atk)
def _safe_metric(fn, y, s) -> float:
s = np.nan_to_num(s, nan=0.0, posinf=1000000000000.0, neginf=-1000000000000.0)
try:
return float(fn(y, s))
except ValueError:
return float('nan')
def _train_and_score(train, val, atk, feat_cols, *, epochs, lr, optimizer):
raw_train = train[feat_cols].astype(np.float64).values
keep = raw_train.std(axis=0) > 0
if not keep.all():
dropped = [c for (c, k) in zip(feat_cols, keep) if not k]
print(f' [train] dropping {len(dropped)} zero-variance cols: {dropped}')
feat_cols = [c for (c, k) in zip(feat_cols, keep) if k]
raw_train = raw_train[:, keep]
raw_val = val[feat_cols].astype(np.float64).values
raw_atk = atk[feat_cols].astype(np.float64).values
scaler = StandardScaler()
X_train = scaler.fit_transform(raw_train)
X_val = scaler.transform(raw_val)
X_atk = scaler.transform(raw_atk)
clip_lim = 30.0
X_train = np.clip(X_train, -clip_lim, clip_lim)
X_val = np.clip(X_val, -clip_lim, clip_lim)
X_atk = np.clip(X_atk, -clip_lim, clip_lim)
df_train = pd.DataFrame(X_train.astype(np.float32), columns=[f'x{i}' for i in range(len(feat_cols))])
df_val = pd.DataFrame(X_val.astype(np.float32), columns=df_train.columns)
df_atk = pd.DataFrame(X_atk.astype(np.float32), columns=df_train.columns)
if optimizer == 'sgd':
opt = optax.sgd(learning_rate=lr)
else:
opt = optax.adam(learning_rate=lr)
flow = Flow(df_train.columns.tolist())
t0 = time.time()
losses = flow.train(df_train, optimizer=opt, epochs=epochs, verbose=False)
t_train = time.time() - t0
t0 = time.time()
lp_val = np.asarray(flow.log_prob(df_val))
lp_atk = np.asarray(flow.log_prob(df_atk))
t_score = time.time() - t0
return {'score_val': (-lp_val).astype(np.float32), 'score_atk': (-lp_atk).astype(np.float32), 'losses': np.asarray(losses, dtype=np.float64), 't_train': t_train, 't_score': t_score}
def _per_class(val_score, atk_score, atk_labels):
out = {}
for cls in sorted(set(atk_labels)):
m = atk_labels == cls
n_c = int(m.sum())
v_c = atk_score[m]
y = np.r_[np.zeros(len(val_score)), np.ones(len(v_c))]
s = np.r_[val_score, v_c]
out[cls] = {'_n': float(n_c), 'auroc': _safe_metric(roc_auc_score, y, s)}
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--protocol', required=True, choices=list(PROTOCOL_CONFIG))
p.add_argument('--seed', type=int, required=True, choices=[42, 43, 44])
p.add_argument('--out-dir', type=Path, required=True)
p.add_argument('--epochs', type=int, default=100)
p.add_argument('--lr', type=float, default=0.001)
p.add_argument('--optimizer', choices=['sgd', 'adam'], default='sgd')
args = p.parse_args()
args.out_dir.mkdir(parents=True, exist_ok=True)
(src_name, tgt_name, caps) = PROTOCOL_CONFIG[args.protocol]
cross = src_name != tgt_name
print(f'[run] shafir_nf_csv protocol={args.protocol} seed={args.seed}')
print(f' src={src_name} tgt={tgt_name} cross={cross}')
feat_set = DATASETS[src_name]['feature_set']
(src_df, src_feat_cols) = _load_dataset(src_name, feat_set)
if cross:
(tgt_df, tgt_feat_cols) = _load_dataset(tgt_name, feat_set)
feat_cols = [c for c in feat_set if c in src_feat_cols and c in tgt_feat_cols]
print(f' [features] cross intersection: {len(feat_cols)} cols')
(train, val, atk) = _sample_cross(src_df, tgt_df, caps, args.seed)
else:
feat_cols = src_feat_cols
print(f' [features] within: {len(feat_cols)} cols')
(train, val, atk) = _sample_within(src_df, caps, args.seed)
print(f' [data] train={len(train):,} val={len(val):,} attack={len(atk):,} D={len(feat_cols)}')
res = _train_and_score(train, val, atk, feat_cols, epochs=args.epochs, lr=args.lr, optimizer=args.optimizer)
(val_score, atk_score) = (res['score_val'], res['score_atk'])
y = np.r_[np.zeros(len(val_score)), np.ones(len(atk_score))]
s = np.r_[val_score, atk_score]
overall = {'neg_log_prob': {'auroc': _safe_metric(roc_auc_score, y, s), 'auprc': _safe_metric(average_precision_score, y, s)}}
a_labels = atk['cls_label'].astype(str).to_numpy()
per_cls = _per_class(val_score, atk_score, a_labels)
out = {'method': 'shafir_nf_csv', 'protocol': args.protocol, 'seed': args.seed, 'src_dataset': src_name, 'tgt_dataset': tgt_name, 'feature_set': feat_cols, 'n_features': len(feat_cols), 'n_train': len(train), 'n_val': len(val), 'n_atk': len(atk), 'epochs': args.epochs, 'lr': args.lr, 'optimizer': args.optimizer, 't_train_sec': round(res['t_train'], 2), 't_score_sec': round(res['t_score'], 2), 'loss_first_last': [float(res['losses'][0]), float(res['losses'][-1])], 'overall': overall, 'per_class': per_cls}
out_json = args.out_dir / f'{args.protocol}_seed{args.seed}.json'
out_json.write_text(json.dumps(out, indent=2))
npz_path = out_json.with_suffix('.npz')
np.savez_compressed(npz_path, b_neg_log_prob=val_score, a_neg_log_prob=atk_score, a_labels=a_labels.astype(str), losses=res['losses'])
print(f'[saved] {out_json}')
print(f"[result] AUROC={overall['neg_log_prob']['auroc']:.4f} AUPRC={overall['neg_log_prob']['auprc']:.4f} train={res['t_train']:.1f}s")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env bash
set -euo pipefail
REPO=$(cd "$(dirname "$0")/../.." && pwd)
cd "$REPO"
OUT_DIR="artifacts/baselines/shafir_nf_csv_2026_04_29"
mkdir -p "$OUT_DIR"
LOG="$OUT_DIR/master.log"
: > "$LOG"
PROTOCOLS_DEFAULT="iscxtor_within cicids_within cicddos_within ciciot_within forward_cross reverse_cross"
SEEDS_DEFAULT="42 43 44"
PROTOCOLS="${PROTOCOLS:-$PROTOCOLS_DEFAULT}"
SEEDS="${SEEDS:-$SEEDS_DEFAULT}"
EPOCHS="${EPOCHS:-100}"
LR="${LR:-0.001}"
OPTIMIZER="${OPTIMIZER:-sgd}"
for protocol in $PROTOCOLS; do
for seed in $SEEDS; do
out_json="$OUT_DIR/${protocol}_seed${seed}.json"
if [[ -f "$out_json" ]]; then
echo "[skip] $out_json exists" | tee -a "$LOG"
continue
fi
echo "=== protocol=$protocol seed=$seed epochs=$EPOCHS opt=$OPTIMIZER lr=$LR ===" | tee -a "$LOG"
ts=$(date +%s)
uv run --no-sync python scripts/baselines/run_shafir_nf_csv.py \
--protocol "$protocol" --seed "$seed" \
--out-dir "$OUT_DIR" \
--epochs "$EPOCHS" --lr "$LR" --optimizer "$OPTIMIZER" \
2>&1 | tee -a "$LOG"
te=$(date +%s)
echo "[done] elapsed=$((te-ts))s $out_json" | tee -a "$LOG"
done
done
echo "ALL DONE"

View File

@@ -0,0 +1,87 @@
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO / 'Packet_CFM'))
from packet_store import PacketShardStore
SHAFIR5_FEATURE_NAMES = ('HTTPS', 'Protocol_Type', 'Magnitude', 'Variance', 'fin_count')
def _compute_batch(tokens: np.ndarray, lens: np.ndarray, dst_ports: np.ndarray, protocols: np.ndarray) -> np.ndarray:
(B, T, _) = tokens.shape
out = np.zeros((B, 5), dtype=np.float32)
arange = np.arange(T)[None, :]
mask = arange < lens[:, None]
log_size = tokens[:, :, 0]
sizes = np.expm1(np.maximum(log_size, 0.0))
sizes = np.where(mask, sizes, 0.0)
n = lens.astype(np.float32)
n_safe = np.maximum(n, 1.0)
sum_sq = (sizes * sizes).sum(axis=1)
mean = sizes.sum(axis=1) / n_safe
mean_sq = sum_sq / n_safe
magnitude = np.sqrt(np.maximum(mean_sq, 0.0))
variance = np.maximum(mean_sq - mean * mean, 0.0)
fin_flags = tokens[:, :, 4]
fin_flags = np.where(mask, fin_flags, 0.0)
fin_count = fin_flags.sum(axis=1)
https = (dst_ports == 443).astype(np.float32)
proto_type = protocols.astype(np.float32)
out[:, 0] = https
out[:, 1] = proto_type
out[:, 2] = magnitude
out[:, 3] = variance
out[:, 4] = fin_count
return out
def main():
p = argparse.ArgumentParser()
p.add_argument('--source-store', type=Path, required=True)
p.add_argument('--flows-parquet', type=Path, required=True)
p.add_argument('--out', type=Path, required=True)
p.add_argument('--T', type=int, default=None, help='Truncate to first T packets (default = stored).')
p.add_argument('--batch', type=int, default=100000)
args = p.parse_args()
print(f'[read] {args.flows_parquet}')
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label', 'dst_port', 'protocol'])
flow_id = flows['flow_id'].to_numpy(dtype=np.uint64)
labels = flows['label'].astype(str).to_numpy()
dst_ports = flows['dst_port'].to_numpy(dtype=np.uint32)
protocols = flows['protocol'].to_numpy(dtype=np.uint8)
store = PacketShardStore.open(args.source_store)
store_fid = store.read_flows(columns=['flow_id'])['flow_id'].to_numpy(dtype=np.uint64)
if len(store_fid) != len(flow_id) or not np.array_equal(store_fid, flow_id):
raise ValueError('store flow_id ordering differs from flows.parquet')
T_stored = int(store.manifest['packet_length'].max())
T = args.T if args.T is not None else T_stored
n = len(flows)
feats = np.zeros((n, 5), dtype=np.float32)
print(f'[stream] {n:,} flows × T={T} (stored {T_stored}), batch={args.batch}')
t0 = time.time()
for start in range(0, n, args.batch):
end = min(start + args.batch, n)
idx = np.arange(start, end, dtype=np.int64)
(tok, ll) = store.read_packets(idx, T=T)
ll = np.minimum(ll, T).astype(np.int32)
feats[start:end] = _compute_batch(tok.astype(np.float32), ll, dst_ports[start:end], protocols[start:end])
if start // args.batch % 20 == 0 or end == n:
dt = time.time() - t0
rate = end / max(dt, 1e-06)
eta = (n - end) / max(rate, 1.0)
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
args.out.parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
for (i, name) in enumerate(SHAFIR5_FEATURE_NAMES):
df[name] = feats[:, i]
df.to_parquet(args.out, compression='snappy', index=False)
print(f'[write] {args.out} rows={len(df):,} cols={list(df.columns)}')
print(f'[stats] HTTPS=1 fraction: {(feats[:, 0] > 0).mean():.4f}')
print(f'[stats] Protocol_Type unique values: {np.unique(feats[:, 1].astype(int))[:10]}')
print(f'[stats] Magnitude mean={feats[:, 2].mean():.1f} median={np.median(feats[:, 2]):.1f}')
print(f'[stats] Variance mean={feats[:, 3].mean():.1f}')
print(f'[stats] fin_count mean={feats[:, 4].mean():.3f}')
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,119 @@
from __future__ import annotations
import argparse
import sys
import zipfile
from pathlib import Path
from typing import BinaryIO
import numpy as np
import pandas as pd
from numpy.lib import format as npy_format
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from packet_store import PacketShardStore, PacketShardWriter
def _read_npy_header(fp: BinaryIO) -> tuple[tuple[int, ...], np.dtype, bool]:
version = npy_format.read_magic(fp)
if version == (1, 0):
(shape, fortran_order, dtype) = npy_format.read_array_header_1_0(fp)
elif version == (2, 0):
(shape, fortran_order, dtype) = npy_format.read_array_header_2_0(fp)
else:
raise ValueError(f'unsupported npy version {version}')
return (tuple((int(v) for v in shape)), np.dtype(dtype), bool(fortran_order))
def _read_exact(fp: BinaryIO, n_bytes: int) -> bytes:
chunks: list[bytes] = []
remaining = int(n_bytes)
while remaining:
chunk = fp.read(remaining)
if not chunk:
raise EOFError(f'expected {n_bytes} bytes, missing {remaining}')
chunks.append(chunk)
remaining -= len(chunk)
return b''.join(chunks)
def _open_member(zf: zipfile.ZipFile, name: str) -> tuple[BinaryIO, tuple[int, ...], np.dtype]:
fp = zf.open(name)
(shape, dtype, fortran_order) = _read_npy_header(fp)
if fortran_order:
fp.close()
raise ValueError(f'{name} uses Fortran order, expected C order')
return (fp, shape, dtype)
def _iter_npz_rows(npz_path: Path, rows: int, chunk_rows: int):
with zipfile.ZipFile(npz_path) as zf:
(token_fp, token_shape, token_dtype) = _open_member(zf, 'packet_tokens.npy')
(length_fp, length_shape, length_dtype) = _open_member(zf, 'packet_lengths.npy')
try:
if len(token_shape) != 3:
raise ValueError(f'packet_tokens.npy must be 3-D, got {token_shape}')
if length_shape != (token_shape[0],):
raise ValueError(f'packet_lengths.npy shape {length_shape} does not match tokens {token_shape}')
if rows > token_shape[0]:
raise ValueError(f'requested {rows} rows, but {npz_path} has {token_shape[0]}')
row_values = int(np.prod(token_shape[1:], dtype=np.int64))
token_row_bytes = row_values * token_dtype.itemsize
length_row_bytes = length_dtype.itemsize
emitted = 0
while emitted < rows:
take = min(int(chunk_rows), rows - emitted)
token_bytes = _read_exact(token_fp, take * token_row_bytes)
length_bytes = _read_exact(length_fp, take * length_row_bytes)
tokens = np.frombuffer(token_bytes, dtype=token_dtype).reshape(take, token_shape[1], token_shape[2])
lengths = np.frombuffer(length_bytes, dtype=length_dtype).reshape(take)
yield (emitted, tokens, lengths)
emitted += take
finally:
token_fp.close()
length_fp.close()
def _npz_token_shape(npz_path: Path) -> tuple[int, int, int]:
with zipfile.ZipFile(npz_path) as zf:
(fp, shape, _dtype) = _open_member(zf, 'packet_tokens.npy')
fp.close()
if len(shape) != 3:
raise ValueError(f'packet_tokens.npy must be 3-D, got {shape}')
return shape
def convert(args: argparse.Namespace) -> None:
pairs = list(zip(args.packets_npz, args.flows_parquet, strict=True))
first_shape = _npz_token_shape(pairs[0][0])
total_rows = 0
with PacketShardWriter(args.out_store, shard_size=args.shard_size, T_full=first_shape[1], D=first_shape[2], overwrite=args.overwrite) as writer:
for (split_id, (npz_path, flows_path)) in enumerate(pairs):
token_shape = _npz_token_shape(npz_path)
if token_shape[1:] != first_shape[1:]:
raise ValueError(f'{npz_path} shape {token_shape} does not match {first_shape}')
flows = pd.read_parquet(flows_path)
rows = min(len(flows), token_shape[0])
if args.max_rows_per_split > 0:
rows = min(rows, args.max_rows_per_split)
if len(flows) != token_shape[0]:
raise ValueError(f'{flows_path} has {len(flows)} rows but {npz_path} has {token_shape[0]}')
print(f'[split {split_id}] npz={npz_path} flows={flows_path} rows={rows:,} shape={token_shape}', flush=True)
for (start, tokens, lengths) in _iter_npz_rows(npz_path, rows, args.chunk_rows):
end = start + len(lengths)
writer.add_batch(tokens, lengths, flows.iloc[start:end].reset_index(drop=True))
total_rows += len(lengths)
if total_rows % args.report_every < len(lengths) or end == rows:
print(f'[split {split_id}] emitted={end:,}/{rows:,} total={total_rows:,}', flush=True)
store = PacketShardStore.open(args.out_store)
flows = store.read_flows(columns=['label'])
print(f"[done] store={args.out_store} rows={store.n_flows:,} shards={store.metadata['n_shards']}")
print(flows['label'].value_counts().to_string())
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--packets-npz', type=Path, nargs='+', required=True)
parser.add_argument('--flows-parquet', type=Path, nargs='+', required=True)
parser.add_argument('--out-store', type=Path, required=True)
parser.add_argument('--shard-size', type=int, default=50000)
parser.add_argument('--chunk-rows', type=int, default=10000)
parser.add_argument('--report-every', type=int, default=250000)
parser.add_argument('--max-rows-per-split', type=int, default=0)
parser.add_argument('--overwrite', action='store_true')
args = parser.parse_args()
if len(args.packets_npz) != len(args.flows_parquet):
raise SystemExit('--packets-npz and --flows-parquet must have the same count')
convert(args)
if __name__ == '__main__':
main()

114
scripts/csv_adapter.py Normal file
View File

@@ -0,0 +1,114 @@
from __future__ import annotations
import csv
import sys
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Callable
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parent))
from extract_lib import _canonical_key
@dataclass(frozen=True)
class CsvFlowAdapter:
join_cols: dict[str, str]
label_col: str
timestamp_formats: tuple[str, ...]
benign_aliases: frozenset[str]
benign_token: str = 'normal'
drop_label_patterns: tuple[str, ...] = ()
label_aliases: dict[str, str] = field(default_factory=dict)
label_normalizer: Callable[[str], str] | None = None
def normalize_label(self, raw: str) -> str:
if self.label_normalizer is not None:
return self.label_normalizer(raw)
s = raw.strip()
if s in self.benign_aliases:
return self.benign_token
return self.label_aliases.get(s, s)
def parse_timestamp(self, raw: str) -> float | None:
s = raw.strip()
if not s:
return None
for fmt in self.timestamp_formats:
try:
return datetime.strptime(s, fmt).timestamp()
except ValueError:
continue
return None
def parse_csv_rows(*, csv_path: Path, row_idx_start: int, time_offset_seconds: float, adapter: CsvFlowAdapter, max_per_class: int | None=None, max_benign: int | None=None, rng: np.random.Generator | None=None) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int, dict[str, int]]:
if (max_per_class is not None or max_benign is not None) and rng is None:
rng = np.random.default_rng(42)
parsed: list[tuple[tuple, float, str]] = []
n_skip = 0
with open(csv_path, 'r', newline='') as f:
reader = csv.reader(f)
header = [h.strip() for h in next(reader)]
h2i = {h: i for (i, h) in enumerate(header)}
needed = list(adapter.join_cols.values()) + [adapter.label_col]
for col in needed:
if col not in h2i:
raise KeyError(f'{csv_path.name}: missing column {col!r}')
i_src_ip = h2i[adapter.join_cols['src_ip']]
i_src_port = h2i[adapter.join_cols['src_port']]
i_dst_ip = h2i[adapter.join_cols['dst_ip']]
i_dst_port = h2i[adapter.join_cols['dst_port']]
i_proto = h2i[adapter.join_cols['protocol']]
i_ts = h2i[adapter.join_cols['timestamp']]
i_label = h2i[adapter.label_col]
for row in reader:
if not row:
continue
try:
raw_label = row[i_label]
except IndexError:
n_skip += 1
continue
if any((pat in raw_label for pat in adapter.drop_label_patterns)):
n_skip += 1
continue
try:
sp = int(float(row[i_src_port])) if row[i_src_port].strip() else 0
dp = int(float(row[i_dst_port])) if row[i_dst_port].strip() else 0
proto = int(float(row[i_proto])) if row[i_proto].strip() else 0
except (ValueError, IndexError):
n_skip += 1
continue
sip = row[i_src_ip].strip()
dip = row[i_dst_ip].strip()
ck = _canonical_key(sip, dip, sp, dp, proto)
ts_parsed = adapter.parse_timestamp(row[i_ts])
ts_epoch = float('nan') if ts_parsed is None else ts_parsed + time_offset_seconds
parsed.append((ck, ts_epoch, adapter.normalize_label(raw_label)))
keep_idx = _select_indices(labels=[p[2] for p in parsed], benign_token=adapter.benign_token, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
rows_by_key: dict[tuple, list[tuple[int, float]]] = {}
labels_out: list[str] = []
class_counts: dict[str, int] = {}
row_idx = row_idx_start
for i in keep_idx:
(ck, ts_epoch, label) = parsed[i]
rows_by_key.setdefault(ck, []).append((row_idx, ts_epoch))
labels_out.append(label)
class_counts[label] = class_counts.get(label, 0) + 1
row_idx += 1
return (rows_by_key, labels_out, row_idx - row_idx_start, n_skip, class_counts)
def _select_indices(*, labels: list[str], benign_token: str, max_per_class: int | None, max_benign: int | None, rng: np.random.Generator | None) -> list[int]:
if max_per_class is None and max_benign is None:
return list(range(len(labels)))
assert rng is not None
buckets: dict[str, list[int]] = {}
for (i, label) in enumerate(labels):
buckets.setdefault(label, []).append(i)
keep: list[int] = []
for (label, idxs) in buckets.items():
cap = max_benign if label == benign_token else max_per_class
if cap is not None and len(idxs) > cap:
pick = rng.choice(len(idxs), size=cap, replace=False)
idxs = [idxs[j] for j in sorted(pick)]
keep.extend(idxs)
keep.sort()
return keep

112
scripts/download/README.md Normal file
View File

@@ -0,0 +1,112 @@
# Dataset download scripts
Target layout (mirrors `datasets/cicids2017/`):
```
datasets/
ciciot2023/raw/{pcap,csv}
iscxtor2016/raw/{pcap,csv}
cicapt_iiot2024/raw/{pcap,csv}
ustc_tfc2016/raw/pcap
datacon2020/raw/pcap
```
## CICIoT2023 / ISCXTor2016 (automated)
UNB/CIC gates downloads behind a consent form. After submission the site issues
a `Token` cookie (domain `.cicresearch.ca`) that unlocks two endpoints:
- `browse.php?p=<path>` — HTML directory listing
- `download.php?file=<path>` — raw file bytes
`cic_download.py` is a stdlib-only recursive crawler that walks `browse.php`
and fetches each leaf via `download.php`. Already-downloaded files are
skipped (presence-based; the PHP endpoint does not advertise sizes).
### Workflow
1. Open the dataset page in a browser, fill and submit the form:
- CICIoT2023 : <https://www.unb.ca/cic/datasets/iotdataset-2023.html>
- ISCXTor2016: <https://www.unb.ca/cic/datasets/tor.html>
2. After submit, click through to `cicresearch.ca/.../browse.php`. The page
must load successfully in your browser — this proves the Token is set.
3. Export the cookie in **Netscape format** (tab-separated). One line is
sufficient:
```
# Netscape HTTP Cookie File
.cicresearch.ca TRUE / TRUE <expiry> Token <value>
```
Save as:
- `scripts/download/cookies_ciciot2023.txt`
- `scripts/download/cookies_iscxtor2016.txt`
Tokens are per-dataset — a CICIoT2023 cookie will not work for ISCXTor.
4. Run:
```bash
bash scripts/download/download_ciciot2023.sh
bash scripts/download/download_iscxtor2016.sh
```
Env vars: `WHAT=pcap|csv|both`, `DEST=`, `COOKIES=`, `DRY_RUN=1`, `LIMIT=N`.
For ISCXTor, if the remote subdir names differ from the defaults
(`Pcaps` / `CSVs`), set `PCAP_ROOT=` / `CSV_ROOT=`.
### Known remote tree sizes
- **CICIoT2023** — `CSV/` 328 files (includes `CSV.zip`, `MERGED_CSV.zip`,
`MERGED_CSV/`, and per-attack CSVs), `PCAP/` 311 files across 36 attack
categories. Full dataset is ~12 GB.
### Quick commands
```bash
# Dry-run (enumerate only, no downloads)
DRY_RUN=1 bash scripts/download/download_ciciot2023.sh
# Download first 5 files as a smoke test
LIMIT=5 WHAT=csv bash scripts/download/download_ciciot2023.sh
# Full download
bash scripts/download/download_ciciot2023.sh
```
## CICAPT-IIoT2024 (automated)
Same UNB/CIC pipeline as CICIoT2023, but crawled in a single pass — the
entire `CICAPT-IIoT Dataset/` top-level folder is mirrored (pcap, csv, and
anything else) under `datasets/cicapt_iiot2024/raw/`.
Cookie file: `scripts/download/cookies_cicapt_iiot2024.txt` (Token for
`.cicresearch.ca`).
```bash
# Smoke test first
DRY_RUN=1 LIMIT=5 bash scripts/download/download_cicapt_iiot2024.sh
# Full download
bash scripts/download/download_cicapt_iiot2024.sh
# Skip heavy archives if they duplicate a per-file tree
SKIP_EXT=zip,7z bash scripts/download/download_cicapt_iiot2024.sh
```
Reference URL (browser, with Token cookie live):
<https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset>
## USTC-TFC2016 (manual)
```bash
cd datasets/ustc_tfc2016/raw/pcap
git clone --depth=1 https://github.com/yungshenglu/USTC-TFC2016.git .
```
No official CSV — extract features yourself (CICFlowMeter, USTC-TK2016).
## DataCon2020 (manual)
Register at <https://datacon.qianxin.com/opendata/maliciousstream> and place
the `black/` `white/` `test/` pcap bundles under
`datasets/datacon2020/raw/pcap/`. No official CSV.

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env bash
# Background wrapper: retry CICIoT2023 PCAP download until it reports
# a clean "Done." with n_files > 0. Each attempt is delimited in the log
# so the monitor can grep for progress.
#
# Invoked detached (nohup ... &). The inner script is resumable via
# the .part-file convention in cic_download.py.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
LOG="${REPO_ROOT}/logs/ciciot2023_pcap.log"
# nohup strips the interactive PATH; re-expose the project venv so
# `python` resolves inside download_ciciot2023.sh.
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
export PATH="${REPO_ROOT}/.venv/bin:${PATH:-/usr/local/bin:/usr/bin:/bin}"
fi
# Route through the local proxy; detached bash does not inherit the
# interactive shell's proxy env, and cicresearch.ca's WAF rate-limits
# bare-IP traffic much more aggressively than the proxy exit.
export HTTP_PROXY="http://127.0.0.1:7093"
export HTTPS_PROXY="http://127.0.0.1:7093"
export ALL_PROXY="socks5h://127.0.0.1:7093"
export NO_PROXY="localhost,127.0.0.1,::1"
export http_proxy="${HTTP_PROXY}"
export https_proxy="${HTTPS_PROXY}"
export all_proxy="${ALL_PROXY}"
export no_proxy="${NO_PROXY}"
i=0
while :; do
i=$((i + 1))
ts=$(date +%F\ %T)
printf '\n=== attempt %d %s ===\n' "$i" "$ts" >>"$LOG"
# Skip bundle zips (e.g. PCAP.zip) — we want per-attack-class .pcap files,
# not the whole dataset as one archive.
WHAT=pcap SKIP_EXT="zip,7z" bash "${SCRIPT_DIR}/download_ciciot2023.sh" >>"$LOG" 2>&1
rc=$?
# If inner script exited with 0 AND last "Done." line reports >0 files,
# we consider the listing+walk to have succeeded at least once. Otherwise
# keep retrying on network/SSL failures.
last_done=$(grep -E '^Done\. [0-9]+ files processed' "$LOG" | tail -1 || true)
n=$(printf '%s' "$last_done" | awk '{print $2}')
if [[ "$rc" -eq 0 && -n "$n" && "$n" -gt 0 ]]; then
printf '=== loop finished clean %s (files=%s) ===\n' "$(date +%F\ %T)" "$n" >>"$LOG"
break
fi
printf '=== attempt %d ended rc=%s last_done=%q; sleep 60 ===\n' \
"$i" "$rc" "$last_done" >>"$LOG"
sleep 60
done

View File

@@ -0,0 +1,185 @@
from __future__ import annotations
import argparse
import http.cookiejar
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
UA = 'Mozilla/5.0 (cic-downloader)'
LINK_RE = re.compile('href="(browse\\.php\\?p=[^"]+|download\\.php\\?file=[^"]+)"')
def build_opener(cookies_path: Path) -> urllib.request.OpenerDirector:
jar = http.cookiejar.MozillaCookieJar()
jar.load(str(cookies_path), ignore_discard=True, ignore_expires=True)
return urllib.request.build_opener(urllib.request.HTTPCookieProcessor(jar))
def http_get(opener, url: str, timeout: int=60, retries: int=5) -> bytes:
last: Exception | None = None
for attempt in range(retries):
try:
req = urllib.request.Request(url, headers={'User-Agent': UA})
with opener.open(req, timeout=timeout) as resp:
final = resp.geturl()
if 'unb.ca/cic/datasets' in final:
raise RuntimeError(f'Got redirected to UNB form page ({final}). Token cookie is missing/expired or wrong dataset scope.')
return resp.read()
except RuntimeError:
raise
except Exception as e:
last = e
wait = min(30, 2 ** attempt)
print(f' WARN GET {url} failed ({e!r}); retry in {wait}s ({attempt + 1}/{retries})', file=sys.stderr)
time.sleep(wait)
raise RuntimeError(f'GET {url} failed after {retries} attempts: {last!r}')
def list_dir(opener, base: str, p: str) -> list[tuple[str, str]]:
url = urllib.parse.urljoin(base, 'browse.php') + '?p=' + urllib.parse.quote(p, safe='/')
html = http_get(opener, url).decode('utf-8', 'replace')
out: list[tuple[str, str]] = []
for m in LINK_RE.finditer(html):
href = m.group(1)
qs = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
if href.startswith('browse.php'):
out.append(('dir', qs['p'][0]))
else:
out.append(('file', qs['file'][0]))
return out
def walk(opener, base: str, root: str):
stack = [root]
seen: set[str] = set()
while stack:
p = stack.pop()
if p in seen:
continue
seen.add(p)
try:
entries = list_dir(opener, base, p)
except Exception as e:
print(f' WARN list_dir({p}) failed permanently: {e!r}', file=sys.stderr)
continue
for (kind, val) in sorted(entries):
if kind == 'dir':
stack.append(val)
else:
yield val
def download_file(opener, base: str, remote: str, dest_root: Path, *, root_prefix: str) -> None:
url = urllib.parse.urljoin(base, 'download.php') + '?file=' + urllib.parse.quote(remote, safe='')
rel = remote[len(root_prefix):].lstrip('/') if remote.startswith(root_prefix) else remote
local = dest_root / rel
local.parent.mkdir(parents=True, exist_ok=True)
if local.exists() and local.stat().st_size > 0:
print(f' SKIP {rel} ({local.stat().st_size} bytes, already present)')
return
tmp = local.with_suffix(local.suffix + '.part')
last: Exception | None = None
for attempt in range(5):
resume_from = tmp.stat().st_size if tmp.exists() else 0
try:
headers = {'User-Agent': UA}
if resume_from > 0:
headers['Range'] = f'bytes={resume_from}-'
req = urllib.request.Request(url, headers=headers)
t0 = time.monotonic()
bytes_read = 0
with opener.open(req, timeout=1800) as resp:
final = resp.geturl()
if 'unb.ca/cic/datasets' in final:
raise RuntimeError('Token cookie invalid mid-download.')
status = getattr(resp, 'status', None)
mode = 'ab'
if resume_from <= 0:
mode = 'wb'
elif status != 206:
print(f' INFO {rel} resume request ignored (status={status}); restarting from zero')
resume_from = 0
mode = 'wb'
with open(tmp, mode) as fh:
while True:
buf = resp.read(1 << 20)
if not buf:
break
fh.write(buf)
bytes_read += len(buf)
tmp.replace(local)
dt = time.monotonic() - t0
total_bytes = local.stat().st_size
mb = total_bytes / (1 << 20)
delta_mb = bytes_read / (1 << 20)
rate = mb / dt if dt > 0 else 0
if resume_from > 0:
resumed_mb = resume_from / (1 << 20)
rate = delta_mb / dt if dt > 0 else 0
print(f' GOT {rel} {mb:.1f} MB +{delta_mb:.1f} MB from {resumed_mb:.1f} MB {rate:.1f} MB/s')
else:
print(f' GOT {rel} {mb:.1f} MB {rate:.1f} MB/s')
return
except urllib.error.HTTPError as e:
last = e
if e.code == 416 and resume_from > 0:
print(f' WARN {rel} resume rejected with 416; restarting from zero', file=sys.stderr)
try:
tmp.unlink(missing_ok=True)
except OSError:
pass
time.sleep(1)
continue
wait = min(30, 2 ** attempt)
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
time.sleep(wait)
except RuntimeError:
raise
except Exception as e:
last = e
wait = min(30, 2 ** attempt)
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
time.sleep(wait)
raise RuntimeError(f'download failed after 5 attempts: {last!r}')
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--cookies', required=True, type=Path)
ap.add_argument('--base', required=True, help='dataset URL ending with /, e.g. https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/')
ap.add_argument('--root', required=True, help='sub-path to crawl (e.g. PCAP or CSV)')
ap.add_argument('--dest', required=True, type=Path, help='local directory to mirror into')
ap.add_argument('--dry-run', action='store_true', help='enumerate only; do not download')
ap.add_argument('--limit', type=int, default=0, help='stop after N files (0 = no limit)')
ap.add_argument('--skip-ext', default='', help="comma-separated file extensions to skip (e.g. 'zip,7z'); case-insensitive, no dots")
args = ap.parse_args()
skip_exts = {e.strip().lower().lstrip('.') for e in args.skip_ext.split(',') if e.strip()}
if not args.cookies.is_file():
print(f'ERROR: cookies file not found: {args.cookies}', file=sys.stderr)
return 2
opener = build_opener(args.cookies)
args.dest.mkdir(parents=True, exist_ok=True)
print(f'Base : {args.base}')
print(f'Root : {args.root}')
print(f'Dest : {args.dest}')
print(f'Walking tree...')
n_files = 0
n_skipped = 0
for remote in walk(opener, args.base, args.root):
ext = remote.rsplit('.', 1)[-1].lower() if '.' in remote else ''
if ext in skip_exts:
n_skipped += 1
print(f" SKIP {remote} (extension '.{ext}' excluded)")
continue
n_files += 1
if args.dry_run:
print(f' FILE {remote}')
else:
try:
download_file(opener, args.base, remote, args.dest, root_prefix=args.root.rstrip('/'))
except Exception as e:
print(f' FAIL {remote}: {e}', file=sys.stderr)
if args.limit and n_files >= args.limit:
print(f'-- stopped after {args.limit} (--limit) --')
break
print(f'Done. {n_files} files processed, {n_skipped} skipped by --skip-ext.')
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1777047525 Token ef8ooumh5qdh42r0k410mjoq0c

View File

@@ -0,0 +1,4 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
.cicresearch.ca TRUE / TRUE 1776910223 Token 8kfh51fj8u46lum8kvu6safonr

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1777518468 Token qn181atofvua6sn8ouv1hlcoo8

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1776990463 Token t4sfffhk5mnttgkh300buhg0it

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# Download CICAPT-IIoT2024 (entire dataset tree) from UNB CIC via cic_download.py.
#
# Prereq: Token cookie for .cicresearch.ca saved as
# scripts/download/cookies_cicapt_iiot2024.txt
#
# Remote tree is crawled in a single pass under ROOT="CICAPT-IIoT Dataset"
# (the top-level folder at
# https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset ).
# Every leaf file — pcap, csv, whatever — is mirrored under
# datasets/cicapt_iiot2024/raw/
# preserving the remote subdirectory layout.
#
# Usage:
# bash download_cicapt_iiot2024.sh # full download
# DRY_RUN=1 bash download_cicapt_iiot2024.sh # enumerate only
# LIMIT=5 bash download_cicapt_iiot2024.sh # smoke test (first 5 files)
# SKIP_EXT=zip,7z bash download_cicapt_iiot2024.sh # skip archives
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicapt_iiot2024/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicapt_iiot2024.txt}"
BASE="${BASE:-https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/}"
ROOT="${ROOT:-CICAPT-IIoT Dataset}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
echo "=== ${ROOT} -> ${DEST_ROOT} ==="
python3 -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${ROOT}" --dest "${DEST_ROOT}" "${EXTRA[@]}"

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Download CICDDoS2019 (CSV, optionally PCAP) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/ddos-2019.html
# in a browser, then save the issued Token cookie (Netscape format) as
# scripts/download/cookies_cicddos2019.txt
# Tokens are scoped per-dataset — the CICIoT2023 / ISCXTor cookies will NOT
# work here.
#
# PCAPs for this dataset are already downloaded (see datasets/cicddos2019/raw/
# pcap/). Default WHAT=csv reflects that. Switch to WHAT=pcap or WHAT=both if
# you need to re-fetch.
#
# Usage:
# bash download_cicddos2019.sh # CSVs only (default)
# WHAT=pcap bash download_cicddos2019.sh # PCAPs only
# WHAT=both bash download_cicddos2019.sh # everything
# DRY_RUN=1 bash download_cicddos2019.sh # enumerate without downloading
# CSV_ROOT=CSV bash download_cicddos2019.sh # override root if server uses a different name
#
# First-time tip: run with DRY_RUN=1 to discover the exact remote root names.
# The CIC site is inconsistent across datasets (CSV / CSVs / CSV-01-12 ...).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicddos2019/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicddos2019.txt}"
BASE="https://cicresearch.ca/CICDataset/CICDDoS2019/"
WHAT="${WHAT:-csv}"
# Default root names. Override via env if dry-run shows a different layout.
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
CSV_ROOT="${CSV_ROOT:-CSVs}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
python -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
# Download CICIoT2023 (PCAP + CSV) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/iotdataset-2023.html
# in a browser, then save the issued Token cookie in Netscape format as
# scripts/download/cookies_ciciot2023.txt
# The cookie domain must be .cicresearch.ca and the name must be "Token".
#
# Usage:
# bash download_ciciot2023.sh # both PCAP and CSV
# WHAT=pcap bash download_ciciot2023.sh # PCAP only
# WHAT=csv bash download_ciciot2023.sh # CSV only
# DRY_RUN=1 bash download_ciciot2023.sh # enumerate without downloading
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/ciciot2023/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_ciciot2023.txt}"
BASE="https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/"
WHAT="${WHAT:-both}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
python -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run PCAP "${DEST_ROOT}/pcap" ;;
csv) run CSV "${DEST_ROOT}/csv" ;;
both) run PCAP "${DEST_ROOT}/pcap"
run CSV "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env bash
# Download ISCXTor2016 (PCAP + CSV) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/tor.html
# in a browser, then save the issued Token cookie (Netscape format) as
# scripts/download/cookies_iscxtor2016.txt
# Tokens are scoped per-dataset — the CICIoT2023 cookie will NOT work here.
#
# Usage:
# bash download_iscxtor2016.sh
# WHAT=pcap|csv|both DEST=... COOKIES=... DRY_RUN=1 LIMIT=N
# PCAP_ROOT=... CSV_ROOT=... SKIP_EXT=zip,7z
#
# Note: the remote sub-path names ("Pcaps" / "CSVs" or similar) are only
# visible after authenticating. Run with DRY_RUN=1 first to confirm the
# tree; if the roots differ, set PCAP_ROOT=... and/or CSV_ROOT=....
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/iscxtor2016/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_iscxtor2016.txt}"
BASE="https://cicresearch.ca/CICDataset/ISCX-Tor-NonTor-2017/"
WHAT="${WHAT:-both}"
# Default root names (override via env if the server uses different casing)
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
CSV_ROOT="${CSV_ROOT:-CSVs}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
resolve_python() {
if [[ -n "${PYTHON:-}" ]]; then
printf '%s\n' "${PYTHON}"
return
fi
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
printf '%s\n' "${REPO_ROOT}/.venv/bin/python"
return
fi
if command -v python >/dev/null 2>&1; then
command -v python
return
fi
if command -v python3 >/dev/null 2>&1; then
command -v python3
return
fi
echo "ERROR: no Python interpreter found. Set PYTHON=/path/to/python." >&2
exit 127
}
PYTHON_BIN="$(resolve_python)"
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
"${PYTHON_BIN}" -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac

View File

@@ -0,0 +1,114 @@
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
import numpy as np
import torch
from sklearn.metrics import average_precision_score, roc_auc_score
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from data import _preprocess_packet_batch
from detect import _load_model
from packet_store import PacketShardStore
@torch.no_grad()
def _score_indices(*, store: PacketShardStore, indices: np.ndarray, model, device: torch.device, preprocess: str, mean: np.ndarray, std: np.ndarray, clip_lo: np.ndarray | None, clip_hi: np.ndarray | None, split_tag: str, split_seed: int, batch: int, materialize_batch: int, n_steps: int) -> dict[str, np.ndarray]:
out = {'terminal_norm': [], 'arc_length': [], 'kinetic_energy': [], 'velocity_score': []}
total = len(indices)
report_every = max(1, total // 4)
next_report = 0
for start in range(0, total, materialize_batch):
idx = indices[start:start + materialize_batch]
(x_np, lens_np) = store.read_packets(idx, T=model.cfg.T)
x_np = _preprocess_packet_batch(x_np, lens_np, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag=split_tag, split_seed=split_seed, flow_ids=idx)
for pos in range(0, len(idx), batch):
bx = torch.from_numpy(x_np[pos:pos + batch]).float().to(device)
bl = torch.from_numpy(lens_np[pos:pos + batch]).long().to(device)
m = model.trajectory_metrics(bx, lens=bl, cond=None, n_steps=n_steps)
for key in ('terminal_norm', 'arc_length', 'kinetic_energy'):
out[key].append(m[key].cpu().numpy())
vs = model.velocity_score(bx, lens=bl, cond=None, t_eval=(0.5, 0.75, 1.0))
out['velocity_score'].append(vs.cpu().numpy())
done = min(start + len(idx), total)
if done >= next_report or done == total:
print(f'[{split_tag}] {done:,}/{total:,}', flush=True)
next_report = done + report_every
return {key: np.concatenate(parts) for (key, parts) in out.items()}
def run(args: argparse.Namespace) -> None:
device = torch.device('cuda' if args.device == 'auto' and torch.cuda.is_available() else 'cpu' if args.device == 'auto' else args.device)
save_dir = Path(args.save_dir)
ckpt = torch.load(save_dir / 'model.pt', map_location='cpu', weights_only=False)
preprocess = str(ckpt.get('preprocess', 'zscore'))
mean = np.asarray(ckpt['packet_mean'], dtype=np.float32)
std = np.asarray(ckpt['packet_std'], dtype=np.float32)
clip_lo = np.asarray(ckpt['clip_lo'], dtype=np.float32) if 'clip_lo' in ckpt else None
clip_hi = np.asarray(ckpt['clip_hi'], dtype=np.float32) if 'clip_hi' in ckpt else None
model = _load_model(save_dir, device)
store = PacketShardStore.open(Path(args.target_store))
flows = store.read_flows(columns=['flow_id', 'label'])
labels = flows['label'].to_numpy().astype(str)
lens = store.manifest['packet_length'].to_numpy(dtype=np.int32)
keep = lens >= int(args.min_len)
benign_idx = flows.loc[keep & (labels == args.benign_label), 'flow_id'].to_numpy(dtype=np.int64)
attack_df = flows.loc[keep & (labels != args.benign_label), ['flow_id', 'label']]
attack_idx_all = attack_df['flow_id'].to_numpy(dtype=np.int64)
attack_labels_all = attack_df['label'].to_numpy().astype(str)
if len(benign_idx) < args.n_benign:
raise ValueError(f'target has only {len(benign_idx)} benign rows, need {args.n_benign}')
if len(attack_idx_all) < args.n_attack:
raise ValueError(f'target has only {len(attack_idx_all)} attack rows, need {args.n_attack}')
print(f'[target] store={args.target_store} benign_pool={len(benign_idx):,} attack_pool={len(attack_idx_all):,} T={model.cfg.T} preprocess={preprocess}', flush=True)
results: dict[str, object] = {'save_dir': str(save_dir), 'target_store': str(args.target_store), 'n_benign': int(args.n_benign), 'n_attack': int(args.n_attack), 'seeds': [], 'mean': {}, 'std': {}}
metrics = ('terminal_norm', 'arc_length', 'kinetic_energy', 'velocity_score')
per_metric_values = {f'{metric}_auroc': [] for metric in metrics}
per_metric_values.update({f'{metric}_auprc': [] for metric in metrics})
for seed in args.seeds:
rng = np.random.default_rng(int(seed))
b_idx = np.sort(rng.choice(benign_idx, args.n_benign, replace=False))
a_pos = rng.choice(len(attack_idx_all), args.n_attack, replace=False)
a_pos.sort()
a_idx = attack_idx_all[a_pos]
a_labels = attack_labels_all[a_pos]
print(f'[seed={seed}] scoring benign={len(b_idx):,} attack={len(a_idx):,}', flush=True)
b_scores = _score_indices(store=store, indices=b_idx, model=model, device=device, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag='val', split_seed=int(seed), batch=args.batch, materialize_batch=args.materialize_batch, n_steps=args.n_steps)
a_scores = _score_indices(store=store, indices=a_idx, model=model, device=device, preprocess=preprocess, mean=mean, std=std, clip_lo=clip_lo, clip_hi=clip_hi, split_tag='attack', split_seed=int(seed), batch=args.batch, materialize_batch=args.materialize_batch, n_steps=args.n_steps)
seed_result: dict[str, object] = {'seed': int(seed), 'attack_label_counts': {str(k): int(v) for (k, v) in zip(*np.unique(a_labels, return_counts=True))}, 'metrics': {}}
for metric in metrics:
y = np.r_[np.zeros(len(b_scores[metric])), np.ones(len(a_scores[metric]))]
s = np.r_[b_scores[metric], a_scores[metric]]
s = np.nan_to_num(s, nan=0.0, posinf=1000000.0, neginf=-1000000.0)
auroc = float(roc_auc_score(y, s))
auprc = float(average_precision_score(y, s))
seed_result['metrics'][metric] = {'auroc': auroc, 'auprc': auprc}
per_metric_values[f'{metric}_auroc'].append(auroc)
per_metric_values[f'{metric}_auprc'].append(auprc)
print(f'[seed={seed}] {metric:<16s} AUROC={auroc:.4f} AUPRC={auprc:.4f}', flush=True)
results['seeds'].append(seed_result)
for (key, values) in per_metric_values.items():
arr = np.asarray(values, dtype=np.float64)
results['mean'][key] = float(arr.mean())
results['std'][key] = float(arr.std(ddof=0))
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
Path(args.output).write_text(json.dumps(results, indent=2, sort_keys=True) + '\n')
print(f'[saved] {args.output}', flush=True)
for metric in metrics:
print(f"[mean] {metric:<16s} AUROC={results['mean'][metric + '_auroc']:.4f}±{results['std'][metric + '_auroc']:.4f} AUPRC={results['mean'][metric + '_auprc']:.4f}±{results['std'][metric + '_auprc']:.4f}", flush=True)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--save-dir', type=Path, required=True)
parser.add_argument('--target-store', type=Path, required=True)
parser.add_argument('--output', type=Path, required=True)
parser.add_argument('--n-benign', type=int, default=10000)
parser.add_argument('--n-attack', type=int, default=10000)
parser.add_argument('--seeds', type=int, nargs='+', default=[0, 1, 2, 3, 4])
parser.add_argument('--benign-label', type=str, default='normal')
parser.add_argument('--min-len', type=int, default=2)
parser.add_argument('--n-steps', type=int, default=16)
parser.add_argument('--batch', type=int, default=4096)
parser.add_argument('--materialize-batch', type=int, default=32768)
parser.add_argument('--device', type=str, default='auto')
run(parser.parse_args())
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,132 @@
from __future__ import annotations
import argparse
import csv
import sys
from datetime import datetime
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parent))
from extract_lib import extract_dataset, _canonical_key
from csv_adapter import CsvFlowAdapter, parse_csv_rows
JOIN_COLS = {'src_ip': 'Source IP', 'src_port': 'Source Port', 'dst_ip': 'Destination IP', 'dst_port': 'Destination Port', 'protocol': 'Protocol', 'timestamp': 'Timestamp'}
LABEL_COL = 'Label'
TIMESTAMP_FORMATS = ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S')
BENIGN_ALIASES = {'BENIGN', 'Benign', 'benign'}
BENIGN_TOKEN = 'normal'
DROP_LABEL_PATTERNS: tuple[str, ...] = ()
LABEL_ALIASES = {'UDP-lag': 'UDPLag'}
SHARDS = {'01-12': 'SAT-01-12-2018', '03-11': 'SAT-03-11-2018'}
SHARD_OFFSETS_DEFAULT = {'01-12': 43200.0, '03-11': 39600.0}
DEFAULT_CSV_DIR = Path('datasets/cicddos2019/raw/csv')
DEFAULT_PCAP_DIR = Path('datasets/cicddos2019/raw/pcap')
DEFAULT_OUT_PACKETS = Path('datasets/cicddos2019/processed/packets.npz')
DEFAULT_OUT_FLOWS = Path('datasets/cicddos2019/processed/flows.parquet')
CICDDOS2019_ADAPTER = CsvFlowAdapter(join_cols=JOIN_COLS, label_col=LABEL_COL, timestamp_formats=TIMESTAMP_FORMATS, benign_aliases=frozenset(BENIGN_ALIASES), benign_token=BENIGN_TOKEN, drop_label_patterns=DROP_LABEL_PATTERNS, label_aliases=LABEL_ALIASES)
def _normalize_label(raw: str) -> str:
s = raw.strip()
if s in BENIGN_ALIASES:
return BENIGN_TOKEN
return LABEL_ALIASES.get(s, s)
def _parse_timestamp(ts: str) -> float | None:
s = ts.strip()
if not s:
return None
for fmt in TIMESTAMP_FORMATS:
try:
return datetime.strptime(s, fmt).timestamp()
except ValueError:
continue
return None
def _find_pcaps_for_shard(pcap_dir: Path, prefix: str) -> list[Path]:
found: list[Path] = []
seen = set()
for pat in (f'{prefix}*', f'{prefix}*.pcap', f'{prefix}*.pcapng'):
for p in sorted(pcap_dir.glob(pat)):
if p.is_file() and p not in seen:
found.append(p)
seen.add(p)
return found
def _parse_csv(csv_path: Path, row_idx_start: int, time_offset_seconds: float, max_per_class: int | None, max_benign: int | None, rng: np.random.Generator) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int, dict[str, int]]:
return parse_csv_rows(csv_path=csv_path, row_idx_start=row_idx_start, time_offset_seconds=time_offset_seconds, adapter=CICDDOS2019_ADAPTER, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--csv-dir', type=Path, default=DEFAULT_CSV_DIR)
ap.add_argument('--pcap-dir', type=Path, default=DEFAULT_PCAP_DIR)
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Matched flows per temporary worker chunk when --out-store is set.')
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
ap.add_argument('--match-strategy', choices=('auto', 'hungarian', 'stream_nearest'), default='auto', help='CSV↔pcap matching strategy. auto uses stream_nearest for --out-store and hungarian for legacy npz output.')
ap.add_argument('--T-full', type=int, default=256)
ap.add_argument('--idle-timeout', type=float, default=120.0)
ap.add_argument('--time-tolerance', type=float, default=2.0)
ap.add_argument('--time-offset', type=float, default=0.0, help='Extra seconds added to per-shard SHARD_OFFSETS_DEFAULT. Default 0 assumes a UTC+8 host (matches the SHARD_OFFSETS_DEFAULT values: 03-11=39600, 01-12=43200). If the per-shard time-delta diagnostic shows a non-zero median, add that to this flag.')
ap.add_argument('--jobs', type=int, default=0, help='0=auto (min(n_shards, cpu_count)). 1=serial.')
ap.add_argument('--shards', type=str, nargs='*', default=None, choices=sorted(SHARDS.keys()), help='Subset of shards to process (default: all).')
ap.add_argument('--max-per-class', type=int, default=500000, help='Per-file, per-attack-class row cap (random subsample). Default 500k. Pass 0 to disable.')
ap.add_argument('--max-benign', type=int, default=None, help='Per-file benign row cap. Default: uncapped (keep all).')
ap.add_argument('--max-packets-per-pcap', type=int, default=None, help='Cap per-pcap packets (smoke only).')
ap.add_argument('--max-pcap-files-per-shard', type=int, default=None, help='Only process the first N pcap chunks per shard (smoke only).')
ap.add_argument('--sample-seed', type=int, default=42)
args = ap.parse_args()
max_per_class = args.max_per_class or None
max_benign = args.max_benign or None
rng = np.random.default_rng(args.sample_seed)
shards = args.shards or sorted(SHARDS.keys())
csv_rows_by_day: dict[str, dict] = {}
all_labels: list[str] = []
total_rows = 0
total_skip = 0
aggregate_counts: dict[str, int] = {}
print(f'=== parsing CSVs in {args.csv_dir} ===')
print(f' max_per_class={max_per_class} max_benign={max_benign}')
print(f' additive time_offset={args.time_offset}s (on top of per-shard defaults)')
for shard in shards:
shard_offset = SHARD_OFFSETS_DEFAULT.get(shard, 0.0) + args.time_offset
print(f'[{shard}] effective time_offset={shard_offset}s (= default {SHARD_OFFSETS_DEFAULT.get(shard, 0.0)} + CLI {args.time_offset})')
shard_dir = args.csv_dir / shard
if not shard_dir.is_dir():
print(f'[{shard}] {shard_dir} not found — skipping')
continue
csvs = sorted(shard_dir.glob('*.csv'))
if not csvs:
print(f'[{shard}] no CSVs under {shard_dir}')
continue
shard_rows: dict[tuple, list[tuple[int, float]]] = {}
for csv_path in csvs:
(day_rows, labels, n_emit, n_skip, cls_counts) = _parse_csv(csv_path, row_idx_start=total_rows, time_offset_seconds=shard_offset, max_per_class=max_per_class, max_benign=max_benign, rng=rng)
for (ck, rs) in day_rows.items():
shard_rows.setdefault(ck, []).extend(rs)
all_labels.extend(labels)
total_rows += n_emit
total_skip += n_skip
for (lbl, c) in cls_counts.items():
aggregate_counts[lbl] = aggregate_counts.get(lbl, 0) + c
print(f'[{shard}/{csv_path.name}] emitted {n_emit:,} skipped {n_skip:,} cls={dict(sorted(cls_counts.items()))}')
csv_rows_by_day[shard] = shard_rows
print(f'[{shard}] shard total: {sum((len(v) for v in shard_rows.values())):,} canonical keys')
labels_by_row = np.asarray(all_labels, dtype=object)
print(f'\nTotal CSV rows emitted: {total_rows:,} skipped: {total_skip:,}')
print(f'Aggregate label distribution (post-subsample):')
for (lbl, cnt) in sorted(aggregate_counts.items(), key=lambda x: -x[1]):
print(f' {lbl:<40s} {cnt:>12,}')
print(f'\n=== locating pcap chunks in {args.pcap_dir} ===')
pcap_files_by_day: dict[str, list[Path]] = {}
for shard in shards:
prefix = SHARDS[shard]
files = _find_pcaps_for_shard(args.pcap_dir, prefix)
if args.max_pcap_files_per_shard is not None:
files = files[:args.max_pcap_files_per_shard]
pcap_files_by_day[shard] = files
print(f'[{shard}] prefix {prefix!r}{len(files):,} pcap chunks')
print(f'\n=== extracting packet sequences ===')
extract_dataset(csv_rows_by_day=csv_rows_by_day, labels_by_row=labels_by_row, pcap_files_by_day=pcap_files_by_day, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, match_strategy=None if args.match_strategy == 'auto' else args.match_strategy, T_full=args.T_full, idle_timeout=args.idle_timeout, time_tolerance_seconds=args.time_tolerance, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,104 @@
from __future__ import annotations
import argparse
import csv
import sys
from datetime import datetime
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parent))
from extract_lib import extract_dataset, _canonical_key
from csv_adapter import CsvFlowAdapter, parse_csv_rows
JOIN_COLS = {'src_ip': 'Src IP', 'src_port': 'Src Port', 'dst_ip': 'Dst IP', 'dst_port': 'Dst Port', 'protocol': 'Protocol', 'timestamp': 'Timestamp'}
LABEL_COL = 'Label'
TIMESTAMP_FORMATS = ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%d/%m/%Y %H:%M:%S', '%d/%m/%Y %H:%M')
BENIGN_ALIASES = {'BENIGN', 'Benign', 'benign'}
BENIGN_TOKEN = 'normal'
DROP_LABEL_PATTERNS = ('- Attempted',)
SHARDS = ('monday', 'tuesday', 'wednesday', 'thursday', 'friday')
DEFAULT_CSV_DIR = Path('datasets/cicids2017/raw/csv')
DEFAULT_PCAP_DIR = Path('datasets/cicids2017/raw/pcap')
DEFAULT_OUT_PACKETS = Path('datasets/cicids2017/processed/packets.npz')
DEFAULT_OUT_FLOWS = Path('datasets/cicids2017/processed/flows.parquet')
CICIDS2017_ADAPTER = CsvFlowAdapter(join_cols=JOIN_COLS, label_col=LABEL_COL, timestamp_formats=TIMESTAMP_FORMATS, benign_aliases=frozenset(BENIGN_ALIASES), benign_token=BENIGN_TOKEN, drop_label_patterns=DROP_LABEL_PATTERNS)
def _normalize_label(raw: str) -> str:
s = raw.strip()
return BENIGN_TOKEN if s in BENIGN_ALIASES else s
def _parse_timestamp(ts: str) -> float | None:
s = ts.strip()
if not s:
return None
for fmt in TIMESTAMP_FORMATS:
try:
return datetime.strptime(s, fmt).timestamp()
except ValueError:
continue
return None
def _find_pcaps_for_day(pcap_dir: Path, day: str) -> list[Path]:
day_lc = day.lower()
day_cap = day.capitalize()
pats = [f'*{day_lc}*.pcap', f'*{day_lc}*.pcapng', f'*{day_cap}*.pcap', f'*{day_cap}*.pcapng']
found: list[Path] = []
seen = set()
for pat in pats:
for p in sorted(pcap_dir.glob(pat)):
if p not in seen:
found.append(p)
seen.add(p)
return found
def _parse_day_csv(csv_path: Path, row_idx_start: int, time_offset_seconds: float) -> tuple[dict[tuple, list[tuple[int, float]]], list[str], int, int]:
(day_rows, labels, n_emit, n_skip, _) = parse_csv_rows(csv_path=csv_path, row_idx_start=row_idx_start, time_offset_seconds=time_offset_seconds, adapter=CICIDS2017_ADAPTER)
return (day_rows, labels, n_emit, n_skip)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--csv-dir', type=Path, default=DEFAULT_CSV_DIR)
ap.add_argument('--pcap-dir', type=Path, default=DEFAULT_PCAP_DIR)
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Matched flows per temporary worker chunk when --out-store is set.')
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
ap.add_argument('--match-strategy', choices=('auto', 'hungarian', 'stream_nearest'), default='auto', help='CSV↔pcap matching strategy. auto uses stream_nearest for --out-store and hungarian for legacy npz output.')
ap.add_argument('--T-full', type=int, default=256)
ap.add_argument('--idle-timeout', type=float, default=120.0)
ap.add_argument('--time-tolerance', type=float, default=2.0, help='Max |t_csv - t_pcap| seconds for flow match.')
ap.add_argument('--time-offset', type=float, default=0.0, help='Seconds added to CSV timestamps before matching.')
ap.add_argument('--jobs', type=int, default=0, help='0 = auto (min(n_days, cpu_count)). 1 = serial.')
ap.add_argument('--days', type=str, nargs='*', default=None, help='Subset of shards to process (default: all 5).')
ap.add_argument('--max-packets-per-pcap', type=int, default=None, help='Cap per-pcap packets (smoke tests only).')
args = ap.parse_args()
days = tuple(args.days) if args.days else SHARDS
csv_rows_by_day: dict[str, dict] = {}
all_labels: list[str] = []
total_rows = 0
total_skip = 0
print(f'=== parsing CSVs in {args.csv_dir} ===')
for day in days:
csv_path = args.csv_dir / f'{day}.csv'
if not csv_path.exists():
print(f'[{day}] {csv_path} not found, skipping')
continue
(day_rows, labels, n_emit, n_skip) = _parse_day_csv(csv_path, row_idx_start=total_rows, time_offset_seconds=args.time_offset)
csv_rows_by_day[day] = day_rows
all_labels.extend(labels)
total_rows += n_emit
total_skip += n_skip
print(f'[{day}] emitted {n_emit:,} rows skipped {n_skip:,} canonical keys {len(day_rows):,}')
labels_by_row = np.asarray(all_labels, dtype=object)
print(f'Total CSV rows emitted: {total_rows:,} (skipped {total_skip:,})')
print(f'\n=== locating pcap files in {args.pcap_dir} ===')
pcap_files_by_day: dict[str, list[Path]] = {}
for day in days:
files = _find_pcaps_for_day(args.pcap_dir, day)
pcap_files_by_day[day] = files
names = [p.name for p in files]
print(f'[{day}] {len(files)} pcap(s): {names}')
print(f'\n=== extracting packet sequences ===')
extract_dataset(csv_rows_by_day=csv_rows_by_day, labels_by_row=labels_by_row, pcap_files_by_day=pcap_files_by_day, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, match_strategy=None if args.match_strategy == 'auto' else args.match_strategy, T_full=args.T_full, idle_timeout=args.idle_timeout, time_tolerance_seconds=args.time_tolerance, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,56 @@
from __future__ import annotations
import argparse
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from extract_lib import extract_labeled_pcaps
DEFAULT_PCAP_ROOT = Path('datasets/ciciot2023/raw/pcap')
DEFAULT_OUT_PACKETS = Path('datasets/ciciot2023/processed/packets.npz')
DEFAULT_OUT_FLOWS = Path('datasets/ciciot2023/processed/flows.parquet')
BENIGN_FOLDER = 'Benign_Final'
BENIGN_LABEL = 'normal'
def _label_for_folder(folder: str) -> str:
if folder == BENIGN_FOLDER:
return BENIGN_LABEL
return folder.lower()
def _find_pcap_files(pcap_root: Path, *, max_pcaps_per_class: int | None) -> list[tuple[Path, str, dict]]:
triples: list[tuple[Path, str, dict]] = []
for class_dir in sorted((p for p in pcap_root.iterdir() if p.is_dir())):
folder = class_dir.name
label = _label_for_folder(folder)
pcaps = sorted(class_dir.rglob('*.pcap')) + sorted(class_dir.rglob('*.pcapng'))
if max_pcaps_per_class is not None and len(pcaps) > max_pcaps_per_class:
pcaps = pcaps[:max_pcaps_per_class]
for p in pcaps:
triples.append((p, label, {'class_folder': folder}))
return triples
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--pcap-root', type=Path, default=DEFAULT_PCAP_ROOT)
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
ap.add_argument('--out-store', type=Path, default=None, help='Sharded PacketShardStore output. Recommended for CICIoT2023 since the raw set is large.')
ap.add_argument('--shard-size', type=int, default=100000)
ap.add_argument('--worker-flush-size', type=int, default=10000)
ap.add_argument('--spool-dir', type=Path, default=None)
ap.add_argument('--T-full', type=int, default=256)
ap.add_argument('--idle-timeout', type=float, default=120.0)
ap.add_argument('--jobs', type=int, default=0)
ap.add_argument('--max-pcaps-per-class', type=int, default=1, help='Cap pcap files per class folder. Default 1 (single pcap per class) keeps extraction tractable.')
ap.add_argument('--max-packets-per-pcap', type=int, default=2000000, help='Cap packets per pcap to bound RAM/IO. Default 2M.')
args = ap.parse_args()
triples = _find_pcap_files(args.pcap_root, max_pcaps_per_class=args.max_pcaps_per_class)
if not triples:
raise RuntimeError(f'No pcap files found under {args.pcap_root}')
print(f'[discover] {len(triples)} pcap files across {len(set((t[1] for t in triples)))} labels')
by_label: dict[str, int] = {}
for (_, lbl, _) in triples:
by_label[lbl] = by_label.get(lbl, 0) + 1
for (lbl, n) in sorted(by_label.items()):
print(f' {lbl:<28s} {n} pcap(s)')
extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('class_folder',))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,96 @@
from __future__ import annotations
import argparse
import re
import shutil
import subprocess
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
from extract_lib import extract_labeled_pcaps
DEFAULT_PCAP_ARCHIVE_DIR = Path('datasets/iscxtor2016/raw/pcap')
DEFAULT_DECOMPRESS_DIR = Path('datasets/iscxtor2016/raw/pcap_extracted')
DEFAULT_OUT_PACKETS = Path('datasets/iscxtor2016/processed/packets.npz')
DEFAULT_OUT_FLOWS = Path('datasets/iscxtor2016/processed/flows.parquet')
NONTOR_ARCHIVE = 'NonTor.tar.xz'
TOR_ARCHIVE = 'Tor.zip'
ACTIVITY_PATTERNS = (('mail', re.compile('mail|email|imap|pop_|smtp|thunderbird')), ('voip', re.compile('voip|voice|call|facebook_voice|hangouts_voice')), ('audio', re.compile('audio|spotify|skype_audio|hangout_audio|facebook_audio')), ('browsing', re.compile('browsing|browser|ssl_browsing|gate_ssl')), ('chat', re.compile('chat|aim|icq|skypechat')), ('file', re.compile('file[-_]?transfer|ftp|sftp|tftp')), ('p2p', re.compile('p2p|multispeed|multiple[sS]peed|bittor|utor')), ('video', re.compile('video|youtube|vimeo')))
def _infer_activity(pcap_name: str) -> str:
lower = pcap_name.lower()
for (act, pat) in ACTIVITY_PATTERNS:
if pat.search(lower):
return act
return 'other'
def _decompress_archives(archive_dir: Path, out_dir: Path) -> None:
nontor_arc = archive_dir / NONTOR_ARCHIVE
tor_arc = archive_dir / TOR_ARCHIVE
out_nontor = out_dir / 'NonTor'
out_tor = out_dir / 'Tor'
if not out_nontor.exists():
out_nontor.parent.mkdir(parents=True, exist_ok=True)
print(f'[decompress] {nontor_arc}{out_dir}/ (tar xf)')
t0 = time.time()
subprocess.run(['tar', '-xf', str(nontor_arc), '-C', str(out_dir)], check=True)
print(f'[decompress] NonTor done in {time.time() - t0:.1f}s')
else:
print(f'[decompress] {out_nontor} already exists — skipping NonTor unpack')
if not out_tor.exists():
print(f'[decompress] {tor_arc}{out_dir}/ (unzip)')
t0 = time.time()
subprocess.run(['unzip', '-q', '-o', str(tor_arc), '-d', str(out_dir)], check=True)
print(f'[decompress] Tor done in {time.time() - t0:.1f}s')
else:
print(f'[decompress] {out_tor} already exists — skipping Tor unpack')
def _find_pcap_files(decompressed_root: Path) -> list[tuple[Path, str, dict]]:
triples: list[tuple[Path, str, dict]] = []
for (sub, coarse) in (('NonTor', 'nontor'), ('Tor', 'tor')):
sub_dir = decompressed_root / sub
if not sub_dir.exists():
print(f'[warn] {sub_dir} not found — skipping')
continue
pcaps = sorted(sub_dir.rglob('*.pcap')) + sorted(sub_dir.rglob('*.pcapng'))
for p in pcaps:
activity = _infer_activity(p.name)
triples.append((p, coarse, {'activity': activity}))
return triples
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--archive-dir', type=Path, default=DEFAULT_PCAP_ARCHIVE_DIR)
ap.add_argument('--decompressed-dir', type=Path, default=DEFAULT_DECOMPRESS_DIR)
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
ap.add_argument('--out-store', type=Path, default=None, help='Optional sharded packet store output. When set, writes store_root/{metadata,manifest,flows,packets/*} instead of the monolithic packets.npz/flows.parquet pair.')
ap.add_argument('--shard-size', type=int, default=100000, help='Rows per packet shard when --out-store is set.')
ap.add_argument('--worker-flush-size', type=int, default=10000, help='Flows per temporary worker chunk when --out-store is set.')
ap.add_argument('--spool-dir', type=Path, default=None, help='Optional temporary spool directory for worker chunks.')
ap.add_argument('--T-full', type=int, default=256)
ap.add_argument('--idle-timeout', type=float, default=120.0)
ap.add_argument('--jobs', type=int, default=0)
ap.add_argument('--max-packets-per-pcap', type=int, default=None)
ap.add_argument('--decompress-only', action='store_true', help='Extract the archives then stop (for staged runs).')
ap.add_argument('--skip-decompress', action='store_true', help='Assume decompressed-dir is already populated.')
args = ap.parse_args()
if not args.skip_decompress:
_decompress_archives(args.archive_dir, args.decompressed_dir)
if args.decompress_only:
print('[decompress-only] exiting as requested.')
return
triples = _find_pcap_files(args.decompressed_dir)
if not triples:
raise RuntimeError(f'No pcap files found under {args.decompressed_dir}')
print(f'\n[discover] found {len(triples)} pcap file(s)')
by_coarse: dict[str, int] = {}
by_act: dict[str, int] = {}
for (_, lbl, extra) in triples:
by_coarse[lbl] = by_coarse.get(lbl, 0) + 1
by_act[extra['activity']] = by_act.get(extra['activity'], 0) + 1
print(f' by label: {by_coarse}')
print(f' by activity: {by_act}')
print(f'\n[extract] writing to {args.out_packets} + {args.out_flows}')
extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('activity',))
if __name__ == '__main__':
main()

774
scripts/extract_lib.py Normal file
View File

@@ -0,0 +1,774 @@
from __future__ import annotations
import os
import shutil
import socket
import sys
import tempfile
import time as _time
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Iterator
import dpkt
import numpy as np
import pandas as pd
from scipy.optimize import linear_sum_assignment
_SCRIPT_DIR = Path(__file__).resolve().parent
_REPO_ROOT = _SCRIPT_DIR.parent
sys.path.insert(0, str(_REPO_ROOT / 'Packet_CFM'))
from packet_store import PacketShardWriter
PACKET_FEATURE_NAMES = ('log_size', 'log_dt_ms', 'direction', 'tcp_syn', 'tcp_fin', 'tcp_rst', 'tcp_psh', 'tcp_ack', 'log_win')
PACKET_D = len(PACKET_FEATURE_NAMES)
(FIN, SYN, RST, PSH, ACK) = (1, 2, 4, 8, 16)
@dataclass(slots=True)
class PacketRecord:
timestamp: float
src_ip: str
dst_ip: str
src_port: int
dst_port: int
protocol: int
tcp_flags: int
payload_len: int
header_len: int
total_len: int
window_size: int
def _try_open_pcap(f):
try:
return dpkt.pcap.Reader(f)
except ValueError:
f.seek(0)
return dpkt.pcapng.Reader(f)
def iter_packets(pcap_path: Path, max_packets: int | None=None) -> Iterator[PacketRecord]:
n = 0
with open(pcap_path, 'rb') as f:
reader = _try_open_pcap(f)
link_type = reader.datalink()
for (ts, buf) in reader:
try:
if link_type == dpkt.pcap.DLT_EN10MB:
eth = dpkt.ethernet.Ethernet(buf)
if eth.type != dpkt.ethernet.ETH_TYPE_IP:
continue
ip = eth.data
elif link_type == dpkt.pcap.DLT_RAW:
ip = dpkt.ip.IP(buf)
elif link_type == dpkt.pcap.DLT_LINUX_SLL:
sll = dpkt.sll.SLL(buf)
if sll.ethtype != dpkt.ethernet.ETH_TYPE_IP:
continue
ip = sll.data
else:
continue
if not isinstance(ip, dpkt.ip.IP):
continue
src_ip = socket.inet_ntoa(ip.src)
dst_ip = socket.inet_ntoa(ip.dst)
transport = ip.data
if isinstance(transport, dpkt.tcp.TCP):
yield PacketRecord(timestamp=ts, src_ip=src_ip, dst_ip=dst_ip, src_port=transport.sport, dst_port=transport.dport, protocol=6, tcp_flags=transport.flags, payload_len=len(transport.data), header_len=transport.off * 4, total_len=ip.len, window_size=transport.win)
elif isinstance(transport, dpkt.udp.UDP):
yield PacketRecord(timestamp=ts, src_ip=src_ip, dst_ip=dst_ip, src_port=transport.sport, dst_port=transport.dport, protocol=17, tcp_flags=0, payload_len=len(transport.data), header_len=8, total_len=ip.len, window_size=0)
else:
continue
except (dpkt.NeedData, dpkt.UnpackError, AttributeError):
continue
n += 1
if max_packets is not None and n >= max_packets:
return
def _packet_token(pkt: PacketRecord, prev_ts: float | None, direction: int) -> np.ndarray:
dt_ms = 0.0 if prev_ts is None else max(0.0, (pkt.timestamp - prev_ts) * 1000.0)
syn = int(bool(pkt.tcp_flags & SYN))
fin = int(bool(pkt.tcp_flags & FIN))
rst = int(bool(pkt.tcp_flags & RST))
psh = int(bool(pkt.tcp_flags & PSH))
ack = int(bool(pkt.tcp_flags & ACK))
return np.array([float(np.log1p(max(pkt.total_len, 0))), float(np.log1p(dt_ms)), float(direction), syn, fin, rst, psh, ack, float(np.log1p(max(pkt.window_size, 0)))], dtype=np.float32)
class _TokenFlow:
__slots__ = ('key_fwd', 'start_ts', 'last_ts', 'fin_count', 'tokens', 'prev_ts', 'n_pkts')
def __init__(self, key_fwd: tuple, start_ts: float) -> None:
self.key_fwd = key_fwd
self.start_ts = start_ts
self.last_ts = start_ts
self.fin_count = 0
self.tokens: list[np.ndarray] = []
self.prev_ts: float | None = None
self.n_pkts: int = 0
def add(self, pkt: PacketRecord, is_forward: bool, max_len: int) -> None:
direction = 0 if is_forward else 1
if len(self.tokens) < max_len:
self.tokens.append(_packet_token(pkt, self.prev_ts, direction))
self.prev_ts = pkt.timestamp
self.last_ts = pkt.timestamp
self.n_pkts += 1
def stream_token_flows(packet_iter: Iterator[PacketRecord], idle_timeout: float, max_len: int, gc_every: int=200000) -> Iterator[_TokenFlow]:
active: dict[tuple, _TokenFlow] = {}
last_pkt_ts = 0.0
n_seen = 0
for pkt in packet_iter:
last_pkt_ts = pkt.timestamp
fwd_key = (pkt.src_ip, pkt.dst_ip, pkt.src_port, pkt.dst_port, pkt.protocol)
bwd_key = (pkt.dst_ip, pkt.src_ip, pkt.dst_port, pkt.src_port, pkt.protocol)
flow: _TokenFlow | None = None
key = fwd_key
is_forward = True
if fwd_key in active:
(flow, key, is_forward) = (active[fwd_key], fwd_key, True)
elif bwd_key in active:
(flow, key, is_forward) = (active[bwd_key], bwd_key, False)
if flow is not None and pkt.timestamp - flow.last_ts > idle_timeout:
old = active.pop(key)
yield old
flow = None
if flow is None:
flow = _TokenFlow(key_fwd=fwd_key, start_ts=pkt.timestamp)
key = fwd_key
is_forward = True
active[key] = flow
flow.add(pkt, is_forward, max_len)
if pkt.protocol == 6:
if pkt.tcp_flags & RST:
yield active.pop(key)
elif pkt.tcp_flags & FIN:
flow.fin_count += 1
if flow.fin_count >= 2:
yield active.pop(key)
n_seen += 1
if n_seen % gc_every == 0:
stale = [k for (k, fl) in active.items() if last_pkt_ts - fl.last_ts > idle_timeout]
for k in stale:
yield active.pop(k)
for fl in list(active.values()):
yield fl
active.clear()
def _canonical_key(src_ip: str, dst_ip: str, src_port: int, dst_port: int, proto: int) -> tuple:
a = (src_ip, src_port)
b = (dst_ip, dst_port)
if a <= b:
return (a[0], a[1], b[0], b[1], proto)
return (b[0], b[1], a[0], a[1], proto)
def _to_fixed_tensor(flow_tokens: list[np.ndarray], max_len: int) -> np.ndarray:
out = np.zeros((max_len, PACKET_D), dtype=np.float32)
n = min(len(flow_tokens), max_len)
if n > 0:
out[:n] = np.stack(flow_tokens[:n], axis=0)
return out
class _WorkerChunkWriter:
def __init__(self, root: Path, *, prefix: str, T_full: int, chunk_size: int) -> None:
self.root = Path(root)
self.root.mkdir(parents=True, exist_ok=True)
self.prefix = prefix
self.T_full = T_full
self.chunk_size = max(1, int(chunk_size))
self._tokens: list[np.ndarray] = []
self._records: list[dict] = []
self._next_chunk = 0
self.chunks: list[dict[str, str]] = []
def add_csv_match(self, row_i: int, tok: np.ndarray, ln: int, meta: dict) -> None:
rec = dict(meta)
rec['csv_row_idx'] = int(row_i)
rec['packet_length'] = int(ln)
self._add(tok, rec)
def add_labeled(self, tok: np.ndarray, ln: int, meta: dict, label: str, extra: dict) -> None:
rec = dict(meta)
rec['packet_length'] = int(ln)
rec['label'] = str(label)
for (k, v) in extra.items():
rec[str(k)] = v
self._add(tok, rec)
def close(self) -> list[dict[str, str]]:
if self._tokens:
self._flush()
return self.chunks
def _add(self, tok: np.ndarray, rec: dict) -> None:
self._tokens.append(tok.astype(np.float32, copy=False))
self._records.append(rec)
if len(self._tokens) >= self.chunk_size:
self._flush()
def _flush(self) -> None:
n = len(self._tokens)
tokens = np.empty((n, self.T_full, PACKET_D), dtype=np.float32)
for (i, tok) in enumerate(self._tokens):
tokens[i] = tok
stem = f'{self.prefix}-chunk-{self._next_chunk:06d}'
token_path = self.root / f'{stem}.npy'
meta_path = self.root / f'{stem}.parquet'
np.save(token_path, tokens, allow_pickle=False)
pd.DataFrame(self._records).to_parquet(meta_path, compression='snappy', index=False)
self.chunks.append({'tokens': str(token_path), 'meta': str(meta_path)})
self._tokens.clear()
self._records.clear()
self._next_chunk += 1
def _flow_meta(fl: _TokenFlow) -> dict:
(sip, dip, sp, dp, proto) = fl.key_fwd
return {'start_ts': float(fl.start_ts), 'src_ip': str(sip), 'dst_ip': str(dip), 'src_port': int(sp), 'dst_port': int(dp), 'protocol': int(proto), 'n_pkts': int(fl.n_pkts)}
def _build_stream_csv_index(csv_rows_for_day: dict[tuple, list[tuple[int, float]]]) -> dict[tuple, dict[str, np.ndarray]]:
out: dict[tuple, dict[str, np.ndarray]] = {}
for (ck, rows) in csv_rows_for_day.items():
finite = [(int(row_i), float(ts)) for (row_i, ts) in rows if not np.isnan(ts)]
if not finite:
continue
finite.sort(key=lambda x: (x[1], x[0]))
row_idx = np.asarray([r for (r, _) in finite], dtype=np.int64)
ts = np.asarray([t for (_, t) in finite], dtype=np.float64)
used = np.zeros(len(finite), dtype=bool)
out[ck] = {'row_idx': row_idx, 'ts': ts, 'used': used}
return out
def _nearest_unused_row(entry: dict[str, np.ndarray], ts: float, tolerance: float) -> tuple[int | None, float | None]:
csv_ts = entry['ts']
used = entry['used']
pos = int(np.searchsorted(csv_ts, ts, side='left'))
best_i: int | None = None
best_abs = float('inf')
j = pos - 1
while j >= 0:
diff = abs(float(csv_ts[j]) - ts)
if diff > tolerance:
break
if not bool(used[j]) and diff < best_abs:
best_i = j
best_abs = diff
j -= 1
j = pos
n = len(csv_ts)
while j < n:
diff = abs(float(csv_ts[j]) - ts)
if diff > tolerance:
break
if not bool(used[j]) and diff < best_abs:
best_i = j
best_abs = diff
j += 1
if best_i is None:
return (None, None)
used[best_i] = True
return (int(entry['row_idx'][best_i]), ts - float(csv_ts[best_i]))
def _extract_day_worker(day: str, pcap_files_str: list[str], csv_rows_for_day: dict[tuple, list[tuple[int, float]]], max_len: int, idle_timeout: float, time_tolerance_seconds: float, max_packets_per_pcap: int | None, spool_dir: str | None=None, worker_flush_size: int=10000, match_strategy: str='hungarian') -> dict:
if match_strategy == 'stream_nearest':
if spool_dir is None:
raise ValueError('stream_nearest requires spool_dir')
return _extract_day_worker_stream_nearest(day=day, pcap_files_str=pcap_files_str, csv_rows_for_day=csv_rows_for_day, max_len=max_len, idle_timeout=idle_timeout, time_tolerance_seconds=time_tolerance_seconds, max_packets_per_pcap=max_packets_per_pcap, spool_dir=spool_dir, worker_flush_size=worker_flush_size)
pcap_by_key: dict[tuple, list[_TokenFlow]] = defaultdict(list)
n_pkts = 0
t_start = _time.time()
def _counting_iter(pkt_iter):
nonlocal n_pkts
for pkt in pkt_iter:
n_pkts += 1
yield pkt
for pcap_path_str in pcap_files_str:
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
(sip, dip, sp, dp, proto) = fl.key_fwd
ck = _canonical_key(sip, dip, sp, dp, proto)
pcap_by_key[ck].append(fl)
n_flows = sum((len(v) for v in pcap_by_key.values()))
elapsed = _time.time() - t_start
BIG = time_tolerance_seconds * 1000.0
results: list[tuple[int, np.ndarray, int, dict]] = []
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'day-{day}', T_full=max_len, chunk_size=worker_flush_size) if spool_dir is not None else None
n_joined = 0
n_collision = 0
n_csv_keys = len(csv_rows_for_day)
n_intersection = 0
def _emit(row_i: int, fl: _TokenFlow) -> None:
nonlocal n_joined
tok = _to_fixed_tensor(fl.tokens, max_len)
ln = min(len(fl.tokens), max_len)
meta = _flow_meta(fl)
if chunk_writer is not None:
chunk_writer.add_csv_match(row_i, tok, ln, meta)
else:
results.append((row_i, tok, ln, meta))
n_joined += 1
for (ck, rows) in sorted(csv_rows_for_day.items(), key=lambda kv: kv[1][0][0]):
if ck not in pcap_by_key:
continue
n_intersection += 1
pcap_flows = pcap_by_key[ck]
csv_ts = np.array([r[1] for r in rows], dtype=np.float64)
pcap_ts = np.array([fl.start_ts for fl in pcap_flows], dtype=np.float64)
(n_csv, n_pcap) = (len(csv_ts), len(pcap_ts))
if n_csv == 1 and n_pcap == 1:
row_i = rows[0][0]
ts = csv_ts[0]
fl = pcap_flows[0]
if not np.isnan(ts) and abs(fl.start_ts - ts) <= time_tolerance_seconds:
_emit(row_i, fl)
else:
n_collision += 1
continue
cost = np.abs(csv_ts[:, None] - pcap_ts[None, :])
cost[np.isnan(cost)] = BIG
cost[cost > time_tolerance_seconds] = BIG
(row_ind, col_ind) = linear_sum_assignment(cost)
for (r, c) in zip(row_ind, col_ind):
if cost[r, c] >= BIG:
n_collision += 1
continue
row_i = rows[r][0]
fl = pcap_flows[c]
_emit(row_i, fl)
deltas: list[float] = []
sampled = 0
for (ck, rows) in csv_rows_for_day.items():
if sampled >= 10000 or ck not in pcap_by_key:
if sampled >= 10000:
break
continue
(row_i, ts) = rows[0]
if np.isnan(ts):
continue
deltas.append(pcap_by_key[ck][0].start_ts - ts)
sampled += 1
return {'day': day, 'results': results, 'chunks': [] if chunk_writer is None else chunk_writer.close(), 'n_joined': n_joined, 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed, 'n_pcap_keys': len(pcap_by_key), 'n_csv_keys': n_csv_keys, 'n_intersection': n_intersection, 'n_collision': n_collision, 'deltas': deltas, 'match_strategy': match_strategy}
def _extract_day_worker_stream_nearest(*, day: str, pcap_files_str: list[str], csv_rows_for_day: dict[tuple, list[tuple[int, float]]], max_len: int, idle_timeout: float, time_tolerance_seconds: float, max_packets_per_pcap: int | None, spool_dir: str, worker_flush_size: int) -> dict:
t_start = _time.time()
n_pkts = 0
n_flows = 0
n_joined = 0
n_collision = 0
seen_pcap_keys: set[tuple] = set()
intersected_keys: set[tuple] = set()
deltas: list[float] = []
csv_index = _build_stream_csv_index(csv_rows_for_day)
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'day-{day}', T_full=max_len, chunk_size=worker_flush_size)
def _counting_iter(pkt_iter):
nonlocal n_pkts
for pkt in pkt_iter:
n_pkts += 1
yield pkt
for pcap_path_str in pcap_files_str:
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
n_flows += 1
(sip, dip, sp, dp, proto) = fl.key_fwd
ck = _canonical_key(sip, dip, sp, dp, proto)
seen_pcap_keys.add(ck)
entry = csv_index.get(ck)
if entry is None:
continue
intersected_keys.add(ck)
(row_i, delta) = _nearest_unused_row(entry, float(fl.start_ts), time_tolerance_seconds)
if row_i is None:
n_collision += 1
continue
tok = _to_fixed_tensor(fl.tokens, max_len)
ln = min(len(fl.tokens), max_len)
chunk_writer.add_csv_match(row_i, tok, ln, _flow_meta(fl))
n_joined += 1
if delta is not None and len(deltas) < 10000:
deltas.append(float(delta))
elapsed = _time.time() - t_start
return {'day': day, 'results': [], 'chunks': chunk_writer.close(), 'n_joined': n_joined, 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed, 'n_pcap_keys': len(seen_pcap_keys), 'n_csv_keys': len(csv_rows_for_day), 'n_intersection': len(intersected_keys), 'n_collision': n_collision, 'deltas': deltas, 'match_strategy': 'stream_nearest'}
def _print_day_stats(res: dict) -> None:
day = res['day']
strategy = res.get('match_strategy', 'hungarian')
print(f"[{day}] {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s match={strategy} ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
print(f" pcap_keys={res['n_pcap_keys']:,} csv_keys={res['n_csv_keys']:,} intersection={res['n_intersection']:,} joined={int(res.get('n_joined', len(res.get('results', ())))):,} within-key-miss={res['n_collision']:,}")
deltas = res.get('deltas') or []
if deltas:
arr = np.asarray(deltas, dtype=np.float64)
print(f' time-delta (pcap_start - csv_ts), seconds: median={np.median(arr):+.2f} mean={arr.mean():+.2f} std={arr.std():.2f} p05={np.percentile(arr, 5):+.2f} p95={np.percentile(arr, 95):+.2f}')
med = float(np.median(arr))
if abs(med) > 2.0:
print(f' -> median |{med:.1f}s| > 2s: rerun with --time-offset {med:.0f}')
def extract_dataset(*, csv_rows_by_day: dict[str, dict[tuple, list[tuple[int, float]]]], labels_by_row: np.ndarray, pcap_files_by_day: dict[str, list[Path]], out_packets: Path, out_flows: Path, out_store: Path | None=None, shard_size: int=100000, worker_flush_size: int=10000, spool_dir: Path | None=None, match_strategy: str | None=None, T_full: int=256, idle_timeout: float=120.0, time_tolerance_seconds: float=2.0, max_packets_per_pcap: int | None=None, n_jobs: int=0) -> None:
N_csv = len(labels_by_row)
print(f'[extract_dataset] N_csv={N_csv:,} T_full={T_full} days={sorted(csv_rows_by_day.keys())}')
if match_strategy is None:
match_strategy = 'stream_nearest' if out_store is not None else 'hungarian'
if match_strategy not in ('hungarian', 'stream_nearest'):
raise ValueError("match_strategy must be 'hungarian' or 'stream_nearest'")
if match_strategy == 'stream_nearest' and out_store is None:
raise ValueError('stream_nearest is only supported with --out-store')
print(f'[extract_dataset] match_strategy={match_strategy}')
tasks: list[tuple] = []
for (day, rows_dict) in csv_rows_by_day.items():
pcap_files = pcap_files_by_day.get(day, [])
if not pcap_files:
print(f'[{day}] NO pcap files — skipping ({len(rows_dict):,} CSV keys unmatched)')
continue
tasks.append((day, [str(p) for p in pcap_files], dict(rows_dict)))
if not tasks:
raise RuntimeError('No days with pcap files — nothing to extract.')
if n_jobs <= 0:
n_jobs = min(len(tasks), os.cpu_count() or 1)
print(f'[extract_dataset] running {len(tasks)} day(s) with {n_jobs} worker(s)')
store_writer: PacketShardWriter | None = None
spool_root: Path | None = None
if out_store is not None:
print(f'[extract_dataset] sharded output enabled: {out_store} shard_size={shard_size:,}')
store_writer = PacketShardWriter(out_store, shard_size=shard_size, T_full=T_full, D=PACKET_D, overwrite=True)
if spool_dir is None:
out_store_parent = Path(out_store).parent
out_store_parent.mkdir(parents=True, exist_ok=True)
spool_root = Path(tempfile.mkdtemp(prefix=f'.{Path(out_store).name}.spool.', dir=out_store_parent))
else:
spool_root = Path(spool_dir)
if spool_root.exists():
shutil.rmtree(spool_root)
spool_root.mkdir(parents=True, exist_ok=True)
print(f'[extract_dataset] worker spool={spool_root} flush_size={worker_flush_size:,}')
tok_chunks: list[np.ndarray] = []
len_chunks: list[np.ndarray] = []
row_chunks: list[np.ndarray] = []
meta_chunks: list[list[dict]] = []
total_joined = 0
def _materialize_results(results: list[tuple[int, np.ndarray, int, dict]]) -> tuple[np.ndarray, np.ndarray, np.ndarray, list[dict]]:
results = sorted(results, key=lambda x: x[0])
n = len(results)
tok_arr = np.empty((n, T_full, PACKET_D), dtype=np.float32)
len_arr = np.empty(n, dtype=np.int32)
row_arr = np.empty(n, dtype=np.int64)
meta_arr: list[dict] = [None] * n
for (i, (row_i, tok, ln, meta)) in enumerate(results):
tok_arr[i] = tok
len_arr[i] = ln
row_arr[i] = row_i
meta_arr[i] = meta
return (tok_arr, len_arr, row_arr, meta_arr)
def _flows_from_meta(row_arr: np.ndarray, meta_arr: list[dict]) -> pd.DataFrame:
labels = labels_by_row[row_arr].astype(str)
return pd.DataFrame({'label': labels, 'start_ts': np.asarray([m['start_ts'] for m in meta_arr], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_arr], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_arr], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_arr], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_arr], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_arr], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_arr], dtype=np.uint32)})
def _append_spool_chunks(res: dict) -> None:
chunks = res.get('chunks') or []
for chunk in chunks:
tokens = np.load(chunk['tokens'], mmap_mode='r')
meta_df = pd.read_parquet(chunk['meta'])
if meta_df.empty:
continue
meta_df = meta_df.assign(__token_row=np.arange(len(meta_df), dtype=np.int64))
meta_df = meta_df.sort_values('csv_row_idx', kind='stable').reset_index(drop=True)
row_arr = meta_df['csv_row_idx'].to_numpy(dtype=np.int64)
lengths = meta_df['packet_length'].to_numpy(dtype=np.int32)
order = meta_df['__token_row'].to_numpy(dtype=np.int64)
labels = labels_by_row[row_arr].astype(str)
flows = pd.DataFrame({'label': labels, 'start_ts': meta_df['start_ts'].to_numpy(dtype=np.float64), 'src_ip': meta_df['src_ip'].to_numpy(dtype=object), 'dst_ip': meta_df['dst_ip'].to_numpy(dtype=object), 'src_port': meta_df['src_port'].to_numpy(dtype=np.uint32), 'dst_port': meta_df['dst_port'].to_numpy(dtype=np.uint32), 'protocol': meta_df['protocol'].to_numpy(dtype=np.uint8), 'n_pkts': meta_df['n_pkts'].to_numpy(dtype=np.uint32)})
assert store_writer is not None
store_writer.add_batch(np.asarray(tokens[order]), lengths, flows)
def _absorb(res: dict, *, print_stats: bool=True) -> None:
if print_stats:
_print_day_stats(res)
results = res['results']
if not results:
return
(tok_arr, len_arr, row_arr, meta_arr) = _materialize_results(results)
if store_writer is not None:
store_writer.add_batch(tok_arr, len_arr, _flows_from_meta(row_arr, meta_arr))
else:
tok_chunks.append(tok_arr)
len_chunks.append(len_arr)
row_chunks.append(row_arr)
meta_chunks.append(meta_arr)
if n_jobs <= 1:
try:
for (i, (day, pcaps, rows)) in enumerate(tasks):
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{day}')
res = _extract_day_worker(day, pcaps, rows, T_full, idle_timeout, time_tolerance_seconds, max_packets_per_pcap, task_spool, worker_flush_size, match_strategy)
_print_day_stats(res)
total_joined += int(res.get('n_joined', 0))
if store_writer is not None:
_append_spool_chunks(res)
else:
_absorb(res, print_stats=False)
finally:
if spool_root is not None:
shutil.rmtree(spool_root, ignore_errors=True)
else:
try:
with ProcessPoolExecutor(max_workers=n_jobs) as pool:
futs = []
for (i, (day, pcaps, rows)) in enumerate(tasks):
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{day}')
futs.append(pool.submit(_extract_day_worker, day, pcaps, rows, T_full, idle_timeout, time_tolerance_seconds, max_packets_per_pcap, task_spool, worker_flush_size, match_strategy))
if store_writer is not None:
completed: dict[str, dict] = {}
for fut in as_completed(futs):
res = fut.result()
_print_day_stats(res)
completed[res['day']] = res
for (day, _, _) in tasks:
if day in completed:
total_joined += int(completed[day].get('n_joined', 0))
_append_spool_chunks(completed[day])
else:
for fut in as_completed(futs):
_absorb(fut.result())
finally:
if spool_root is not None:
shutil.rmtree(spool_root, ignore_errors=True)
if store_writer is not None:
if total_joined == 0:
raise RuntimeError('No matched flows — check timestamps (--time-offset) and pcap×CSV correspondence.')
store_writer.close()
print(f'[extract_dataset] wrote sharded store {out_store}')
return
if not tok_chunks:
raise RuntimeError('No matched flows — check timestamps (--time-offset) and pcap×CSV correspondence.')
tokens = np.concatenate(tok_chunks, axis=0)
lengths = np.concatenate(len_chunks, axis=0)
csv_rows = np.concatenate(row_chunks, axis=0)
meta_list: list[dict] = [m for chunk in meta_chunks for m in chunk]
del tok_chunks, len_chunks, row_chunks, meta_chunks
order = np.argsort(csv_rows, kind='stable')
tokens = tokens[order]
lengths = lengths[order]
csv_rows = csv_rows[order]
meta_list = [meta_list[i] for i in order]
N_matched = len(tokens)
labels = labels_by_row[csv_rows].astype(str)
flow_id = np.arange(N_matched, dtype=np.uint64)
print(f'\n[extract_dataset] matched {N_matched:,}/{N_csv:,} ({100.0 * N_matched / max(N_csv, 1):.2f}%)')
print(f'[extract_dataset] label distribution (matched rows):')
(ulabels, counts) = np.unique(labels, return_counts=True)
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
print(f' {lbl:<40s} {cnt:>10,}')
out_packets.parent.mkdir(parents=True, exist_ok=True)
np.savez_compressed(out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
print(f'[extract_dataset] wrote {out_packets} ({out_packets.stat().st_size / 1000000000.0:.2f} GB)')
out_flows.parent.mkdir(parents=True, exist_ok=True)
flow_df = pd.DataFrame({'flow_id': flow_id, 'label': labels, 'start_ts': np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_list], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_list], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_list], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_list], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_list], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_list], dtype=np.uint32)})
flow_df.to_parquet(out_flows, compression='snappy', index=False)
print(f'[extract_dataset] wrote {out_flows} ({out_flows.stat().st_size / 1000000.0:.2f} MB)')
_write_canonical_flow_features(tokens=tokens, lengths=lengths, flow_id=flow_id, labels=labels, out_path=out_flows.parent / 'flow_features.parquet')
def _write_canonical_flow_features(*, tokens: np.ndarray, lengths: np.ndarray, flow_id: np.ndarray, labels: np.ndarray, out_path: Path) -> None:
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES, compute_flow_features_from_packets
print(f'[extract_dataset] computing canonical {len(CANONICAL_FLOW_FEATURE_NAMES)}-d flow features from packet tokens ...')
feats = compute_flow_features_from_packets(tokens, lengths)
out_path.parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES):
df[name] = feats[:, i]
df.to_parquet(out_path, compression='snappy', index=False)
print(f'[extract_dataset] wrote {out_path} ({out_path.stat().st_size / 1000000.0:.2f} MB)')
def _extract_single_pcap_worker(pcap_path_str: str, label: str, extra: dict, max_len: int, idle_timeout: float, max_packets_per_pcap: int | None, spool_dir: str | None=None, worker_flush_size: int=10000) -> dict:
t_start = _time.time()
n_pkts = 0
n_flows = 0
results: list[tuple[np.ndarray, int, dict]] = []
chunk_writer = _WorkerChunkWriter(Path(spool_dir), prefix=f'pcap-{Path(pcap_path_str).stem}', T_full=max_len, chunk_size=worker_flush_size) if spool_dir is not None else None
def _counting_iter(pkt_iter):
nonlocal n_pkts
for pkt in pkt_iter:
n_pkts += 1
yield pkt
pkt_iter = iter_packets(Path(pcap_path_str), max_packets=max_packets_per_pcap)
for fl in stream_token_flows(_counting_iter(pkt_iter), idle_timeout=idle_timeout, max_len=max_len):
(sip, dip, sp, dp, proto) = fl.key_fwd
meta = {'start_ts': float(fl.start_ts), 'src_ip': str(sip), 'dst_ip': str(dip), 'src_port': int(sp), 'dst_port': int(dp), 'protocol': int(proto), 'n_pkts': int(fl.n_pkts)}
tok = _to_fixed_tensor(fl.tokens, max_len)
ln = min(len(fl.tokens), max_len)
if chunk_writer is not None:
chunk_writer.add_labeled(tok, ln, meta, label, extra)
else:
results.append((tok, ln, meta))
n_flows += 1
elapsed = _time.time() - t_start
return {'pcap': pcap_path_str, 'label': label, 'extra': extra, 'results': results, 'chunks': [] if chunk_writer is None else chunk_writer.close(), 'n_pkts': n_pkts, 'n_flows': n_flows, 'elapsed': elapsed}
def extract_labeled_pcaps(*, pcap_files_with_labels: list[tuple[Path, str, dict]], out_packets: Path, out_flows: Path, out_store: Path | None=None, shard_size: int=100000, worker_flush_size: int=10000, spool_dir: Path | None=None, T_full: int=256, idle_timeout: float=120.0, max_packets_per_pcap: int | None=None, n_jobs: int=0, extra_column_names: tuple[str, ...]=()) -> None:
N_pcap = len(pcap_files_with_labels)
print(f'[extract_labeled_pcaps] n_pcaps={N_pcap} T_full={T_full} extra_cols={extra_column_names}')
for (p, lbl, extra) in pcap_files_with_labels[:10]:
print(f' {lbl:<20s} {Path(p).name:<60s} extra={extra}')
if N_pcap > 10:
print(f' ... ({N_pcap - 10} more)')
if n_jobs <= 0:
n_jobs = min(N_pcap, os.cpu_count() or 1)
print(f'[extract_labeled_pcaps] running {N_pcap} pcap(s) with {n_jobs} worker(s)')
store_writer: PacketShardWriter | None = None
spool_root: Path | None = None
if out_store is not None:
print(f'[extract_labeled_pcaps] sharded output enabled: {out_store} shard_size={shard_size:,}')
store_writer = PacketShardWriter(out_store, shard_size=shard_size, T_full=T_full, D=PACKET_D, overwrite=True)
if spool_dir is None:
out_store_parent = Path(out_store).parent
out_store_parent.mkdir(parents=True, exist_ok=True)
spool_root = Path(tempfile.mkdtemp(prefix=f'.{Path(out_store).name}.spool.', dir=out_store_parent))
else:
spool_root = Path(spool_dir)
if spool_root.exists():
shutil.rmtree(spool_root)
spool_root.mkdir(parents=True, exist_ok=True)
print(f'[extract_labeled_pcaps] worker spool={spool_root} flush_size={worker_flush_size:,}')
tok_chunks: list[np.ndarray] = []
len_chunks: list[np.ndarray] = []
meta_chunks: list[list[dict]] = []
label_chunks: list[np.ndarray] = []
extra_chunks: list[dict[str, list]] = []
total_flows = 0
def _flows_for_labeled_chunk(res: dict, meta_arr: list[dict], n: int) -> pd.DataFrame:
cols = {'label': np.full(n, res['label'], dtype=object), 'start_ts': np.asarray([m['start_ts'] for m in meta_arr], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_arr], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_arr], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_arr], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_arr], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_arr], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_arr], dtype=np.uint32)}
for col in extra_column_names:
cols[col] = np.full(n, res['extra'].get(col, ''), dtype=object)
return pd.DataFrame(cols)
def _append_labeled_spool_chunks(res: dict) -> None:
chunks = res.get('chunks') or []
for chunk in chunks:
tokens = np.load(chunk['tokens'], mmap_mode='r')
flows = pd.read_parquet(chunk['meta'])
if flows.empty:
continue
flows = flows.assign(__token_row=np.arange(len(flows), dtype=np.int64))
sort_keys = ['label', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'start_ts']
flows = flows.sort_values(sort_keys, kind='stable').reset_index(drop=True)
order = flows['__token_row'].to_numpy(dtype=np.int64)
lengths = flows['packet_length'].to_numpy(dtype=np.int32)
flows = flows.drop(columns=['packet_length', '__token_row'])
assert store_writer is not None
store_writer.add_batch(np.asarray(tokens[order]), lengths, flows)
def _absorb(res: dict, *, print_stats: bool=True) -> None:
pcap_name = Path(res['pcap']).name
lbl = res['label']
extra = res['extra']
if print_stats:
print(f"[pcap:{pcap_name}] label={lbl} {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
if not res['results']:
return
n = len(res['results'])
tok_arr = np.empty((n, T_full, PACKET_D), dtype=np.float32)
len_arr = np.empty(n, dtype=np.int32)
meta_arr: list[dict] = [None] * n
for (i, (tok, ln, meta)) in enumerate(res['results']):
tok_arr[i] = tok
len_arr[i] = ln
meta_arr[i] = meta
if store_writer is not None:
flows = _flows_for_labeled_chunk(res, meta_arr, n)
order = np.lexsort((flows['start_ts'].to_numpy(dtype=np.float64), flows['protocol'].to_numpy(dtype=np.int64), flows['dst_port'].to_numpy(dtype=np.int64), flows['src_port'].to_numpy(dtype=np.int64), flows['dst_ip'].to_numpy(dtype=object), flows['src_ip'].to_numpy(dtype=object), flows['label'].to_numpy(dtype=object)))
store_writer.add_batch(tok_arr[order], len_arr[order], flows.iloc[order].reset_index(drop=True))
else:
tok_chunks.append(tok_arr)
len_chunks.append(len_arr)
meta_chunks.append(meta_arr)
label_chunks.append(np.full(n, lbl, dtype=object))
ex: dict[str, list] = {}
for col in extra_column_names:
val = extra.get(col, '')
ex[col] = [val] * n
extra_chunks.append(ex)
if n_jobs <= 1:
try:
for (i, (p, lbl, extra)) in enumerate(pcap_files_with_labels):
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{Path(p).stem}')
res = _extract_single_pcap_worker(str(p), lbl, extra, T_full, idle_timeout, max_packets_per_pcap, task_spool, worker_flush_size)
_absorb(res)
total_flows += int(res.get('n_flows', 0))
if store_writer is not None:
_append_labeled_spool_chunks(res)
finally:
if spool_root is not None:
shutil.rmtree(spool_root, ignore_errors=True)
else:
try:
with ProcessPoolExecutor(max_workers=n_jobs) as pool:
futs = []
for (i, (p, lbl, extra)) in enumerate(pcap_files_with_labels):
task_spool = None if spool_root is None else str(spool_root / f'task-{i:04d}-{Path(p).stem}')
futs.append(pool.submit(_extract_single_pcap_worker, str(p), lbl, extra, T_full, idle_timeout, max_packets_per_pcap, task_spool, worker_flush_size))
if store_writer is not None:
completed: dict[str, dict] = {}
for fut in as_completed(futs):
res = fut.result()
pcap_name = Path(res['pcap']).name
print(f"[pcap:{pcap_name}] label={res['label']} {res['n_pkts']:,} pkts → {res['n_flows']:,} flows in {res['elapsed']:.1f}s ({res['n_pkts'] / max(res['elapsed'], 0.001) / 1000000.0:.2f}M pkts/s)")
completed[str(res['pcap'])] = res
for (p, _, _) in pcap_files_with_labels:
res = completed.get(str(p))
if res is not None:
total_flows += int(res.get('n_flows', 0))
_append_labeled_spool_chunks(res)
else:
for fut in as_completed(futs):
_absorb(fut.result())
finally:
if spool_root is not None:
shutil.rmtree(spool_root, ignore_errors=True)
if store_writer is not None:
if total_flows == 0:
raise RuntimeError('No flows emitted — check pcap contents.')
store_writer.close()
print(f'[extract_labeled_pcaps] wrote sharded store {out_store}')
return
if not tok_chunks:
raise RuntimeError('No flows emitted — check pcap contents.')
tokens = np.concatenate(tok_chunks, axis=0)
lengths = np.concatenate(len_chunks, axis=0)
meta_list: list[dict] = [m for chunk in meta_chunks for m in chunk]
labels = np.concatenate(label_chunks, axis=0)
extra_dict: dict[str, list] = {col: [] for col in extra_column_names}
for chunk in extra_chunks:
for col in extra_column_names:
extra_dict[col].extend(chunk[col])
del tok_chunks, len_chunks, meta_chunks, label_chunks, extra_chunks
sip_arr = np.asarray([m['src_ip'] for m in meta_list], dtype=object)
dip_arr = np.asarray([m['dst_ip'] for m in meta_list], dtype=object)
sp_arr = np.asarray([m['src_port'] for m in meta_list], dtype=np.int64)
dp_arr = np.asarray([m['dst_port'] for m in meta_list], dtype=np.int64)
pr_arr = np.asarray([m['protocol'] for m in meta_list], dtype=np.int64)
ts_arr = np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64)
order = np.lexsort((ts_arr, pr_arr, dp_arr, sp_arr, dip_arr, sip_arr, labels))
tokens = tokens[order]
lengths = lengths[order]
labels = labels[order]
meta_list = [meta_list[i] for i in order]
for col in extra_column_names:
extra_dict[col] = [extra_dict[col][i] for i in order]
N = len(tokens)
flow_id = np.arange(N, dtype=np.uint64)
print(f'\n[extract_labeled_pcaps] total flows: {N:,}')
print(f'[extract_labeled_pcaps] label distribution:')
(ulabels, counts) = np.unique(labels, return_counts=True)
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
print(f' {lbl:<40s} {cnt:>10,}')
out_packets.parent.mkdir(parents=True, exist_ok=True)
np.savez_compressed(out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
print(f'[extract_labeled_pcaps] wrote {out_packets} ({out_packets.stat().st_size / 1000000000.0:.2f} GB)')
out_flows.parent.mkdir(parents=True, exist_ok=True)
cols = {'flow_id': flow_id, 'label': labels.astype(str), 'start_ts': np.asarray([m['start_ts'] for m in meta_list], dtype=np.float64), 'src_ip': np.asarray([m['src_ip'] for m in meta_list], dtype=object), 'dst_ip': np.asarray([m['dst_ip'] for m in meta_list], dtype=object), 'src_port': np.asarray([m['src_port'] for m in meta_list], dtype=np.uint32), 'dst_port': np.asarray([m['dst_port'] for m in meta_list], dtype=np.uint32), 'protocol': np.asarray([m['protocol'] for m in meta_list], dtype=np.uint8), 'n_pkts': np.asarray([m['n_pkts'] for m in meta_list], dtype=np.uint32)}
for col in extra_column_names:
cols[col] = np.asarray(extra_dict[col], dtype=object)
flow_df = pd.DataFrame(cols)
flow_df.to_parquet(out_flows, compression='snappy', index=False)
print(f'[extract_labeled_pcaps] wrote {out_flows} ({out_flows.stat().st_size / 1000000.0:.2f} MB) cols={list(flow_df.columns)}')
_write_canonical_flow_features(tokens=tokens, lengths=lengths, flow_id=flow_id, labels=labels.astype(str), out_path=out_flows.parent / 'flow_features.parquet')

View File

@@ -0,0 +1,97 @@
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES, compute_flow_features_from_packets
def _from_npz(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
print(f'[read] {args.packets_npz}')
pz = np.load(args.packets_npz)
tokens = pz['packet_tokens']
lens = pz['packet_lengths'].astype(np.int32)
packet_flow_id = pz['flow_id'] if 'flow_id' in pz.files else None
T_stored = tokens.shape[1]
if args.T is not None:
if args.T > T_stored:
raise ValueError(f'requested T={args.T} > stored {T_stored}')
tokens = tokens[:, :args.T, :]
lens = np.minimum(lens, args.T).astype(np.int32)
print(f'[read] {args.flows_parquet}')
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
if len(flows) != len(tokens):
raise ValueError(f'row count mismatch: packets={len(tokens):,} flows={len(flows):,}')
flow_id = np.asarray(flows['flow_id'].to_numpy(), dtype=np.uint64)
if packet_flow_id is not None:
if not np.array_equal(flow_id, packet_flow_id.astype(np.uint64)):
raise ValueError('packets.npz flow_id != flows.parquet flow_id')
labels = flows['label'].astype(str).to_numpy()
print(f'[compute] {len(tokens):,} flows × T={tokens.shape[1]}{len(CANONICAL_FLOW_FEATURE_NAMES)} features ...')
t0 = time.time()
feats = compute_flow_features_from_packets(tokens, lens)
dt = time.time() - t0
print(f'[compute] {dt:.1f}s ({len(tokens) / max(dt, 1e-06):.0f} flows/s)')
return (feats, flow_id, labels, np.array([T_stored if args.T is None else args.T]))
def _from_store(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'Packet_CFM'))
from packet_store import PacketShardStore
store = PacketShardStore.open(args.source_store)
T_stored = int(store.manifest['packet_length'].max())
T = args.T if args.T is not None else T_stored
if T > T_stored:
raise ValueError(f'requested T={T} > stored max {T_stored}')
print(f'[read] {args.flows_parquet}')
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
n = len(flows)
store_flows = store.read_flows(columns=['flow_id'])
if len(store_flows) != n:
raise ValueError(f'store has {len(store_flows):,} rows but flows.parquet has {n:,}')
if not np.array_equal(store_flows['flow_id'].to_numpy(dtype=np.uint64), flows['flow_id'].to_numpy(dtype=np.uint64)):
raise ValueError('store flow_id ordering differs from flows.parquet')
flow_id = flows['flow_id'].to_numpy(dtype=np.uint64)
labels = flows['label'].astype(str).to_numpy()
feats = np.zeros((n, len(CANONICAL_FLOW_FEATURE_NAMES)), dtype=np.float32)
print(f'[stream] {n:,} flows × T={T} (full={T_stored}), batch={args.batch} ...')
t0 = time.time()
all_idx = np.arange(n, dtype=np.int64)
for start in range(0, n, args.batch):
end = min(start + args.batch, n)
idx = all_idx[start:end]
(tok, lens) = store.read_packets(idx, T=T)
lens = np.minimum(lens, T).astype(np.int32)
feats[start:end] = compute_flow_features_from_packets(tok, lens)
if start // args.batch % 20 == 0 or end == n:
dt = time.time() - t0
rate = end / max(dt, 1e-06)
eta = (n - end) / max(rate, 1.0)
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
return (feats, flow_id, labels, np.array([T]))
def main() -> None:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument('--packets-npz', type=Path, default=None, help='Monolithic packets.npz path (mutually exclusive with --source-store).')
p.add_argument('--source-store', type=Path, default=None, help='PacketShardStore directory (mutually exclusive with --packets-npz).')
p.add_argument('--flows-parquet', type=Path, required=True)
p.add_argument('--out', type=Path, required=True)
p.add_argument('--T', type=int, default=None, help='Truncate packet sequences to first T positions (default: use stored T_full).')
p.add_argument('--batch', type=int, default=100000, help='Batch size when streaming from --source-store.')
args = p.parse_args()
if (args.packets_npz is None) == (args.source_store is None):
p.error('pass exactly one of --packets-npz or --source-store')
if args.packets_npz is not None:
(feats, flow_id, labels, _) = _from_npz(args)
else:
(feats, flow_id, labels, _) = _from_store(args)
args.out.parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame({'flow_id': flow_id, 'label': labels})
for (i, name) in enumerate(CANONICAL_FLOW_FEATURE_NAMES):
df[name] = feats[:, i]
df.to_parquet(args.out, compression='snappy', index=False)
sz_mb = args.out.stat().st_size / 1000000.0
print(f'[write] {args.out} ({sz_mb:.2f} MB, {len(df):,} rows × {len(df.columns)} cols)')
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,122 @@
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from common.data_contract import CANONICAL_FLOW_FEATURE_NAMES
def compute_spectral_features(packet_tokens: np.ndarray, packet_lengths: np.ndarray, n_bands: int=8) -> np.ndarray:
(N, T, _) = packet_tokens.shape
mask = (np.arange(T)[None, :] < packet_lengths[:, None]).astype(np.float32)
sig = packet_tokens[..., :2].astype(np.float32) * mask[..., None]
Z = np.fft.rfft(sig, axis=1)
if n_bands > Z.shape[1]:
raise ValueError(f'n_bands={n_bands} > {Z.shape[1]} available bins')
Z_K = Z[:, :n_bands]
size_re = Z_K[..., 0].real.astype(np.float32)
size_im = Z_K[..., 0].imag.astype(np.float32)
iat_re = Z_K[..., 1].real.astype(np.float32)
iat_im = Z_K[..., 1].imag.astype(np.float32)
out = np.concatenate([size_re, size_im, iat_re, iat_im], axis=1)
return out
def _spectral_column_names(n_bands: int) -> list[str]:
cols: list[str] = []
for prefix in ('spec_size_re', 'spec_size_im', 'spec_iat_re', 'spec_iat_im'):
for k in range(n_bands):
cols.append(f'{prefix}_K{k}')
return cols
def _from_npz(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
print(f'[read] {args.packets_npz}')
pz = np.load(args.packets_npz)
tokens = pz['packet_tokens']
lens = pz['packet_lengths'].astype(np.int32)
if args.T is not None:
if args.T > tokens.shape[1]:
raise ValueError(f'requested T={args.T} > stored {tokens.shape[1]}')
tokens = tokens[:, :args.T, :]
lens = np.minimum(lens, args.T).astype(np.int32)
flow_id = pz['flow_id'].astype(np.uint64) if 'flow_id' in pz.files else None
print(f'[compute] {len(tokens):,} flows × T={tokens.shape[1]}{4 * args.n_bands} spectral cols ...')
t0 = time.time()
spec = compute_spectral_features(tokens, lens, n_bands=args.n_bands)
print(f'[compute] {time.time() - t0:.1f}s')
return (spec, flow_id, lens)
def _from_store(args: argparse.Namespace) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'Packet_CFM'))
from packet_store import PacketShardStore
store = PacketShardStore.open(args.source_store)
T_stored = int(store.manifest['packet_length'].max())
T = args.T if args.T is not None else T_stored
if T > T_stored:
raise ValueError(f'requested T={T} > stored max {T_stored}')
store_flows = store.read_flows(columns=['flow_id'])
n = len(store_flows)
flow_id = store_flows['flow_id'].to_numpy(dtype=np.uint64)
spec = np.zeros((n, 4 * args.n_bands), dtype=np.float32)
print(f'[stream] {n:,} flows × T={T} (full={T_stored}), batch={args.batch} ...')
t0 = time.time()
all_idx = np.arange(n, dtype=np.int64)
for start in range(0, n, args.batch):
end = min(start + args.batch, n)
idx = all_idx[start:end]
(tok, lens) = store.read_packets(idx, T=T)
lens = np.minimum(lens, T).astype(np.int32)
spec[start:end] = compute_spectral_features(tok, lens, n_bands=args.n_bands)
if start // args.batch % 20 == 0 or end == n:
dt = time.time() - t0
rate = end / max(dt, 1e-06)
eta = (n - end) / max(rate, 1.0)
print(f'[stream] {end:,}/{n:,} dt={dt:.1f}s rate={rate:.0f} flows/s ETA={eta:.0f}s', flush=True)
return (spec, flow_id, None)
def main() -> None:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument('--packets-npz', type=Path, default=None, help='Monolithic packets.npz path (mutually exclusive with --source-store).')
p.add_argument('--source-store', type=Path, default=None, help='PacketShardStore directory (mutually exclusive with --packets-npz).')
p.add_argument('--flows-parquet', type=Path, required=True, help='flows.parquet for flow_id + label.')
p.add_argument('--base-features', type=Path, required=True, help='Existing canonical flow_features.parquet (20-d).')
p.add_argument('--out', type=Path, required=True)
p.add_argument('--n-bands', type=int, default=8)
p.add_argument('--T', type=int, default=None, help='Truncate to first T packets before FFT (default: stored T).')
p.add_argument('--batch', type=int, default=100000)
args = p.parse_args()
if (args.packets_npz is None) == (args.source_store is None):
p.error('pass exactly one of --packets-npz or --source-store')
print(f'[read] {args.flows_parquet}')
flows = pd.read_parquet(args.flows_parquet, columns=['flow_id', 'label'])
n = len(flows)
flow_id_flows = flows['flow_id'].to_numpy(dtype=np.uint64)
labels = flows['label'].astype(str).to_numpy()
print(f'[read] {args.base_features}')
base = pd.read_parquet(args.base_features)
if len(base) != n:
raise ValueError(f'base features rows {len(base):,} != flows rows {n:,}')
if 'flow_id' in base.columns:
if not np.array_equal(base['flow_id'].to_numpy(dtype=np.uint64), flow_id_flows):
raise ValueError('base flow_id != flows flow_id (row alignment broken)')
if args.packets_npz is not None:
(spec, flow_id_pkt, _) = _from_npz(args)
else:
(spec, flow_id_pkt, _) = _from_store(args)
if flow_id_pkt is not None and (not np.array_equal(flow_id_pkt, flow_id_flows)):
raise ValueError('packet flow_id != flows flow_id')
out_df = pd.DataFrame({'flow_id': flow_id_flows, 'label': labels})
for name in CANONICAL_FLOW_FEATURE_NAMES:
if name not in base.columns:
raise ValueError(f'base parquet missing canonical feature {name!r}')
out_df[name] = base[name].to_numpy(dtype=np.float32)
spec_cols = _spectral_column_names(args.n_bands)
for (i, name) in enumerate(spec_cols):
out_df[name] = spec[:, i]
args.out.parent.mkdir(parents=True, exist_ok=True)
out_df.to_parquet(args.out, compression='snappy', index=False)
sz_mb = args.out.stat().st_size / 1000000.0
print(f'[write] {args.out} ({sz_mb:.2f} MB, {len(out_df):,} rows × {len(out_df.columns)} cols, +{4 * args.n_bands} spectral)')
if __name__ == '__main__':
main()

132
scripts/iscxtor_companion.sh Executable file
View File

@@ -0,0 +1,132 @@
#!/bin/bash
# Companion to scripts/repr_experiment.sh.
#
# Timeline:
# Phase I (parallel with main pipeline after S2b):
# extract ISCXTor2016 pcaps into unified artifacts.
# CPU-bound, coexists with GPU training of E0/E1/E2.
# Phase II (after main pipeline DONE):
# for each trained model (E0, E1, E2), run detect + per_class
# against ISCXTor2016 (benign=nontor, attack=tor), emitting
# `iscxtor_eval/` subdir per model.
# Phase III : unified summary across both transfer targets.
#
# Log layout (reused from main repr_experiment):
# $MAIN_DIR/companion.log — this script's orchestration log
# $MAIN_DIR/companion_iscxtor_extract.log
# $MAIN_DIR/<tag>/iscxtor_eval/ — per-model ISCXTor2016 results
set -uo pipefail
ROOT=/home/chy/mambafortrafficmodeling
cd "$ROOT"
MAIN_DIR="runs/repr_experiment_20260423_092147"
MAIN_LOG="$MAIN_DIR/orch.log"
COMP_LOG="$MAIN_DIR/companion.log"
mkdir -p "$MAIN_DIR"
exec > >(tee -a "$COMP_LOG") 2>&1
N_VAL=20000
N_ATK=20000 # ISCXTor2016 has fewer attack flows than CICDDoS2019
SPLIT_SEED=42
echo "========================================================================"
echo "= $(date): iscxtor_companion START ="
echo "= main dir: $MAIN_DIR ="
echo "========================================================================"
wait_for_pattern() {
local pattern=$1 log=$2 desc=$3
echo ">>> $(date): waiting for '$desc' (pattern='$pattern' in $log)"
local waited=0
while ! grep -q "$pattern" "$log" 2>/dev/null; do
sleep 60
waited=$((waited + 60))
if (( waited % 600 == 0 )); then
echo " [heartbeat $(date +%H:%M:%S)] waited ${waited}s for $desc"
fi
done
echo "<<< $(date): '$desc' detected after ${waited}s wait"
}
run_stage() {
local name=$1; shift
local log="$MAIN_DIR/${name}.log"
echo ""
echo ">>> $(date): [$name] START"
local t0=$(date +%s)
if ! "$@" > "$log" 2>&1; then
local t1=$(date +%s)
echo "!!! $(date): [$name] FAILED after $((t1-t0))s — see $log"
tail -30 "$log"
exit 1
fi
local t1=$(date +%s)
echo "<<< $(date): [$name] OK in $((t1-t0))s (log: $log)"
tail -6 "$log" | sed 's/^/ | /'
}
# =====================================================================
# Phase I — ISCXTor2016 extraction (after S2b, parallel with main training)
# =====================================================================
wait_for_pattern "s2b_extract_cicddos2019_01-12.*OK" "$MAIN_LOG" \
"S2b CICDDoS2019 01-12 extraction to complete"
run_stage "companion_iscxtor_extract" \
nice -n 10 ionice -c 3 uv run python scripts/extract_iscxtor2016.py \
--skip-decompress --jobs 6
# =====================================================================
# Phase II — wait for main pipeline DONE, then detect + per_class
# =====================================================================
wait_for_pattern "repr_experiment DONE" "$MAIN_LOG" \
"main repr_experiment to finish (S7 summary)"
detect_and_per_class_iscxtor() {
local tag=$1
local src="$MAIN_DIR/$tag"
local dst="$MAIN_DIR/$tag/iscxtor_eval"
if [ ! -f "$src/model.pt" ]; then
echo "!!! $(date): [$tag] model.pt not found at $src — skipping"
return 1
fi
mkdir -p "$dst"
# Symlink the trained model into the eval subdir — detect.py reads model.pt
# from --save-dir. This keeps the original $tag/ directory pristine
# (CICDDoS2019 artifacts stay where they were).
ln -sf "../model.pt" "$dst/model.pt"
run_stage "${tag}_detect_iscxtor" \
uv run python -m detect \
--save-dir "$dst" \
--packets-npz datasets/iscxtor2016/processed/packets.npz \
--flows-parquet datasets/iscxtor2016/processed/flows.parquet \
--benign-label nontor \
--per-class-column activity \
--n-val "$N_VAL" --n-atk "$N_ATK" --seed "$SPLIT_SEED"
run_stage "${tag}_per_class_iscxtor" \
uv run python -m eval.per_class --save-dir "$dst"
}
detect_and_per_class_iscxtor "e0_baseline"
detect_and_per_class_iscxtor "e1_relv2"
detect_and_per_class_iscxtor "e2_relv2_ctx"
# =====================================================================
# Phase III — unified summary across both transfer targets
# =====================================================================
run_stage "companion_summary" \
uv run python scripts/summarize_repr_exp.py --root "$MAIN_DIR" --with-iscxtor
echo ""
echo "========================================================================"
echo "= $(date): iscxtor_companion DONE ="
echo "= results: $MAIN_DIR/{summary.txt, summary.json} ="
echo "========================================================================"

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
DEFAULT_DIR = Path('datasets/cicddos2019/processed')
def _load_shard(dir: Path, shard: str) -> tuple[dict, pd.DataFrame]:
p = np.load(dir / f'packets.{shard}.npz')
f = pd.read_parquet(dir / f'flows.{shard}.parquet')
assert set(p.files) == {'packet_tokens', 'packet_lengths', 'flow_id'}, p.files
assert set(f.columns) == {'flow_id', 'label'}, f.columns
assert len(p['flow_id']) == len(f), f'row count mismatch in {shard}'
assert np.array_equal(p['flow_id'], f['flow_id'].to_numpy()), f'flow_id mismatch in {shard}'
return ({'packet_tokens': p['packet_tokens'], 'packet_lengths': p['packet_lengths'], 'flow_id': p['flow_id']}, f)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--dir', type=Path, default=DEFAULT_DIR)
ap.add_argument('--out-packets', type=Path, default=None)
ap.add_argument('--out-flows', type=Path, default=None)
args = ap.parse_args()
out_p = args.out_packets or args.dir / 'packets.npz'
out_f = args.out_flows or args.dir / 'flows.parquet'
print(f'=== merging shards from {args.dir} ===')
(p1, f1) = _load_shard(args.dir, '01-12')
(p3, f3) = _load_shard(args.dir, '03-11')
n1 = len(f1)
n3 = len(f3)
N = n1 + n3
print(f'01-12 rows: {n1:,} 03-11 rows: {n3:,} total: {N:,}')
tokens = np.concatenate([p1['packet_tokens'], p3['packet_tokens']], axis=0)
lengths = np.concatenate([p1['packet_lengths'], p3['packet_lengths']], axis=0)
flow_id = np.arange(N, dtype=np.uint64)
print(f' tokens shape={tokens.shape} dtype={tokens.dtype}')
print(f' lengths shape={lengths.shape} dtype={lengths.dtype}')
flows = pd.concat([f1.drop(columns=['flow_id']), f3.drop(columns=['flow_id'])], ignore_index=True)
flows.insert(0, 'flow_id', flow_id)
print(f" flows rows={len(flows):,} label unique={flows['label'].nunique()}")
assert len(tokens) == N == len(flows)
assert np.array_equal(flow_id, flows['flow_id'].to_numpy())
print(f'\n=== writing {out_p} ===')
out_p.parent.mkdir(parents=True, exist_ok=True)
np.savez_compressed(out_p, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
sz = out_p.stat().st_size / 1000000000.0
print(f' wrote {sz:.2f} GB')
print(f'\n=== writing {out_f} ===')
flows.to_parquet(out_f, compression='snappy', index=False)
sz = out_f.stat().st_size / 1000000.0
print(f' wrote {sz:.2f} MB')
print(f'\n=== summary ===')
print(flows['label'].value_counts().to_string())
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,70 @@
from __future__ import annotations
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
LABEL_ALIASES = {'UDP-lag': 'UDPLag'}
def _infer_flows_path(packets_path: Path) -> Path:
name = packets_path.name
if name.startswith('packets.'):
flows_name = 'flows.' + name[len('packets.'):].removesuffix('.npz') + '.parquet'
else:
raise ValueError(f'Cannot infer flows path from {packets_path}')
return packets_path.parent / flows_name
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--in', dest='inputs', action='append', type=Path, required=True, help='packets.<shard>.npz path. Pass multiple times (one per shard). flows.<shard>.parquet is inferred.')
ap.add_argument('--out-packets', type=Path, required=True)
ap.add_argument('--out-flows', type=Path, required=True)
args = ap.parse_args()
tok_chunks: list[np.ndarray] = []
len_chunks: list[np.ndarray] = []
flow_dfs: list[pd.DataFrame] = []
META_COLS = ['flow_id', 'label', 'start_ts', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'protocol', 'n_pkts']
for pkt_path in args.inputs:
flow_path = _infer_flows_path(pkt_path)
if not pkt_path.exists():
raise FileNotFoundError(pkt_path)
if not flow_path.exists():
raise FileNotFoundError(flow_path)
p = np.load(pkt_path)
available = set(pd.read_parquet(flow_path).columns)
cols = [c for c in META_COLS if c in available]
f = pd.read_parquet(flow_path, columns=cols)
if len(p['flow_id']) != len(f):
raise ValueError(f'{pkt_path.name}: row count mismatch with {flow_path.name}')
if not np.array_equal(p['flow_id'], f['flow_id'].to_numpy()):
raise ValueError(f'{pkt_path.name}: flow_id mismatch with {flow_path.name}')
tok_chunks.append(np.asarray(p['packet_tokens']))
len_chunks.append(np.asarray(p['packet_lengths']))
flow_dfs.append(f)
print(f"[load] {pkt_path.name} : {len(p['flow_id']):>10,} rows cols={cols}")
T_full_set = {t.shape[1] for t in tok_chunks}
D_set = {t.shape[2] for t in tok_chunks}
if len(T_full_set) != 1 or len(D_set) != 1:
raise ValueError(f'inconsistent T/D across shards: T={T_full_set} D={D_set}')
tokens = np.concatenate(tok_chunks, axis=0)
lengths = np.concatenate(len_chunks, axis=0)
flow_df = pd.concat(flow_dfs, ignore_index=True)
del tok_chunks, len_chunks, flow_dfs
if LABEL_ALIASES:
flow_df['label'] = flow_df['label'].map(lambda s: LABEL_ALIASES.get(s, s)).astype(str)
N = len(tokens)
flow_id = np.arange(N, dtype=np.uint64)
flow_df['flow_id'] = flow_id
labels = flow_df['label'].to_numpy().astype(str)
print(f'\n[merge] total rows: {N:,}')
print(f'[merge] label distribution:')
(ulabels, counts) = np.unique(labels, return_counts=True)
for (lbl, cnt) in sorted(zip(ulabels, counts), key=lambda x: -x[1]):
print(f' {lbl:<40s} {cnt:>10,}')
args.out_packets.parent.mkdir(parents=True, exist_ok=True)
np.savez(args.out_packets, packet_tokens=tokens, packet_lengths=lengths, flow_id=flow_id)
print(f'\n[merge] wrote {args.out_packets} ({args.out_packets.stat().st_size / 1000000000.0:.2f} GB)')
args.out_flows.parent.mkdir(parents=True, exist_ok=True)
flow_df.to_parquet(args.out_flows, compression='snappy', index=False)
print(f'[merge] wrote {args.out_flows} ({args.out_flows.stat().st_size / 1000000.0:.2f} MB) cols={list(flow_df.columns)}')
if __name__ == '__main__':
main()

124
scripts/repr_experiment.sh Executable file
View File

@@ -0,0 +1,124 @@
#!/bin/bash
# End-to-end representation experiment: re-extract CICIDS2017 + CICDDoS2019
# with metadata columns, then train E0/E1/E2 with fixed 10k benign and
# evaluate on CICDDoS2019.
#
# Stages (each with wall-clock logging + per-stage log file):
# S1 re-extract CICIDS2017 → datasets/cicids2017/processed/*
# S2a re-extract CICDDoS2019 03-11 shard
# S2b re-extract CICDDoS2019 01-12 shard
# S2c merge CICDDoS2019 shards
# S3 train E0 (mixed_dequant, no ctx) [configs/n10k_baseline.yaml]
# S4 train E1 (relative_v2, no ctx) [configs/n10k_relv2.yaml]
# S5 train E2 (relative_v2, with 8-d ctx) [configs/n10k_relv2_ctx.yaml]
# S6 detect+per_class for each on CICDDoS2019
# S7 summary table
#
# Any stage's failure aborts the rest and leaves the partial log intact.
set -uo pipefail
ROOT=/home/chy/mambafortrafficmodeling
cd "$ROOT"
STAMP=$(date +%Y%m%d_%H%M%S)
OUT_DIR="runs/repr_experiment_${STAMP}"
mkdir -p "$OUT_DIR"
MAIN_LOG="$OUT_DIR/orch.log"
exec > >(tee -a "$MAIN_LOG") 2>&1
N_VAL=20000
N_ATK=100000
SPLIT_SEED=42
echo "========================================================================"
echo "= $(date): repr_experiment start ="
echo "= output root: $OUT_DIR ="
echo "========================================================================"
run_stage() {
local name=$1; shift
local log="$OUT_DIR/${name}.log"
echo ""
echo ">>> $(date): [$name] START"
echo ">>> $(date): [$name] command: $*"
local t0=$(date +%s)
if ! "$@" > "$log" 2>&1; then
local t1=$(date +%s); echo "!!! $(date): [$name] FAILED after $((t1-t0))s — see $log"
tail -30 "$log"
exit 1
fi
local t1=$(date +%s)
echo "<<< $(date): [$name] OK in $((t1-t0))s (log: $log)"
# Print tail of log so orch.log shows meaningful progress.
tail -10 "$log" | sed 's/^/ | /'
}
# ====================================================================
# S1 — re-extract CICIDS2017
# ====================================================================
run_stage "s1_extract_cicids2017" \
uv run python scripts/extract_cicids2017.py --jobs 5 --time-offset 28800
# ====================================================================
# S2 — re-extract CICDDoS2019 (per-shard) + merge
# ====================================================================
run_stage "s2a_extract_cicddos2019_03-11" \
uv run python scripts/extract_cicddos2019.py \
--shards 03-11 --jobs 1 \
--out-packets datasets/cicddos2019/processed/packets.03-11.npz \
--out-flows datasets/cicddos2019/processed/flows.03-11.parquet
run_stage "s2b_extract_cicddos2019_01-12" \
uv run python scripts/extract_cicddos2019.py \
--shards 01-12 --jobs 1 \
--out-packets datasets/cicddos2019/processed/packets.01-12.npz \
--out-flows datasets/cicddos2019/processed/flows.01-12.parquet
run_stage "s2c_merge_cicddos2019" \
uv run python scripts/merge_shard_artifacts.py \
--in datasets/cicddos2019/processed/packets.03-11.npz \
--in datasets/cicddos2019/processed/packets.01-12.npz \
--out-packets datasets/cicddos2019/processed/packets.npz \
--out-flows datasets/cicddos2019/processed/flows.parquet
# ====================================================================
# S3..S5 — train E0 / E1 / E2 with the same 10k benign
# ====================================================================
train_and_eval() {
local tag=$1 cfg=$2
local run_dir="$OUT_DIR/$tag"
mkdir -p "$run_dir"
# Copy config and patch save_dir to our per-tag directory.
cp "$cfg" "$run_dir/config.yaml"
sed -i "s#^save_dir:.*#save_dir: $run_dir#" "$run_dir/config.yaml"
run_stage "${tag}_train" \
uv run python -m train --config "$run_dir/config.yaml"
run_stage "${tag}_detect_ddos" \
uv run python -m detect \
--save-dir "$run_dir" \
--packets-npz datasets/cicddos2019/processed/packets.npz \
--flows-parquet datasets/cicddos2019/processed/flows.parquet \
--n-val "$N_VAL" --n-atk "$N_ATK" --seed "$SPLIT_SEED"
run_stage "${tag}_per_class" \
uv run python -m eval.per_class --save-dir "$run_dir"
}
train_and_eval "e0_baseline" "configs/n10k_baseline.yaml"
train_and_eval "e1_relv2" "configs/n10k_relv2.yaml"
train_and_eval "e2_relv2_ctx" "configs/n10k_relv2_ctx.yaml"
# ====================================================================
# S7 — summary table
# ====================================================================
run_stage "s7_summary" \
uv run python scripts/summarize_repr_exp.py --root "$OUT_DIR"
echo ""
echo "========================================================================"
echo "= $(date): repr_experiment DONE ="
echo "= results under: $OUT_DIR ="
echo "========================================================================"

View File

@@ -0,0 +1,93 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
HARD_CLASSES = ('Syn', 'UDPLag', 'DrDoS_NTP')
def _load_pc(run_dir: Path) -> dict | None:
p = run_dir / 'per_class.json'
if not p.exists():
print(f'[warn] missing {p}')
return None
return json.loads(p.read_text())['terminal_norm']
def _render_block(title: str, data: list[dict], hard_classes: tuple[str, ...]) -> list[str]:
lines: list[str] = []
lines.append('')
lines.append('=' * 96)
lines.append(f'# {title}')
lines.append('=' * 96)
lines.append(f"{'experiment':<40s} {'overall AUROC':>14s} {'macro AUROC':>14s} {'TPR@1%FPR':>12s} {'FPR@95%TPR':>12s}")
lines.append('-' * 96)
if not data:
lines.append('(no results)')
return lines
base = data[0]['pc']
for d in data:
pc = d['pc']
delta_overall = pc['overall_auroc'] - base['overall_auroc']
delta_macro = pc['macro_auroc'] - base['macro_auroc']
delta_tpr = pc['tpr_at_1fpr'] - base['tpr_at_1fpr']
lines.append(f"{d['label']:<40s} {pc['overall_auroc']:>8.4f} ({delta_overall:+.4f}) {pc['macro_auroc']:>8.4f} ({delta_macro:+.4f}) {pc['tpr_at_1fpr']:>6.4f} ({delta_tpr:+.4f}) {pc['fpr_at_95tpr']:>12.4f}")
if hard_classes:
lines.append('')
lines.append(f"--- focus classes: {', '.join(hard_classes)} ---")
for c in hard_classes:
row = f'{c:<18s}'
for d in data:
pc = d['pc']
match = next((r for r in pc['per_class'] if r['class'] == c), None)
if match is None:
row += f" {d['tag']}:n/a"
else:
row += f" {d['tag']}:{match['auroc']:.3f}(tpr={match['tpr_at_1fpr']:.3f})"
lines.append(row)
lines.append('')
lines.append('--- all classes (sorted by E0 AUROC ascending) ---')
base_pc = data[0]['pc']['per_class']
ordered = sorted(base_pc, key=lambda r: r['auroc'])
hdr2 = f"{'class':<22s} {'N':>8s}" + ''.join((f" {d['tag']:>14s}" for d in data))
lines.append(hdr2)
for row_b in ordered:
cls = row_b['class']
row = f"{cls:<22s} {row_b['n']:>8d}"
for d in data:
pc = d['pc']
match = next((r for r in pc['per_class'] if r['class'] == cls), None)
row += f" {match['auroc']:>14.4f}" if match else f" {'':>14s}"
lines.append(row)
return lines
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--root', type=Path, required=True)
ap.add_argument('--with-iscxtor', action='store_true', help='Also load iscxtor_eval/per_class.json under each tag and render a second comparison block for the CICIDS2017 → ISCXTor2016 transfer target.')
args = ap.parse_args()
runs = [('E0 baseline (mixed_dequant)', 'e0_baseline'), ('E1 relative_v2 (channel rehab)', 'e1_relv2'), ('E2 relative_v2 + 8-d context', 'e2_relv2_ctx')]
ddos_data: list[dict] = []
for (label, tag) in runs:
pc = _load_pc(args.root / tag)
if pc is not None:
ddos_data.append({'label': label, 'tag': tag, 'pc': pc})
iscx_data: list[dict] = []
if args.with_iscxtor:
for (label, tag) in runs:
pc = _load_pc(args.root / tag / 'iscxtor_eval')
if pc is not None:
iscx_data.append({'label': label, 'tag': tag, 'pc': pc})
if not ddos_data and (not iscx_data):
print('[err] no results found under', args.root)
return
lines: list[str] = []
if ddos_data:
lines.extend(_render_block('CICIDS2017 → CICDDoS2019 (target=DDoS attacks; benign=normal)', ddos_data, HARD_CLASSES))
if iscx_data:
lines.extend(_render_block('CICIDS2017 → ISCXTor2016 (target=Tor flows; benign=nontor)', iscx_data, ()))
txt = '\n'.join(lines)
print(txt)
(args.root / 'summary.txt').write_text(txt + '\n')
combined = {'cicddos2019': ddos_data, 'iscxtor2016': iscx_data}
(args.root / 'summary.json').write_text(json.dumps(combined, indent=2))
print(f"\n[saved] {args.root / 'summary.txt'}")
if __name__ == '__main__':
main()