from __future__ import annotations import argparse import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent)) from extract_lib import extract_labeled_pcaps DEFAULT_PCAP_ROOT = Path('datasets/ciciot2023/raw/pcap') DEFAULT_OUT_PACKETS = Path('datasets/ciciot2023/processed/packets.npz') DEFAULT_OUT_FLOWS = Path('datasets/ciciot2023/processed/flows.parquet') BENIGN_FOLDER = 'Benign_Final' BENIGN_LABEL = 'normal' def _label_for_folder(folder: str) -> str: if folder == BENIGN_FOLDER: return BENIGN_LABEL return folder.lower() def _find_pcap_files(pcap_root: Path, *, max_pcaps_per_class: int | None) -> list[tuple[Path, str, dict]]: triples: list[tuple[Path, str, dict]] = [] for class_dir in sorted((p for p in pcap_root.iterdir() if p.is_dir())): folder = class_dir.name label = _label_for_folder(folder) pcaps = sorted(class_dir.rglob('*.pcap')) + sorted(class_dir.rglob('*.pcapng')) if max_pcaps_per_class is not None and len(pcaps) > max_pcaps_per_class: pcaps = pcaps[:max_pcaps_per_class] for p in pcaps: triples.append((p, label, {'class_folder': folder})) return triples def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument('--pcap-root', type=Path, default=DEFAULT_PCAP_ROOT) ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS) ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS) ap.add_argument('--out-store', type=Path, default=None, help='Sharded PacketShardStore output. Recommended for CICIoT2023 since the raw set is large.') ap.add_argument('--shard-size', type=int, default=100000) ap.add_argument('--worker-flush-size', type=int, default=10000) ap.add_argument('--spool-dir', type=Path, default=None) ap.add_argument('--T-full', type=int, default=256) ap.add_argument('--idle-timeout', type=float, default=120.0) ap.add_argument('--jobs', type=int, default=0) ap.add_argument('--max-pcaps-per-class', type=int, default=1, help='Cap pcap files per class folder. Default 1 (single pcap per class) keeps extraction tractable.') ap.add_argument('--max-packets-per-pcap', type=int, default=2000000, help='Cap packets per pcap to bound RAM/IO. Default 2M.') args = ap.parse_args() triples = _find_pcap_files(args.pcap_root, max_pcaps_per_class=args.max_pcaps_per_class) if not triples: raise RuntimeError(f'No pcap files found under {args.pcap_root}') print(f'[discover] {len(triples)} pcap files across {len(set((t[1] for t in triples)))} labels') by_label: dict[str, int] = {} for (_, lbl, _) in triples: by_label[lbl] = by_label.get(lbl, 0) + 1 for (lbl, n) in sorted(by_label.items()): print(f' {lbl:<28s} {n} pcap(s)') extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('class_folder',)) if __name__ == '__main__': main()