57 lines
3.2 KiB
Python
57 lines
3.2 KiB
Python
from __future__ import annotations
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
|
from extract_lib import extract_labeled_pcaps
|
|
DEFAULT_PCAP_ROOT = Path('datasets/ciciot2023/raw/pcap')
|
|
DEFAULT_OUT_PACKETS = Path('datasets/ciciot2023/processed/packets.npz')
|
|
DEFAULT_OUT_FLOWS = Path('datasets/ciciot2023/processed/flows.parquet')
|
|
BENIGN_FOLDER = 'Benign_Final'
|
|
BENIGN_LABEL = 'normal'
|
|
|
|
def _label_for_folder(folder: str) -> str:
|
|
if folder == BENIGN_FOLDER:
|
|
return BENIGN_LABEL
|
|
return folder.lower()
|
|
|
|
def _find_pcap_files(pcap_root: Path, *, max_pcaps_per_class: int | None) -> list[tuple[Path, str, dict]]:
|
|
triples: list[tuple[Path, str, dict]] = []
|
|
for class_dir in sorted((p for p in pcap_root.iterdir() if p.is_dir())):
|
|
folder = class_dir.name
|
|
label = _label_for_folder(folder)
|
|
pcaps = sorted(class_dir.rglob('*.pcap')) + sorted(class_dir.rglob('*.pcapng'))
|
|
if max_pcaps_per_class is not None and len(pcaps) > max_pcaps_per_class:
|
|
pcaps = pcaps[:max_pcaps_per_class]
|
|
for p in pcaps:
|
|
triples.append((p, label, {'class_folder': folder}))
|
|
return triples
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument('--pcap-root', type=Path, default=DEFAULT_PCAP_ROOT)
|
|
ap.add_argument('--out-packets', type=Path, default=DEFAULT_OUT_PACKETS)
|
|
ap.add_argument('--out-flows', type=Path, default=DEFAULT_OUT_FLOWS)
|
|
ap.add_argument('--out-store', type=Path, default=None, help='Sharded PacketShardStore output. Recommended for CICIoT2023 since the raw set is large.')
|
|
ap.add_argument('--shard-size', type=int, default=100000)
|
|
ap.add_argument('--worker-flush-size', type=int, default=10000)
|
|
ap.add_argument('--spool-dir', type=Path, default=None)
|
|
ap.add_argument('--T-full', type=int, default=256)
|
|
ap.add_argument('--idle-timeout', type=float, default=120.0)
|
|
ap.add_argument('--jobs', type=int, default=0)
|
|
ap.add_argument('--max-pcaps-per-class', type=int, default=1, help='Cap pcap files per class folder. Default 1 (single pcap per class) keeps extraction tractable.')
|
|
ap.add_argument('--max-packets-per-pcap', type=int, default=2000000, help='Cap packets per pcap to bound RAM/IO. Default 2M.')
|
|
args = ap.parse_args()
|
|
triples = _find_pcap_files(args.pcap_root, max_pcaps_per_class=args.max_pcaps_per_class)
|
|
if not triples:
|
|
raise RuntimeError(f'No pcap files found under {args.pcap_root}')
|
|
print(f'[discover] {len(triples)} pcap files across {len(set((t[1] for t in triples)))} labels')
|
|
by_label: dict[str, int] = {}
|
|
for (_, lbl, _) in triples:
|
|
by_label[lbl] = by_label.get(lbl, 0) + 1
|
|
for (lbl, n) in sorted(by_label.items()):
|
|
print(f' {lbl:<28s} {n} pcap(s)')
|
|
extract_labeled_pcaps(pcap_files_with_labels=triples, out_packets=args.out_packets, out_flows=args.out_flows, out_store=args.out_store, shard_size=args.shard_size, worker_flush_size=args.worker_flush_size, spool_dir=args.spool_dir, T_full=args.T_full, idle_timeout=args.idle_timeout, max_packets_per_pcap=args.max_packets_per_pcap, n_jobs=args.jobs, extra_column_names=('class_folder',))
|
|
if __name__ == '__main__':
|
|
main()
|