Initial commit: code, paper, small artifacts

This commit is contained in:
2026-05-07 20:47:30 +08:00
commit fae2db8cff
322 changed files with 33159 additions and 0 deletions

112
scripts/download/README.md Normal file
View File

@@ -0,0 +1,112 @@
# Dataset download scripts
Target layout (mirrors `datasets/cicids2017/`):
```
datasets/
ciciot2023/raw/{pcap,csv}
iscxtor2016/raw/{pcap,csv}
cicapt_iiot2024/raw/{pcap,csv}
ustc_tfc2016/raw/pcap
datacon2020/raw/pcap
```
## CICIoT2023 / ISCXTor2016 (automated)
UNB/CIC gates downloads behind a consent form. After submission the site issues
a `Token` cookie (domain `.cicresearch.ca`) that unlocks two endpoints:
- `browse.php?p=<path>` — HTML directory listing
- `download.php?file=<path>` — raw file bytes
`cic_download.py` is a stdlib-only recursive crawler that walks `browse.php`
and fetches each leaf via `download.php`. Already-downloaded files are
skipped (presence-based; the PHP endpoint does not advertise sizes).
### Workflow
1. Open the dataset page in a browser, fill and submit the form:
- CICIoT2023 : <https://www.unb.ca/cic/datasets/iotdataset-2023.html>
- ISCXTor2016: <https://www.unb.ca/cic/datasets/tor.html>
2. After submit, click through to `cicresearch.ca/.../browse.php`. The page
must load successfully in your browser — this proves the Token is set.
3. Export the cookie in **Netscape format** (tab-separated). One line is
sufficient:
```
# Netscape HTTP Cookie File
.cicresearch.ca TRUE / TRUE <expiry> Token <value>
```
Save as:
- `scripts/download/cookies_ciciot2023.txt`
- `scripts/download/cookies_iscxtor2016.txt`
Tokens are per-dataset — a CICIoT2023 cookie will not work for ISCXTor.
4. Run:
```bash
bash scripts/download/download_ciciot2023.sh
bash scripts/download/download_iscxtor2016.sh
```
Env vars: `WHAT=pcap|csv|both`, `DEST=`, `COOKIES=`, `DRY_RUN=1`, `LIMIT=N`.
For ISCXTor, if the remote subdir names differ from the defaults
(`Pcaps` / `CSVs`), set `PCAP_ROOT=` / `CSV_ROOT=`.
### Known remote tree sizes
- **CICIoT2023** — `CSV/` 328 files (includes `CSV.zip`, `MERGED_CSV.zip`,
`MERGED_CSV/`, and per-attack CSVs), `PCAP/` 311 files across 36 attack
categories. Full dataset is ~12 GB.
### Quick commands
```bash
# Dry-run (enumerate only, no downloads)
DRY_RUN=1 bash scripts/download/download_ciciot2023.sh
# Download first 5 files as a smoke test
LIMIT=5 WHAT=csv bash scripts/download/download_ciciot2023.sh
# Full download
bash scripts/download/download_ciciot2023.sh
```
## CICAPT-IIoT2024 (automated)
Same UNB/CIC pipeline as CICIoT2023, but crawled in a single pass — the
entire `CICAPT-IIoT Dataset/` top-level folder is mirrored (pcap, csv, and
anything else) under `datasets/cicapt_iiot2024/raw/`.
Cookie file: `scripts/download/cookies_cicapt_iiot2024.txt` (Token for
`.cicresearch.ca`).
```bash
# Smoke test first
DRY_RUN=1 LIMIT=5 bash scripts/download/download_cicapt_iiot2024.sh
# Full download
bash scripts/download/download_cicapt_iiot2024.sh
# Skip heavy archives if they duplicate a per-file tree
SKIP_EXT=zip,7z bash scripts/download/download_cicapt_iiot2024.sh
```
Reference URL (browser, with Token cookie live):
<https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset>
## USTC-TFC2016 (manual)
```bash
cd datasets/ustc_tfc2016/raw/pcap
git clone --depth=1 https://github.com/yungshenglu/USTC-TFC2016.git .
```
No official CSV — extract features yourself (CICFlowMeter, USTC-TK2016).
## DataCon2020 (manual)
Register at <https://datacon.qianxin.com/opendata/maliciousstream> and place
the `black/` `white/` `test/` pcap bundles under
`datasets/datacon2020/raw/pcap/`. No official CSV.

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env bash
# Background wrapper: retry CICIoT2023 PCAP download until it reports
# a clean "Done." with n_files > 0. Each attempt is delimited in the log
# so the monitor can grep for progress.
#
# Invoked detached (nohup ... &). The inner script is resumable via
# the .part-file convention in cic_download.py.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
LOG="${REPO_ROOT}/logs/ciciot2023_pcap.log"
# nohup strips the interactive PATH; re-expose the project venv so
# `python` resolves inside download_ciciot2023.sh.
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
export PATH="${REPO_ROOT}/.venv/bin:${PATH:-/usr/local/bin:/usr/bin:/bin}"
fi
# Route through the local proxy; detached bash does not inherit the
# interactive shell's proxy env, and cicresearch.ca's WAF rate-limits
# bare-IP traffic much more aggressively than the proxy exit.
export HTTP_PROXY="http://127.0.0.1:7093"
export HTTPS_PROXY="http://127.0.0.1:7093"
export ALL_PROXY="socks5h://127.0.0.1:7093"
export NO_PROXY="localhost,127.0.0.1,::1"
export http_proxy="${HTTP_PROXY}"
export https_proxy="${HTTPS_PROXY}"
export all_proxy="${ALL_PROXY}"
export no_proxy="${NO_PROXY}"
i=0
while :; do
i=$((i + 1))
ts=$(date +%F\ %T)
printf '\n=== attempt %d %s ===\n' "$i" "$ts" >>"$LOG"
# Skip bundle zips (e.g. PCAP.zip) — we want per-attack-class .pcap files,
# not the whole dataset as one archive.
WHAT=pcap SKIP_EXT="zip,7z" bash "${SCRIPT_DIR}/download_ciciot2023.sh" >>"$LOG" 2>&1
rc=$?
# If inner script exited with 0 AND last "Done." line reports >0 files,
# we consider the listing+walk to have succeeded at least once. Otherwise
# keep retrying on network/SSL failures.
last_done=$(grep -E '^Done\. [0-9]+ files processed' "$LOG" | tail -1 || true)
n=$(printf '%s' "$last_done" | awk '{print $2}')
if [[ "$rc" -eq 0 && -n "$n" && "$n" -gt 0 ]]; then
printf '=== loop finished clean %s (files=%s) ===\n' "$(date +%F\ %T)" "$n" >>"$LOG"
break
fi
printf '=== attempt %d ended rc=%s last_done=%q; sleep 60 ===\n' \
"$i" "$rc" "$last_done" >>"$LOG"
sleep 60
done

View File

@@ -0,0 +1,185 @@
from __future__ import annotations
import argparse
import http.cookiejar
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
UA = 'Mozilla/5.0 (cic-downloader)'
LINK_RE = re.compile('href="(browse\\.php\\?p=[^"]+|download\\.php\\?file=[^"]+)"')
def build_opener(cookies_path: Path) -> urllib.request.OpenerDirector:
jar = http.cookiejar.MozillaCookieJar()
jar.load(str(cookies_path), ignore_discard=True, ignore_expires=True)
return urllib.request.build_opener(urllib.request.HTTPCookieProcessor(jar))
def http_get(opener, url: str, timeout: int=60, retries: int=5) -> bytes:
last: Exception | None = None
for attempt in range(retries):
try:
req = urllib.request.Request(url, headers={'User-Agent': UA})
with opener.open(req, timeout=timeout) as resp:
final = resp.geturl()
if 'unb.ca/cic/datasets' in final:
raise RuntimeError(f'Got redirected to UNB form page ({final}). Token cookie is missing/expired or wrong dataset scope.')
return resp.read()
except RuntimeError:
raise
except Exception as e:
last = e
wait = min(30, 2 ** attempt)
print(f' WARN GET {url} failed ({e!r}); retry in {wait}s ({attempt + 1}/{retries})', file=sys.stderr)
time.sleep(wait)
raise RuntimeError(f'GET {url} failed after {retries} attempts: {last!r}')
def list_dir(opener, base: str, p: str) -> list[tuple[str, str]]:
url = urllib.parse.urljoin(base, 'browse.php') + '?p=' + urllib.parse.quote(p, safe='/')
html = http_get(opener, url).decode('utf-8', 'replace')
out: list[tuple[str, str]] = []
for m in LINK_RE.finditer(html):
href = m.group(1)
qs = urllib.parse.parse_qs(urllib.parse.urlparse(href).query)
if href.startswith('browse.php'):
out.append(('dir', qs['p'][0]))
else:
out.append(('file', qs['file'][0]))
return out
def walk(opener, base: str, root: str):
stack = [root]
seen: set[str] = set()
while stack:
p = stack.pop()
if p in seen:
continue
seen.add(p)
try:
entries = list_dir(opener, base, p)
except Exception as e:
print(f' WARN list_dir({p}) failed permanently: {e!r}', file=sys.stderr)
continue
for (kind, val) in sorted(entries):
if kind == 'dir':
stack.append(val)
else:
yield val
def download_file(opener, base: str, remote: str, dest_root: Path, *, root_prefix: str) -> None:
url = urllib.parse.urljoin(base, 'download.php') + '?file=' + urllib.parse.quote(remote, safe='')
rel = remote[len(root_prefix):].lstrip('/') if remote.startswith(root_prefix) else remote
local = dest_root / rel
local.parent.mkdir(parents=True, exist_ok=True)
if local.exists() and local.stat().st_size > 0:
print(f' SKIP {rel} ({local.stat().st_size} bytes, already present)')
return
tmp = local.with_suffix(local.suffix + '.part')
last: Exception | None = None
for attempt in range(5):
resume_from = tmp.stat().st_size if tmp.exists() else 0
try:
headers = {'User-Agent': UA}
if resume_from > 0:
headers['Range'] = f'bytes={resume_from}-'
req = urllib.request.Request(url, headers=headers)
t0 = time.monotonic()
bytes_read = 0
with opener.open(req, timeout=1800) as resp:
final = resp.geturl()
if 'unb.ca/cic/datasets' in final:
raise RuntimeError('Token cookie invalid mid-download.')
status = getattr(resp, 'status', None)
mode = 'ab'
if resume_from <= 0:
mode = 'wb'
elif status != 206:
print(f' INFO {rel} resume request ignored (status={status}); restarting from zero')
resume_from = 0
mode = 'wb'
with open(tmp, mode) as fh:
while True:
buf = resp.read(1 << 20)
if not buf:
break
fh.write(buf)
bytes_read += len(buf)
tmp.replace(local)
dt = time.monotonic() - t0
total_bytes = local.stat().st_size
mb = total_bytes / (1 << 20)
delta_mb = bytes_read / (1 << 20)
rate = mb / dt if dt > 0 else 0
if resume_from > 0:
resumed_mb = resume_from / (1 << 20)
rate = delta_mb / dt if dt > 0 else 0
print(f' GOT {rel} {mb:.1f} MB +{delta_mb:.1f} MB from {resumed_mb:.1f} MB {rate:.1f} MB/s')
else:
print(f' GOT {rel} {mb:.1f} MB {rate:.1f} MB/s')
return
except urllib.error.HTTPError as e:
last = e
if e.code == 416 and resume_from > 0:
print(f' WARN {rel} resume rejected with 416; restarting from zero', file=sys.stderr)
try:
tmp.unlink(missing_ok=True)
except OSError:
pass
time.sleep(1)
continue
wait = min(30, 2 ** attempt)
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
time.sleep(wait)
except RuntimeError:
raise
except Exception as e:
last = e
wait = min(30, 2 ** attempt)
print(f' WARN {rel} failed ({e!r}); retry in {wait}s ({attempt + 1}/5)', file=sys.stderr)
time.sleep(wait)
raise RuntimeError(f'download failed after 5 attempts: {last!r}')
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument('--cookies', required=True, type=Path)
ap.add_argument('--base', required=True, help='dataset URL ending with /, e.g. https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/')
ap.add_argument('--root', required=True, help='sub-path to crawl (e.g. PCAP or CSV)')
ap.add_argument('--dest', required=True, type=Path, help='local directory to mirror into')
ap.add_argument('--dry-run', action='store_true', help='enumerate only; do not download')
ap.add_argument('--limit', type=int, default=0, help='stop after N files (0 = no limit)')
ap.add_argument('--skip-ext', default='', help="comma-separated file extensions to skip (e.g. 'zip,7z'); case-insensitive, no dots")
args = ap.parse_args()
skip_exts = {e.strip().lower().lstrip('.') for e in args.skip_ext.split(',') if e.strip()}
if not args.cookies.is_file():
print(f'ERROR: cookies file not found: {args.cookies}', file=sys.stderr)
return 2
opener = build_opener(args.cookies)
args.dest.mkdir(parents=True, exist_ok=True)
print(f'Base : {args.base}')
print(f'Root : {args.root}')
print(f'Dest : {args.dest}')
print(f'Walking tree...')
n_files = 0
n_skipped = 0
for remote in walk(opener, args.base, args.root):
ext = remote.rsplit('.', 1)[-1].lower() if '.' in remote else ''
if ext in skip_exts:
n_skipped += 1
print(f" SKIP {remote} (extension '.{ext}' excluded)")
continue
n_files += 1
if args.dry_run:
print(f' FILE {remote}')
else:
try:
download_file(opener, args.base, remote, args.dest, root_prefix=args.root.rstrip('/'))
except Exception as e:
print(f' FAIL {remote}: {e}', file=sys.stderr)
if args.limit and n_files >= args.limit:
print(f'-- stopped after {args.limit} (--limit) --')
break
print(f'Done. {n_files} files processed, {n_skipped} skipped by --skip-ext.')
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1777047525 Token ef8ooumh5qdh42r0k410mjoq0c

View File

@@ -0,0 +1,4 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
.cicresearch.ca TRUE / TRUE 1776910223 Token 8kfh51fj8u46lum8kvu6safonr

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1777518468 Token qn181atofvua6sn8ouv1hlcoo8

View File

@@ -0,0 +1,5 @@
# Netscape HTTP Cookie File
# https://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.cicresearch.ca TRUE / TRUE 1776990463 Token t4sfffhk5mnttgkh300buhg0it

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# Download CICAPT-IIoT2024 (entire dataset tree) from UNB CIC via cic_download.py.
#
# Prereq: Token cookie for .cicresearch.ca saved as
# scripts/download/cookies_cicapt_iiot2024.txt
#
# Remote tree is crawled in a single pass under ROOT="CICAPT-IIoT Dataset"
# (the top-level folder at
# https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/browse.php?p=CICAPT-IIoT+Dataset ).
# Every leaf file — pcap, csv, whatever — is mirrored under
# datasets/cicapt_iiot2024/raw/
# preserving the remote subdirectory layout.
#
# Usage:
# bash download_cicapt_iiot2024.sh # full download
# DRY_RUN=1 bash download_cicapt_iiot2024.sh # enumerate only
# LIMIT=5 bash download_cicapt_iiot2024.sh # smoke test (first 5 files)
# SKIP_EXT=zip,7z bash download_cicapt_iiot2024.sh # skip archives
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicapt_iiot2024/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicapt_iiot2024.txt}"
BASE="${BASE:-https://cicresearch.ca/IOTDataset/CICAPT-IIoT-Dataset/}"
ROOT="${ROOT:-CICAPT-IIoT Dataset}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
echo "=== ${ROOT} -> ${DEST_ROOT} ==="
python3 -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${ROOT}" --dest "${DEST_ROOT}" "${EXTRA[@]}"

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Download CICDDoS2019 (CSV, optionally PCAP) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/ddos-2019.html
# in a browser, then save the issued Token cookie (Netscape format) as
# scripts/download/cookies_cicddos2019.txt
# Tokens are scoped per-dataset — the CICIoT2023 / ISCXTor cookies will NOT
# work here.
#
# PCAPs for this dataset are already downloaded (see datasets/cicddos2019/raw/
# pcap/). Default WHAT=csv reflects that. Switch to WHAT=pcap or WHAT=both if
# you need to re-fetch.
#
# Usage:
# bash download_cicddos2019.sh # CSVs only (default)
# WHAT=pcap bash download_cicddos2019.sh # PCAPs only
# WHAT=both bash download_cicddos2019.sh # everything
# DRY_RUN=1 bash download_cicddos2019.sh # enumerate without downloading
# CSV_ROOT=CSV bash download_cicddos2019.sh # override root if server uses a different name
#
# First-time tip: run with DRY_RUN=1 to discover the exact remote root names.
# The CIC site is inconsistent across datasets (CSV / CSVs / CSV-01-12 ...).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/cicddos2019/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_cicddos2019.txt}"
BASE="https://cicresearch.ca/CICDataset/CICDDoS2019/"
WHAT="${WHAT:-csv}"
# Default root names. Override via env if dry-run shows a different layout.
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
CSV_ROOT="${CSV_ROOT:-CSVs}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
python -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
# Download CICIoT2023 (PCAP + CSV) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/iotdataset-2023.html
# in a browser, then save the issued Token cookie in Netscape format as
# scripts/download/cookies_ciciot2023.txt
# The cookie domain must be .cicresearch.ca and the name must be "Token".
#
# Usage:
# bash download_ciciot2023.sh # both PCAP and CSV
# WHAT=pcap bash download_ciciot2023.sh # PCAP only
# WHAT=csv bash download_ciciot2023.sh # CSV only
# DRY_RUN=1 bash download_ciciot2023.sh # enumerate without downloading
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/ciciot2023/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_ciciot2023.txt}"
BASE="https://cicresearch.ca/IOTDataset/CIC_IOT_Dataset2023/"
WHAT="${WHAT:-both}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
python -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run PCAP "${DEST_ROOT}/pcap" ;;
csv) run CSV "${DEST_ROOT}/csv" ;;
both) run PCAP "${DEST_ROOT}/pcap"
run CSV "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env bash
# Download ISCXTor2016 (PCAP + CSV) from UNB CIC via cic_download.py.
#
# Prereq: submit the form at
# https://www.unb.ca/cic/datasets/tor.html
# in a browser, then save the issued Token cookie (Netscape format) as
# scripts/download/cookies_iscxtor2016.txt
# Tokens are scoped per-dataset — the CICIoT2023 cookie will NOT work here.
#
# Usage:
# bash download_iscxtor2016.sh
# WHAT=pcap|csv|both DEST=... COOKIES=... DRY_RUN=1 LIMIT=N
# PCAP_ROOT=... CSV_ROOT=... SKIP_EXT=zip,7z
#
# Note: the remote sub-path names ("Pcaps" / "CSVs" or similar) are only
# visible after authenticating. Run with DRY_RUN=1 first to confirm the
# tree; if the roots differ, set PCAP_ROOT=... and/or CSV_ROOT=....
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DEST_ROOT="${DEST:-${REPO_ROOT}/datasets/iscxtor2016/raw}"
COOKIES="${COOKIES:-${SCRIPT_DIR}/cookies_iscxtor2016.txt}"
BASE="https://cicresearch.ca/CICDataset/ISCX-Tor-NonTor-2017/"
WHAT="${WHAT:-both}"
# Default root names (override via env if the server uses different casing)
PCAP_ROOT="${PCAP_ROOT:-PCAPs}"
CSV_ROOT="${CSV_ROOT:-CSVs}"
EXTRA=()
[[ "${DRY_RUN:-}" == "1" ]] && EXTRA+=(--dry-run)
[[ -n "${LIMIT:-}" ]] && EXTRA+=(--limit "${LIMIT}")
[[ -n "${SKIP_EXT:-}" ]] && EXTRA+=(--skip-ext "${SKIP_EXT}")
resolve_python() {
if [[ -n "${PYTHON:-}" ]]; then
printf '%s\n' "${PYTHON}"
return
fi
if [[ -x "${REPO_ROOT}/.venv/bin/python" ]]; then
printf '%s\n' "${REPO_ROOT}/.venv/bin/python"
return
fi
if command -v python >/dev/null 2>&1; then
command -v python
return
fi
if command -v python3 >/dev/null 2>&1; then
command -v python3
return
fi
echo "ERROR: no Python interpreter found. Set PYTHON=/path/to/python." >&2
exit 127
}
PYTHON_BIN="$(resolve_python)"
run() {
local root="$1" dest="$2"
echo "=== ${root} -> ${dest} ==="
"${PYTHON_BIN}" -u "${SCRIPT_DIR}/cic_download.py" \
--cookies "${COOKIES}" --base "${BASE}" \
--root "${root}" --dest "${dest}" "${EXTRA[@]}"
}
case "${WHAT}" in
pcap) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap" ;;
csv) run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
both) run "${PCAP_ROOT}" "${DEST_ROOT}/pcap"
run "${CSV_ROOT}" "${DEST_ROOT}/csv" ;;
*) echo "Unknown WHAT=${WHAT} (expected pcap|csv|both)" >&2; exit 1 ;;
esac