#!/usr/bin/env bash set -euo pipefail export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$CONDA_PREFIX/targets/x86_64-linux/lib:$CONDA_PREFIX/lib/python3.12/site-packages/nvidia/cu13/lib:$LD_LIBRARY_PATH MODEL_NAME="Qwen/Qwen3.5-9B" TRAIN_FILE="data/train_dpo_clean.jsonl" DEV_FILE="data/dev_dpo_clean.jsonl" OUTPUT_DIR="outputs/qwen35_dpo_precompute_only" REF_LOGPS_CACHE_DIR="cache/ref_logps_qwen35_lowmem" torchrun --nproc_per_node=2 scripts/train_dpo.py \ --model-name "${MODEL_NAME}" \ --train-file "${TRAIN_FILE}" \ --dev-file "${DEV_FILE}" \ --output-dir "${OUTPUT_DIR}" \ --ref-logps-cache-dir "${REF_LOGPS_CACHE_DIR}" \ --precompute-only \ --per-device-train-batch-size 1 \ --per-device-eval-batch-size 1 \ --gradient-accumulation-steps 16 \ --max-length 4096 \ --max-prompt-length 3584 \ --max-completion-length 512 \ --truncation-mode keep_end \ --precompute-ref-log-probs \ --logging-steps 10 \ --torch-empty-cache-steps 10 \ --lora-r 16 \ --lora-alpha 32 \ --lora-dropout 0.05 \ --optim paged_adamw_8bit \ --attn-implementation flash_attention_2 \ --bf16