Files
llmiotsafe/DPO_QWEN35_SERVER_BUNDLE/run_precompute_ref_logps_lowmem.sh
2026-05-12 17:01:39 +08:00

36 lines
1.1 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$CONDA_PREFIX/targets/x86_64-linux/lib:$CONDA_PREFIX/lib/python3.12/site-packages/nvidia/cu13/lib:$LD_LIBRARY_PATH
MODEL_NAME="Qwen/Qwen3.5-9B"
TRAIN_FILE="data/train_dpo_clean.jsonl"
DEV_FILE="data/dev_dpo_clean.jsonl"
OUTPUT_DIR="outputs/qwen35_dpo_precompute_only"
REF_LOGPS_CACHE_DIR="cache/ref_logps_qwen35_lowmem"
torchrun --nproc_per_node=2 scripts/train_dpo.py \
--model-name "${MODEL_NAME}" \
--train-file "${TRAIN_FILE}" \
--dev-file "${DEV_FILE}" \
--output-dir "${OUTPUT_DIR}" \
--ref-logps-cache-dir "${REF_LOGPS_CACHE_DIR}" \
--precompute-only \
--per-device-train-batch-size 1 \
--per-device-eval-batch-size 1 \
--gradient-accumulation-steps 16 \
--max-length 4096 \
--max-prompt-length 3584 \
--max-completion-length 512 \
--truncation-mode keep_end \
--precompute-ref-log-probs \
--logging-steps 10 \
--torch-empty-cache-steps 10 \
--lora-r 16 \
--lora-alpha 32 \
--lora-dropout 0.05 \
--optim paged_adamw_8bit \
--attn-implementation flash_attention_2 \
--bf16