{ "best_global_step": 77, "best_metric": 0.6231179237365723, "best_model_checkpoint": "outputs/qwen35_dpo_ultralowmem_ref_free/checkpoint-77", "epoch": 1.0, "eval_steps": 200, "global_step": 77, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9454727958887815, "epoch": 0.13136288998357964, "grad_norm": 7.8125, "learning_rate": 4.919332367333748e-07, "logits/chosen": -1.8099737944940273, "logits/rejected": -1.832894790703956, "logps/chosen": -353.753306388855, "logps/rejected": -344.2661180496216, "loss": 0.6934457302093506, "mean_token_accuracy": 0.7538095749914646, "num_tokens": 1287126.0, "rewards/accuracies": 0.409375, "rewards/chosen": -0.00509019878518302, "rewards/margins": 0.0012955188169144094, "rewards/rejected": -0.006385717548255343, "step": 10 }, { "entropy": 0.960740290209651, "epoch": 0.2627257799671593, "grad_norm": 7.0625, "learning_rate": 4.4450893857960984e-07, "logits/chosen": -1.8059819363814396, "logits/rejected": -1.8327529228301944, "logps/chosen": -383.5576331138611, "logps/rejected": -348.3659210205078, "loss": 0.6661795139312744, "mean_token_accuracy": 0.7455122817307711, "num_tokens": 2584081.0, "rewards/accuracies": 0.696875, "rewards/chosen": 0.01022649770602584, "rewards/margins": 0.05804813946597278, "rewards/rejected": -0.04782164186508453, "step": 20 }, { "entropy": 0.9594713591039181, "epoch": 0.39408866995073893, "grad_norm": 6.34375, "learning_rate": 3.625509362044183e-07, "logits/chosen": -1.8090848916866544, "logits/rejected": -1.839708000919806, "logps/chosen": -367.3285415649414, "logps/rejected": -347.3939818382263, "loss": 0.6498698234558106, "mean_token_accuracy": 0.7490173149853945, "num_tokens": 3873129.0, "rewards/accuracies": 0.76875, "rewards/chosen": 0.01563858055451419, "rewards/margins": 0.09292944613844156, "rewards/rejected": -0.0772908657156222, "step": 30 }, { "entropy": 0.9579024501144886, "epoch": 0.5254515599343186, "grad_norm": 7.84375, "learning_rate": 2.606103007990371e-07, "logits/chosen": -1.822963703035332, "logits/rejected": -1.8331565552576632, "logps/chosen": -365.0903636932373, "logps/rejected": -355.14615325927736, "loss": 0.6380878925323487, "mean_token_accuracy": 0.749091599136591, "num_tokens": 5164485.0, "rewards/accuracies": 0.7625, "rewards/chosen": 0.013401065368088893, "rewards/margins": 0.12074792645871639, "rewards/rejected": -0.10734686049545417, "step": 40 }, { "entropy": 0.9612626571208238, "epoch": 0.6568144499178982, "grad_norm": 6.09375, "learning_rate": 1.5678588055492286e-07, "logits/chosen": -1.8040991478766433, "logits/rejected": -1.833159321438822, "logps/chosen": -402.17723083496094, "logps/rejected": -356.4602531433105, "loss": 0.6306396484375, "mean_token_accuracy": 0.7441606149077415, "num_tokens": 6453542.0, "rewards/accuracies": 0.78125, "rewards/chosen": 0.014387460224679672, "rewards/margins": 0.13786569883814082, "rewards/rejected": -0.12347823897434865, "step": 50 }, { "entropy": 0.9673831064254046, "epoch": 0.7881773399014779, "grad_norm": 6.8125, "learning_rate": 6.951097651136889e-08, "logits/chosen": -1.8032587367384516, "logits/rejected": -1.8365809013216652, "logps/chosen": -381.1871561050415, "logps/rejected": -348.20880632400514, "loss": 0.6237552642822266, "mean_token_accuracy": 0.7467762563377619, "num_tokens": 7742778.0, "rewards/accuracies": 0.79375, "rewards/chosen": 0.02942158783553168, "rewards/margins": 0.15444288045400753, "rewards/rejected": -0.12502129255008185, "step": 60 }, { "entropy": 0.9571243241429329, "epoch": 0.9195402298850575, "grad_norm": 6.78125, "learning_rate": 1.4280638634728948e-08, "logits/chosen": -1.8158756551653077, "logits/rejected": -1.8351591651305355, "logps/chosen": -369.4532477378845, "logps/rejected": -360.18324394226073, "loss": 0.6174001693725586, "mean_token_accuracy": 0.7488324739038944, "num_tokens": 9038143.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02844569750013761, "rewards/margins": 0.1687723191542318, "rewards/rejected": -0.14032662139070454, "step": 70 }, { "epoch": 1.0, "eval_entropy": 0.9807351431617998, "eval_logits/chosen": -1.8414135470736719, "eval_logits/rejected": -1.8220581632231228, "eval_logps/chosen": -395.92160922533844, "eval_logps/rejected": -371.32895587241813, "eval_loss": 0.6231179237365723, "eval_mean_token_accuracy": 0.7525672818699928, "eval_num_tokens": 9825739.0, "eval_rewards/accuracies": 0.7602739726027398, "eval_rewards/chosen": 0.013094972891650125, "eval_rewards/margins": 0.15859890370693516, "eval_rewards/rejected": -0.1455039322377846, "eval_runtime": 256.8556, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.568, "step": 77 } ], "logging_steps": 10, "max_steps": 77, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.73228507514667e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }