{ "split": "train", "input_root": "data_dpo_full_log_v1", "total_pairs": 2500, "counters": { "chosen_rule_fallback": 1621, "rejected_weak_model": 1679, "chosen_strong_model": 879, "rejected_constructed_fallback": 821 }, "strong_answer_summary": { "total_episodes": 2500, "detection_accuracy": 0.5428, "miss_rate": 0.3529850746268657, "false_alarm_rate": 0.5577586206896552, "precision": 0.5726552179656539, "recall": 0.6470149253731343, "f1_security": 0.6075683251576735, "threat_type_accuracy": 0.5201845444059977, "parse_failure_rate": 0.0672, "breakdown": { "TP": { "total": 1340, "correct": 867, "accuracy": 0.6470149253731343 }, "FP": { "total": 980, "correct": 431, "accuracy": 0.43979591836734694 }, "TN": { "total": 180, "correct": 59, "accuracy": 0.3277777777777778 } }, "per_sq": { "SQ1": { "total": 575, "accuracy": 0.5565217391304348, "miss_count": 26, "false_alarm_count": 226 }, "SQ2": { "total": 633, "accuracy": 0.6145339652448657, "miss_count": 196, "false_alarm_count": 45 }, "SQ3": { "total": 568, "accuracy": 0.47183098591549294, "miss_count": 166, "false_alarm_count": 123 }, "SQ4": { "total": 399, "accuracy": 0.5012531328320802, "miss_count": 60, "false_alarm_count": 135 }, "SQ5": { "total": 325, "accuracy": 0.5538461538461539, "miss_count": 25, "false_alarm_count": 118 } }, "label": "strong_answers", "count": 2500, "error_taxonomy": { "total_errors": 1559, "error_distribution": { "MISS": 402, "WRONG_TYPE": 416, "PARSE_FAIL": 100, "FALSE_ALARM": 641 }, "error_by_sq": { "SQ1": { "WRONG_TYPE": 65, "PARSE_FAIL": 29, "MISS": 3, "FALSE_ALARM": 223 }, "SQ2": { "MISS": 186, "WRONG_TYPE": 52, "PARSE_FAIL": 13, "FALSE_ALARM": 45 }, "SQ3": { "WRONG_TYPE": 136, "MISS": 145, "PARSE_FAIL": 33, "FALSE_ALARM": 122 }, "SQ4": { "WRONG_TYPE": 110, "MISS": 48, "PARSE_FAIL": 17, "FALSE_ALARM": 134 }, "SQ5": { "MISS": 20, "WRONG_TYPE": 53, "PARSE_FAIL": 8, "FALSE_ALARM": 117 } }, "error_by_category": { "intrusion": 331, "fire_gas": 107, "water_damage": 14, "device_fault": 68, "elderly_specific": 131, "child_specific": 84, "behavioral_anomaly": 83 } } }, "weak_answer_summary": { "total_episodes": 2500, "detection_accuracy": 0.528, "miss_rate": 0.2947761194029851, "false_alarm_rate": 0.6603448275862069, "precision": 0.5523085914669784, "recall": 0.7052238805970149, "f1_security": 0.6194690265486725, "threat_type_accuracy": 0.4804232804232804, "parse_failure_rate": 0.0508, "breakdown": { "TP": { "total": 1340, "correct": 945, "accuracy": 0.7052238805970149 }, "FP": { "total": 980, "correct": 320, "accuracy": 0.32653061224489793 }, "TN": { "total": 180, "correct": 55, "accuracy": 0.3055555555555556 } }, "per_sq": { "SQ1": { "total": 575, "accuracy": 0.551304347826087, "miss_count": 7, "false_alarm_count": 250 }, "SQ2": { "total": 633, "accuracy": 0.5576619273301737, "miss_count": 148, "false_alarm_count": 127 }, "SQ3": { "total": 568, "accuracy": 0.46830985915492956, "miss_count": 139, "false_alarm_count": 154 }, "SQ4": { "total": 399, "accuracy": 0.47117794486215536, "miss_count": 68, "false_alarm_count": 140 }, "SQ5": { "total": 325, "accuracy": 0.6030769230769231, "miss_count": 33, "false_alarm_count": 95 } }, "label": "weak_answers", "count": 2500, "error_taxonomy": { "total_errors": 1671, "error_distribution": { "MISS": 334, "WRONG_TYPE": 491, "PARSE_FAIL": 89, "FALSE_ALARM": 757 }, "error_by_sq": { "SQ1": { "WRONG_TYPE": 45, "PARSE_FAIL": 9, "MISS": 1, "FALSE_ALARM": 248 }, "SQ2": { "MISS": 134, "WRONG_TYPE": 114, "PARSE_FAIL": 21, "FALSE_ALARM": 125 }, "SQ3": { "WRONG_TYPE": 163, "PARSE_FAIL": 31, "MISS": 120, "FALSE_ALARM": 151 }, "SQ4": { "WRONG_TYPE": 114, "PARSE_FAIL": 18, "MISS": 55, "FALSE_ALARM": 138 }, "SQ5": { "MISS": 24, "WRONG_TYPE": 55, "PARSE_FAIL": 10, "FALSE_ALARM": 95 } }, "error_by_category": { "intrusion": 336, "fire_gas": 126, "water_damage": 21, "device_fault": 46, "elderly_specific": 132, "child_specific": 86, "behavioral_anomaly": 78 } } } }