{ "split": "train", "input_root": "data_dpo_v2\\train_pref_v1", "total_pairs": 2500, "assembly_counts": { "chosen_rule_fallback": 1240, "rejected_weak_model": 1319, "rejected_constructed_fallback": 1181, "chosen_strong_model": 1260 }, "strong_answer_summary": { "total_episodes": 2500, "detection_accuracy": 0.61, "miss_rate": 0.4298507462686567, "false_alarm_rate": 0.3439655172413793, "precision": 0.6569217540842648, "recall": 0.5701492537313433, "f1_security": 0.6104674390731122, "threat_type_accuracy": 0.6583769633507853, "parse_failure_rate": 0.0052, "breakdown": { "TP": { "total": 1340, "correct": 764, "accuracy": 0.5701492537313433 }, "FP": { "total": 980, "correct": 615, "accuracy": 0.6275510204081632 }, "TN": { "total": 180, "correct": 146, "accuracy": 0.8111111111111111 } }, "per_sq": { "SQ1": { "total": 575, "accuracy": 0.5634782608695652, "miss_count": 77, "false_alarm_count": 174 }, "SQ2": { "total": 633, "accuracy": 0.6382306477093207, "miss_count": 178, "false_alarm_count": 51 }, "SQ3": { "total": 568, "accuracy": 0.602112676056338, "miss_count": 172, "false_alarm_count": 54 }, "SQ4": { "total": 399, "accuracy": 0.6090225563909775, "miss_count": 89, "false_alarm_count": 67 }, "SQ5": { "total": 325, "accuracy": 0.6523076923076923, "miss_count": 60, "false_alarm_count": 53 } }, "label": "strong_answers", "count": 2500, "error_taxonomy": { "total_errors": 1236, "error_distribution": { "MISS": 568, "WRONG_TYPE": 261, "PARSE_FAIL": 9, "FALSE_ALARM": 398 }, "error_by_sq": { "SQ1": { "MISS": 72, "WRONG_TYPE": 9, "PARSE_FAIL": 5, "FALSE_ALARM": 174 }, "SQ2": { "MISS": 178, "WRONG_TYPE": 26, "FALSE_ALARM": 51 }, "SQ3": { "MISS": 169, "WRONG_TYPE": 122, "PARSE_FAIL": 3, "FALSE_ALARM": 54 }, "SQ4": { "MISS": 89, "WRONG_TYPE": 83, "FALSE_ALARM": 67 }, "SQ5": { "MISS": 60, "WRONG_TYPE": 21, "FALSE_ALARM": 52, "PARSE_FAIL": 1 } }, "error_by_category": { "intrusion": 329, "fire_gas": 107, "device_fault": 81, "elderly_specific": 133, "child_specific": 89, "behavioral_anomaly": 90 } } }, "weak_answer_summary": { "total_episodes": 2500, "detection_accuracy": 0.6124, "miss_rate": 0.3597014925373134, "false_alarm_rate": 0.41206896551724137, "precision": 0.6422155688622755, "recall": 0.6402985074626866, "f1_security": 0.641255605381166, "threat_type_accuracy": 0.6118881118881119, "parse_failure_rate": 0.026, "breakdown": { "TP": { "total": 1340, "correct": 858, "accuracy": 0.6402985074626866 }, "FP": { "total": 980, "correct": 510, "accuracy": 0.5204081632653061 }, "TN": { "total": 180, "correct": 163, "accuracy": 0.9055555555555556 } }, "per_sq": { "SQ1": { "total": 575, "accuracy": 0.5704347826086956, "miss_count": 72, "false_alarm_count": 170 }, "SQ2": { "total": 633, "accuracy": 0.6477093206951027, "miss_count": 142, "false_alarm_count": 79 }, "SQ3": { "total": 568, "accuracy": 0.647887323943662, "miss_count": 121, "false_alarm_count": 77 }, "SQ4": { "total": 399, "accuracy": 0.5588972431077694, "miss_count": 85, "false_alarm_count": 91 }, "SQ5": { "total": 325, "accuracy": 0.6215384615384615, "miss_count": 62, "false_alarm_count": 61 } }, "label": "weak_answers", "count": 2500, "error_taxonomy": { "total_errors": 1302, "error_distribution": { "MISS": 460, "WRONG_TYPE": 333, "PARSE_FAIL": 40, "FALSE_ALARM": 469 }, "error_by_sq": { "SQ1": { "MISS": 67, "WRONG_TYPE": 32, "PARSE_FAIL": 14, "FALSE_ALARM": 166 }, "SQ2": { "MISS": 140, "PARSE_FAIL": 6, "FALSE_ALARM": 77, "WRONG_TYPE": 47 }, "SQ3": { "MISS": 111, "WRONG_TYPE": 142, "PARSE_FAIL": 15, "FALSE_ALARM": 74 }, "SQ4": { "MISS": 81, "WRONG_TYPE": 93, "FALSE_ALARM": 91, "PARSE_FAIL": 4 }, "SQ5": { "MISS": 61, "WRONG_TYPE": 19, "PARSE_FAIL": 1, "FALSE_ALARM": 61 } }, "error_by_category": { "intrusion": 258, "fire_gas": 146, "water_damage": 6, "device_fault": 99, "elderly_specific": 134, "child_specific": 63, "behavioral_anomaly": 87 } } } }