{ "split": "dev", "input_root": "data_dpo_v2\\dev_pref_v1", "total_pairs": 300, "assembly_counts": { "chosen_rule_fallback": 154, "rejected_weak_model": 155, "rejected_constructed_fallback": 145, "chosen_strong_model": 146 }, "strong_answer_summary": { "total_episodes": 300, "detection_accuracy": 0.5466666666666666, "miss_rate": 0.41875, "false_alarm_rate": 0.4928571428571429, "precision": 0.5740740740740741, "recall": 0.58125, "f1_security": 0.577639751552795, "threat_type_accuracy": 0.8387096774193549, "parse_failure_rate": 0.016666666666666666, "breakdown": { "TP": { "total": 160, "correct": 93, "accuracy": 0.58125 }, "FP": { "total": 120, "correct": 56, "accuracy": 0.4666666666666667 }, "TN": { "total": 20, "correct": 15, "accuracy": 0.75 } }, "per_sq": { "SQ1": { "total": 70, "accuracy": 0.5714285714285714, "miss_count": 2, "false_alarm_count": 28 }, "SQ2": { "total": 38, "accuracy": 0.5526315789473685, "miss_count": 13, "false_alarm_count": 4 }, "SQ3": { "total": 71, "accuracy": 0.5492957746478874, "miss_count": 23, "false_alarm_count": 9 }, "SQ4": { "total": 59, "accuracy": 0.559322033898305, "miss_count": 16, "false_alarm_count": 10 }, "SQ5": { "total": 62, "accuracy": 0.5, "miss_count": 13, "false_alarm_count": 18 } }, "label": "strong_answers", "count": 300, "error_taxonomy": { "total_errors": 151, "error_distribution": { "MISS": 65, "FALSE_ALARM": 69, "PARSE_FAIL": 2, "WRONG_TYPE": 15 }, "error_by_sq": { "SQ1": { "PARSE_FAIL": 2, "FALSE_ALARM": 28 }, "SQ2": { "MISS": 13, "FALSE_ALARM": 4 }, "SQ3": { "MISS": 23, "FALSE_ALARM": 9, "WRONG_TYPE": 9 }, "SQ4": { "FALSE_ALARM": 10, "WRONG_TYPE": 6, "MISS": 16 }, "SQ5": { "MISS": 13, "FALSE_ALARM": 18 } }, "error_by_category": { "intrusion": 44, "elderly_specific": 15, "child_specific": 11, "behavioral_anomaly": 10 } } }, "weak_answer_summary": { "total_episodes": 300, "detection_accuracy": 0.5966666666666667, "miss_rate": 0.29375, "false_alarm_rate": 0.5214285714285715, "precision": 0.6075268817204301, "recall": 0.70625, "f1_security": 0.653179190751445, "threat_type_accuracy": 0.7256637168141593, "parse_failure_rate": 0.02666666666666667, "breakdown": { "TP": { "total": 160, "correct": 113, "accuracy": 0.70625 }, "FP": { "total": 120, "correct": 48, "accuracy": 0.4 }, "TN": { "total": 20, "correct": 18, "accuracy": 0.9 } }, "per_sq": { "SQ1": { "total": 70, "accuracy": 0.5428571428571428, "miss_count": 2, "false_alarm_count": 29 }, "SQ2": { "total": 38, "accuracy": 0.6052631578947368, "miss_count": 12, "false_alarm_count": 3 }, "SQ3": { "total": 71, "accuracy": 0.6338028169014085, "miss_count": 11, "false_alarm_count": 15 }, "SQ4": { "total": 59, "accuracy": 0.6101694915254238, "miss_count": 11, "false_alarm_count": 12 }, "SQ5": { "total": 62, "accuracy": 0.5967741935483871, "miss_count": 11, "false_alarm_count": 14 } }, "label": "weak_answers", "count": 300, "error_taxonomy": { "total_errors": 152, "error_distribution": { "MISS": 43, "PARSE_FAIL": 5, "WRONG_TYPE": 31, "FALSE_ALARM": 73 }, "error_by_sq": { "SQ1": { "PARSE_FAIL": 3, "FALSE_ALARM": 29 }, "SQ2": { "PARSE_FAIL": 1, "MISS": 11, "FALSE_ALARM": 3 }, "SQ3": { "MISS": 10, "WRONG_TYPE": 20, "PARSE_FAIL": 1, "FALSE_ALARM": 15 }, "SQ4": { "FALSE_ALARM": 12, "WRONG_TYPE": 11, "MISS": 11 }, "SQ5": { "MISS": 11, "FALSE_ALARM": 14 } }, "error_by_category": { "intrusion": 38, "elderly_specific": 15, "child_specific": 11, "behavioral_anomaly": 10 } } } }