{ "split": "dev", "input_root": "data_dpo_full_log_v1", "total_pairs": 300, "counters": { "chosen_rule_fallback": 201, "rejected_weak_model": 190, "rejected_constructed_fallback": 110, "chosen_strong_model": 99 }, "strong_answer_summary": { "total_episodes": 300, "detection_accuracy": 0.5433333333333333, "miss_rate": 0.25625, "false_alarm_rate": 0.6571428571428571, "precision": 0.5639810426540285, "recall": 0.74375, "f1_security": 0.6415094339622642, "threat_type_accuracy": 0.5714285714285714, "parse_failure_rate": 0.11333333333333333, "breakdown": { "TP": { "total": 160, "correct": 119, "accuracy": 0.74375 }, "FP": { "total": 120, "correct": 35, "accuracy": 0.2916666666666667 }, "TN": { "total": 20, "correct": 9, "accuracy": 0.45 } }, "per_sq": { "SQ1": { "total": 70, "accuracy": 0.5428571428571428, "miss_count": 5, "false_alarm_count": 27 }, "SQ2": { "total": 38, "accuracy": 0.631578947368421, "miss_count": 10, "false_alarm_count": 4 }, "SQ3": { "total": 71, "accuracy": 0.4507042253521127, "miss_count": 14, "false_alarm_count": 25 }, "SQ4": { "total": 59, "accuracy": 0.6610169491525424, "miss_count": 5, "false_alarm_count": 14 }, "SQ5": { "total": 62, "accuracy": 0.4838709677419355, "miss_count": 7, "false_alarm_count": 22 } }, "label": "strong_answers", "count": 300, "error_taxonomy": { "total_errors": 188, "error_distribution": { "MISS": 25, "PARSE_FAIL": 21, "WRONG_TYPE": 51, "FALSE_ALARM": 91 }, "error_by_sq": { "SQ1": { "WRONG_TYPE": 6, "PARSE_FAIL": 6, "FALSE_ALARM": 26 }, "SQ2": { "MISS": 10, "WRONG_TYPE": 3, "FALSE_ALARM": 4 }, "SQ3": { "WRONG_TYPE": 19, "MISS": 7, "PARSE_FAIL": 7, "FALSE_ALARM": 25 }, "SQ4": { "FALSE_ALARM": 14, "PARSE_FAIL": 3, "WRONG_TYPE": 17, "MISS": 3 }, "SQ5": { "MISS": 5, "PARSE_FAIL": 5, "WRONG_TYPE": 6, "FALSE_ALARM": 22 } }, "error_by_category": { "intrusion": 38, "device_fault": 6, "elderly_specific": 12, "child_specific": 11, "behavioral_anomaly": 9 } } }, "weak_answer_summary": { "total_episodes": 300, "detection_accuracy": 0.5566666666666666, "miss_rate": 0.2, "false_alarm_rate": 0.6928571428571428, "precision": 0.5688888888888889, "recall": 0.8, "f1_security": 0.664935064935065, "threat_type_accuracy": 0.5546875, "parse_failure_rate": 0.07666666666666666, "breakdown": { "TP": { "total": 160, "correct": 128, "accuracy": 0.8 }, "FP": { "total": 120, "correct": 33, "accuracy": 0.275 }, "TN": { "total": 20, "correct": 6, "accuracy": 0.3 } }, "per_sq": { "SQ1": { "total": 70, "accuracy": 0.5714285714285714, "miss_count": 0, "false_alarm_count": 30 }, "SQ2": { "total": 38, "accuracy": 0.631578947368421, "miss_count": 8, "false_alarm_count": 6 }, "SQ3": { "total": 71, "accuracy": 0.4647887323943662, "miss_count": 11, "false_alarm_count": 27 }, "SQ4": { "total": 59, "accuracy": 0.5423728813559322, "miss_count": 7, "false_alarm_count": 20 }, "SQ5": { "total": 62, "accuracy": 0.6129032258064516, "miss_count": 6, "false_alarm_count": 14 } }, "label": "weak_answers", "count": 300, "error_taxonomy": { "total_errors": 190, "error_distribution": { "MISS": 21, "WRONG_TYPE": 57, "PARSE_FAIL": 15, "FALSE_ALARM": 97 }, "error_by_sq": { "SQ1": { "WRONG_TYPE": 5, "FALSE_ALARM": 30 }, "SQ2": { "WRONG_TYPE": 8, "MISS": 7, "PARSE_FAIL": 1, "FALSE_ALARM": 6 }, "SQ3": { "MISS": 6, "WRONG_TYPE": 22, "PARSE_FAIL": 5, "FALSE_ALARM": 27 }, "SQ4": { "FALSE_ALARM": 20, "WRONG_TYPE": 15, "MISS": 5, "PARSE_FAIL": 2 }, "SQ5": { "MISS": 3, "PARSE_FAIL": 7, "WRONG_TYPE": 7, "FALSE_ALARM": 14 } }, "error_by_category": { "intrusion": 38, "water_damage": 3, "device_fault": 5, "elderly_specific": 15, "child_specific": 9, "behavioral_anomaly": 8 } } } }