Files
llmiotsafe/tmp_dpo/final_dev_pairs_report.json
2026-05-12 17:01:39 +08:00

215 lines
4.8 KiB
JSON

{
"split": "dev",
"input_root": "data_dpo_v2\\dev_pref_v1",
"total_pairs": 300,
"assembly_counts": {
"chosen_rule_fallback": 154,
"rejected_weak_model": 155,
"rejected_constructed_fallback": 145,
"chosen_strong_model": 146
},
"strong_answer_summary": {
"total_episodes": 300,
"detection_accuracy": 0.5466666666666666,
"miss_rate": 0.41875,
"false_alarm_rate": 0.4928571428571429,
"precision": 0.5740740740740741,
"recall": 0.58125,
"f1_security": 0.577639751552795,
"threat_type_accuracy": 0.8387096774193549,
"parse_failure_rate": 0.016666666666666666,
"breakdown": {
"TP": {
"total": 160,
"correct": 93,
"accuracy": 0.58125
},
"FP": {
"total": 120,
"correct": 56,
"accuracy": 0.4666666666666667
},
"TN": {
"total": 20,
"correct": 15,
"accuracy": 0.75
}
},
"per_sq": {
"SQ1": {
"total": 70,
"accuracy": 0.5714285714285714,
"miss_count": 2,
"false_alarm_count": 28
},
"SQ2": {
"total": 38,
"accuracy": 0.5526315789473685,
"miss_count": 13,
"false_alarm_count": 4
},
"SQ3": {
"total": 71,
"accuracy": 0.5492957746478874,
"miss_count": 23,
"false_alarm_count": 9
},
"SQ4": {
"total": 59,
"accuracy": 0.559322033898305,
"miss_count": 16,
"false_alarm_count": 10
},
"SQ5": {
"total": 62,
"accuracy": 0.5,
"miss_count": 13,
"false_alarm_count": 18
}
},
"label": "strong_answers",
"count": 300,
"error_taxonomy": {
"total_errors": 151,
"error_distribution": {
"MISS": 65,
"FALSE_ALARM": 69,
"PARSE_FAIL": 2,
"WRONG_TYPE": 15
},
"error_by_sq": {
"SQ1": {
"PARSE_FAIL": 2,
"FALSE_ALARM": 28
},
"SQ2": {
"MISS": 13,
"FALSE_ALARM": 4
},
"SQ3": {
"MISS": 23,
"FALSE_ALARM": 9,
"WRONG_TYPE": 9
},
"SQ4": {
"FALSE_ALARM": 10,
"WRONG_TYPE": 6,
"MISS": 16
},
"SQ5": {
"MISS": 13,
"FALSE_ALARM": 18
}
},
"error_by_category": {
"intrusion": 44,
"elderly_specific": 15,
"child_specific": 11,
"behavioral_anomaly": 10
}
}
},
"weak_answer_summary": {
"total_episodes": 300,
"detection_accuracy": 0.5966666666666667,
"miss_rate": 0.29375,
"false_alarm_rate": 0.5214285714285715,
"precision": 0.6075268817204301,
"recall": 0.70625,
"f1_security": 0.653179190751445,
"threat_type_accuracy": 0.7256637168141593,
"parse_failure_rate": 0.02666666666666667,
"breakdown": {
"TP": {
"total": 160,
"correct": 113,
"accuracy": 0.70625
},
"FP": {
"total": 120,
"correct": 48,
"accuracy": 0.4
},
"TN": {
"total": 20,
"correct": 18,
"accuracy": 0.9
}
},
"per_sq": {
"SQ1": {
"total": 70,
"accuracy": 0.5428571428571428,
"miss_count": 2,
"false_alarm_count": 29
},
"SQ2": {
"total": 38,
"accuracy": 0.6052631578947368,
"miss_count": 12,
"false_alarm_count": 3
},
"SQ3": {
"total": 71,
"accuracy": 0.6338028169014085,
"miss_count": 11,
"false_alarm_count": 15
},
"SQ4": {
"total": 59,
"accuracy": 0.6101694915254238,
"miss_count": 11,
"false_alarm_count": 12
},
"SQ5": {
"total": 62,
"accuracy": 0.5967741935483871,
"miss_count": 11,
"false_alarm_count": 14
}
},
"label": "weak_answers",
"count": 300,
"error_taxonomy": {
"total_errors": 152,
"error_distribution": {
"MISS": 43,
"PARSE_FAIL": 5,
"WRONG_TYPE": 31,
"FALSE_ALARM": 73
},
"error_by_sq": {
"SQ1": {
"PARSE_FAIL": 3,
"FALSE_ALARM": 29
},
"SQ2": {
"PARSE_FAIL": 1,
"MISS": 11,
"FALSE_ALARM": 3
},
"SQ3": {
"MISS": 10,
"WRONG_TYPE": 20,
"PARSE_FAIL": 1,
"FALSE_ALARM": 15
},
"SQ4": {
"FALSE_ALARM": 12,
"WRONG_TYPE": 11,
"MISS": 11
},
"SQ5": {
"MISS": 11,
"FALSE_ALARM": 14
}
},
"error_by_category": {
"intrusion": 38,
"elderly_specific": 15,
"child_specific": 11,
"behavioral_anomaly": 10
}
}
}
}