Files
llmiotsafe/tmp_dpo_full_log/final_train_pairs_report.json
2026-05-12 17:01:39 +08:00

235 lines
5.4 KiB
JSON

{
"split": "train",
"input_root": "data_dpo_full_log_v1",
"total_pairs": 2500,
"counters": {
"chosen_rule_fallback": 1621,
"rejected_weak_model": 1679,
"chosen_strong_model": 879,
"rejected_constructed_fallback": 821
},
"strong_answer_summary": {
"total_episodes": 2500,
"detection_accuracy": 0.5428,
"miss_rate": 0.3529850746268657,
"false_alarm_rate": 0.5577586206896552,
"precision": 0.5726552179656539,
"recall": 0.6470149253731343,
"f1_security": 0.6075683251576735,
"threat_type_accuracy": 0.5201845444059977,
"parse_failure_rate": 0.0672,
"breakdown": {
"TP": {
"total": 1340,
"correct": 867,
"accuracy": 0.6470149253731343
},
"FP": {
"total": 980,
"correct": 431,
"accuracy": 0.43979591836734694
},
"TN": {
"total": 180,
"correct": 59,
"accuracy": 0.3277777777777778
}
},
"per_sq": {
"SQ1": {
"total": 575,
"accuracy": 0.5565217391304348,
"miss_count": 26,
"false_alarm_count": 226
},
"SQ2": {
"total": 633,
"accuracy": 0.6145339652448657,
"miss_count": 196,
"false_alarm_count": 45
},
"SQ3": {
"total": 568,
"accuracy": 0.47183098591549294,
"miss_count": 166,
"false_alarm_count": 123
},
"SQ4": {
"total": 399,
"accuracy": 0.5012531328320802,
"miss_count": 60,
"false_alarm_count": 135
},
"SQ5": {
"total": 325,
"accuracy": 0.5538461538461539,
"miss_count": 25,
"false_alarm_count": 118
}
},
"label": "strong_answers",
"count": 2500,
"error_taxonomy": {
"total_errors": 1559,
"error_distribution": {
"MISS": 402,
"WRONG_TYPE": 416,
"PARSE_FAIL": 100,
"FALSE_ALARM": 641
},
"error_by_sq": {
"SQ1": {
"WRONG_TYPE": 65,
"PARSE_FAIL": 29,
"MISS": 3,
"FALSE_ALARM": 223
},
"SQ2": {
"MISS": 186,
"WRONG_TYPE": 52,
"PARSE_FAIL": 13,
"FALSE_ALARM": 45
},
"SQ3": {
"WRONG_TYPE": 136,
"MISS": 145,
"PARSE_FAIL": 33,
"FALSE_ALARM": 122
},
"SQ4": {
"WRONG_TYPE": 110,
"MISS": 48,
"PARSE_FAIL": 17,
"FALSE_ALARM": 134
},
"SQ5": {
"MISS": 20,
"WRONG_TYPE": 53,
"PARSE_FAIL": 8,
"FALSE_ALARM": 117
}
},
"error_by_category": {
"intrusion": 331,
"fire_gas": 107,
"water_damage": 14,
"device_fault": 68,
"elderly_specific": 131,
"child_specific": 84,
"behavioral_anomaly": 83
}
}
},
"weak_answer_summary": {
"total_episodes": 2500,
"detection_accuracy": 0.528,
"miss_rate": 0.2947761194029851,
"false_alarm_rate": 0.6603448275862069,
"precision": 0.5523085914669784,
"recall": 0.7052238805970149,
"f1_security": 0.6194690265486725,
"threat_type_accuracy": 0.4804232804232804,
"parse_failure_rate": 0.0508,
"breakdown": {
"TP": {
"total": 1340,
"correct": 945,
"accuracy": 0.7052238805970149
},
"FP": {
"total": 980,
"correct": 320,
"accuracy": 0.32653061224489793
},
"TN": {
"total": 180,
"correct": 55,
"accuracy": 0.3055555555555556
}
},
"per_sq": {
"SQ1": {
"total": 575,
"accuracy": 0.551304347826087,
"miss_count": 7,
"false_alarm_count": 250
},
"SQ2": {
"total": 633,
"accuracy": 0.5576619273301737,
"miss_count": 148,
"false_alarm_count": 127
},
"SQ3": {
"total": 568,
"accuracy": 0.46830985915492956,
"miss_count": 139,
"false_alarm_count": 154
},
"SQ4": {
"total": 399,
"accuracy": 0.47117794486215536,
"miss_count": 68,
"false_alarm_count": 140
},
"SQ5": {
"total": 325,
"accuracy": 0.6030769230769231,
"miss_count": 33,
"false_alarm_count": 95
}
},
"label": "weak_answers",
"count": 2500,
"error_taxonomy": {
"total_errors": 1671,
"error_distribution": {
"MISS": 334,
"WRONG_TYPE": 491,
"PARSE_FAIL": 89,
"FALSE_ALARM": 757
},
"error_by_sq": {
"SQ1": {
"WRONG_TYPE": 45,
"PARSE_FAIL": 9,
"MISS": 1,
"FALSE_ALARM": 248
},
"SQ2": {
"MISS": 134,
"WRONG_TYPE": 114,
"PARSE_FAIL": 21,
"FALSE_ALARM": 125
},
"SQ3": {
"WRONG_TYPE": 163,
"PARSE_FAIL": 31,
"MISS": 120,
"FALSE_ALARM": 151
},
"SQ4": {
"WRONG_TYPE": 114,
"PARSE_FAIL": 18,
"MISS": 55,
"FALSE_ALARM": 138
},
"SQ5": {
"MISS": 24,
"WRONG_TYPE": 55,
"PARSE_FAIL": 10,
"FALSE_ALARM": 95
}
},
"error_by_category": {
"intrusion": 336,
"fire_gas": 126,
"water_damage": 21,
"device_fault": 46,
"elderly_specific": 132,
"child_specific": 86,
"behavioral_anomaly": 78
}
}
}
}