Files
llmiotsafe/tmp_dpo/final_train_pairs_report.json
2026-05-12 17:01:39 +08:00

232 lines
5.3 KiB
JSON

{
"split": "train",
"input_root": "data_dpo_v2\\train_pref_v1",
"total_pairs": 2500,
"assembly_counts": {
"chosen_rule_fallback": 1240,
"rejected_weak_model": 1319,
"rejected_constructed_fallback": 1181,
"chosen_strong_model": 1260
},
"strong_answer_summary": {
"total_episodes": 2500,
"detection_accuracy": 0.61,
"miss_rate": 0.4298507462686567,
"false_alarm_rate": 0.3439655172413793,
"precision": 0.6569217540842648,
"recall": 0.5701492537313433,
"f1_security": 0.6104674390731122,
"threat_type_accuracy": 0.6583769633507853,
"parse_failure_rate": 0.0052,
"breakdown": {
"TP": {
"total": 1340,
"correct": 764,
"accuracy": 0.5701492537313433
},
"FP": {
"total": 980,
"correct": 615,
"accuracy": 0.6275510204081632
},
"TN": {
"total": 180,
"correct": 146,
"accuracy": 0.8111111111111111
}
},
"per_sq": {
"SQ1": {
"total": 575,
"accuracy": 0.5634782608695652,
"miss_count": 77,
"false_alarm_count": 174
},
"SQ2": {
"total": 633,
"accuracy": 0.6382306477093207,
"miss_count": 178,
"false_alarm_count": 51
},
"SQ3": {
"total": 568,
"accuracy": 0.602112676056338,
"miss_count": 172,
"false_alarm_count": 54
},
"SQ4": {
"total": 399,
"accuracy": 0.6090225563909775,
"miss_count": 89,
"false_alarm_count": 67
},
"SQ5": {
"total": 325,
"accuracy": 0.6523076923076923,
"miss_count": 60,
"false_alarm_count": 53
}
},
"label": "strong_answers",
"count": 2500,
"error_taxonomy": {
"total_errors": 1236,
"error_distribution": {
"MISS": 568,
"WRONG_TYPE": 261,
"PARSE_FAIL": 9,
"FALSE_ALARM": 398
},
"error_by_sq": {
"SQ1": {
"MISS": 72,
"WRONG_TYPE": 9,
"PARSE_FAIL": 5,
"FALSE_ALARM": 174
},
"SQ2": {
"MISS": 178,
"WRONG_TYPE": 26,
"FALSE_ALARM": 51
},
"SQ3": {
"MISS": 169,
"WRONG_TYPE": 122,
"PARSE_FAIL": 3,
"FALSE_ALARM": 54
},
"SQ4": {
"MISS": 89,
"WRONG_TYPE": 83,
"FALSE_ALARM": 67
},
"SQ5": {
"MISS": 60,
"WRONG_TYPE": 21,
"FALSE_ALARM": 52,
"PARSE_FAIL": 1
}
},
"error_by_category": {
"intrusion": 329,
"fire_gas": 107,
"device_fault": 81,
"elderly_specific": 133,
"child_specific": 89,
"behavioral_anomaly": 90
}
}
},
"weak_answer_summary": {
"total_episodes": 2500,
"detection_accuracy": 0.6124,
"miss_rate": 0.3597014925373134,
"false_alarm_rate": 0.41206896551724137,
"precision": 0.6422155688622755,
"recall": 0.6402985074626866,
"f1_security": 0.641255605381166,
"threat_type_accuracy": 0.6118881118881119,
"parse_failure_rate": 0.026,
"breakdown": {
"TP": {
"total": 1340,
"correct": 858,
"accuracy": 0.6402985074626866
},
"FP": {
"total": 980,
"correct": 510,
"accuracy": 0.5204081632653061
},
"TN": {
"total": 180,
"correct": 163,
"accuracy": 0.9055555555555556
}
},
"per_sq": {
"SQ1": {
"total": 575,
"accuracy": 0.5704347826086956,
"miss_count": 72,
"false_alarm_count": 170
},
"SQ2": {
"total": 633,
"accuracy": 0.6477093206951027,
"miss_count": 142,
"false_alarm_count": 79
},
"SQ3": {
"total": 568,
"accuracy": 0.647887323943662,
"miss_count": 121,
"false_alarm_count": 77
},
"SQ4": {
"total": 399,
"accuracy": 0.5588972431077694,
"miss_count": 85,
"false_alarm_count": 91
},
"SQ5": {
"total": 325,
"accuracy": 0.6215384615384615,
"miss_count": 62,
"false_alarm_count": 61
}
},
"label": "weak_answers",
"count": 2500,
"error_taxonomy": {
"total_errors": 1302,
"error_distribution": {
"MISS": 460,
"WRONG_TYPE": 333,
"PARSE_FAIL": 40,
"FALSE_ALARM": 469
},
"error_by_sq": {
"SQ1": {
"MISS": 67,
"WRONG_TYPE": 32,
"PARSE_FAIL": 14,
"FALSE_ALARM": 166
},
"SQ2": {
"MISS": 140,
"PARSE_FAIL": 6,
"FALSE_ALARM": 77,
"WRONG_TYPE": 47
},
"SQ3": {
"MISS": 111,
"WRONG_TYPE": 142,
"PARSE_FAIL": 15,
"FALSE_ALARM": 74
},
"SQ4": {
"MISS": 81,
"WRONG_TYPE": 93,
"FALSE_ALARM": 91,
"PARSE_FAIL": 4
},
"SQ5": {
"MISS": 61,
"WRONG_TYPE": 19,
"PARSE_FAIL": 1,
"FALSE_ALARM": 61
}
},
"error_by_category": {
"intrusion": 258,
"fire_gas": 146,
"water_damage": 6,
"device_fault": 99,
"elderly_specific": 134,
"child_specific": 63,
"behavioral_anomaly": 87
}
}
}
}