215 lines
4.8 KiB
JSON
215 lines
4.8 KiB
JSON
{
|
|
"split": "dev",
|
|
"input_root": "data_dpo_v2\\dev_pref_v1",
|
|
"total_pairs": 300,
|
|
"assembly_counts": {
|
|
"chosen_rule_fallback": 154,
|
|
"rejected_weak_model": 155,
|
|
"rejected_constructed_fallback": 145,
|
|
"chosen_strong_model": 146
|
|
},
|
|
"strong_answer_summary": {
|
|
"total_episodes": 300,
|
|
"detection_accuracy": 0.5466666666666666,
|
|
"miss_rate": 0.41875,
|
|
"false_alarm_rate": 0.4928571428571429,
|
|
"precision": 0.5740740740740741,
|
|
"recall": 0.58125,
|
|
"f1_security": 0.577639751552795,
|
|
"threat_type_accuracy": 0.8387096774193549,
|
|
"parse_failure_rate": 0.016666666666666666,
|
|
"breakdown": {
|
|
"TP": {
|
|
"total": 160,
|
|
"correct": 93,
|
|
"accuracy": 0.58125
|
|
},
|
|
"FP": {
|
|
"total": 120,
|
|
"correct": 56,
|
|
"accuracy": 0.4666666666666667
|
|
},
|
|
"TN": {
|
|
"total": 20,
|
|
"correct": 15,
|
|
"accuracy": 0.75
|
|
}
|
|
},
|
|
"per_sq": {
|
|
"SQ1": {
|
|
"total": 70,
|
|
"accuracy": 0.5714285714285714,
|
|
"miss_count": 2,
|
|
"false_alarm_count": 28
|
|
},
|
|
"SQ2": {
|
|
"total": 38,
|
|
"accuracy": 0.5526315789473685,
|
|
"miss_count": 13,
|
|
"false_alarm_count": 4
|
|
},
|
|
"SQ3": {
|
|
"total": 71,
|
|
"accuracy": 0.5492957746478874,
|
|
"miss_count": 23,
|
|
"false_alarm_count": 9
|
|
},
|
|
"SQ4": {
|
|
"total": 59,
|
|
"accuracy": 0.559322033898305,
|
|
"miss_count": 16,
|
|
"false_alarm_count": 10
|
|
},
|
|
"SQ5": {
|
|
"total": 62,
|
|
"accuracy": 0.5,
|
|
"miss_count": 13,
|
|
"false_alarm_count": 18
|
|
}
|
|
},
|
|
"label": "strong_answers",
|
|
"count": 300,
|
|
"error_taxonomy": {
|
|
"total_errors": 151,
|
|
"error_distribution": {
|
|
"MISS": 65,
|
|
"FALSE_ALARM": 69,
|
|
"PARSE_FAIL": 2,
|
|
"WRONG_TYPE": 15
|
|
},
|
|
"error_by_sq": {
|
|
"SQ1": {
|
|
"PARSE_FAIL": 2,
|
|
"FALSE_ALARM": 28
|
|
},
|
|
"SQ2": {
|
|
"MISS": 13,
|
|
"FALSE_ALARM": 4
|
|
},
|
|
"SQ3": {
|
|
"MISS": 23,
|
|
"FALSE_ALARM": 9,
|
|
"WRONG_TYPE": 9
|
|
},
|
|
"SQ4": {
|
|
"FALSE_ALARM": 10,
|
|
"WRONG_TYPE": 6,
|
|
"MISS": 16
|
|
},
|
|
"SQ5": {
|
|
"MISS": 13,
|
|
"FALSE_ALARM": 18
|
|
}
|
|
},
|
|
"error_by_category": {
|
|
"intrusion": 44,
|
|
"elderly_specific": 15,
|
|
"child_specific": 11,
|
|
"behavioral_anomaly": 10
|
|
}
|
|
}
|
|
},
|
|
"weak_answer_summary": {
|
|
"total_episodes": 300,
|
|
"detection_accuracy": 0.5966666666666667,
|
|
"miss_rate": 0.29375,
|
|
"false_alarm_rate": 0.5214285714285715,
|
|
"precision": 0.6075268817204301,
|
|
"recall": 0.70625,
|
|
"f1_security": 0.653179190751445,
|
|
"threat_type_accuracy": 0.7256637168141593,
|
|
"parse_failure_rate": 0.02666666666666667,
|
|
"breakdown": {
|
|
"TP": {
|
|
"total": 160,
|
|
"correct": 113,
|
|
"accuracy": 0.70625
|
|
},
|
|
"FP": {
|
|
"total": 120,
|
|
"correct": 48,
|
|
"accuracy": 0.4
|
|
},
|
|
"TN": {
|
|
"total": 20,
|
|
"correct": 18,
|
|
"accuracy": 0.9
|
|
}
|
|
},
|
|
"per_sq": {
|
|
"SQ1": {
|
|
"total": 70,
|
|
"accuracy": 0.5428571428571428,
|
|
"miss_count": 2,
|
|
"false_alarm_count": 29
|
|
},
|
|
"SQ2": {
|
|
"total": 38,
|
|
"accuracy": 0.6052631578947368,
|
|
"miss_count": 12,
|
|
"false_alarm_count": 3
|
|
},
|
|
"SQ3": {
|
|
"total": 71,
|
|
"accuracy": 0.6338028169014085,
|
|
"miss_count": 11,
|
|
"false_alarm_count": 15
|
|
},
|
|
"SQ4": {
|
|
"total": 59,
|
|
"accuracy": 0.6101694915254238,
|
|
"miss_count": 11,
|
|
"false_alarm_count": 12
|
|
},
|
|
"SQ5": {
|
|
"total": 62,
|
|
"accuracy": 0.5967741935483871,
|
|
"miss_count": 11,
|
|
"false_alarm_count": 14
|
|
}
|
|
},
|
|
"label": "weak_answers",
|
|
"count": 300,
|
|
"error_taxonomy": {
|
|
"total_errors": 152,
|
|
"error_distribution": {
|
|
"MISS": 43,
|
|
"PARSE_FAIL": 5,
|
|
"WRONG_TYPE": 31,
|
|
"FALSE_ALARM": 73
|
|
},
|
|
"error_by_sq": {
|
|
"SQ1": {
|
|
"PARSE_FAIL": 3,
|
|
"FALSE_ALARM": 29
|
|
},
|
|
"SQ2": {
|
|
"PARSE_FAIL": 1,
|
|
"MISS": 11,
|
|
"FALSE_ALARM": 3
|
|
},
|
|
"SQ3": {
|
|
"MISS": 10,
|
|
"WRONG_TYPE": 20,
|
|
"PARSE_FAIL": 1,
|
|
"FALSE_ALARM": 15
|
|
},
|
|
"SQ4": {
|
|
"FALSE_ALARM": 12,
|
|
"WRONG_TYPE": 11,
|
|
"MISS": 11
|
|
},
|
|
"SQ5": {
|
|
"MISS": 11,
|
|
"FALSE_ALARM": 14
|
|
}
|
|
},
|
|
"error_by_category": {
|
|
"intrusion": 38,
|
|
"elderly_specific": 15,
|
|
"child_specific": 11,
|
|
"behavioral_anomaly": 10
|
|
}
|
|
}
|
|
}
|
|
} |