{ "model": "Qwen/Qwen3.5-9B", "api_base": "http://localhost:8000/v1", "timestamp": "2026-05-04T14:48:06.999403", "total_evaluated": 1200, "summary": { "total_episodes": 1200, "detection_accuracy": 0.5875, "miss_rate": 0.38727272727272727, "false_alarm_rate": 0.42, "precision": 0.5524590163934426, "recall": 0.6127272727272727, "f1_security": 0.5810344827586207, "threat_type_accuracy": 0.5459940652818991, "parse_failure_rate": 0.013333333333333334, "breakdown": { "TP": { "total": 550, "correct": 337, "accuracy": 0.6127272727272727 }, "FP": { "total": 400, "correct": 226, "accuracy": 0.565 }, "TN": { "total": 250, "correct": 142, "accuracy": 0.568 } }, "per_sq": { "SQ1": { "total": 190, "accuracy": 0.5736842105263158, "miss_count": 41, "false_alarm_count": 39 }, "SQ2": { "total": 240, "accuracy": 0.6125, "miss_count": 43, "false_alarm_count": 49 }, "SQ3": { "total": 290, "accuracy": 0.4896551724137931, "miss_count": 80, "false_alarm_count": 67 }, "SQ4": { "total": 290, "accuracy": 0.5448275862068965, "miss_count": 39, "false_alarm_count": 89 }, "SQ5": { "total": 190, "accuracy": 0.7842105263157895, "miss_count": 10, "false_alarm_count": 29 } } }, "errors": { "total_errors": 648, "error_distribution": { "FALSE_ALARM": 273, "PARSE_FAIL": 14, "MISS": 208, "WRONG_TYPE": 153 }, "error_by_sq": { "SQ1": { "FALSE_ALARM": 39, "PARSE_FAIL": 2, "MISS": 40 }, "SQ2": { "FALSE_ALARM": 49, "PARSE_FAIL": 2, "MISS": 42, "WRONG_TYPE": 28 }, "SQ3": { "FALSE_ALARM": 67, "PARSE_FAIL": 3, "MISS": 78, "WRONG_TYPE": 39 }, "SQ4": { "FALSE_ALARM": 89, "PARSE_FAIL": 5, "MISS": 38, "WRONG_TYPE": 50 }, "SQ5": { "FALSE_ALARM": 29, "PARSE_FAIL": 2, "WRONG_TYPE": 36, "MISS": 10 } }, "error_by_category": { "device_fault": 40, "fire_gas": 53, "water_damage": 37, "intrusion": 81, "behavioral_anomaly": 79, "child_specific": 28, "elderly_specific": 43 } }, "pipeline": "EGP" }