{ "model": "Qwen/Qwen2.5-7B-Instruct", "api_base": "http://localhost:8000/v1", "timestamp": "2026-04-30T22:18:00.620006", "total_evaluated": 1200, "avg_latency_seconds": 7.61, "api_errors": 0, "summary": { "total_episodes": 1200, "detection_accuracy": 0.5975, "miss_rate": 0.8781818181818182, "false_alarm_rate": 0.0, "precision": 1.0, "recall": 0.12181818181818181, "f1_security": 0.2171799027552674, "threat_type_accuracy": 0.44776119402985076, "parse_failure_rate": 0.04, "breakdown": { "TP": { "total": 550, "correct": 67, "accuracy": 0.12181818181818181 }, "FP": { "total": 400, "correct": 400, "accuracy": 1.0 }, "TN": { "total": 250, "correct": 250, "accuracy": 1.0 } }, "per_sq": { "SQ1": { "total": 190, "accuracy": 0.5894736842105263, "miss_count": 78, "false_alarm_count": 0 }, "SQ2": { "total": 240, "accuracy": 0.5791666666666667, "miss_count": 101, "false_alarm_count": 0 }, "SQ3": { "total": 290, "accuracy": 0.5551724137931034, "miss_count": 129, "false_alarm_count": 0 }, "SQ4": { "total": 290, "accuracy": 0.5517241379310345, "miss_count": 130, "false_alarm_count": 0 }, "SQ5": { "total": 190, "accuracy": 0.7631578947368421, "miss_count": 45, "false_alarm_count": 0 } } }, "errors": { "total_errors": 520, "error_distribution": { "MISS": 460, "WRONG_TYPE": 37, "PARSE_FAIL": 23 }, "error_by_sq": { "SQ1": { "MISS": 76, "WRONG_TYPE": 2, "PARSE_FAIL": 2 }, "SQ2": { "MISS": 98, "PARSE_FAIL": 3, "WRONG_TYPE": 1 }, "SQ3": { "MISS": 126, "PARSE_FAIL": 3, "WRONG_TYPE": 1 }, "SQ4": { "MISS": 125, "PARSE_FAIL": 5 }, "SQ5": { "PARSE_FAIL": 10, "MISS": 35, "WRONG_TYPE": 33 } }, "error_by_category": { "device_fault": 138, "fire_gas": 116, "water_damage": 52, "intrusion": 115, "behavioral_anomaly": 35, "child_specific": 14, "elderly_specific": 27 } } }