{ "model": "Qwen/Qwen3.5-9B", "api_base": "http://localhost:8000/v1", "timestamp": "2026-05-03T09:31:20.047748", "total_evaluated": 1200, "avg_latency_seconds": 71.08, "api_errors": 1, "summary": { "total_episodes": 1200, "detection_accuracy": 0.5566666666666666, "miss_rate": 0.19454545454545455, "false_alarm_rate": 0.6430769230769231, "precision": 0.5145180023228804, "recall": 0.8054545454545454, "f1_security": 0.6279234585400425, "threat_type_accuracy": 0.40632054176072235, "parse_failure_rate": 0.1825, "breakdown": { "TP": { "total": 550, "correct": 443, "accuracy": 0.8054545454545454 }, "FP": { "total": 400, "correct": 143, "accuracy": 0.3575 }, "TN": { "total": 250, "correct": 82, "accuracy": 0.328 } }, "per_sq": { "SQ1": { "total": 190, "accuracy": 0.4842105263157895, "miss_count": 18, "false_alarm_count": 75 }, "SQ2": { "total": 240, "accuracy": 0.5708333333333333, "miss_count": 12, "false_alarm_count": 91 }, "SQ3": { "total": 290, "accuracy": 0.5344827586206896, "miss_count": 42, "false_alarm_count": 93 }, "SQ4": { "total": 290, "accuracy": 0.5586206896551724, "miss_count": 16, "false_alarm_count": 110 }, "SQ5": { "total": 190, "accuracy": 0.6421052631578947, "miss_count": 19, "false_alarm_count": 49 } } }, "errors": { "total_errors": 795, "error_distribution": { "FALSE_ALARM": 386, "PARSE_FAIL": 91, "WRONG_TYPE": 263, "MISS": 55 }, "error_by_sq": { "SQ1": { "FALSE_ALARM": 68, "PARSE_FAIL": 19, "WRONG_TYPE": 3, "MISS": 11 }, "SQ2": { "PARSE_FAIL": 18, "FALSE_ALARM": 84, "WRONG_TYPE": 56, "MISS": 1 }, "SQ3": { "FALSE_ALARM": 86, "PARSE_FAIL": 21, "MISS": 28, "WRONG_TYPE": 83 }, "SQ4": { "FALSE_ALARM": 104, "PARSE_FAIL": 17, "WRONG_TYPE": 69, "MISS": 7 }, "SQ5": { "FALSE_ALARM": 44, "PARSE_FAIL": 16, "WRONG_TYPE": 52, "MISS": 8 } }, "error_by_category": { "device_fault": 14, "water_damage": 26, "fire_gas": 21, "intrusion": 116, "behavioral_anomaly": 77, "child_specific": 28, "elderly_specific": 36 } } }