{ "model": "Qwen/Qwen3.5-2B", "api_base": "http://localhost:8000/v1", "timestamp": "2026-05-10T13:55:09.665723", "total_evaluated": 1200, "avg_latency_seconds": 8.5, "api_errors": 0, "summary": { "total_episodes": 1200, "detection_accuracy": 0.6291666666666667, "miss_rate": 0.72, "false_alarm_rate": 0.05846153846153846, "precision": 0.8020833333333334, "recall": 0.28, "f1_security": 0.4150943396226416, "threat_type_accuracy": 0.551948051948052, "parse_failure_rate": 0.08833333333333333, "breakdown": { "TP": { "total": 550, "correct": 154, "accuracy": 0.28 }, "FP": { "total": 400, "correct": 365, "accuracy": 0.9125 }, "TN": { "total": 250, "correct": 236, "accuracy": 0.944 } }, "per_sq": { "SQ1": { "total": 190, "accuracy": 0.5578947368421052, "miss_count": 58, "false_alarm_count": 18 }, "SQ2": { "total": 240, "accuracy": 0.6208333333333333, "miss_count": 78, "false_alarm_count": 12 }, "SQ3": { "total": 290, "accuracy": 0.5551724137931034, "miss_count": 129, "false_alarm_count": 0 }, "SQ4": { "total": 290, "accuracy": 0.5689655172413793, "miss_count": 122, "false_alarm_count": 2 }, "SQ5": { "total": 190, "accuracy": 0.9157894736842105, "miss_count": 9, "false_alarm_count": 6 } } }, "errors": { "total_errors": 514, "error_distribution": { "PARSE_FAIL": 49, "FALSE_ALARM": 36, "MISS": 360, "WRONG_TYPE": 69 }, "error_by_sq": { "SQ1": { "PARSE_FAIL": 20, "FALSE_ALARM": 18, "MISS": 46, "WRONG_TYPE": 4 }, "SQ2": { "FALSE_ALARM": 12, "PARSE_FAIL": 6, "MISS": 73, "WRONG_TYPE": 4 }, "SQ3": { "MISS": 119, "PARSE_FAIL": 10, "WRONG_TYPE": 1 }, "SQ4": { "PARSE_FAIL": 7, "MISS": 118, "WRONG_TYPE": 2 }, "SQ5": { "FALSE_ALARM": 6, "PARSE_FAIL": 6, "WRONG_TYPE": 58, "MISS": 4 } }, "error_by_category": { "device_fault": 50, "fire_gas": 77, "intrusion": 135, "water_damage": 15, "behavioral_anomaly": 81, "child_specific": 28, "elderly_specific": 43 } } }