llmiotsafe/debug_model.py

#!/usr/bin/env python3
"""debug_model.py — 单条 episode 调试，打印完整的 raw response"""
import json, sys, requests
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.evaluation.prompt_builder import build_prompt

# 取第一个 TP episode
ep_file = list((PROJECT_ROOT / "data/benchmark/sq1").glob("SQ1_TP_*.json"))[0]
with open(ep_file, "r", encoding="utf-8") as f:
    episode = json.load(f)

mode = sys.argv[1] if len(sys.argv) > 1 else "baseline"
model = sys.argv[2] if len(sys.argv) > 2 else "Qwen/Qwen3.5-9B"
api_base = sys.argv[3] if len(sys.argv) > 3 else "http://localhost:8000/v1"

print(f"Episode: {episode['episode_id']}")
print(f"Mode: {mode}")
print(f"Model: {model}")
print(f"API: {api_base}")
print()

prompt = build_prompt(episode, mode=mode)
print(f"System prompt length: {len(prompt.get('system',''))} chars")
print(f"User prompt length: {len(prompt['user'])} chars")
print()

# 调用 API
url = api_base.rstrip("/") + "/chat/completions"
payload = {
    "model": model,
    "messages": [
        {"role": "system", "content": prompt.get("system", "")},
        {"role": "user", "content": prompt["user"]},
    ],
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": 20,
    "min_p": 0.0,
    "presence_penalty": 1.5,
    "repetition_penalty": 1.0,
    "max_tokens": 4096,
    "chat_template_kwargs": {"enable_thinking": False},
}

print("Calling API...")
try:
    resp = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=300)
    print(f"HTTP Status: {resp.status_code}")
    data = resp.json()
    print(f"Response keys: {list(data.keys())}")

    if "error" in data:
        print(f"API ERROR: {data['error']}")
    elif "choices" in data and len(data["choices"]) > 0:
        choice = data["choices"][0]
        print(f"Finish reason: {choice.get('finish_reason', '?')}")
        msg = choice.get("message", {})
        raw = msg.get("content")
        reasoning = msg.get("reasoning_content") or msg.get("thinking") or msg.get("reasoning")

        if reasoning:
            print(f"\nTHINKING/REASONING ({len(str(reasoning))} chars):")
            print(str(reasoning)[:500])
            print("...")

        if raw is None:
            print("\nCONTENT IS NONE — model returned no content")
            print(f"Full message object: {json.dumps(msg, ensure_ascii=False)[:1000]}")
        else:
            print(f"\nResponse length: {len(raw)} chars")
            print("=" * 60)
            print("RAW RESPONSE (first 2000 chars):")
            print("=" * 60)
            print(raw[:2000])
    else:
        print(f"Unexpected response: {json.dumps(data, ensure_ascii=False)[:1000]}")
except Exception as e:
    import traceback
    print(f"ERROR: {e}")
    traceback.print_exc()