#!/usr/bin/env python3 """debug_model.py — 单条 episode 调试,打印完整的 raw response""" import json, sys, requests from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent sys.path.insert(0, str(PROJECT_ROOT)) from src.evaluation.prompt_builder import build_prompt # 取第一个 TP episode ep_file = list((PROJECT_ROOT / "data/benchmark/sq1").glob("SQ1_TP_*.json"))[0] with open(ep_file, "r", encoding="utf-8") as f: episode = json.load(f) mode = sys.argv[1] if len(sys.argv) > 1 else "baseline" model = sys.argv[2] if len(sys.argv) > 2 else "Qwen/Qwen3.5-9B" api_base = sys.argv[3] if len(sys.argv) > 3 else "http://localhost:8000/v1" print(f"Episode: {episode['episode_id']}") print(f"Mode: {mode}") print(f"Model: {model}") print(f"API: {api_base}") print() prompt = build_prompt(episode, mode=mode) print(f"System prompt length: {len(prompt.get('system',''))} chars") print(f"User prompt length: {len(prompt['user'])} chars") print() # 调用 API url = api_base.rstrip("/") + "/chat/completions" payload = { "model": model, "messages": [ {"role": "system", "content": prompt.get("system", "")}, {"role": "user", "content": prompt["user"]}, ], "temperature": 0.7, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 1.5, "repetition_penalty": 1.0, "max_tokens": 4096, "chat_template_kwargs": {"enable_thinking": False}, } print("Calling API...") try: resp = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=300) print(f"HTTP Status: {resp.status_code}") data = resp.json() print(f"Response keys: {list(data.keys())}") if "error" in data: print(f"API ERROR: {data['error']}") elif "choices" in data and len(data["choices"]) > 0: choice = data["choices"][0] print(f"Finish reason: {choice.get('finish_reason', '?')}") msg = choice.get("message", {}) raw = msg.get("content") reasoning = msg.get("reasoning_content") or msg.get("thinking") or msg.get("reasoning") if reasoning: print(f"\nTHINKING/REASONING ({len(str(reasoning))} chars):") print(str(reasoning)[:500]) print("...") if raw is None: print("\nCONTENT IS NONE — model returned no content") print(f"Full message object: {json.dumps(msg, ensure_ascii=False)[:1000]}") else: print(f"\nResponse length: {len(raw)} chars") print("=" * 60) print("RAW RESPONSE (first 2000 chars):") print("=" * 60) print(raw[:2000]) else: print(f"Unexpected response: {json.dumps(data, ensure_ascii=False)[:1000]}") except Exception as e: import traceback print(f"ERROR: {e}") traceback.print_exc()