feat: switch LLM client to OpenAI SDK for DeepSeek compatibility

The previous LLMClient used raw httpx + Claude Messages API (/v1/messages,
x-api-key, Anthropic SSE event types). Incompatible with DeepSeek.

Rewrite LLMClient.__init__/chat/close to use openai.AsyncOpenAI:
- /v1/chat/completions endpoint, OpenAI message format
- Bearer auth, native SDK error types
- Stream chunks via async for + chunk.choices[0].delta.content

Tool calling protocol (ReAct text-based tags) and all surrounding helpers
(_apply_progressive_decay, _fold_old_messages, _partition_tool_calls,
tool_call_loop, etc.) are unchanged — endpoint-agnostic by design.

New optional config params surfaced to config.yaml.agent:
- reasoning_effort: "high" | "medium" | "low" — DeepSeek/o1-style depth
- thinking_enabled: bool — DeepSeek extra_body.thinking switch

main.py and regenerate_report.py pass these through to LLMClient.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
BattleTag
2026-05-12 17:13:54 +08:00
parent 31812a72ee
commit 0a966d8476
5 changed files with 227 additions and 67 deletions

View File

@@ -1,8 +1,9 @@
"""Custom LLM client using httpx for Claude Messages API via third-party proxy.
"""LLM client via the OpenAI SDK (works with DeepSeek's OpenAI-compatible API).
The proxy does not support Claude's native tool_use format (it strips the `tools`
field from requests). So we embed tool definitions in the system prompt and parse
structured JSON tool calls from the model's text output (ReAct-style).
Tool calling is text-based (ReAct pattern): tool schemas are embedded in
the system prompt and tool calls are parsed as <tool_call> JSON blocks
from model output. This keeps the protocol independent of whether the
underlying API supports native function/tool calling.
"""
from __future__ import annotations
@@ -18,6 +19,7 @@ from dataclasses import dataclass, field
from typing import Any
import httpx
from openai import APIConnectionError, APIError, APITimeoutError, AsyncOpenAI
logger = logging.getLogger(__name__)
@@ -301,44 +303,51 @@ _FOLD_SUMMARY_SYSTEM = (
class LLMClient:
"""Calls Claude Messages API through a third-party proxy using raw httpx.
"""Async LLM client via the OpenAI SDK.
Uses prompt-based tool calling (ReAct pattern) since the proxy does not
support Claude's native tool_use format.
Works with any OpenAI-compatible endpoint (OpenAI, DeepSeek, ...).
Tool calling is text-based (ReAct) — see module docstring.
"""
def __init__(
self,
base_url: str,
api_key: str,
model: str = "claude-sonnet-4-6",
model: str = "deepseek-v4-pro",
max_tokens: int = 4096,
proxy: str | None = "auto",
reasoning_effort: str | None = None,
thinking_enabled: bool = False,
) -> None:
self.base_url = base_url.rstrip("/")
self.api_key = api_key
self.model = model
self.max_tokens = max_tokens
# proxy="auto": read from env; proxy=None/""/"none": no proxy; proxy="http://...": use it
self.reasoning_effort = reasoning_effort
self.thinking_enabled = thinking_enabled
# proxy="auto": read from env; proxy=None/""/"none": no proxy
if proxy == "auto":
proxy_url = os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY")
elif proxy and proxy.lower() != "none":
proxy_url = proxy
else:
proxy_url = None
self._client = httpx.AsyncClient(
http_client = (
httpx.AsyncClient(proxy=proxy_url, timeout=300.0)
if proxy_url else None
)
self._client = AsyncOpenAI(
api_key=self.api_key,
base_url=self.base_url,
headers={
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
},
timeout=300.0,
proxy=proxy_url,
http_client=http_client,
)
async def close(self) -> None:
await self._client.aclose()
await self._client.close()
async def chat(
self,
@@ -346,79 +355,53 @@ class LLMClient:
system: str | None = None,
max_retries: int = 5,
) -> str:
"""Send a streaming chat request and return the assembled text response.
"""Send a streaming chat completion and return the assembled text."""
full_messages: list[dict] = []
if system:
full_messages.append({"role": "system", "content": system})
full_messages.extend(messages)
Uses SSE streaming to keep the connection alive and avoid gateway
timeouts (504/524) on long-running completions.
"""
import asyncio as _asyncio
payload: dict[str, Any] = {
kwargs: dict[str, Any] = {
"model": self.model,
"messages": full_messages,
"max_tokens": self.max_tokens,
"messages": messages,
"stream": True,
}
if system:
payload["system"] = system
if self.reasoning_effort:
kwargs["reasoning_effort"] = self.reasoning_effort
if self.thinking_enabled:
kwargs["extra_body"] = {"thinking": {"type": "enabled"}}
for attempt in range(max_retries):
logger.debug("LLM request (stream): %d messages (attempt %d)", len(messages), attempt + 1)
logger.debug(
"LLM request (stream): %d messages (attempt %d)",
len(messages), attempt + 1,
)
text_parts: list[str] = []
try:
async with self._client.stream(
"POST", "/v1/messages", json=payload,
) as resp:
# Check for HTTP errors before consuming stream
if resp.status_code >= 400:
body = await resp.aread()
raise httpx.HTTPStatusError(
f"Server error '{resp.status_code}' for url '{resp.url}'",
request=resp.request,
response=resp,
)
# Parse SSE events
async for line in resp.aiter_lines():
if not line.startswith("data: "):
continue
data_str = line[6:] # strip "data: " prefix
if data_str.strip() == "[DONE]":
break
try:
event = json.loads(data_str)
except json.JSONDecodeError:
continue
event_type = event.get("type", "")
if event_type == "content_block_delta":
delta = event.get("delta", {})
if delta.get("type") == "text_delta":
text_parts.append(delta["text"])
elif event_type == "message_stop":
break
elif event_type == "error":
err_msg = event.get("error", {}).get("message", "Unknown streaming error")
raise httpx.HTTPStatusError(
err_msg, request=resp.request, response=resp,
)
stream = await self._client.chat.completions.create(**kwargs)
async for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if delta.content:
text_parts.append(delta.content)
text = "".join(text_parts)
logger.debug("LLM response (stream): %d chars", len(text))
return text
except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as e:
except (APIConnectionError, APITimeoutError, APIError) as e:
if attempt < max_retries - 1:
wait = 2 ** attempt * 10
logger.warning("Request failed (%s), retrying in %ds...", e, wait)
await _asyncio.sleep(wait)
await asyncio.sleep(wait)
else:
raise LLMAPIError(
f"LLM API unreachable after {max_retries} attempts: {e}",
attempts=max_retries,
) from e
# Should not reach here, but just in case
return ""
async def tool_call_loop(