feat: switch LLM client to OpenAI SDK for DeepSeek compatibility
The previous LLMClient used raw httpx + Claude Messages API (/v1/messages, x-api-key, Anthropic SSE event types). Incompatible with DeepSeek. Rewrite LLMClient.__init__/chat/close to use openai.AsyncOpenAI: - /v1/chat/completions endpoint, OpenAI message format - Bearer auth, native SDK error types - Stream chunks via async for + chunk.choices[0].delta.content Tool calling protocol (ReAct text-based tags) and all surrounding helpers (_apply_progressive_decay, _fold_old_messages, _partition_tool_calls, tool_call_loop, etc.) are unchanged — endpoint-agnostic by design. New optional config params surfaced to config.yaml.agent: - reasoning_effort: "high" | "medium" | "low" — DeepSeek/o1-style depth - thinking_enabled: bool — DeepSeek extra_body.thinking switch main.py and regenerate_report.py pass these through to LLMClient. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
117
llm_client.py
117
llm_client.py
@@ -1,8 +1,9 @@
|
||||
"""Custom LLM client using httpx for Claude Messages API via third-party proxy.
|
||||
"""LLM client via the OpenAI SDK (works with DeepSeek's OpenAI-compatible API).
|
||||
|
||||
The proxy does not support Claude's native tool_use format (it strips the `tools`
|
||||
field from requests). So we embed tool definitions in the system prompt and parse
|
||||
structured JSON tool calls from the model's text output (ReAct-style).
|
||||
Tool calling is text-based (ReAct pattern): tool schemas are embedded in
|
||||
the system prompt and tool calls are parsed as <tool_call> JSON blocks
|
||||
from model output. This keeps the protocol independent of whether the
|
||||
underlying API supports native function/tool calling.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -18,6 +19,7 @@ from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from openai import APIConnectionError, APIError, APITimeoutError, AsyncOpenAI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -301,44 +303,51 @@ _FOLD_SUMMARY_SYSTEM = (
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""Calls Claude Messages API through a third-party proxy using raw httpx.
|
||||
"""Async LLM client via the OpenAI SDK.
|
||||
|
||||
Uses prompt-based tool calling (ReAct pattern) since the proxy does not
|
||||
support Claude's native tool_use format.
|
||||
Works with any OpenAI-compatible endpoint (OpenAI, DeepSeek, ...).
|
||||
Tool calling is text-based (ReAct) — see module docstring.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
api_key: str,
|
||||
model: str = "claude-sonnet-4-6",
|
||||
model: str = "deepseek-v4-pro",
|
||||
max_tokens: int = 4096,
|
||||
proxy: str | None = "auto",
|
||||
reasoning_effort: str | None = None,
|
||||
thinking_enabled: bool = False,
|
||||
) -> None:
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
# proxy="auto": read from env; proxy=None/""/"none": no proxy; proxy="http://...": use it
|
||||
self.reasoning_effort = reasoning_effort
|
||||
self.thinking_enabled = thinking_enabled
|
||||
|
||||
# proxy="auto": read from env; proxy=None/""/"none": no proxy
|
||||
if proxy == "auto":
|
||||
proxy_url = os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY")
|
||||
elif proxy and proxy.lower() != "none":
|
||||
proxy_url = proxy
|
||||
else:
|
||||
proxy_url = None
|
||||
self._client = httpx.AsyncClient(
|
||||
|
||||
http_client = (
|
||||
httpx.AsyncClient(proxy=proxy_url, timeout=300.0)
|
||||
if proxy_url else None
|
||||
)
|
||||
|
||||
self._client = AsyncOpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url,
|
||||
headers={
|
||||
"x-api-key": self.api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
},
|
||||
timeout=300.0,
|
||||
proxy=proxy_url,
|
||||
http_client=http_client,
|
||||
)
|
||||
|
||||
async def close(self) -> None:
|
||||
await self._client.aclose()
|
||||
await self._client.close()
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
@@ -346,79 +355,53 @@ class LLMClient:
|
||||
system: str | None = None,
|
||||
max_retries: int = 5,
|
||||
) -> str:
|
||||
"""Send a streaming chat request and return the assembled text response.
|
||||
"""Send a streaming chat completion and return the assembled text."""
|
||||
full_messages: list[dict] = []
|
||||
if system:
|
||||
full_messages.append({"role": "system", "content": system})
|
||||
full_messages.extend(messages)
|
||||
|
||||
Uses SSE streaming to keep the connection alive and avoid gateway
|
||||
timeouts (504/524) on long-running completions.
|
||||
"""
|
||||
import asyncio as _asyncio
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": self.model,
|
||||
"messages": full_messages,
|
||||
"max_tokens": self.max_tokens,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
}
|
||||
if system:
|
||||
payload["system"] = system
|
||||
if self.reasoning_effort:
|
||||
kwargs["reasoning_effort"] = self.reasoning_effort
|
||||
if self.thinking_enabled:
|
||||
kwargs["extra_body"] = {"thinking": {"type": "enabled"}}
|
||||
|
||||
for attempt in range(max_retries):
|
||||
logger.debug("LLM request (stream): %d messages (attempt %d)", len(messages), attempt + 1)
|
||||
logger.debug(
|
||||
"LLM request (stream): %d messages (attempt %d)",
|
||||
len(messages), attempt + 1,
|
||||
)
|
||||
text_parts: list[str] = []
|
||||
try:
|
||||
async with self._client.stream(
|
||||
"POST", "/v1/messages", json=payload,
|
||||
) as resp:
|
||||
# Check for HTTP errors before consuming stream
|
||||
if resp.status_code >= 400:
|
||||
body = await resp.aread()
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Server error '{resp.status_code}' for url '{resp.url}'",
|
||||
request=resp.request,
|
||||
response=resp,
|
||||
)
|
||||
|
||||
# Parse SSE events
|
||||
async for line in resp.aiter_lines():
|
||||
if not line.startswith("data: "):
|
||||
continue
|
||||
data_str = line[6:] # strip "data: " prefix
|
||||
if data_str.strip() == "[DONE]":
|
||||
break
|
||||
try:
|
||||
event = json.loads(data_str)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
event_type = event.get("type", "")
|
||||
if event_type == "content_block_delta":
|
||||
delta = event.get("delta", {})
|
||||
if delta.get("type") == "text_delta":
|
||||
text_parts.append(delta["text"])
|
||||
elif event_type == "message_stop":
|
||||
break
|
||||
elif event_type == "error":
|
||||
err_msg = event.get("error", {}).get("message", "Unknown streaming error")
|
||||
raise httpx.HTTPStatusError(
|
||||
err_msg, request=resp.request, response=resp,
|
||||
)
|
||||
stream = await self._client.chat.completions.create(**kwargs)
|
||||
async for chunk in stream:
|
||||
if not chunk.choices:
|
||||
continue
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.content:
|
||||
text_parts.append(delta.content)
|
||||
|
||||
text = "".join(text_parts)
|
||||
logger.debug("LLM response (stream): %d chars", len(text))
|
||||
return text
|
||||
|
||||
except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as e:
|
||||
except (APIConnectionError, APITimeoutError, APIError) as e:
|
||||
if attempt < max_retries - 1:
|
||||
wait = 2 ** attempt * 10
|
||||
logger.warning("Request failed (%s), retrying in %ds...", e, wait)
|
||||
await _asyncio.sleep(wait)
|
||||
await asyncio.sleep(wait)
|
||||
else:
|
||||
raise LLMAPIError(
|
||||
f"LLM API unreachable after {max_retries} attempts: {e}",
|
||||
attempts=max_retries,
|
||||
) from e
|
||||
|
||||
# Should not reach here, but just in case
|
||||
return ""
|
||||
|
||||
async def tool_call_loop(
|
||||
|
||||
Reference in New Issue
Block a user