feat: switch LLM client to OpenAI SDK for DeepSeek compatibility

The previous LLMClient used raw httpx + Claude Messages API (/v1/messages, x-api-key, Anthropic SSE event types). Incompatible with DeepSeek. Rewrite LLMClient.__init__/chat/close to use openai.AsyncOpenAI: - /v1/chat/completions endpoint, OpenAI message format - Bearer auth, native SDK error types - Stream chunks via async for + chunk.choices[0].delta.content Tool calling protocol (ReAct text-based tags) and all surrounding helpers (_apply_progressive_decay, _fold_old_messages, _partition_tool_calls, tool_call_loop, etc.) are unchanged — endpoint-agnostic by design. New optional config params surfaced to config.yaml.agent: - reasoning_effort: "high" | "medium" | "low" — DeepSeek/o1-style depth - thinking_enabled: bool — DeepSeek extra_body.thinking switch main.py and regenerate_report.py pass these through to LLMClient. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 17:13:54 +08:00
parent 31812a72ee
commit 0a966d8476
5 changed files with 227 additions and 67 deletions
--- a/llm_client.py
+++ b/llm_client.py
@@ -1,8 +1,9 @@
-"""Custom LLM client using httpx for Claude Messages API via third-party proxy.
+"""LLM client via the OpenAI SDK (works with DeepSeek's OpenAI-compatible API).

-The proxy does not support Claude's native tool_use format (it strips the `tools`
-field from requests). So we embed tool definitions in the system prompt and parse
-structured JSON tool calls from the model's text output (ReAct-style).
+Tool calling is text-based (ReAct pattern): tool schemas are embedded in
+the system prompt and tool calls are parsed as <tool_call> JSON blocks
+from model output. This keeps the protocol independent of whether the
+underlying API supports native function/tool calling.
 """

 from __future__ import annotations
@@ -18,6 +19,7 @@ from dataclasses import dataclass, field
 from typing import Any

 import httpx
+from openai import APIConnectionError, APIError, APITimeoutError, AsyncOpenAI

 logger = logging.getLogger(__name__)

@@ -301,44 +303,51 @@ _FOLD_SUMMARY_SYSTEM = (


 class LLMClient:
-    """Calls Claude Messages API through a third-party proxy using raw httpx.
+    """Async LLM client via the OpenAI SDK.

-    Uses prompt-based tool calling (ReAct pattern) since the proxy does not
-    support Claude's native tool_use format.
+    Works with any OpenAI-compatible endpoint (OpenAI, DeepSeek, ...).
+    Tool calling is text-based (ReAct) — see module docstring.
    """

    def __init__(
        self,
        base_url: str,
        api_key: str,
-        model: str = "claude-sonnet-4-6",
+        model: str = "deepseek-v4-pro",
        max_tokens: int = 4096,
        proxy: str | None = "auto",
+        reasoning_effort: str | None = None,
+        thinking_enabled: bool = False,
    ) -> None:
        self.base_url = base_url.rstrip("/")
        self.api_key = api_key
        self.model = model
        self.max_tokens = max_tokens
-        # proxy="auto": read from env; proxy=None/""/"none": no proxy; proxy="http://...": use it
+        self.reasoning_effort = reasoning_effort
+        self.thinking_enabled = thinking_enabled
+
+        # proxy="auto": read from env; proxy=None/""/"none": no proxy
        if proxy == "auto":
            proxy_url = os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY")
        elif proxy and proxy.lower() != "none":
            proxy_url = proxy
        else:
            proxy_url = None
-        self._client = httpx.AsyncClient(
+
+        http_client = (
+            httpx.AsyncClient(proxy=proxy_url, timeout=300.0)
+            if proxy_url else None
+        )
+
+        self._client = AsyncOpenAI(
+            api_key=self.api_key,
            base_url=self.base_url,
-            headers={
-                "x-api-key": self.api_key,
-                "anthropic-version": "2023-06-01",
-                "content-type": "application/json",
-            },
            timeout=300.0,
-            proxy=proxy_url,
+            http_client=http_client,
        )

    async def close(self) -> None:
-        await self._client.aclose()
+        await self._client.close()

    async def chat(
        self,
@@ -346,79 +355,53 @@ class LLMClient:
        system: str | None = None,
        max_retries: int = 5,
    ) -> str:
-        """Send a streaming chat request and return the assembled text response.
+        """Send a streaming chat completion and return the assembled text."""
+        full_messages: list[dict] = []
+        if system:
+            full_messages.append({"role": "system", "content": system})
+        full_messages.extend(messages)

-        Uses SSE streaming to keep the connection alive and avoid gateway
-        timeouts (504/524) on long-running completions.
-        """
-        import asyncio as _asyncio
-
-        payload: dict[str, Any] = {
+        kwargs: dict[str, Any] = {
            "model": self.model,
+            "messages": full_messages,
            "max_tokens": self.max_tokens,
-            "messages": messages,
            "stream": True,
        }
-        if system:
-            payload["system"] = system
+        if self.reasoning_effort:
+            kwargs["reasoning_effort"] = self.reasoning_effort
+        if self.thinking_enabled:
+            kwargs["extra_body"] = {"thinking": {"type": "enabled"}}

        for attempt in range(max_retries):
-            logger.debug("LLM request (stream): %d messages (attempt %d)", len(messages), attempt + 1)
+            logger.debug(
+                "LLM request (stream): %d messages (attempt %d)",
+                len(messages), attempt + 1,
+            )
            text_parts: list[str] = []
            try:
-                async with self._client.stream(
-                    "POST", "/v1/messages", json=payload,
-                ) as resp:
-                    # Check for HTTP errors before consuming stream
-                    if resp.status_code >= 400:
-                        body = await resp.aread()
-                        raise httpx.HTTPStatusError(
-                            f"Server error '{resp.status_code}' for url '{resp.url}'",
-                            request=resp.request,
-                            response=resp,
-                        )
-
-                    # Parse SSE events
-                    async for line in resp.aiter_lines():
-                        if not line.startswith("data: "):
-                            continue
-                        data_str = line[6:]  # strip "data: " prefix
-                        if data_str.strip() == "[DONE]":
-                            break
-                        try:
-                            event = json.loads(data_str)
-                        except json.JSONDecodeError:
-                            continue
-
-                        event_type = event.get("type", "")
-                        if event_type == "content_block_delta":
-                            delta = event.get("delta", {})
-                            if delta.get("type") == "text_delta":
-                                text_parts.append(delta["text"])
-                        elif event_type == "message_stop":
-                            break
-                        elif event_type == "error":
-                            err_msg = event.get("error", {}).get("message", "Unknown streaming error")
-                            raise httpx.HTTPStatusError(
-                                err_msg, request=resp.request, response=resp,
-                            )
+                stream = await self._client.chat.completions.create(**kwargs)
+                async for chunk in stream:
+                    if not chunk.choices:
+                        continue
+                    delta = chunk.choices[0].delta
+                    if delta.content:
+                        text_parts.append(delta.content)

                text = "".join(text_parts)
                logger.debug("LLM response (stream): %d chars", len(text))
                return text

-            except (httpx.HTTPStatusError, httpx.ConnectError, httpx.ReadTimeout, httpx.RemoteProtocolError) as e:
+            except (APIConnectionError, APITimeoutError, APIError) as e:
                if attempt < max_retries - 1:
                    wait = 2 ** attempt * 10
                    logger.warning("Request failed (%s), retrying in %ds...", e, wait)
-                    await _asyncio.sleep(wait)
+                    await asyncio.sleep(wait)
                else:
                    raise LLMAPIError(
                        f"LLM API unreachable after {max_retries} attempts: {e}",
                        attempts=max_retries,
                    ) from e

-        # Should not reach here, but just in case
        return ""

    async def tool_call_loop(