From 12b8332b2f9979d5aa2e804b5648e7512ee52886 Mon Sep 17 00:00:00 2001 From: bboxwtf Date: Sat, 21 Mar 2026 20:01:26 +0000 Subject: [PATCH] =?UTF-8?q?feat(retry):=20LLM=20retry-on-failure=20for=20o?= =?UTF-8?q?rchestrator=20=E2=80=94=20never=20returns=20empty=20response?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: when LLM returned empty content or network error, the orchestrator immediately stopped with (no response) — visible to user as blank reply. Solution — 4-layer retry system: ## Go Gateway (gateway/internal/orchestrator/orchestrator.go) - Extracted shared runLoop() used by Chat(), ChatWithEvents(), ChatWithEventsAndRetry() - Added RetryPolicy struct: MaxLLMRetries (default 3), InitialDelay (2s), MaxDelay (30s), RetryOnEmpty (true) - callLLMWithRetry(): wraps every LLM call with exponential back-off: * retries on HTTP/network error * retries on empty choices array * retries when content=="" AND finish_reason!="tool_calls" (soft empty) * strips tools on attempt > 1 (avoids repeated tool-format errors) * logs each attempt; total attempts = MaxLLMRetries + 1 (default: 4) - Added ChatWithEventsAndRetry() with onRetry callback for client visibility - SetRetryPolicy() for runtime override ## Config (gateway/config/config.go) - New fields: MaxLLMRetries (GATEWAY_MAX_LLM_RETRIES, default 3) RetryDelaySecs (GATEWAY_RETRY_DELAY_SECS, default 2) ## main.go — wires retry policy from config into orchestrator ## docker-compose.yml - GATEWAY_REQUEST_TIMEOUT_SECS: 120 → 300 (accommodates up to 4 retries) - GATEWAY_MAX_LLM_RETRIES=3, GATEWAY_RETRY_DELAY_SECS=2 env vars ## API (handlers.go) - StartChatSession goroutine now uses ChatWithEventsAndRetry - onRetry callback emits "thinking" DB event with content "⟳ Retry N: reason" so the client sees retry progress in the console panel ## Frontend (client/src/lib/chatStore.ts + client/src/pages/Chat.tsx) - ConsoleEntry gains content?: string and new type "retry" - thinking events with content starting "⟳ Retry" → type=retry (amber) - Chat ConsolePanel renders retry events in amber with RefreshCw icon and shows the retry reason string underneath --- client/src/lib/chatStore.ts | 15 +- client/src/pages/Chat.tsx | 10 + docker/docker-compose.yml | 6 +- gateway/cmd/gateway/main.go | 8 + gateway/config/config.go | 10 + gateway/internal/api/handlers.go | 49 ++- gateway/internal/orchestrator/orchestrator.go | 415 ++++++++++-------- 7 files changed, 314 insertions(+), 199 deletions(-) diff --git a/client/src/lib/chatStore.ts b/client/src/lib/chatStore.ts index c40c7ff..e3a252e 100644 --- a/client/src/lib/chatStore.ts +++ b/client/src/lib/chatStore.ts @@ -53,7 +53,7 @@ export interface Conversation { export interface ConsoleEntry { id: string; - type: "thinking" | "tool_call" | "done" | "error"; + type: "thinking" | "tool_call" | "done" | "error" | "retry"; tool?: string; args?: any; result?: any; @@ -62,6 +62,8 @@ export interface ConsoleEntry { durationMs?: number; timestamp: string; model?: string; + /** For thinking events: extra message text (e.g. retry reason) */ + content?: string; } type StoreEvent = "update" | "console"; @@ -439,9 +441,16 @@ class ChatStore { if (ev.seq > maxSeq) maxSeq = ev.seq; switch (ev.eventType) { - case "thinking": - this.addConsoleEntry({ type: "thinking" }); + case "thinking": { + // If content starts with retry prefix, show as retry event + const thinkMsg = ev.content || ""; + if (thinkMsg.startsWith("⟳ Retry")) { + this.addConsoleEntry({ type: "retry", content: thinkMsg }); + } else { + this.addConsoleEntry({ type: "thinking", content: thinkMsg || undefined }); + } break; + } case "tool_call": { let args: any = {}; diff --git a/client/src/pages/Chat.tsx b/client/src/pages/Chat.tsx index c394d78..74a129c 100644 --- a/client/src/pages/Chat.tsx +++ b/client/src/pages/Chat.tsx @@ -51,6 +51,7 @@ import { PanelRightOpen, Shell, StopCircle, + RefreshCw, } from "lucide-react"; // ─── useChatStore hook ──────────────────────────────────────────────────────── @@ -295,6 +296,8 @@ function ConsolePanel({ entries }: { entries: ConsoleEntry[] }) { animate={{ opacity: 1, x: 0 }} className={`rounded p-2 border ${ e.type === "thinking" ? "bg-cyan-500/10 border-cyan-500/20 text-cyan-400" + : e.type === "retry" + ? "bg-amber-500/10 border-amber-500/20 text-amber-400" : e.type === "tool_call" ? e.success !== false ? "bg-green-500/10 border-green-500/20 text-green-300" : "bg-red-500/10 border-red-500/20 text-red-300" @@ -304,11 +307,13 @@ function ConsolePanel({ entries }: { entries: ConsoleEntry[] }) { >
{e.type === "thinking" && } + {e.type === "retry" && } {e.type === "tool_call" && } {e.type === "done" && } {e.type === "error" && } {e.type === "thinking" ? "thinking…" + : e.type === "retry" ? "retry" : e.type === "tool_call" ? toolLabel(e.tool ?? "") : e.type === "done" ? `done · ${e.model ?? ""}` : "error"} @@ -316,6 +321,11 @@ function ConsolePanel({ entries }: { entries: ConsoleEntry[] }) { {e.timestamp}
+ {/* Retry reason or thinking message */} + {(e.type === "retry" || e.type === "thinking") && e.content && ( +
{e.content}
+ )} + {e.type === "tool_call" && e.args && (
               {JSON.stringify(e.args, null, 1).slice(0, 200)}
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index b1c9865..1fd5e98 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -109,8 +109,12 @@ services:
       DEFAULT_MODEL: "${DEFAULT_MODEL:-qwen2.5:7b}"
       DATABASE_URL: "${MYSQL_USER:-goclaw}:${MYSQL_PASSWORD:-goClawPass123}@tcp(db:3306)/${MYSQL_DATABASE:-goclaw}?parseTime=true"
       PROJECT_ROOT: "/app"
-      GATEWAY_REQUEST_TIMEOUT_SECS: "120"
+      # Request timeout — must be > (MaxLLMRetries * RetryDelay * 2 + actual LLM time)
+      GATEWAY_REQUEST_TIMEOUT_SECS: "300"
       GATEWAY_MAX_TOOL_ITERATIONS: "10"
+      # LLM retry policy: retry up to N times on empty response or network error
+      GATEWAY_MAX_LLM_RETRIES: "${GATEWAY_MAX_LLM_RETRIES:-3}"
+      GATEWAY_RETRY_DELAY_SECS: "${GATEWAY_RETRY_DELAY_SECS:-2}"
       LOG_LEVEL: "info"
     depends_on:
       db:
diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go
index f5d9c2d..ccad82a 100644
--- a/gateway/cmd/gateway/main.go
+++ b/gateway/cmd/gateway/main.go
@@ -47,6 +47,14 @@ func main() {
 
 	// ── Orchestrator ─────────────────────────────────────────────────────────
 	orch := orchestrator.New(llmClient, database, cfg.ProjectRoot)
+	// Apply retry policy from config
+	orch.SetRetryPolicy(orchestrator.RetryPolicy{
+		MaxLLMRetries:  cfg.MaxLLMRetries,
+		InitialDelay:   time.Duration(cfg.RetryDelaySecs) * time.Second,
+		MaxDelay:       30 * time.Second,
+		RetryOnEmpty:   true,
+	})
+	log.Printf("[Gateway] LLM retry policy: maxRetries=%d, initialDelay=%ds", cfg.MaxLLMRetries, cfg.RetryDelaySecs)
 
 	// ── HTTP Handlers ────────────────────────────────────────────────────────
 	h := api.NewHandler(cfg, llmClient, orch, database)
diff --git a/gateway/config/config.go b/gateway/config/config.go
index 7945659..6f31862 100644
--- a/gateway/config/config.go
+++ b/gateway/config/config.go
@@ -46,6 +46,12 @@ type Config struct {
 	DefaultModel       string
 	MaxToolIterations  int
 	RequestTimeoutSecs int
+
+	// LLM retry policy
+	// GATEWAY_MAX_LLM_RETRIES — additional attempts after a failure/empty response (default 3).
+	MaxLLMRetries int
+	// GATEWAY_RETRY_DELAY_SECS — initial delay before first retry in seconds (default 2).
+	RetryDelaySecs int
 }
 
 func Load() *Config {
@@ -55,6 +61,8 @@ func Load() *Config {
 
 	maxIter, _ := strconv.Atoi(getEnv("GATEWAY_MAX_TOOL_ITERATIONS", "10"))
 	timeout, _ := strconv.Atoi(getEnv("GATEWAY_REQUEST_TIMEOUT_SECS", "120"))
+	maxLLMRetries, _ := strconv.Atoi(getEnv("GATEWAY_MAX_LLM_RETRIES", "3"))
+	retryDelaySecs, _ := strconv.Atoi(getEnv("GATEWAY_RETRY_DELAY_SECS", "2"))
 
 	// Resolve LLM base URL — priority: LLM_BASE_URL > OLLAMA_BASE_URL > default cloud
 	rawLLMURL := getEnvFirst(
@@ -82,6 +90,8 @@ func Load() *Config {
 		DefaultModel:       getEnv("DEFAULT_MODEL", "qwen2.5:7b"),
 		MaxToolIterations:  maxIter,
 		RequestTimeoutSecs: timeout,
+		MaxLLMRetries:      maxLLMRetries,
+		RetryDelaySecs:     retryDelaySecs,
 	}
 
 	if cfg.LLMAPIKey == "" {
diff --git a/gateway/internal/api/handlers.go b/gateway/internal/api/handlers.go
index 0d99cb0..193aa93 100644
--- a/gateway/internal/api/handlers.go
+++ b/gateway/internal/api/handlers.go
@@ -761,24 +761,37 @@ func (h *Handler) StartChatSession(w http.ResponseWriter, r *http.Request) {
 			time.Duration(h.cfg.RequestTimeoutSecs)*time.Second)
 		defer cancel()
 
-		result := h.orch.ChatWithEvents(ctx, messages, model, maxIter, func(step orchestrator.ToolCallStep) {
-			argsJSON, _ := json.Marshal(step.Args)
-			resultStr := ""
-			if step.Result != nil {
-				b, _ := json.Marshal(step.Result)
-				resultStr = string(b)
-			}
-			_ = h.db.AppendEvent(db.ChatEventRow{
-				SessionID:   sessionID,
-				EventType:   "tool_call",
-				ToolName:    step.Tool,
-				ToolArgs:    string(argsJSON),
-				ToolResult:  resultStr,
-				ToolSuccess: step.Success,
-				DurationMs:  int(step.DurationMs),
-				ErrorMsg:    step.Error,
-			})
-		})
+		result := h.orch.ChatWithEventsAndRetry(ctx, messages, model, maxIter,
+			// onToolCall — store each tool execution as an event
+			func(step orchestrator.ToolCallStep) {
+				argsJSON, _ := json.Marshal(step.Args)
+				resultStr := ""
+				if step.Result != nil {
+					b, _ := json.Marshal(step.Result)
+					resultStr = string(b)
+				}
+				_ = h.db.AppendEvent(db.ChatEventRow{
+					SessionID:   sessionID,
+					EventType:   "tool_call",
+					ToolName:    step.Tool,
+					ToolArgs:    string(argsJSON),
+					ToolResult:  resultStr,
+					ToolSuccess: step.Success,
+					DurationMs:  int(step.DurationMs),
+					ErrorMsg:    step.Error,
+				})
+			},
+			// onRetry — emit a "thinking" event so the client sees retry progress
+			func(attempt int, reason string) {
+				msg := fmt.Sprintf("⟳ Retry %d: %s", attempt, reason)
+				log.Printf("[Orchestrator] %s", msg)
+				_ = h.db.AppendEvent(db.ChatEventRow{
+					SessionID: sessionID,
+					EventType: "thinking",
+					Content:   msg,
+				})
+			},
+		)
 
 		processingMs := time.Since(startTime).Milliseconds()
 
diff --git a/gateway/internal/orchestrator/orchestrator.go b/gateway/internal/orchestrator/orchestrator.go
index d77bbc4..62247b9 100644
--- a/gateway/internal/orchestrator/orchestrator.go
+++ b/gateway/internal/orchestrator/orchestrator.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
+	"strings"
 	"time"
 
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
@@ -53,6 +54,30 @@ type OrchestratorConfig struct {
 	MaxTokens    int
 }
 
+// RetryPolicy controls how the orchestrator retries failed or empty LLM calls.
+type RetryPolicy struct {
+	// MaxLLMRetries is the number of additional attempts after a failure.
+	// Total attempts = MaxLLMRetries + 1.  Default: 3 (4 total).
+	MaxLLMRetries int
+	// InitialDelay before the first retry.  Default: 2s.
+	InitialDelay time.Duration
+	// MaxDelay caps the exponential back-off.  Default: 30s.
+	MaxDelay time.Duration
+	// RetryOnEmpty means an empty-content response is treated as a soft failure
+	// and triggers a retry.  Default: true.
+	RetryOnEmpty bool
+}
+
+// defaultRetryPolicy returns the default retry policy.
+func defaultRetryPolicy() RetryPolicy {
+	return RetryPolicy{
+		MaxLLMRetries: 3,
+		InitialDelay:  2 * time.Second,
+		MaxDelay:      30 * time.Second,
+		RetryOnEmpty:  true,
+	}
+}
+
 // ─── Default System Prompt ────────────────────────────────────────────────────
 
 const defaultSystemPrompt = `You are GoClaw Orchestrator — the main AI agent managing the GoClaw distributed AI system.
@@ -88,6 +113,7 @@ type Orchestrator struct {
 	executor    *tools.Executor
 	database    *db.DB
 	projectRoot string
+	retry       RetryPolicy
 }
 
 func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrator {
@@ -95,12 +121,18 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat
 		llmClient:   llmClient,
 		database:    database,
 		projectRoot: projectRoot,
+		retry:       defaultRetryPolicy(),
 	}
 	// Inject agent list function to avoid circular dependency
 	o.executor = tools.NewExecutor(projectRoot, o.listAgentsFn)
 	return o
 }
 
+// SetRetryPolicy overrides the default retry policy.
+func (o *Orchestrator) SetRetryPolicy(p RetryPolicy) {
+	o.retry = p
+}
+
 // GetConfig loads orchestrator config from DB, falls back to defaults.
 func (o *Orchestrator) GetConfig() *OrchestratorConfig {
 	if o.database != nil {
@@ -159,28 +191,160 @@ func (o *Orchestrator) resolveModel(ctx context.Context, desired string) (model
 	return fallback, warning
 }
 
-// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
-func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
-	if maxIter <= 0 {
-		maxIter = 10
+// ─── LLM call with retry ──────────────────────────────────────────────────────
+
+// llmCallResult holds one attempt's outcome.
+type llmCallResult struct {
+	resp        *llm.ChatResponse
+	usedTools   bool // whether the call was made with tools enabled
+	err         error
+	attemptNum  int
+}
+
+// callLLMWithRetry calls the LLM and retries on error or empty response.
+// It also strips tools on the second attempt if the first fails with tools.
+func (o *Orchestrator) callLLMWithRetry(
+	ctx context.Context,
+	req llm.ChatRequest,
+	model string,
+	onRetry func(attempt int, reason string), // optional event callback (may be nil)
+) llmCallResult {
+	policy := o.retry
+	delay := policy.InitialDelay
+	maxAttempts := policy.MaxLLMRetries + 1
+	hasTools := len(req.Tools) > 0
+
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		// On attempt > 1, always strip tools (avoid repeated tool-format errors)
+		useTools := hasTools && attempt == 1
+		r := req
+		if !useTools {
+			r.Tools = nil
+			r.ToolChoice = ""
+		}
+
+		resp, err := o.llmClient.Chat(ctx, r)
+
+		// ── Hard error (network, auth, etc.) ─────────────────────────
+		if err != nil {
+			reason := fmt.Sprintf("LLM error (attempt %d/%d): %v", attempt, maxAttempts, err)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{err: fmt.Errorf("LLM error after %d attempts (model: %s): %w", maxAttempts, model, err), attemptNum: attempt}
+		}
+
+		// ── Context cancelled ─────────────────────────────────────────
+		if ctx.Err() != nil {
+			return llmCallResult{err: ctx.Err(), attemptNum: attempt}
+		}
+
+		// ── Empty choices ─────────────────────────────────────────────
+		if len(resp.Choices) == 0 {
+			reason := fmt.Sprintf("empty choices (attempt %d/%d)", attempt, maxAttempts)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		content := strings.TrimSpace(resp.Choices[0].Message.Content)
+		finishReason := resp.Choices[0].FinishReason
+
+		// ── Empty content AND no tool calls — retry ───────────────────
+		if policy.RetryOnEmpty &&
+			content == "" &&
+			finishReason != "tool_calls" &&
+			len(resp.Choices[0].Message.ToolCalls) == 0 {
+
+			reason := fmt.Sprintf("empty response content (attempt %d/%d, finish_reason=%q)", attempt, maxAttempts, finishReason)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			// Exhausted retries — return what we have (even if empty)
+			log.Printf("[Orchestrator] All %d attempts exhausted — returning empty response", maxAttempts)
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		// ── Success ───────────────────────────────────────────────────
+		if attempt > 1 {
+			log.Printf("[Orchestrator] Succeeded on attempt %d/%d", attempt, maxAttempts)
+		}
+		return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+	}
+
+	// Should not be reached
+	return llmCallResult{err: fmt.Errorf("retry loop exited unexpectedly"), attemptNum: maxAttempts}
+}
+
+// sleep waits for d, returning early if ctx is cancelled.
+func (o *Orchestrator) sleep(ctx context.Context, d time.Duration) {
+	select {
+	case <-ctx.Done():
+	case <-time.After(d):
+	}
+}
+
+// min returns the smaller of two durations.
+func min(a, b time.Duration) time.Duration {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// ─── Core loop (shared by Chat and ChatWithEvents) ────────────────────────────
+
+type loopOptions struct {
+	messages      []Message
+	overrideModel string
+	maxIter       int
+	onToolCall    func(ToolCallStep) // may be nil
+	onRetry       func(attempt int, reason string) // may be nil
+}
+
+func (o *Orchestrator) runLoop(ctx context.Context, opts loopOptions) ChatResult {
+	if opts.maxIter <= 0 {
+		opts.maxIter = 10
 	}
 
 	cfg := o.GetConfig()
 	model := cfg.Model
-	if overrideModel != "" {
-		model = overrideModel
+	if opts.overrideModel != "" {
+		model = opts.overrideModel
 	}
 
 	// Validate model against LLM API — fall back if unavailable (prevents 401/404)
 	model, modelWarning := o.resolveModel(ctx, model)
-
-	log.Printf("[Orchestrator] Chat started: model=%s, messages=%d", model, len(messages))
+	log.Printf("[Orchestrator] Loop started: model=%s, messages=%d, maxIter=%d, maxRetries=%d",
+		model, len(opts.messages), opts.maxIter, o.retry.MaxLLMRetries)
 
 	// Build conversation
 	conv := []llm.Message{
 		{Role: "system", Content: cfg.SystemPrompt},
 	}
-	for _, m := range messages {
+	for _, m := range opts.messages {
 		conv = append(conv, llm.Message{Role: m.Role, Content: m.Content})
 	}
 
@@ -206,7 +370,7 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 	var lastUsage *llm.Usage
 	var lastModel string
 
-	for iter := 0; iter < maxIter; iter++ {
+	for iter := 0; iter < opts.maxIter; iter++ {
 		req := llm.ChatRequest{
 			Model:       model,
 			Messages:    conv,
@@ -216,29 +380,22 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			ToolChoice:  "auto",
 		}
 
-		resp, err := o.llmClient.Chat(ctx, req)
-		if err != nil {
-			// Fallback: try without tools
-			log.Printf("[Orchestrator] LLM error with tools: %v — retrying without tools", err)
-			req.Tools = nil
-			req.ToolChoice = ""
-			resp2, err2 := o.llmClient.Chat(ctx, req)
-			if err2 != nil {
-				return ChatResult{
-					Success:      false,
-					ModelWarning: modelWarning,
-					Error:        fmt.Sprintf("LLM error (model: %s): %v", model, err2),
-				}
+		// ── LLM call with retry ────────────────────────────────────
+		callRes := o.callLLMWithRetry(ctx, req, model, opts.onRetry)
+
+		if callRes.err != nil {
+			return ChatResult{
+				Success:      false,
+				ToolCalls:    toolCallSteps,
+				Model:        model,
+				ModelWarning: modelWarning,
+				Error:        callRes.err.Error(),
 			}
-			if len(resp2.Choices) > 0 {
-				finalResponse = resp2.Choices[0].Message.Content
-				lastUsage = resp2.Usage
-				lastModel = resp2.Model
-			}
-			break
 		}
 
+		resp := callRes.resp
 		if len(resp.Choices) == 0 {
+			log.Printf("[Orchestrator] No choices in response — stopping loop at iter %d", iter)
 			break
 		}
 
@@ -249,19 +406,17 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			lastModel = model
 		}
 
-		// Check if LLM wants to call tools
+		// ── Tool calls ─────────────────────────────────────────────
 		if choice.FinishReason == "tool_calls" && len(choice.Message.ToolCalls) > 0 {
 			// Add assistant message with tool calls to conversation
 			conv = append(conv, choice.Message)
 
-			// Execute each tool call
 			for _, tc := range choice.Message.ToolCalls {
 				toolName := tc.Function.Name
 				argsJSON := tc.Function.Arguments
 
 				log.Printf("[Orchestrator] Executing tool: %s args=%s", toolName, argsJSON)
 				start := time.Now()
-
 				result := o.executor.Execute(ctx, toolName, argsJSON)
 
 				step := ToolCallStep{
@@ -270,7 +425,6 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 					DurationMs: time.Since(start).Milliseconds(),
 				}
 
-				// Parse args for display
 				var argsMap any
 				_ = json.Unmarshal([]byte(argsJSON), &argsMap)
 				step.Args = argsMap
@@ -287,7 +441,10 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 
 				toolCallSteps = append(toolCallSteps, step)
 
-				// Add tool result to conversation
+				if opts.onToolCall != nil {
+					opts.onToolCall(step)
+				}
+
 				conv = append(conv, llm.Message{
 					Role:       "tool",
 					Content:    toolResultContent,
@@ -299,7 +456,7 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			continue
 		}
 
-		// LLM finished — extract final response
+		// ── Final response ─────────────────────────────────────────
 		finalResponse = choice.Message.Content
 		break
 	}
@@ -314,151 +471,55 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 	}
 }
 
-// ChatWithEvents runs the full orchestration loop and calls onToolCall for each tool execution.
-// This enables SSE streaming of tool calls in real time.
-func (o *Orchestrator) ChatWithEvents(ctx context.Context, messages []Message, overrideModel string, maxIter int, onToolCall func(ToolCallStep)) ChatResult {
-	if maxIter <= 0 {
-		maxIter = 10
-	}
+// ─── Public API ───────────────────────────────────────────────────────────────
 
-	cfg := o.GetConfig()
-	model := cfg.Model
-	if overrideModel != "" {
-		model = overrideModel
-	}
-
-	model, modelWarning := o.resolveModel(ctx, model)
-	log.Printf("[Orchestrator] ChatWithEvents started: model=%s, messages=%d", model, len(messages))
-
-	conv := []llm.Message{
-		{Role: "system", Content: cfg.SystemPrompt},
-	}
-	for _, m := range messages {
-		conv = append(conv, llm.Message{Role: m.Role, Content: m.Content})
-	}
-
-	toolDefs := tools.OrchestratorTools()
-	llmTools := make([]llm.Tool, len(toolDefs))
-	for i, t := range toolDefs {
-		llmTools[i] = llm.Tool{
-			Type: t.Type,
-			Function: llm.ToolFunction{
-				Name:        t.Function.Name,
-				Description: t.Function.Description,
-				Parameters:  t.Function.Parameters,
-			},
-		}
-	}
-
-	temp := cfg.Temperature
-	maxTok := cfg.MaxTokens
-
-	var toolCallSteps []ToolCallStep
-	var finalResponse string
-	var lastUsage *llm.Usage
-	var lastModel string
-
-	for iter := 0; iter < maxIter; iter++ {
-		req := llm.ChatRequest{
-			Model:       model,
-			Messages:    conv,
-			Temperature: &temp,
-			MaxTokens:   &maxTok,
-			Tools:       llmTools,
-			ToolChoice:  "auto",
-		}
-
-		resp, err := o.llmClient.Chat(ctx, req)
-		if err != nil {
-			log.Printf("[Orchestrator] LLM error with tools: %v — retrying without tools", err)
-			req.Tools = nil
-			req.ToolChoice = ""
-			resp2, err2 := o.llmClient.Chat(ctx, req)
-			if err2 != nil {
-				return ChatResult{
-					Success:      false,
-					ModelWarning: modelWarning,
-					Error:        fmt.Sprintf("LLM error (model: %s): %v", model, err2),
-				}
-			}
-			if len(resp2.Choices) > 0 {
-				finalResponse = resp2.Choices[0].Message.Content
-				lastUsage = resp2.Usage
-				lastModel = resp2.Model
-			}
-			break
-		}
-
-		if len(resp.Choices) == 0 {
-			break
-		}
-
-		choice := resp.Choices[0]
-		lastUsage = resp.Usage
-		lastModel = resp.Model
-		if lastModel == "" {
-			lastModel = model
-		}
-
-		if choice.FinishReason == "tool_calls" && len(choice.Message.ToolCalls) > 0 {
-			conv = append(conv, choice.Message)
-
-			for _, tc := range choice.Message.ToolCalls {
-				toolName := tc.Function.Name
-				argsJSON := tc.Function.Arguments
-
-				log.Printf("[Orchestrator] Executing tool: %s args=%s", toolName, argsJSON)
-				start := time.Now()
-				result := o.executor.Execute(ctx, toolName, argsJSON)
-
-				step := ToolCallStep{
-					Tool:       toolName,
-					Success:    result.Success,
-					DurationMs: time.Since(start).Milliseconds(),
-				}
-				var argsMap any
-				_ = json.Unmarshal([]byte(argsJSON), &argsMap)
-				step.Args = argsMap
-
-				var toolResultContent string
-				if result.Success {
-					step.Result = result.Result
-					resultBytes, _ := json.Marshal(result.Result)
-					toolResultContent = string(resultBytes)
-				} else {
-					step.Error = result.Error
-					toolResultContent = fmt.Sprintf(`{"error": %q}`, result.Error)
-				}
-
-				toolCallSteps = append(toolCallSteps, step)
-				if onToolCall != nil {
-					onToolCall(step)
-				}
-
-				conv = append(conv, llm.Message{
-					Role:       "tool",
-					Content:    toolResultContent,
-					ToolCallID: tc.ID,
-					Name:       toolName,
-				})
-			}
-			continue
-		}
-
-		finalResponse = choice.Message.Content
-		break
-	}
-
-	return ChatResult{
-		Success:      true,
-		Response:     finalResponse,
-		ToolCalls:    toolCallSteps,
-		Model:        lastModel,
-		ModelWarning: modelWarning,
-		Usage:        lastUsage,
-	}
+// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
+func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+	})
 }
 
+// ChatWithEvents runs the full orchestration loop and calls callbacks for each
+// tool execution and each retry attempt.  Used for SSE streaming and DB event logging.
+func (o *Orchestrator) ChatWithEvents(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+	})
+}
+
+// ChatWithEventsAndRetry is the full-featured variant that also reports retry
+// attempts through onRetry so they can be streamed to the client.
+func (o *Orchestrator) ChatWithEventsAndRetry(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+	onRetry func(attempt int, reason string),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+		onRetry:       onRetry,
+	})
+}
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
 // listAgentsFn is injected into the tool executor to list agents from DB.
 func (o *Orchestrator) listAgentsFn() ([]map[string]any, error) {
 	if o.database == nil {