feat(gateway): restore Phase C full agent lifecycle API

- Restored Phase C gateway code (handlers, main.go, docker client, db) - Added routes: GET /api/agents/running, POST /api/agents (CRUD), POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop, POST /api/agents/{id}/restart, POST /api/agents/{id}/scale - Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id} even when serviceName is empty in DB - Fixed DeployAgent: handle 409 conflict by removing existing container and retrying once (idempotent deploy) - Added swarm_manager.go: background SwarmManager for dead-letter recovery - Added AGENT_NETWORK and AGENT_DB_URL config options - Updated .gitignore to exclude gateway binaries - All agents use standalone docker run (not Swarm) on bridge network Verified on prod: deploy/stop/restart cycle works correctly, /api/agents/running returns live running agents with containerStatus
2026-04-19 11:40:39 +00:00
parent dbde22cec1
commit f8e0ca7d5d
16 changed files with 4505 additions and 257 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -121,3 +121,8 @@ deploy-secrets
 .kilo/
 .manus/
 AGENTS.md
+
+# Gateway binaries
+gateway/gateway
+gateway/gateway-new
+gateway/agent-worker
--- a/gateway/cmd/agent-worker/main.go
+++ b/gateway/cmd/agent-worker/main.go
@@ -127,30 +127,50 @@ type AgentWorker struct {
 	// Recent tasks ring buffer (для GET /tasks)
 	recentMu   sync.Mutex
 	recentKeys []string
+
+	// Rate-limiting semaphore — limits concurrent LLM calls.
+	// Filled with MAX_CONCURRENT_TASKS tokens; each worker acquires one before
+	// calling runChat() and releases it when done.
+	rateSem chan struct{}
+	// maxConcurrent is the configured concurrency limit (exported for /health).
+	maxConcurrent int
 }

 const (
-	taskQueueDepth  = 100
-	maxRecentTasks  = 50
-	defaultMaxIter  = 8
-	defaultTimeout  = 120
-	workerGoroutines = 4 // параллельных воркеров на агента
+	taskQueueDepth        = 100
+	maxRecentTasks        = 50
+	defaultMaxIter        = 8
+	defaultTimeout        = 120
+	workerGoroutines      = 4 // параллельных воркеров на агента
+	defaultMaxConcurrent  = 2 // default simultaneous LLM calls per agent
 )

-func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client) (*AgentWorker, error) {
+func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client, maxConcurrent int) (*AgentWorker, error) {
 	cfg, err := database.GetAgentByID(agentID)
 	if err != nil {
 		return nil, fmt.Errorf("agent %d not found in DB: %w", agentID, err)
 	}
 	log.Printf("[AgentWorker] Loaded config: id=%d name=%q model=%s", cfg.ID, cfg.Name, cfg.Model)

+	if maxConcurrent <= 0 {
+		maxConcurrent = defaultMaxConcurrent
+	}
+
+	// Fill the semaphore with tokens equal to the concurrency limit.
+	sem := make(chan struct{}, maxConcurrent)
+	for i := 0; i < maxConcurrent; i++ {
+		sem <- struct{}{}
+	}
+
 	w := &AgentWorker{
-		agentID:   agentID,
-		cfg:       cfg,
-		llm:       llmClient,
-		database:  database,
-		taskQueue: make(chan *Task, taskQueueDepth),
-		tasks:     make(map[string]*Task),
+		agentID:       agentID,
+		cfg:           cfg,
+		llm:           llmClient,
+		database:      database,
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: maxConcurrent,
 	}
 	// Tool executor: агент использует подмножество инструментов из allowedTools
 	w.executor = tools.NewExecutor("/app", func() ([]map[string]any, error) {
@@ -234,7 +254,23 @@ func (w *AgentWorker) EnqueueTask(req TaskRequest) *Task {
 }

 // processTask выполняет задачу через LLM loop и обновляет её статус.
+// Acquires a rate-limiting token before invoking the LLM to cap concurrent
+// calls at w.maxConcurrent.
 func (w *AgentWorker) processTask(ctx context.Context, task *Task) {
+	// ── Rate limiting: acquire a token ───────────────────────────────────────
+	// If no token is available, block until one frees up or ctx is cancelled.
+	select {
+	case <-ctx.Done():
+		w.tasksMu.Lock()
+		task.Status = TaskCancelled
+		task.Error = "context cancelled before execution"
+		w.tasksMu.Unlock()
+		return
+	case <-w.rateSem:
+		// acquired
+	}
+	defer func() { w.rateSem <- struct{}{} }() // release token
+
 	now := time.Now()
 	w.tasksMu.Lock()
 	task.Status = TaskRunning
@@ -478,12 +514,17 @@ func (w *AgentWorker) postCallback(task *Task) {
 // ─── HTTP Handlers ────────────────────────────────────────────────────────────

 func (w *AgentWorker) handleHealth(rw http.ResponseWriter, r *http.Request) {
+	activeSlots := w.maxConcurrent - len(w.rateSem)
+	rw.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(rw).Encode(map[string]any{
-		"status":   "ok",
-		"agentId":  w.agentID,
-		"name":     w.cfg.Name,
-		"model":    w.cfg.Model,
-		"queueLen": len(w.taskQueue),
+		"status":         "ok",
+		"agentId":        w.agentID,
+		"name":           w.cfg.Name,
+		"model":          w.cfg.Model,
+		"queueLen":       len(w.taskQueue),
+		"activeTasks":    activeSlots,
+		"maxConcurrent":  w.maxConcurrent,
+		"rateLimitFree":  len(w.rateSem),
 	})
 }

@@ -643,7 +684,16 @@ func main() {
 		log.Fatal("[AgentWorker] DATABASE_URL env var is required")
 	}

-	log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s", agentID, port, llmBaseURL)
+	// MAX_CONCURRENT_TASKS controls the rate-limiting semaphore (Phase C).
+	maxConcurrent := defaultMaxConcurrent
+	if mcStr := os.Getenv("MAX_CONCURRENT_TASKS"); mcStr != "" {
+		if mc, err := strconv.Atoi(mcStr); err == nil && mc > 0 {
+			maxConcurrent = mc
+		}
+	}
+
+	log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s MAX_CONCURRENT=%d",
+		agentID, port, llmBaseURL, maxConcurrent)

 	// ── DB ───────────────────────────────────────────────────────────────────
 	database, err := db.Connect(dbURL)
@@ -656,7 +706,7 @@ func main() {
 	llmClient := llm.NewClient(llmBaseURL, llmAPIKey)

 	// ── Agent Worker ─────────────────────────────────────────────────────────
-	worker, err := newAgentWorker(agentID, database, llmClient)
+	worker, err := newAgentWorker(agentID, database, llmClient, maxConcurrent)
 	if err != nil {
 		log.Fatalf("[AgentWorker] init failed: %v", err)
 	}
@@ -724,4 +774,4 @@ func getEnvFirst(keys ...string) string {
 		}
 	}
 	return ""
-}
+}
--- a/gateway/cmd/agent-worker/main_test.go
+++ b/gateway/cmd/agent-worker/main_test.go
@@ -130,11 +130,18 @@ func TestEnqueueTask_DefaultTimeout(t *testing.T) {
 // ─── HTTP Handlers ────────────────────────────────────────────────────────────

 func makeTestWorker() *AgentWorker {
+	mc := defaultMaxConcurrent
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
 	return &AgentWorker{
-		agentID:   42,
-		cfg:       mockAgentConfig(),
-		taskQueue: make(chan *Task, taskQueueDepth),
-		tasks:     make(map[string]*Task),
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
 	}
 }

@@ -435,4 +442,107 @@ func TestWorkerProcessesTask_WithMockLLM(t *testing.T) {
 	if finalStatus != TaskDone {
 		t.Errorf("expected task done, got %s", finalStatus)
 	}
-}
+}
+
+// ─── Phase C: Rate-limiting tests ─────────────────────────────────────────────
+
+// TestRateLimiting_TokensInitialized verifies that the semaphore is filled with
+// maxConcurrent tokens on worker creation.
+func TestRateLimiting_TokensInitialized(t *testing.T) {
+	mc := 3
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+	if len(w.rateSem) != mc {
+		t.Errorf("expected %d tokens in semaphore, got %d", mc, len(w.rateSem))
+	}
+	if cap(w.rateSem) != mc {
+		t.Errorf("expected semaphore capacity=%d, got %d", mc, cap(w.rateSem))
+	}
+}
+
+// TestRateLimiting_TokenAcquireRelease verifies that tokens can be acquired and
+// released correctly (simulating what processTask does).
+func TestRateLimiting_TokenAcquireRelease(t *testing.T) {
+	mc := 2
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+
+	// Acquire both tokens
+	<-w.rateSem
+	<-w.rateSem
+
+	if len(w.rateSem) != 0 {
+		t.Errorf("expected 0 free tokens after acquiring all, got %d", len(w.rateSem))
+	}
+
+	// Release one token
+	w.rateSem <- struct{}{}
+	if len(w.rateSem) != 1 {
+		t.Errorf("expected 1 free token after release, got %d", len(w.rateSem))
+	}
+
+	// Release second token
+	w.rateSem <- struct{}{}
+	if len(w.rateSem) != mc {
+		t.Errorf("expected %d free tokens after full release, got %d", mc, len(w.rateSem))
+	}
+}
+
+// TestRateLimiting_HealthShowsActiveTasks verifies the /health endpoint reports
+// active task count and rate-limit info.
+func TestRateLimiting_HealthShowsActiveTasks(t *testing.T) {
+	mc := 3
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+	// Simulate 1 active task (consume 1 token)
+	<-w.rateSem
+
+	rr := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/health", nil)
+	w.handleHealth(rr, req)
+
+	var body map[string]any
+	if err := json.NewDecoder(rr.Body).Decode(&body); err != nil {
+		t.Fatalf("invalid JSON: %v", err)
+	}
+
+	if int(body["maxConcurrent"].(float64)) != mc {
+		t.Errorf("expected maxConcurrent=%d, got %v", mc, body["maxConcurrent"])
+	}
+	if int(body["rateLimitFree"].(float64)) != mc-1 {
+		t.Errorf("expected rateLimitFree=%d, got %v", mc-1, body["rateLimitFree"])
+	}
+	if int(body["activeTasks"].(float64)) != 1 {
+		t.Errorf("expected activeTasks=1, got %v", body["activeTasks"])
+	}
+}
--- a/gateway/cmd/agent/main.go
+++ b/gateway/cmd/agent/main.go
@@ -0,0 +1,270 @@
+// GoClaw Agent Server — autonomous agent microservice
+//
+// Each agent runs as an independent container in the Docker Swarm overlay
+// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
+// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
+//
+// The agent:
+//   - Receives task requests from the orchestrator
+//   - Calls the LLM via the centrally-managed GoClaw Gateway
+//   - Reads/writes shared state in the MySQL database
+//   - Reports its last-activity time so the SwarmManager can auto-stop it
+//   - Gracefully shuts down after IdleTimeout with no requests
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+)
+
+// ─── Config ──────────────────────────────────────────────────────────────────
+
+type AgentConfig struct {
+	AgentID            string
+	Port               string
+	GatewayURL         string
+	LLMURL             string
+	LLMAPIKey          string
+	DatabaseURL        string
+	IdleTimeoutMinutes int
+}
+
+func loadConfig() AgentConfig {
+	idleMin := 15
+	if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil {
+			idleMin = n
+		}
+	}
+	port := os.Getenv("AGENT_PORT")
+	if port == "" {
+		port = "8080"
+	}
+	return AgentConfig{
+		AgentID:            getEnv("AGENT_ID", "unnamed-agent"),
+		Port:               port,
+		GatewayURL:         getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
+		LLMURL:             getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
+		LLMAPIKey:          os.Getenv("LLM_API_KEY"),
+		DatabaseURL:        os.Getenv("DATABASE_URL"),
+		IdleTimeoutMinutes: idleMin,
+	}
+}
+
+func getEnv(key, fallback string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return fallback
+}
+
+// ─── State ───────────────────────────────────────────────────────────────────
+
+type Agent struct {
+	cfg          AgentConfig
+	lastActivity time.Time
+	httpClient   *http.Client
+}
+
+func NewAgent(cfg AgentConfig) *Agent {
+	return &Agent{
+		cfg:          cfg,
+		lastActivity: time.Now(),
+		httpClient:   &http.Client{Timeout: 120 * time.Second},
+	}
+}
+
+func (a *Agent) touch() {
+	a.lastActivity = time.Now()
+}
+
+// ─── HTTP handlers ────────────────────────────────────────────────────────────
+
+// GET /health — liveness probe
+func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
+	respond(w, 200, map[string]any{
+		"ok":           true,
+		"agentId":      a.cfg.AgentID,
+		"lastActivity": a.lastActivity.Format(time.RFC3339),
+		"idleMinutes":  time.Since(a.lastActivity).Minutes(),
+	})
+}
+
+// POST /task — receive a task from the orchestrator
+// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
+func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
+	a.touch()
+	var body struct {
+		SessionID string          `json:"sessionId"`
+		Messages  json.RawMessage `json:"messages"`
+		Model     string          `json:"model"`
+		MaxIter   int             `json:"maxIter"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, 400, "invalid request: "+err.Error())
+		return
+	}
+	// Forward the task to the GoClaw Gateway orchestrator
+	gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
+	reqBody, _ := json.Marshal(map[string]any{
+		"messages": body.Messages,
+		"model":    body.Model,
+		"maxIter":  body.MaxIter,
+	})
+
+	req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
+	if err != nil {
+		respondError(w, 500, "request build error: "+err.Error())
+		return
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := a.httpClient.Do(req)
+	if err != nil {
+		respondError(w, 502, "gateway error: "+err.Error())
+		return
+	}
+	defer resp.Body.Close()
+
+	var result map[string]any
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		respondError(w, 502, "gateway response error: "+err.Error())
+		return
+	}
+
+	a.touch()
+	respond(w, 200, map[string]any{
+		"ok":        true,
+		"agentId":   a.cfg.AgentID,
+		"sessionId": body.SessionID,
+		"result":    result,
+	})
+}
+
+// GET /info — agent metadata
+func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
+	hostname, _ := os.Hostname()
+	respond(w, 200, map[string]any{
+		"agentId":       a.cfg.AgentID,
+		"hostname":      hostname,
+		"gatewayUrl":    a.cfg.GatewayURL,
+		"idleTimeout":   a.cfg.IdleTimeoutMinutes,
+		"lastActivity":  a.lastActivity.Format(time.RFC3339),
+		"idleMinutes":   time.Since(a.lastActivity).Minutes(),
+	})
+}
+
+// ─── Idle watchdog ────────────────────────────────────────────────────────────
+
+func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
+	threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
+	ticker := time.NewTicker(30 * time.Second)
+	defer ticker.Stop()
+	for range ticker.C {
+		idle := time.Since(a.lastActivity)
+		if idle >= threshold {
+			log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
+				a.cfg.AgentID, idle.Minutes())
+			a.selfStop()
+			cancel()
+			return
+		}
+	}
+}
+
+// selfStop asks the GoClaw Gateway to scale this service to 0.
+func (a *Agent) selfStop() {
+	url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
+	req, err := http.NewRequest("POST", url, nil)
+	if err != nil {
+		log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
+		return
+	}
+	resp, err := a.httpClient.Do(req)
+	if err != nil {
+		log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
+		return
+	}
+	body, _ := io.ReadAll(resp.Body)
+	resp.Body.Close()
+	log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
+}
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+func respond(w http.ResponseWriter, status int, data any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(data)
+}
+
+func respondError(w http.ResponseWriter, status int, msg string) {
+	respond(w, status, map[string]any{"error": msg})
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+func main() {
+	log.SetFlags(log.LstdFlags | log.Lshortfile)
+
+	cfg := loadConfig()
+	agent := NewAgent(cfg)
+
+	log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
+		cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
+	log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
+
+	// ── HTTP server ──────────────────────────────────────────────────────────
+	mux := http.NewServeMux()
+	mux.HandleFunc("GET /health", agent.handleHealth)
+	mux.HandleFunc("POST /task", agent.handleTask)
+	mux.HandleFunc("GET /info", agent.handleInfo)
+
+	srv := &http.Server{
+		Addr:         ":" + cfg.Port,
+		Handler:      mux,
+		ReadTimeout:  30 * time.Second,
+		WriteTimeout: 150 * time.Second,
+		IdleTimeout:  120 * time.Second,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// ── Idle watchdog ────────────────────────────────────────────────────────
+	go agent.runIdleWatchdog(cancel)
+
+	// ── Graceful shutdown ────────────────────────────────────────────────────
+	quit := make(chan os.Signal, 1)
+	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
+
+	go func() {
+		log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
+		}
+	}()
+
+	select {
+	case <-quit:
+		log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
+	case <-ctx.Done():
+		log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
+	}
+
+	shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer shutCancel()
+	if err := srv.Shutdown(shutCtx); err != nil {
+		log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
+	}
+	log.Printf("[Agent %s] Stopped.", cfg.AgentID)
+}
--- a/gateway/cmd/gateway/main.go
+++ b/gateway/cmd/gateway/main.go
@@ -47,10 +47,35 @@ func main() {

 	// ── Orchestrator ─────────────────────────────────────────────────────────
 	orch := orchestrator.New(llmClient, database, cfg.ProjectRoot)
+	// Apply retry policy from config
+	orch.SetRetryPolicy(orchestrator.RetryPolicy{
+		MaxLLMRetries:  cfg.MaxLLMRetries,
+		InitialDelay:   time.Duration(cfg.RetryDelaySecs) * time.Second,
+		MaxDelay:       30 * time.Second,
+		RetryOnEmpty:   true,
+	})
+	log.Printf("[Gateway] LLM retry policy: maxRetries=%d, initialDelay=%ds", cfg.MaxLLMRetries, cfg.RetryDelaySecs)

 	// ── HTTP Handlers ────────────────────────────────────────────────────────
 	h := api.NewHandler(cfg, llmClient, orch, database)

+	// ── Sync Swarm tokens to DB on startup ──────────────────────────────────
+	go func() {
+		time.Sleep(3 * time.Second) // wait for Docker daemon readiness
+		if database != nil {
+			dockerCl := h.GetDockerClient()
+			if tokens, err := dockerCl.GetJoinTokens(); err == nil {
+				addr := dockerCl.GetManagerAddr()
+				database.UpsertSwarmTokens(
+					tokens.JoinTokens.Worker,
+					tokens.JoinTokens.Manager,
+					addr,
+				)
+				log.Printf("[Gateway] Swarm tokens synced to DB. Manager addr: %s", addr)
+			}
+		}
+	}()
+
 	// ── Router ───────────────────────────────────────────────────────────────
 	r := chi.NewRouter()

@@ -76,11 +101,19 @@ func main() {
 	r.Route("/api", func(r chi.Router) {
 		// Orchestrator
 		r.Post("/orchestrator/chat", h.OrchestratorChat)
+		r.Post("/orchestrator/stream", h.OrchestratorStream)
 		r.Get("/orchestrator/config", h.OrchestratorConfig)

-		// Agents
+		// Agents — CRUD + Container lifecycle (Phase A-C)
 		r.Get("/agents", h.ListAgents)
+		r.Get("/agents/running", h.ListRunningAgents) // Phase C: service discovery
+		r.Post("/agents", h.CreateAgent)
 		r.Get("/agents/{id}", h.GetAgent)
+		r.Delete("/agents/{id}", h.DeleteAgent)
+		r.Post("/agents/{id}/deploy", h.DeployAgent)
+		r.Post("/agents/{id}/stop", h.StopAgent)
+		r.Post("/agents/{id}/scale", h.ScaleAgent)
+		r.Post("/agents/{id}/restart", h.RestartAgent) // Phase C: dead-letter restart

 		// Models
 		r.Get("/models", h.ListModels)
@@ -92,8 +125,41 @@ func main() {
 		// Nodes / Docker Swarm monitoring
 		r.Get("/nodes", h.ListNodes)
 		r.Get("/nodes/stats", h.NodeStats)
+
+		// Provider config reload (called by Node.js after provider change)
+		r.Post("/providers/reload", h.ProvidersReload)
+
+		// Persistent chat sessions (background processing, DB-backed)
+		r.Post("/chat/session", h.StartChatSession)
+		r.Get("/chat/sessions", h.ListChatSessions)
+		r.Get("/chat/session/{id}", h.GetChatSession)
+		r.Get("/chat/session/{id}/events", h.GetChatEvents)
+
+		// ── Real Docker Swarm Management ─────────────────────────────────────
+		r.Get("/swarm/info", h.SwarmInfo)
+		r.Get("/swarm/nodes", h.SwarmNodes)
+		r.Post("/swarm/nodes/{id}/label", h.SwarmAddNodeLabel)
+		r.Post("/swarm/nodes/{id}/availability", h.SwarmSetNodeAvailability)
+		r.Get("/swarm/services", h.SwarmServices)
+		r.Post("/swarm/services/create", h.SwarmCreateService)
+		r.Delete("/swarm/services/{id}", h.SwarmRemoveService)
+		r.Get("/swarm/services/{id}/tasks", h.SwarmServiceTasks)
+		r.Post("/swarm/services/{id}/scale", h.SwarmScaleService)
+		r.Get("/swarm/join-token", h.SwarmJoinToken)
+		r.Post("/swarm/join-node", h.SwarmJoinNodeViaSSH)
+		r.Post("/swarm/ssh-test", h.SwarmSSHTest)
+		r.Post("/swarm/shell", h.SwarmShell)
+		r.Get("/swarm/agents", h.SwarmListAgents)
+		r.Post("/swarm/agents/{name}/start", h.SwarmStartAgent)
+		r.Post("/swarm/agents/{name}/stop", h.SwarmStopAgent)
 	})

+	// ── Swarm Manager: auto-stop idle agents after 15 min ────────────────────
+	swarmMgr := api.NewSwarmManager(h, 60*time.Second)
+	managerCtx, managerCancel := context.WithCancel(context.Background())
+	go swarmMgr.Start(managerCtx)
+	defer managerCancel()
+
 	// ── Start Server ─────────────────────────────────────────────────────────
 	srv := &http.Server{
 		Addr:         ":" + cfg.Port,
--- a/gateway/config/config.go
+++ b/gateway/config/config.go
@@ -46,6 +46,22 @@ type Config struct {
 	DefaultModel       string
 	MaxToolIterations  int
 	RequestTimeoutSecs int
+
+	// Docker overlay network for agent containers
+	// AGENT_NETWORK — name of the Docker overlay/bridge network agents are attached to.
+	// Default: goclaw-agents (a dedicated overlay network)
+	AgentNetwork string
+
+	// AGENT_DB_URL — DATABASE_URL passed to agent containers.
+	// Useful when agents run on an overlay network and the DB hostname differs.
+	// Falls back to DatabaseURL if not set.
+	AgentDBURL string
+
+	// LLM retry policy
+	// GATEWAY_MAX_LLM_RETRIES — additional attempts after a failure/empty response (default 3).
+	MaxLLMRetries int
+	// GATEWAY_RETRY_DELAY_SECS — initial delay before first retry in seconds (default 2).
+	RetryDelaySecs int
 }

 func Load() *Config {
@@ -55,6 +71,8 @@ func Load() *Config {

 	maxIter, _ := strconv.Atoi(getEnv("GATEWAY_MAX_TOOL_ITERATIONS", "10"))
 	timeout, _ := strconv.Atoi(getEnv("GATEWAY_REQUEST_TIMEOUT_SECS", "120"))
+	maxLLMRetries, _ := strconv.Atoi(getEnv("GATEWAY_MAX_LLM_RETRIES", "3"))
+	retryDelaySecs, _ := strconv.Atoi(getEnv("GATEWAY_RETRY_DELAY_SECS", "2"))

 	// Resolve LLM base URL — priority: LLM_BASE_URL > OLLAMA_BASE_URL > default cloud
 	rawLLMURL := getEnvFirst(
@@ -82,6 +100,10 @@ func Load() *Config {
 		DefaultModel:       getEnv("DEFAULT_MODEL", "qwen2.5:7b"),
 		MaxToolIterations:  maxIter,
 		RequestTimeoutSecs: timeout,
+		MaxLLMRetries:      maxLLMRetries,
+		RetryDelaySecs:     retryDelaySecs,
+		AgentNetwork:       getEnv("AGENT_NETWORK", "goclaw-agents"),
+		AgentDBURL:         getEnv("AGENT_DB_URL", ""),
 	}

 	if cfg.LLMAPIKey == "" {
--- a/gateway/go.mod
+++ b/gateway/go.mod
@@ -3,11 +3,15 @@ module git.softuniq.eu/UniqAI/GoClaw/gateway
 go 1.23.4

 require (
-	filippo.io/edwards25519 v1.1.0 // indirect
 	github.com/go-chi/chi/v5 v5.2.1
 	github.com/go-chi/cors v1.2.1
-	github.com/go-sql-driver/mysql v1.8.1 // indirect
+	github.com/go-sql-driver/mysql v1.8.1
 	github.com/google/uuid v1.6.0
-	github.com/jmoiron/sqlx v1.4.0 // indirect
 	github.com/joho/godotenv v1.5.1
+	golang.org/x/crypto v0.37.0
+)
+
+require (
+	filippo.io/edwards25519 v1.1.0 // indirect
+	golang.org/x/sys v0.32.0 // indirect
 )
--- a/gateway/go.sum
+++ b/gateway/go.sum
@@ -8,9 +8,11 @@ github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpv
 github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
-github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
 github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
 github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
-github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
-github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
+golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
+golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
+golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
+golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
--- a/gateway/internal/api/handlers.go
+++ b/gateway/internal/api/handlers.go
--- a/gateway/internal/api/handlers_agents_test.go
+++ b/gateway/internal/api/handlers_agents_test.go
@@ -0,0 +1,334 @@
+package api
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"git.softuniq.eu/UniqAI/GoClaw/gateway/config"
+	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
+)
+
+// mockDB implements only the methods needed for agent handler tests.
+type mockDB struct {
+	agents      []db.AgentRow
+	configs     map[int]*db.AgentConfig
+	created     []db.CreateAgentInput
+	deleted     []int
+	statusUpdates []statusUpdate
+	nextID      int
+}
+
+type statusUpdate struct {
+	agentID     int
+	status      string
+	serviceName string
+	servicePort int
+}
+
+func newMockDB() *mockDB {
+	return &mockDB{
+		configs: map[int]*db.AgentConfig{},
+		nextID:  100,
+	}
+}
+
+func (m *mockDB) ListAgents() ([]db.AgentRow, error) {
+	return m.agents, nil
+}
+
+func (m *mockDB) GetAgentByID(id int) (*db.AgentConfig, error) {
+	if cfg, ok := m.configs[id]; ok {
+		return cfg, nil
+	}
+	return nil, nil
+}
+
+func (m *mockDB) CreateAgent(in db.CreateAgentInput) (int, error) {
+	m.created = append(m.created, in)
+	id := m.nextID
+	m.nextID++
+	m.configs[id] = &db.AgentConfig{
+		ID:    id,
+		Name:  in.Name,
+		Model: in.Model,
+	}
+	return id, nil
+}
+
+func (m *mockDB) DeleteAgent(id int) error {
+	m.deleted = append(m.deleted, id)
+	delete(m.configs, id)
+	return nil
+}
+
+func (m *mockDB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+	m.statusUpdates = append(m.statusUpdates, statusUpdate{agentID, status, serviceName, servicePort})
+	if cfg, ok := m.configs[agentID]; ok {
+		cfg.ContainerStatus = status
+		cfg.ServiceName = serviceName
+		cfg.ServicePort = servicePort
+	}
+	return nil
+}
+
+func (m *mockDB) AssignServicePort(start, max int) (int, error) {
+	return start, nil
+}
+
+// ─── DB adapter: wrap mockDB to satisfy Handler which uses *db.DB ────────────
+// We use composition instead — create a handler variant that uses an interface.
+// For these tests, we bypass the Handler's db field and test the logic separately.
+
+// ─── Unit tests for new agent REST endpoints ──────────────────────────────────
+
+// TestCreateAgent_MissingDB verifies 503 when DB is nil.
+func TestCreateAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil, // no DB
+	}
+	body := `{"name":"Test","model":"qwen2.5:7b"}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+
+	h.CreateAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeleteAgent_MissingDB verifies 503 when DB is nil.
+func TestDeleteAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodDelete, "/api/agents/1", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.DeleteAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeleteAgent_InvalidID verifies 400 for non-numeric id.
+func TestDeleteAgent_InvalidID(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodDelete, "/api/agents/abc", nil)
+	req.SetPathValue("id", "abc")
+	w := httptest.NewRecorder()
+
+	h.DeleteAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_MissingDB verifies 503 when DB is nil.
+func TestDeployAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_InvalidID verifies 400 for non-numeric id.
+func TestDeployAgent_InvalidID(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/deploy", nil)
+	req.SetPathValue("id", "xyz")
+	w := httptest.NewRecorder()
+
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestStopAgent_MissingDB verifies 503 when DB is nil.
+func TestStopAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/stop", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.StopAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestScaleAgent_MissingDB verifies 503 when DB is nil.
+func TestScaleAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	body := `{"replicas":3}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body))
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.ScaleAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestScaleAgent_BadReplicas verifies 400 for replicas < 1.
+func TestScaleAgent_BadReplicas(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	body := `{"replicas":0}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body))
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.ScaleAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestListAgents_NoDB verifies empty list when DB is nil.
+func TestListAgents_NoDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodGet, "/api/agents", nil)
+	w := httptest.NewRecorder()
+
+	h.ListAgents(w, req)
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d", w.Code)
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatal("invalid JSON:", err)
+	}
+	if _, ok := resp["note"]; !ok {
+		t.Error("expected 'note' field in response when DB is nil")
+	}
+}
+
+// TestCreateAgent_MissingName verifies 400 when name is empty.
+func TestCreateAgent_MissingName(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil, // will fail at DB check first, but let's also test directly
+	}
+	body := `{"model":"qwen2.5:7b"}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+
+	h.CreateAgent(w, req)
+	// When DB is nil => 503; that's fine, confirms routing works
+	if w.Code != http.StatusServiceUnavailable && w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 or 503, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_SimulatedNoDB verifies simulated deploy when docker is nil.
+// We can't inject a real *db.DB without a live MySQL, so we just verify
+// that the nil-docker path returns 200 with the right fields.
+// This test verifies the handler structure is correct.
+func TestDeployAgent_DockerNil_DBNil(t *testing.T) {
+	// When db is nil, returns 503
+	h := &Handler{
+		cfg:    &config.Config{},
+		db:     nil,
+		docker: nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// ─── Phase C tests ─────────────────────────────────────────────────────────────
+
+// TestListRunningAgents_NoDB verifies empty list when DB is nil.
+func TestListRunningAgents_NoDB(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodGet, "/api/agents/running", nil)
+	w := httptest.NewRecorder()
+
+	h.ListRunningAgents(w, req)
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d", w.Code)
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatal("invalid JSON:", err)
+	}
+	if cnt, _ := resp["count"].(float64); cnt != 0 {
+		t.Errorf("expected count=0, got %v", cnt)
+	}
+}
+
+// TestRestartAgent_NoDB verifies 503 when DB is nil.
+func TestRestartAgent_NoDB(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/restart", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.RestartAgent(w, req)
+	// RestartAgent calls DeployAgent which checks DB → 503
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestRestartAgent_InvalidID verifies 400 for non-numeric id.
+func TestRestartAgent_InvalidID(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/restart", nil)
+	req.SetPathValue("id", "xyz")
+	w := httptest.NewRecorder()
+
+	h.RestartAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestSwarmManagerDeadLetter_DockerNil verifies that checkIdleAgents
+// returns early without panic when docker is nil.
+func TestSwarmManagerDeadLetter_DockerNil(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil, docker: nil}
+	sm := NewSwarmManager(h, 60*time.Second)
+	// Should not panic
+	sm.checkIdleAgents()
+}
--- a/gateway/internal/api/swarm_manager.go
+++ b/gateway/internal/api/swarm_manager.go
@@ -0,0 +1,196 @@
+// Package api – Swarm Agent Lifecycle Manager
+//
+// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
+// (which is the Swarm manager node). It watches all agent services and
+// automatically scales them to 0 replicas after IdleTimeout minutes of no
+// activity. The orchestrator can call StartAgent / StopAgent via the REST API
+// to start/stop agents on demand.
+//
+//	Start flow:  POST /api/swarm/agents/{name}/start  → scale to N replicas (default 1)
+//	Stop flow:   POST /api/swarm/agents/{name}/stop   → scale to 0
+//	Auto-stop:   background loop checks every 60 s, scales idle agents to 0
+package api
+
+import (
+	"context"
+	"encoding/json"
+	"log"
+	"net/http"
+	"time"
+)
+
+const (
+	// IdleTimeout – how many minutes without any task updates before an agent
+	// is automatically scaled to 0.
+	defaultIdleTimeoutMinutes = 15
+	// deadLetterCheckEnabled – when true, SwarmManager reconciles DB containerStatus
+	// with actual Swarm service existence (dead-letter recovery).
+	deadLetterCheckEnabled = true
+)
+
+// SwarmManager watches agent services and auto-scales them down after idle.
+type SwarmManager struct {
+	handler *Handler
+	ticker  *time.Ticker
+	done    chan struct{}
+}
+
+// NewSwarmManager creates a manager that checks every checkInterval.
+func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
+	return &SwarmManager{
+		handler: h,
+		ticker:  time.NewTicker(checkInterval),
+		done:    make(chan struct{}),
+	}
+}
+
+// Start launches the background loop. Call in a goroutine.
+func (m *SwarmManager) Start(ctx context.Context) {
+	log.Printf("[SwarmManager] Started — idle timeout %d min",
+		defaultIdleTimeoutMinutes)
+	defer m.ticker.Stop()
+	for {
+		select {
+		case <-m.done:
+			return
+		case <-ctx.Done():
+			return
+		case <-m.ticker.C:
+			m.checkIdleAgents()
+		}
+	}
+}
+
+// Stop signals the background loop to exit.
+func (m *SwarmManager) Stop() {
+	close(m.done)
+}
+
+func (m *SwarmManager) checkIdleAgents() {
+	h := m.handler
+	if h.docker == nil {
+		return
+	}
+
+	// Build a lookup set of currently-live container/service names (both standalone + Swarm)
+	liveContainers := make(map[string]bool)
+
+	// Check standalone containers
+	if containers, err := h.docker.ListContainers(); err == nil {
+		for _, c := range containers {
+			for _, name := range c.Names {
+				// Docker container names are prefixed with "/"
+				clean := name
+				if len(clean) > 0 && clean[0] == '/' {
+					clean = clean[1:]
+				}
+				if c.Labels["goclaw.agent"] == "true" {
+					liveContainers[clean] = true
+				}
+			}
+		}
+	}
+
+	// Check Swarm services (for legacy/mixed environments)
+	services, err := h.docker.ListServices()
+	if err != nil {
+		log.Printf("[SwarmManager] list services error: %v", err)
+	}
+
+	idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
+	now := time.Now()
+
+	for _, svc := range services {
+		// Only manage services labelled as GoClaw agents
+		if svc.Spec.Labels["goclaw.agent"] != "true" {
+			continue
+		}
+		liveContainers[svc.Spec.Name] = true
+
+		// Skip already-stopped services (0 desired replicas)
+		desired := 0
+		if svc.Spec.Mode.Replicated != nil {
+			desired = svc.Spec.Mode.Replicated.Replicas
+		}
+		if desired == 0 {
+			continue
+		}
+		// Check last activity time
+		lastActivity, actErr := h.docker.GetServiceLastActivity(svc.ID)
+		if actErr != nil || lastActivity.IsZero() {
+			lastActivity = svc.UpdatedAt
+		}
+		idle := now.Sub(lastActivity)
+		if idle >= idleThreshold {
+			log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
+				svc.Spec.Name, idle.Minutes())
+			if scaleErr := h.docker.ScaleService(svc.ID, 0); scaleErr != nil {
+				log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, scaleErr)
+			}
+		}
+	}
+
+	// ── Dead-letter reconciliation (Phase C) ─────────────────────────────────
+	// If an agent's DB says "running" but its container/service is gone, update
+	// the status to "error" so the UI shows the discrepancy and allows redeploy.
+	if !deadLetterCheckEnabled || h.db == nil {
+		return
+	}
+	agents, dbErr := h.db.ListAgents()
+	if dbErr != nil {
+		return
+	}
+	for _, a := range agents {
+		if a.ContainerStatus != "running" || a.ServiceName == "" {
+			continue
+		}
+		if !liveContainers[a.ServiceName] {
+			log.Printf("[SwarmManager] Dead-letter: agent %d (%q) marked running but container %q not found — setting status=error",
+				a.ID, a.Name, a.ServiceName)
+			if updateErr := h.db.UpdateContainerStatus(a.ID, "error", a.ServiceName, a.ServicePort); updateErr != nil {
+				log.Printf("[SwarmManager] UpdateContainerStatus error for agent %d: %v", a.ID, updateErr)
+			}
+		}
+	}
+}
+
+// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
+
+// POST /api/swarm/agents/{name}/start
+// Start (scale-up) a named agent service. Body: { "replicas": 1 }
+func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
+	name := r.PathValue("name")
+	if name == "" {
+		respondError(w, http.StatusBadRequest, "agent name required")
+		return
+	}
+	var body struct {
+		Replicas int `json:"replicas"`
+	}
+	_ = json.NewDecoder(r.Body).Decode(&body)
+	if body.Replicas <= 0 {
+		body.Replicas = 1
+	}
+	if err := h.docker.ScaleService(name, body.Replicas); err != nil {
+		respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
+		return
+	}
+	log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
+}
+
+// POST /api/swarm/agents/{name}/stop
+// Stop (scale-to-0) a named agent service.
+func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
+	name := r.PathValue("name")
+	if name == "" {
+		respondError(w, http.StatusBadRequest, "agent name required")
+		return
+	}
+	if err := h.docker.ScaleService(name, 0); err != nil {
+		respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
+		return
+	}
+	log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
+}
--- a/gateway/internal/db/db.go
+++ b/gateway/internal/db/db.go
@@ -3,6 +3,7 @@ package db

 import (
 	"database/sql"
+	"database/sql/driver"
 	"encoding/json"
 	"fmt"
 	"log"
@@ -20,9 +21,9 @@ type AgentConfig struct {
 	AllowedTools []string
 	Temperature  float64
 	MaxTokens    int
-	IsOrchestrator bool
-	IsSystem     bool
-	IsActive     bool
+	IsOrchestrator  bool
+	IsSystem        bool
+	IsActive        bool
 	// Container / Swarm fields (Phase A)
 	ServiceName     string
 	ServicePort     int
@@ -32,14 +33,34 @@ type AgentConfig struct {

 // AgentRow is a minimal agent representation for listing.
 type AgentRow struct {
-	ID          int    `json:"id"`
-	Name        string `json:"name"`
-	Role        string `json:"role"`
-	Model       string `json:"model"`
-	Description string `json:"description"`
-	IsActive    bool   `json:"isActive"`
-	IsSystem    bool   `json:"isSystem"`
-	IsOrchestrator bool `json:"isOrchestrator"`
+	ID             int    `json:"id"`
+	Name           string `json:"name"`
+	Role           string `json:"role"`
+	Model          string `json:"model"`
+	Description    string `json:"description"`
+	IsActive       bool   `json:"isActive"`
+	IsSystem       bool   `json:"isSystem"`
+	IsOrchestrator bool   `json:"isOrchestrator"`
+	// Container / Swarm fields
+	ServiceName     string `json:"serviceName"`
+	ServicePort     int    `json:"servicePort"`
+	ContainerImage  string `json:"containerImage"`
+	ContainerStatus string `json:"containerStatus"`
+}
+
+// CreateAgentInput holds the fields required to create a new agent in DB.
+type CreateAgentInput struct {
+	Name           string   `json:"name"`
+	Role           string   `json:"role"`
+	Model          string   `json:"model"`
+	Description    string   `json:"description"`
+	SystemPrompt   string   `json:"systemPrompt"`
+	Temperature    float64  `json:"temperature"`
+	MaxTokens      int      `json:"maxTokens"`
+	AllowedTools   []string `json:"allowedTools"`
+	IsSystem       bool     `json:"isSystem"`
+	IsOrchestrator bool     `json:"isOrchestrator"`
+	ContainerImage string   `json:"containerImage"`
 }

 type DB struct {
@@ -73,8 +94,7 @@ func (d *DB) Close() {
 // GetOrchestratorConfig loads the agent with isOrchestrator=1 from DB.
 func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) {
 	row := d.conn.QueryRow(`
-		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive,
-		       COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped')
+		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive
 		FROM agents
 		WHERE isOrchestrator = 1
 		LIMIT 1
@@ -85,8 +105,7 @@ func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) {
 // GetAgentByID loads a specific agent by ID.
 func (d *DB) GetAgentByID(id int) (*AgentConfig, error) {
 	row := d.conn.QueryRow(`
-		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive,
-		       COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped')
+		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive
 		FROM agents
 		WHERE id = ?
 		LIMIT 1
@@ -94,10 +113,13 @@ func (d *DB) GetAgentByID(id int) (*AgentConfig, error) {
 	return scanAgentConfig(row)
 }

-// ListAgents returns all active agents.
+// ListAgents returns all agents with container status fields.
 func (d *DB) ListAgents() ([]AgentRow, error) {
 	rows, err := d.conn.Query(`
-		SELECT id, name, role, model, COALESCE(description,''), isActive, isSystem, isOrchestrator
+		SELECT id, name, role, model,
+		       COALESCE(description,''), isActive, isSystem, isOrchestrator,
+		       COALESCE(serviceName,''), COALESCE(servicePort,0),
+		       COALESCE(containerImage,''), COALESCE(containerStatus,'stopped')
 		FROM agents
 		ORDER BY isOrchestrator DESC, isSystem DESC, id ASC
 	`)
@@ -110,7 +132,11 @@ func (d *DB) ListAgents() ([]AgentRow, error) {
 	for rows.Next() {
 		var a AgentRow
 		var isActive, isSystem, isOrch int
-		if err := rows.Scan(&a.ID, &a.Name, &a.Role, &a.Model, &a.Description, &isActive, &isSystem, &isOrch); err != nil {
+		if err := rows.Scan(
+			&a.ID, &a.Name, &a.Role, &a.Model, &a.Description,
+			&isActive, &isSystem, &isOrch,
+			&a.ServiceName, &a.ServicePort, &a.ContainerImage, &a.ContainerStatus,
+		); err != nil {
 			continue
 		}
 		a.IsActive = isActive == 1
@@ -121,64 +147,423 @@ func (d *DB) ListAgents() ([]AgentRow, error) {
 	return agents, nil
 }

-// ─── Helpers ──────────────────────────────────────────────────────────────────
-
-func scanAgentConfig(row *sql.Row) (*AgentConfig, error) {
-	var cfg AgentConfig
-	var systemPrompt sql.NullString
-	var allowedToolsJSON sql.NullString
-	var temperature sql.NullFloat64
-	var maxTokens sql.NullInt64
-	var isOrch, isSystem, isActive int
-
-	err := row.Scan(
-		&cfg.ID, &cfg.Name, &cfg.Model,
-		&systemPrompt, &allowedToolsJSON,
-		&temperature, &maxTokens,
-		&isOrch, &isSystem, &isActive,
-		&cfg.ServiceName, &cfg.ServicePort, &cfg.ContainerImage, &cfg.ContainerStatus,
+// CreateAgent inserts a new agent into the DB and returns its ID.
+func (d *DB) CreateAgent(in CreateAgentInput) (int, error) {
+	if d.conn == nil {
+		return 0, fmt.Errorf("DB not connected")
+	}
+	toolsJSON := "[]"
+	if len(in.AllowedTools) > 0 {
+		b, _ := json.Marshal(in.AllowedTools)
+		toolsJSON = string(b)
+	}
+	temp := in.Temperature
+	if temp == 0 {
+		temp = 0.7
+	}
+	maxTok := in.MaxTokens
+	if maxTok == 0 {
+		maxTok = 8192
+	}
+	img := in.ContainerImage
+	if img == "" {
+		img = "goclaw-agent-worker:latest"
+	}
+	res, err := d.conn.Exec(`
+		INSERT INTO agents
+		  (name, role, model, description, systemPrompt, temperature, maxTokens,
+		   allowedTools, isActive, isSystem, isOrchestrator,
+		   containerImage, containerStatus, createdAt, updatedAt)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?, ?, 'stopped', NOW(), NOW())
+	`,
+		in.Name, in.Role, in.Model, in.Description, in.SystemPrompt,
+		temp, maxTok, toolsJSON,
+		boolToInt(in.IsSystem), boolToInt(in.IsOrchestrator),
+		img,
 	)
+	if err != nil {
+		return 0, fmt.Errorf("insert agent: %w", err)
+	}
+	id, _ := res.LastInsertId()
+	return int(id), nil
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// DeleteAgent removes an agent record by ID (only non-system agents).
+func (d *DB) DeleteAgent(id int) error {
+	if d.conn == nil {
+		return fmt.Errorf("DB not connected")
+	}
+	res, err := d.conn.Exec(`DELETE FROM agents WHERE id = ? AND isSystem = 0`, id)
+	if err != nil {
+		return err
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("agent %d not found or is a system agent", id)
+	}
+	return nil
+}
+
+// AssignServicePort finds the lowest free port in range [start, start+maxAgents).
+// It reads all currently used ports from DB.
+func (d *DB) AssignServicePort(start, maxAgents int) (int, error) {
+	if d.conn == nil {
+		return start, nil // offline — just return start
+	}
+	rows, err := d.conn.Query(`SELECT COALESCE(servicePort,0) FROM agents WHERE servicePort > 0`)
+	if err != nil {
+		return start, nil
+	}
+	defer rows.Close()
+
+	used := map[int]bool{}
+	for rows.Next() {
+		var p int
+		if rows.Scan(&p) == nil && p > 0 {
+			used[p] = true
+		}
+	}
+	for port := start; port < start+maxAgents; port++ {
+		if !used[port] {
+			return port, nil
+		}
+	}
+	return 0, fmt.Errorf("no free port in range %d-%d", start, start+maxAgents)
+}
+
+// ─── LLM Provider ─────────────────────────────────────────────────────────────
+
+// ProviderRow holds the active LLM provider config from DB.
+type ProviderRow struct {
+	ID       int
+	Name     string
+	BaseURL  string
+	APIKey   string // decrypted (Node.js encrypts, Go just reads raw for now)
+}
+
+// GetActiveProvider returns the active LLM provider from the llmProviders table.
+// Note: The API key is stored AES-256-GCM encrypted by the Node.js server.
+// The Go gateway reads the raw encrypted bytes but cannot decrypt them (no shared key in Go).
+// The proper flow: Node.js decrypts the key and passes it via /api/providers/reload.
+// For now, GetActiveProvider returns the stored encrypted bytes as-is (not useful for direct use).
+// Use UpdateCredentials on the LLM client instead.
+func (d *DB) GetActiveProvider() (*ProviderRow, error) {
+	var p ProviderRow
+	var apiKeyEncrypted sql.NullString
+	row := d.conn.QueryRow(`
+		SELECT id, name, baseUrl, COALESCE(apiKeyEncrypted, '')
+		FROM llmProviders
+		WHERE isActive = 1
+		LIMIT 1
+	`)
+	err := row.Scan(&p.ID, &p.Name, &p.BaseURL, &apiKeyEncrypted)
 	if err != nil {
 		return nil, err
 	}
-
-	cfg.SystemPrompt = systemPrompt.String
-	cfg.Temperature = temperature.Float64
-	if cfg.Temperature == 0 {
-		cfg.Temperature = 0.5
-	}
-	cfg.MaxTokens = int(maxTokens.Int64)
-	if cfg.MaxTokens == 0 {
-		cfg.MaxTokens = 8192
-	}
-	cfg.IsOrchestrator = isOrch == 1
-	cfg.IsSystem = isSystem == 1
-	cfg.IsActive = isActive == 1
-
-	if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" {
-		_ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools)
-	}
-
-	return &cfg, nil
+	// We cannot decrypt the key in Go (different crypto impl from Node.js)
+	// Return empty key — the LLM client will use its env-configured key
+	p.APIKey = ""
+	return &p, nil
 }

-// ─── Agent Container Fields ───────────────────────────────────────────────────
-// These methods support the agent-worker container architecture where each
-// agent runs as an autonomous Docker Swarm service.
+// ─── Chat Sessions & Events ───────────────────────────────────────────────────

-// UpdateContainerStatus updates the container lifecycle state of an agent.
-func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+// ChatSessionRow holds one persistent chat session.
+type ChatSessionRow struct {
+	ID              int    `json:"id"`
+	SessionID       string `json:"sessionId"`
+	AgentID         int    `json:"agentId"`
+	Status          string `json:"status"` // running | done | error
+	UserMessage     string `json:"userMessage"`
+	FinalResponse   string `json:"finalResponse"`
+	Model           string `json:"model"`
+	TotalTokens     int    `json:"totalTokens"`
+	ProcessingTimeMs int64 `json:"processingTimeMs"`
+	ErrorMessage    string `json:"errorMessage"`
+	CreatedAt       string `json:"createdAt"`
+	UpdatedAt       string `json:"updatedAt"`
+}
+
+// ChatEventRow holds one event inside a session.
+type ChatEventRow struct {
+	ID          int    `json:"id"`
+	SessionID   string `json:"sessionId"`
+	Seq         int    `json:"seq"`
+	EventType   string `json:"eventType"` // thinking | tool_call | delta | done | error
+	Content     string `json:"content"`
+	ToolName    string `json:"toolName"`
+	ToolArgs    string `json:"toolArgs"`    // JSON string
+	ToolResult  string `json:"toolResult"`
+	ToolSuccess bool   `json:"toolSuccess"`
+	DurationMs  int    `json:"durationMs"`
+	Model       string `json:"model"`
+	UsageJSON   string `json:"usageJson"`   // JSON string
+	ErrorMsg    string `json:"errorMsg"`
+	CreatedAt   string `json:"createdAt"`
+}
+
+// CreateSession inserts a new running session and returns its row.
+func (d *DB) CreateSession(sessionID, userMessage string, agentID int) error {
+	if d.conn == nil {
+		return fmt.Errorf("DB not connected")
+	}
+	_, err := d.conn.Exec(`
+		INSERT INTO chatSessions (sessionId, agentId, status, userMessage)
+		VALUES (?, ?, 'running', ?)
+	`, sessionID, agentID, truncate(userMessage, 65535))
+	return err
+}
+
+// AppendEvent inserts a new event row for a session.
+// seq is auto-calculated as MAX(seq)+1 for the session.
+func (d *DB) AppendEvent(e ChatEventRow) error {
 	if d.conn == nil {
 		return nil
 	}
+	toolArgs := e.ToolArgs
+	if toolArgs == "" {
+		toolArgs = "null"
+	}
+	usageJSON := e.UsageJSON
+	if usageJSON == "" {
+		usageJSON = "null"
+	}
+	var toolSuccessVal interface{}
+	if e.EventType == "tool_call" {
+		if e.ToolSuccess {
+			toolSuccessVal = 1
+		} else {
+			toolSuccessVal = 0
+		}
+	}
 	_, err := d.conn.Exec(`
-		UPDATE agents
-		SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW()
-		WHERE id = ?
-	`, status, serviceName, servicePort, agentID)
+		INSERT INTO chatEvents
+			(sessionId, seq, eventType, content, toolName, toolArgs,
+			 toolResult, toolSuccess, durationMs, model, usageJson, errorMsg)
+		SELECT ?, COALESCE(MAX(seq),0)+1, ?, ?, ?, ?,
+			   ?, ?, ?, ?, ?, ?
+		FROM chatEvents WHERE sessionId = ?
+	`,
+		e.SessionID, e.EventType,
+		nullStr(e.Content), nullStr(e.ToolName), rawJSON(toolArgs),
+		nullStr(e.ToolResult), toolSuccessVal, nullInt(e.DurationMs),
+		nullStr(e.Model), rawJSON(usageJSON), nullStr(e.ErrorMsg),
+		e.SessionID,
+	)
+	if err != nil {
+		log.Printf("[DB] AppendEvent error: %v", err)
+	}
 	return err
 }

+// MarkSessionDone updates a session to done/error status.
+func (d *DB) MarkSessionDone(sessionID, status, finalResponse, model, errorMessage string, totalTokens int, processingTimeMs int64) {
+	if d.conn == nil {
+		return
+	}
+	_, err := d.conn.Exec(`
+		UPDATE chatSessions
+		SET status=?, finalResponse=?, model=?, totalTokens=?,
+		    processingTimeMs=?, errorMessage=?
+		WHERE sessionId=?
+	`, status,
+		truncate(finalResponse, 65535),
+		model,
+		totalTokens,
+		processingTimeMs,
+		truncate(errorMessage, 65535),
+		sessionID,
+	)
+	if err != nil {
+		log.Printf("[DB] MarkSessionDone error: %v", err)
+	}
+}
+
+// GetSession returns a single session by its string ID.
+func (d *DB) GetSession(sessionID string) (*ChatSessionRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	row := d.conn.QueryRow(`
+		SELECT id, sessionId, agentId, status,
+		       COALESCE(userMessage,''),
+		       COALESCE(finalResponse,''),
+		       COALESCE(model,''),
+		       COALESCE(totalTokens,0),
+		       COALESCE(processingTimeMs,0),
+		       COALESCE(errorMessage,''),
+		       createdAt, updatedAt
+		FROM chatSessions WHERE sessionId=? LIMIT 1
+	`, sessionID)
+	var s ChatSessionRow
+	err := row.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status,
+		&s.UserMessage, &s.FinalResponse, &s.Model,
+		&s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage,
+		&s.CreatedAt, &s.UpdatedAt)
+	if err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+// GetEvents returns all events for a session with seq > afterSeq (for incremental polling).
+func (d *DB) GetEvents(sessionID string, afterSeq int) ([]ChatEventRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, sessionId, seq, eventType,
+		       COALESCE(content,''), COALESCE(toolName,''),
+		       COALESCE(CAST(toolArgs AS CHAR),'null'),
+		       COALESCE(toolResult,''),
+		       COALESCE(toolSuccess,0),
+		       COALESCE(durationMs,0),
+		       COALESCE(model,''),
+		       COALESCE(CAST(usageJson AS CHAR),'null'),
+		       COALESCE(errorMsg,''),
+		       createdAt
+		FROM chatEvents
+		WHERE sessionId=? AND seq > ?
+		ORDER BY seq ASC
+	`, sessionID, afterSeq)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var result []ChatEventRow
+	for rows.Next() {
+		var e ChatEventRow
+		var toolSuccess int
+		if err := rows.Scan(
+			&e.ID, &e.SessionID, &e.Seq, &e.EventType,
+			&e.Content, &e.ToolName, &e.ToolArgs,
+			&e.ToolResult, &toolSuccess, &e.DurationMs,
+			&e.Model, &e.UsageJSON, &e.ErrorMsg, &e.CreatedAt,
+		); err != nil {
+			continue
+		}
+		e.ToolSuccess = toolSuccess == 1
+		result = append(result, e)
+	}
+	return result, nil
+}
+
+// GetRecentSessions returns the N most recent sessions.
+func (d *DB) GetRecentSessions(limit int) ([]ChatSessionRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, sessionId, agentId, status,
+		       COALESCE(userMessage,''),
+		       COALESCE(finalResponse,''),
+		       COALESCE(model,''),
+		       COALESCE(totalTokens,0),
+		       COALESCE(processingTimeMs,0),
+		       COALESCE(errorMessage,''),
+		       createdAt, updatedAt
+		FROM chatSessions ORDER BY id DESC LIMIT ?
+	`, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var result []ChatSessionRow
+	for rows.Next() {
+		var s ChatSessionRow
+		if err := rows.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status,
+			&s.UserMessage, &s.FinalResponse, &s.Model,
+			&s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage,
+			&s.CreatedAt, &s.UpdatedAt); err != nil {
+			continue
+		}
+		result = append(result, s)
+	}
+	return result, nil
+}
+
+// helper — nil for empty strings
+func nullStr(s string) interface{} {
+	if s == "" {
+		return nil
+	}
+	return s
+}
+
+// helper — nil for zero int
+func nullInt(n int) interface{} {
+	if n == 0 {
+		return nil
+	}
+	return n
+}
+
+// rawJSON wraps a JSON string so it's passed as-is to MySQL (not double-encoded)
+type rawJSON string
+
+func (r rawJSON) Value() (driver.Value, error) {
+	if r == "null" || r == "" {
+		return nil, nil
+	}
+	return string(r), nil
+}
+
+// ─── Metrics & History ────────────────────────────────────────────────────────
+
+// MetricInput holds data for a single orchestrator request metric.
+type MetricInput struct {
+	AgentID         int
+	RequestID       string
+	UserMessage     string
+	AgentResponse   string
+	InputTokens     int
+	OutputTokens    int
+	TotalTokens     int
+	ProcessingTimeMs int64
+	Status          string // "success" | "error" | "timeout"
+	ErrorMessage    string
+	ToolsCalled     []string
+	Model           string
+}
+
+// SaveMetric inserts a row into the agentMetrics table.
+// Non-fatal — logs on error but does not return one.
+func (d *DB) SaveMetric(m MetricInput) {
+	if d.conn == nil {
+		return
+	}
+	toolsJSON, _ := json.Marshal(m.ToolsCalled)
+	_, err := d.conn.Exec(`
+		INSERT INTO agentMetrics
+			(agentId, requestId, userMessage, agentResponse,
+			 inputTokens, outputTokens, totalTokens,
+			 processingTimeMs, status, errorMessage, toolsCalled, model)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+	`,
+		m.AgentID,
+		m.RequestID,
+		truncate(m.UserMessage, 65535),
+		truncate(m.AgentResponse, 65535),
+		m.InputTokens, m.OutputTokens, m.TotalTokens,
+		m.ProcessingTimeMs,
+		m.Status,
+		m.ErrorMessage,
+		string(toolsJSON),
+		m.Model,
+	)
+	if err != nil {
+		log.Printf("[DB] SaveMetric error: %v", err)
+	}
+}
+
 // HistoryInput holds data for one conversation entry.
 type HistoryInput struct {
 	AgentID        int
@@ -188,14 +573,6 @@ type HistoryInput struct {
 	Status         string // "success" | "error" | "pending"
 }

-// HistoryRow is a single entry from agentHistory for sliding window memory.
-type HistoryRow struct {
-	ID            int    `json:"id"`
-	UserMessage   string `json:"userMessage"`
-	AgentResponse string `json:"agentResponse"`
-	ConvID        string `json:"conversationId"`
-}
-
 // SaveHistory inserts a row into the agentHistory table.
 // Non-fatal — logs on error but does not return one.
 func (d *DB) SaveHistory(h HistoryInput) {
@@ -223,39 +600,7 @@ func (d *DB) SaveHistory(h HistoryInput) {
 	}
 }

-// GetAgentHistory returns the last N conversation turns for an agent, oldest first.
-func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) {
-	if d.conn == nil {
-		return nil, nil
-	}
-	rows, err := d.conn.Query(`
-		SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'')
-		FROM agentHistory
-		WHERE agentId = ?
-		ORDER BY id DESC
-		LIMIT ?
-	`, agentID, limit)
-	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-
-	var result []HistoryRow
-	for rows.Next() {
-		var h HistoryRow
-		if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil {
-			continue
-		}
-		result = append(result, h)
-	}
-	// Reverse so oldest is first (for LLM context ordering)
-	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
-		result[i], result[j] = result[j], result[i]
-	}
-	return result, nil
-}
-
-// truncate caps a string to maxLen bytes.
+// truncate caps a string to maxLen bytes (not runes — fast path for DB limits).
 func truncate(s string, maxLen int) string {
 	if len(s) <= maxLen {
 		return s
@@ -263,6 +608,154 @@ func truncate(s string, maxLen int) string {
 	return s[:maxLen]
 }

+// ─── Swarm Node Persistence ───────────────────────────────────────────────────
+
+// SwarmNodeInput is the data shape that handlers pass to UpsertSwarmNodes.
+// It matches the JSON shape from handler's NodeOut struct so we can reuse it.
+type SwarmNodeInput struct {
+	ID            string            `json:"id"`
+	Hostname      string            `json:"hostname"`
+	Role          string            `json:"role"`
+	State         string            `json:"state"`
+	Availability  string            `json:"availability"`
+	IP            string            `json:"ip"`
+	CPUCores      int               `json:"cpuCores"`
+	MemTotalMB    int64             `json:"memTotalMB"`
+	DockerVersion string            `json:"dockerVersion"`
+	IsLeader      bool              `json:"isLeader"`
+	ManagerAddr   string            `json:"managerAddr"`
+	Labels        map[string]string `json:"labels"`
+}
+
+// UpsertSwarmNodes inserts or updates swarm node records in the swarmNodes table.
+// Called asynchronously from the SwarmNodes handler — never blocks the response.
+func (d *DB) UpsertSwarmNodes(nodes interface{}) {
+	if d.conn == nil {
+		return
+	}
+	// We accept interface{} to avoid circular import; use json round-trip to parse.
+	b, err := json.Marshal(nodes)
+	if err != nil {
+		return
+	}
+	var list []SwarmNodeInput
+	if err := json.Unmarshal(b, &list); err != nil {
+		return
+	}
+	for _, n := range list {
+		labelsJSON, _ := json.Marshal(n.Labels)
+		isLeader := 0
+		if n.IsLeader {
+			isLeader = 1
+		}
+		isManager := 0
+		if n.Role == "manager" {
+			isManager = 1
+		}
+		state := n.State
+		if state != "ready" && state != "down" && state != "disconnected" {
+			state = "ready"
+		}
+		avail := n.Availability
+		if avail != "active" && avail != "pause" && avail != "drain" {
+			avail = "active"
+		}
+		_, err := d.conn.Exec(`
+			INSERT INTO swarmNodes
+				(nodeId, hostname, role, state, availability, advertiseAddr,
+				 labels, engineVersion, cpuCores, memTotalMB, isManager, isLeader)
+			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+			ON DUPLICATE KEY UPDATE
+				hostname=VALUES(hostname), role=VALUES(role),
+				state=VALUES(state), availability=VALUES(availability),
+				advertiseAddr=VALUES(advertiseAddr),
+				labels=VALUES(labels), engineVersion=VALUES(engineVersion),
+				cpuCores=VALUES(cpuCores), memTotalMB=VALUES(memTotalMB),
+				isManager=VALUES(isManager), isLeader=VALUES(isLeader),
+				lastSeenAt=CURRENT_TIMESTAMP
+		`,
+			n.ID, n.Hostname, n.Role, state, avail, n.IP,
+			string(labelsJSON), n.DockerVersion,
+			n.CPUCores, n.MemTotalMB, isManager, isLeader,
+		)
+		if err != nil {
+			log.Printf("[DB] UpsertSwarmNodes error for node %s: %v", n.ID, err)
+		}
+	}
+}
+
+// UpsertSwarmTokens stores the current swarm join tokens.
+func (d *DB) UpsertSwarmTokens(workerToken, managerToken, managerAddr string) {
+	if d.conn == nil {
+		return
+	}
+	_, err := d.conn.Exec(`
+		INSERT INTO swarmTokens (managerToken, workerToken, managerAddr)
+		VALUES (?, ?, ?)
+		ON DUPLICATE KEY UPDATE
+			managerToken=VALUES(managerToken),
+			workerToken=VALUES(workerToken),
+			managerAddr=VALUES(managerAddr)
+	`, managerToken, workerToken, managerAddr)
+	if err != nil {
+		log.Printf("[DB] UpsertSwarmTokens error: %v", err)
+	}
+}
+
+// GetSwarmTokens retrieves the stored join tokens.
+func (d *DB) GetSwarmTokens() (worker, manager, addr string, err error) {
+	if d.conn == nil {
+		err = fmt.Errorf("DB not connected")
+		return
+	}
+	row := d.conn.QueryRow(`
+		SELECT COALESCE(workerToken,''), COALESCE(managerToken,''), COALESCE(managerAddr,'')
+		FROM swarmTokens ORDER BY id DESC LIMIT 1
+	`)
+	err = row.Scan(&worker, &manager, &addr)
+	return
+}
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+func scanAgentConfig(row *sql.Row) (*AgentConfig, error) {
+	var cfg AgentConfig
+	var systemPrompt sql.NullString
+	var allowedToolsJSON sql.NullString
+	var temperature sql.NullFloat64
+	var maxTokens sql.NullInt64
+	var isOrch, isSystem, isActive int
+
+	err := row.Scan(
+		&cfg.ID, &cfg.Name, &cfg.Model,
+		&systemPrompt, &allowedToolsJSON,
+		&temperature, &maxTokens,
+		&isOrch, &isSystem, &isActive,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	cfg.SystemPrompt = systemPrompt.String
+	cfg.Temperature = temperature.Float64
+	if cfg.Temperature == 0 {
+		cfg.Temperature = 0.5
+	}
+	cfg.MaxTokens = int(maxTokens.Int64)
+	if cfg.MaxTokens == 0 {
+		cfg.MaxTokens = 8192
+	}
+	cfg.IsOrchestrator = isOrch == 1
+	cfg.IsSystem = isSystem == 1
+	cfg.IsActive = isActive == 1
+
+	if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" {
+		_ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools)
+	}
+
+	return &cfg, nil
+}
+
 // normalizeDSN converts mysql://user:pass@host:port/db to user:pass@tcp(host:port)/db
 func normalizeDSN(dsn string) string {
 	if !strings.HasPrefix(dsn, "mysql://") {
@@ -304,3 +797,60 @@ func normalizeDSN(dsn string) string {
 	}
 	return fmt.Sprintf("%s@tcp(%s)%s?parseTime=true&charset=utf8mb4%s", userInfo, hostPort, dbName, tlsParam)
 }
+
+// ─── Agent Container Fields ───────────────────────────────────────────────────
+// These methods support the agent-worker container architecture where each
+// agent runs as an autonomous Docker Swarm service.
+
+// UpdateContainerStatus updates the container lifecycle state of an agent.
+func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+	if d.conn == nil {
+		return nil
+	}
+	_, err := d.conn.Exec(`
+		UPDATE agents
+		SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW()
+		WHERE id = ?
+	`, status, serviceName, servicePort, agentID)
+	return err
+}
+
+// HistoryRow is a single entry from agentHistory for sliding window memory.
+type HistoryRow struct {
+	ID            int    `json:"id"`
+	UserMessage   string `json:"userMessage"`
+	AgentResponse string `json:"agentResponse"`
+	ConvID        string `json:"conversationId"`
+}
+
+// GetAgentHistory returns the last N conversation turns for an agent, oldest first.
+func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) {
+	if d.conn == nil {
+		return nil, nil
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'')
+		FROM agentHistory
+		WHERE agentId = ?
+		ORDER BY id DESC
+		LIMIT ?
+	`, agentID, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var result []HistoryRow
+	for rows.Next() {
+		var h HistoryRow
+		if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil {
+			continue
+		}
+		result = append(result, h)
+	}
+	// Reverse so oldest is first (for LLM context ordering)
+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
+		result[i], result[j] = result[j], result[i]
+	}
+	return result, nil
+}
--- a/gateway/internal/docker/client.go
+++ b/gateway/internal/docker/client.go
@@ -1,22 +1,25 @@
 package docker

 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net"
 	"net/http"
+	"os/exec"
+	"strings"
 	"time"
 )

-// DockerClient communicates with the Docker daemon via Unix socket or TCP.
+// DockerClient communicates with the Docker daemon via Unix socket.
 type DockerClient struct {
 	httpClient *http.Client
 	baseURL    string
 }

-// NewDockerClient creates a client that talks to /var/run/docker.sock.
+// NewDockerClient creates a client talking to /var/run/docker.sock.
 func NewDockerClient() *DockerClient {
 	transport := &http.Transport{
 		DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
@@ -24,11 +27,13 @@ func NewDockerClient() *DockerClient {
 		},
 	}
 	return &DockerClient{
-		httpClient: &http.Client{Transport: transport, Timeout: 10 * time.Second},
-		baseURL:    "http://localhost", // host is ignored for unix socket
+		httpClient: &http.Client{Transport: transport, Timeout: 30 * time.Second},
+		baseURL:    "http://localhost",
 	}
 }

+// ─── HTTP helpers ─────────────────────────────────────────────────────────────
+
 func (c *DockerClient) get(path string, out interface{}) error {
 	resp, err := c.httpClient.Get(c.baseURL + path)
 	if err != nil {
@@ -42,16 +47,64 @@ func (c *DockerClient) get(path string, out interface{}) error {
 	return json.Unmarshal(body, out)
 }

-// ---- Types ----------------------------------------------------------------
+func (c *DockerClient) post(path string, payload interface{}, out interface{}) error {
+	b, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+	resp, err := c.httpClient.Post(c.baseURL+path, "application/json", bytes.NewReader(b))
+	if err != nil {
+		return fmt.Errorf("docker POST %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("docker POST %s: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	if out != nil && len(body) > 0 {
+		return json.Unmarshal(body, out)
+	}
+	return nil
+}
+
+func (c *DockerClient) postUpdate(path string, version int, payload interface{}) error {
+	b, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+	url := fmt.Sprintf("%s%s?version=%d", c.baseURL, path, version)
+	req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(b))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("docker POST(update) %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("docker POST(update) %s: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	return nil
+}
+
+// ─── Swarm Node Types ─────────────────────────────────────────────────────────

 type SwarmNode struct {
-	ID          string          `json:"ID"`
+	ID          string         `json:"ID"`
 	Description NodeDescription `json:"Description"`
-	Status      NodeStatus      `json:"Status"`
+	Status      NodeStatus     `json:"Status"`
 	ManagerStatus *ManagerStatus `json:"ManagerStatus,omitempty"`
-	Spec        NodeSpec        `json:"Spec"`
-	UpdatedAt   time.Time       `json:"UpdatedAt"`
-	CreatedAt   time.Time       `json:"CreatedAt"`
+	Spec        NodeSpec       `json:"Spec"`
+	UpdatedAt   time.Time      `json:"UpdatedAt"`
+	CreatedAt   time.Time      `json:"CreatedAt"`
+	Version     VersionInfo    `json:"Version"`
+}
+
+type VersionInfo struct {
+	Index int `json:"Index"`
 }

 type NodeDescription struct {
@@ -82,17 +135,155 @@ type NodeStatus struct {
 }

 type ManagerStatus struct {
-	Addr           string `json:"Addr"`
-	Leader         bool   `json:"Leader"`
-	Reachability   string `json:"Reachability"`
+	Addr         string `json:"Addr"`
+	Leader       bool   `json:"Leader"`
+	Reachability string `json:"Reachability"`
 }

 type NodeSpec struct {
-	Role         string `json:"Role"`
-	Availability string `json:"Availability"`
+	Role         string            `json:"Role"`
+	Availability string            `json:"Availability"`
 	Labels       map[string]string `json:"Labels"`
 }

+// ─── Swarm Service Types ──────────────────────────────────────────────────────
+
+type SwarmService struct {
+	ID          string          `json:"ID"`
+	Spec        ServiceSpec     `json:"Spec"`
+	ServiceStatus *ServiceStatus `json:"ServiceStatus,omitempty"`
+	UpdatedAt   time.Time       `json:"UpdatedAt"`
+	CreatedAt   time.Time       `json:"CreatedAt"`
+	Version     VersionInfo     `json:"Version"`
+}
+
+type ServiceSpec struct {
+	Name         string            `json:"Name"`
+	Mode         ServiceMode       `json:"Mode"`
+	TaskTemplate TaskTemplate      `json:"TaskTemplate"`
+	EndpointSpec *EndpointSpec     `json:"EndpointSpec,omitempty"`
+	Labels       map[string]string `json:"Labels"`
+	Networks     []NetworkAttachment `json:"Networks,omitempty"`
+}
+
+type NetworkAttachment struct {
+	Target  string   `json:"Target"`
+	Aliases []string `json:"Aliases,omitempty"`
+}
+
+type ServiceMode struct {
+	Replicated *ReplicatedService `json:"Replicated,omitempty"`
+	Global     *struct{}          `json:"Global,omitempty"`
+}
+
+type ReplicatedService struct {
+	Replicas int `json:"Replicas"`
+}
+
+type TaskTemplate struct {
+	ContainerSpec ContainerSpec `json:"ContainerSpec"`
+	Resources     *TaskResources `json:"Resources,omitempty"`
+	Placement     *Placement    `json:"Placement,omitempty"`
+}
+
+type ContainerSpec struct {
+	Image  string   `json:"Image"`
+	Env    []string `json:"Env,omitempty"`
+	Labels map[string]string `json:"Labels,omitempty"`
+}
+
+type TaskResources struct {
+	Limits       *ResourceSpec `json:"Limits,omitempty"`
+	Reservations *ResourceSpec `json:"Reservations,omitempty"`
+}
+
+type ResourceSpec struct {
+	NanoCPUs    int64 `json:"NanoCPUs,omitempty"`
+	MemoryBytes int64 `json:"MemoryBytes,omitempty"`
+}
+
+type Placement struct {
+	Constraints []string `json:"Constraints,omitempty"`
+}
+
+type EndpointSpec struct {
+	Ports []PortConfig `json:"Ports,omitempty"`
+}
+
+type PortConfig struct {
+	Protocol      string `json:"Protocol"`
+	TargetPort    int    `json:"TargetPort"`
+	PublishedPort int    `json:"PublishedPort"`
+	PublishMode   string `json:"PublishMode"`
+}
+
+type ServiceStatus struct {
+	RunningTasks   int `json:"RunningTasks"`
+	DesiredTasks   int `json:"DesiredTasks"`
+	CompletedTasks int `json:"CompletedTasks"`
+}
+
+// ─── Swarm Task Types ─────────────────────────────────────────────────────────
+
+type SwarmTask struct {
+	ID          string      `json:"ID"`
+	ServiceID   string      `json:"ServiceID"`
+	NodeID      string      `json:"NodeID"`
+	Spec        TaskSpec    `json:"Spec"`
+	Status      TaskStatus  `json:"Status"`
+	Slot        int         `json:"Slot"`
+	UpdatedAt   time.Time   `json:"UpdatedAt"`
+	CreatedAt   time.Time   `json:"CreatedAt"`
+}
+
+type TaskSpec struct {
+	ContainerSpec ContainerSpec `json:"ContainerSpec"`
+}
+
+type TaskStatus struct {
+	Timestamp   time.Time       `json:"Timestamp"`
+	State       string          `json:"State"`
+	Message     string          `json:"Message"`
+	ContainerStatus *ContainerTaskStatus `json:"ContainerStatus,omitempty"`
+}
+
+type ContainerTaskStatus struct {
+	ContainerID string `json:"ContainerID"`
+	PID         int    `json:"PID"`
+}
+
+// ─── Swarm Info / Tokens ──────────────────────────────────────────────────────
+
+type DockerInfo struct {
+	Swarm SwarmInfo `json:"Swarm"`
+}
+
+type SwarmInfo struct {
+	NodeID           string `json:"NodeID"`
+	LocalNodeState   string `json:"LocalNodeState"`
+	ControlAvailable bool   `json:"ControlAvailable"`
+	Managers         int    `json:"Managers"`
+	Nodes            int    `json:"Nodes"`
+	RemoteManagers   []RemoteManager `json:"RemoteManagers"`
+}
+
+type RemoteManager struct {
+	NodeID string `json:"NodeID"`
+	Addr   string `json:"Addr"`
+}
+
+type SwarmSpec struct {
+	JoinTokens JoinTokens `json:"JoinTokens"`
+	ID         string     `json:"ID"`
+}
+
+type JoinTokens struct {
+	Worker  string `json:"Worker"`
+	Manager string `json:"Manager"`
+}
+
+// ─── Container types ──────────────────────────────────────────────────────────
+
 type Container struct {
 	ID      string            `json:"Id"`
 	Names   []string          `json:"Names"`
@@ -109,9 +300,9 @@ type ContainerStats struct {
 }

 type CPUStats struct {
-	CPUUsage    CPUUsage `json:"cpu_usage"`
-	SystemCPUUsage int64 `json:"system_cpu_usage"`
-	OnlineCPUs  int      `json:"online_cpus"`
+	CPUUsage       CPUUsage `json:"cpu_usage"`
+	SystemCPUUsage int64    `json:"system_cpu_usage"`
+	OnlineCPUs     int      `json:"online_cpus"`
 }

 type CPUUsage struct {
@@ -120,27 +311,14 @@ type CPUUsage struct {
 }

 type MemoryStats struct {
-	Usage    int64 `json:"usage"`
-	MaxUsage int64 `json:"max_usage"`
-	Limit    int64 `json:"limit"`
+	Usage    int64            `json:"usage"`
+	MaxUsage int64            `json:"max_usage"`
+	Limit    int64            `json:"limit"`
 	Stats    map[string]int64 `json:"stats"`
 }

-type DockerInfo struct {
-	Swarm SwarmInfo `json:"Swarm"`
-}
+// ─── Methods: Swarm info ──────────────────────────────────────────────────────

-type SwarmInfo struct {
-	NodeID           string `json:"NodeID"`
-	LocalNodeState   string `json:"LocalNodeState"`
-	ControlAvailable bool   `json:"ControlAvailable"`
-	Managers         int    `json:"Managers"`
-	Nodes            int    `json:"Nodes"`
-}
-
-// ---- Methods ---------------------------------------------------------------
-
-// IsSwarmActive checks if Docker Swarm is initialized.
 func (c *DockerClient) IsSwarmActive() bool {
 	var info DockerInfo
 	if err := c.get("/v1.44/info", &info); err != nil {
@@ -149,7 +327,6 @@ func (c *DockerClient) IsSwarmActive() bool {
 	return info.Swarm.LocalNodeState == "active"
 }

-// GetSwarmInfo returns basic swarm info.
 func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) {
 	var info DockerInfo
 	if err := c.get("/v1.44/info", &info); err != nil {
@@ -158,7 +335,27 @@ func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) {
 	return &info, nil
 }

-// ListNodes returns all Swarm nodes (requires manager node).
+// GetJoinTokens returns the Swarm worker and manager join tokens.
+// Requires this node to be a swarm manager.
+func (c *DockerClient) GetJoinTokens() (*SwarmSpec, error) {
+	var spec SwarmSpec
+	if err := c.get("/v1.44/swarm", &spec); err != nil {
+		return nil, err
+	}
+	return &spec, nil
+}
+
+// GetManagerAddr returns the advertise address (IP:2377) for joining this swarm.
+func (c *DockerClient) GetManagerAddr() string {
+	info, err := c.GetSwarmInfo()
+	if err != nil || len(info.Swarm.RemoteManagers) == 0 {
+		return ""
+	}
+	return info.Swarm.RemoteManagers[0].Addr
+}
+
+// ─── Methods: Nodes ───────────────────────────────────────────────────────────
+
 func (c *DockerClient) ListNodes() ([]SwarmNode, error) {
 	var nodes []SwarmNode
 	if err := c.get("/v1.44/nodes", &nodes); err != nil {
@@ -167,7 +364,323 @@ func (c *DockerClient) ListNodes() ([]SwarmNode, error) {
 	return nodes, nil
 }

-// ListContainers returns all running containers on this host.
+// UpdateNodeAvailability sets a node's availability (active|pause|drain).
+func (c *DockerClient) UpdateNodeAvailability(nodeID, availability string) error {
+	// First get current node spec + version
+	var node SwarmNode
+	if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil {
+		return err
+	}
+	node.Spec.Availability = availability
+	return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec)
+}
+
+// AddNodeLabel adds a label to a swarm node.
+func (c *DockerClient) AddNodeLabel(nodeID, key, value string) error {
+	var node SwarmNode
+	if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil {
+		return err
+	}
+	if node.Spec.Labels == nil {
+		node.Spec.Labels = map[string]string{}
+	}
+	node.Spec.Labels[key] = value
+	return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec)
+}
+
+// ─── Methods: Services ────────────────────────────────────────────────────────
+
+// ListServices returns all swarm services, optionally filtered by label.
+func (c *DockerClient) ListServices() ([]SwarmService, error) {
+	var services []SwarmService
+	// Include ServiceStatus so running/desired replicas are returned
+	if err := c.get("/v1.44/services?status=true", &services); err != nil {
+		return nil, err
+	}
+	return services, nil
+}
+
+// GetService returns a single service by ID or name.
+func (c *DockerClient) GetService(idOrName string) (*SwarmService, error) {
+	var svc SwarmService
+	if err := c.get("/v1.44/services/"+idOrName+"?status=true", &svc); err != nil {
+		return nil, err
+	}
+	return &svc, nil
+}
+
+// ScaleService updates the replica count for a replicated service.
+func (c *DockerClient) ScaleService(idOrName string, replicas int) error {
+	svc, err := c.GetService(idOrName)
+	if err != nil {
+		return err
+	}
+	if svc.Spec.Mode.Replicated == nil {
+		return fmt.Errorf("service %s is not in replicated mode", idOrName)
+	}
+	svc.Spec.Mode.Replicated.Replicas = replicas
+	return c.postUpdate(
+		"/v1.44/services/"+svc.ID+"/update",
+		svc.Version.Index,
+		svc.Spec,
+	)
+}
+
+// ListServiceTasks returns all tasks for a given service.
+func (c *DockerClient) ListServiceTasks(serviceID string) ([]SwarmTask, error) {
+	var tasks []SwarmTask
+	filter := fmt.Sprintf(`{"service":["%s"]}`, serviceID)
+	path := "/v1.44/tasks?filters=" + urlEncode(filter)
+	if err := c.get(path, &tasks); err != nil {
+		return nil, err
+	}
+	return tasks, nil
+}
+
+// ListAllTasks returns all swarm tasks (across services).
+func (c *DockerClient) ListAllTasks() ([]SwarmTask, error) {
+	var tasks []SwarmTask
+	if err := c.get("/v1.44/tasks", &tasks); err != nil {
+		return nil, err
+	}
+	return tasks, nil
+}
+
+// CreateAgentService deploys a new swarm service for an AI agent.
+// image: container image, name: service name, replicas: initial count,
+// env: environment variables, port: optional published port (0 = none).
+// CreateAgentServiceOpts holds options for deploying an agent Swarm service.
+type CreateAgentServiceOpts struct {
+	Name        string
+	Image       string
+	Replicas    int
+	Env         []string
+	Port        int
+	Networks    []string // overlay network names/IDs to attach
+	Labels      map[string]string
+	Constraints []string // placement constraints, e.g. ["node.role==manager"]
+}
+
+func (c *DockerClient) CreateAgentService(name, image string, replicas int, env []string, port int) (*SwarmService, error) {
+	return c.CreateAgentServiceFull(CreateAgentServiceOpts{
+		Name:     name,
+		Image:    image,
+		Replicas: replicas,
+		Env:      env,
+		Port:     port,
+	})
+}
+
+func (c *DockerClient) CreateAgentServiceFull(opts CreateAgentServiceOpts) (*SwarmService, error) {
+	labels := map[string]string{
+		"goclaw.agent": "true",
+		"goclaw.name":  opts.Name,
+	}
+	for k, v := range opts.Labels {
+		labels[k] = v
+	}
+	var placement *Placement
+	if len(opts.Constraints) > 0 {
+		placement = &Placement{Constraints: opts.Constraints}
+	}
+	spec := ServiceSpec{
+		Name: opts.Name,
+		Mode: ServiceMode{
+			Replicated: &ReplicatedService{Replicas: opts.Replicas},
+		},
+		TaskTemplate: TaskTemplate{
+			ContainerSpec: ContainerSpec{
+				Image: opts.Image,
+				Env:   opts.Env,
+			},
+			Placement: placement,
+		},
+		Labels: labels,
+	}
+	if opts.Port > 0 {
+		spec.EndpointSpec = &EndpointSpec{
+			Ports: []PortConfig{
+				{
+					Protocol:      "tcp",
+					TargetPort:    opts.Port,
+					PublishedPort: opts.Port,
+					PublishMode:   "host",
+				},
+			},
+		}
+	}
+	if len(opts.Networks) > 0 {
+		for _, net := range opts.Networks {
+			spec.Networks = append(spec.Networks, NetworkAttachment{
+				Target:  net,
+				Aliases: []string{opts.Name},
+			})
+		}
+	}
+	var created struct {
+		ID string `json:"ID"`
+	}
+	if err := c.post("/v1.44/services/create", spec, &created); err != nil {
+		return nil, err
+	}
+	return c.GetService(created.ID)
+}
+
+// RemoveService removes a swarm service by ID or name.
+func (c *DockerClient) RemoveService(idOrName string) error {
+	req, err := http.NewRequest(http.MethodDelete, c.baseURL+"/v1.44/services/"+urlEncode(idOrName), nil)
+	if err != nil {
+		return err
+	}
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("docker DELETE service %s: %w", idOrName, err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 400 {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("docker DELETE service %s: status %d: %s", idOrName, resp.StatusCode, string(body))
+	}
+	return nil
+}
+
+// GetServiceLastActivity returns the most recent task update time for a service.
+// Used to determine whether a service is idle.
+func (c *DockerClient) GetServiceLastActivity(serviceID string) (time.Time, error) {
+	tasks, err := c.ListServiceTasks(serviceID)
+	if err != nil {
+		return time.Time{}, err
+	}
+	var latest time.Time
+	for _, t := range tasks {
+		if t.UpdatedAt.After(latest) {
+			latest = t.UpdatedAt
+		}
+	}
+	return latest, nil
+}
+
+// ─── Methods: Containers ─────────────────────────────────────────────────────
+
+// RunContainerOpts holds options for running a standalone container.
+type RunContainerOpts struct {
+	Name     string
+	Image    string
+	Env      []string
+	Networks []string // bridge/overlay networks to attach
+	Port     int      // host port (also used as container port)
+	Labels   map[string]string
+}
+
+// RunContainer creates and starts a standalone Docker container (docker run equivalent).
+// Returns the container ID on success.
+func (c *DockerClient) RunContainer(opts RunContainerOpts) (string, error) {
+	labels := map[string]string{"goclaw.agent": "true"}
+	for k, v := range opts.Labels {
+		labels[k] = v
+	}
+
+	// Build port bindings: host port -> container port
+	exposedPorts := map[string]struct{}{}
+	portBindings := map[string][]map[string]string{}
+	if opts.Port > 0 {
+		key := fmt.Sprintf("%d/tcp", opts.Port)
+		exposedPorts[key] = struct{}{}
+		portBindings[key] = []map[string]string{{"HostPort": fmt.Sprintf("%d", opts.Port)}}
+	}
+
+	// Pick first network for creation; additional networks attached after
+	firstNetwork := ""
+	if len(opts.Networks) > 0 {
+		firstNetwork = opts.Networks[0]
+	}
+
+	body := map[string]any{
+		"Image":        opts.Image,
+		"Env":          opts.Env,
+		"Labels":       labels,
+		"ExposedPorts": exposedPorts,
+		"HostConfig": map[string]any{
+			"PortBindings":  portBindings,
+			"RestartPolicy": map[string]any{"Name": "unless-stopped"},
+			"NetworkMode":   firstNetwork,
+		},
+		"NetworkingConfig": map[string]any{
+			"EndpointsConfig": map[string]any{
+				firstNetwork: map[string]any{
+					"Aliases": []string{opts.Name},
+				},
+			},
+		},
+	}
+
+	var created struct {
+		ID string `json:"Id"`
+	}
+	if err := c.post(fmt.Sprintf("/v1.44/containers/create?name=%s", urlEncode(opts.Name)), body, &created); err != nil {
+		return "", fmt.Errorf("create container %s: %w", opts.Name, err)
+	}
+
+	// Start the container
+	startURL := fmt.Sprintf("/v1.44/containers/%s/start", created.ID)
+	req, err := http.NewRequest(http.MethodPost, c.baseURL+startURL, nil)
+	if err != nil {
+		return created.ID, err
+	}
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return created.ID, fmt.Errorf("start container: %w", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 300 {
+		b, _ := io.ReadAll(resp.Body)
+		return created.ID, fmt.Errorf("start container HTTP %d: %s", resp.StatusCode, string(b))
+	}
+
+	// Attach additional networks
+	for i, net := range opts.Networks {
+		if i == 0 {
+			continue // already attached via NetworkMode
+		}
+		netBody := map[string]any{
+			"Container": created.ID,
+			"EndpointConfig": map[string]any{
+				"Aliases": []string{opts.Name},
+			},
+		}
+		_ = c.post(fmt.Sprintf("/v1.44/networks/%s/connect", urlEncode(net)), netBody, nil)
+	}
+
+	return created.ID, nil
+}
+
+// StopContainer stops and removes a standalone container by name or ID.
+func (c *DockerClient) StopContainer(nameOrID string) error {
+	// Stop
+	stopURL := fmt.Sprintf("/v1.44/containers/%s/stop", urlEncode(nameOrID))
+	req, _ := http.NewRequest(http.MethodPost, c.baseURL+stopURL, nil)
+	resp, err := c.httpClient.Do(req)
+	if err == nil {
+		resp.Body.Close()
+	}
+	// Remove (force)
+	rmURL := fmt.Sprintf("/v1.44/containers/%s?force=true", urlEncode(nameOrID))
+	req, err = http.NewRequest(http.MethodDelete, c.baseURL+rmURL, nil)
+	if err != nil {
+		return err
+	}
+	resp, err = c.httpClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 400 {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("remove container HTTP %d: %s", resp.StatusCode, string(b))
+	}
+	return nil
+}
+
 func (c *DockerClient) ListContainers() ([]Container, error) {
 	var containers []Container
 	if err := c.get("/v1.44/containers/json?all=false", &containers); err != nil {
@@ -176,7 +689,6 @@ func (c *DockerClient) ListContainers() ([]Container, error) {
 	return containers, nil
 }

-// GetContainerStats returns one-shot stats for a container (no streaming).
 func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, error) {
 	var stats ContainerStats
 	if err := c.get(fmt.Sprintf("/v1.44/containers/%s/stats?stream=false", containerID), &stats); err != nil {
@@ -185,7 +697,69 @@ func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, e
 	return &stats, nil
 }

-// CalcCPUPercent computes CPU usage % from two consecutive stats snapshots.
+// ─── Host Shell execution ─────────────────────────────────────────────────────
+// The gateway runs inside a container but has /var/run/docker.sock mounted.
+// We use `docker exec` against the host PID namespace via a privileged helper,
+// OR simply run commands via the docker socket by exec-ing into the gateway
+// container's own shell with nsenter to reach PID 1 on the host.
+//
+// Approach: use `nsenter -t 1 -m -u -i -n -p -- <cmd>` via the host PID namespace.
+// This requires the container to run with --privileged or SYS_PTRACE capability
+// and PID namespace sharing. We add that to docker-compose.yml.
+//
+// Alternative (safer): exec into host via SSH or a privileged sidecar.
+// For now we use nsenter which works when pid:host and privileged: true.
+
+// ExecOnHost runs a shell command on the host via nsenter into PID 1.
+// Returns combined stdout+stderr.
+func ExecOnHost(command string) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	// Try nsenter (requires pid:host + SYS_ADMIN or privileged)
+	cmd := exec.CommandContext(ctx, "nsenter", "-t", "1", "-m", "-u", "-i", "-n", "-p", "--",
+		"sh", "-c", command)
+	var out bytes.Buffer
+	var stderr bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		// If nsenter fails, fall back to running in container scope
+		cmd2 := exec.CommandContext(ctx, "sh", "-c", command)
+		var out2 bytes.Buffer
+		var stderr2 bytes.Buffer
+		cmd2.Stdout = &out2
+		cmd2.Stderr = &stderr2
+		if err2 := cmd2.Run(); err2 != nil {
+			combined := out2.String() + stderr2.String()
+			if combined == "" {
+				combined = err2.Error()
+			}
+			return combined, err2
+		}
+		return out2.String() + stderr2.String(), nil
+	}
+	return out.String() + stderr.String(), nil
+}
+
+// ExecDockerCLI runs `docker <args>` on the host by calling the docker socket.
+// Since we have the socket mounted, we can exec docker commands directly
+// using the docker CLI binary if available.
+func ExecDockerCLI(args ...string) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "docker", args...)
+	var out, stderr bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		return out.String() + stderr.String(), err
+	}
+	return out.String(), nil
+}
+
+// CalcCPUPercent computes CPU% from stats snapshot.
 func CalcCPUPercent(stats *ContainerStats) float64 {
 	cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage)
 	systemDelta := float64(stats.CPUStats.SystemCPUUsage) - float64(stats.PreCPUStats.SystemCPUUsage)
@@ -198,3 +772,19 @@ func CalcCPUPercent(stats *ContainerStats) float64 {
 	}
 	return 0
 }
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+func urlEncode(s string) string {
+	var b strings.Builder
+	for _, r := range s {
+		switch {
+		case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z', r >= '0' && r <= '9',
+			r == '-', r == '_', r == '.', r == '~':
+			b.WriteRune(r)
+		default:
+			b.WriteString(fmt.Sprintf("%%%02X", r))
+		}
+	}
+	return b.String()
+}
--- a/gateway/internal/llm/client.go
+++ b/gateway/internal/llm/client.go
@@ -2,6 +2,7 @@
 package llm

 import (
+	"bufio"
 	"bytes"
 	"context"
 	"encoding/json"
@@ -105,6 +106,13 @@ func NewClient(baseURL, apiKey string) *Client {
 	}
 }

+// UpdateCredentials updates the LLM client's base URL and API key at runtime.
+// Called when the active provider is changed via the Settings UI.
+func (c *Client) UpdateCredentials(baseURL, apiKey string) {
+	c.baseURL = strings.TrimRight(baseURL, "/")
+	c.apiKey = apiKey
+}
+
 func (c *Client) headers() map[string]string {
 	h := map[string]string{
 		"Content-Type": "application/json",
@@ -159,7 +167,86 @@ func (c *Client) ListModels(ctx context.Context) (*ModelsResponse, error) {
 	return &result, nil
 }

-// Chat sends a chat completion request (non-streaming).
+// ChatStream sends a streaming chat completion request (SSE).
+// It calls the callback for each chunk received.
+func (c *Client) ChatStream(ctx context.Context, req ChatRequest, onChunk func(delta string, done bool)) error {
+	req.Stream = true
+
+	body, err := json.Marshal(req)
+	if err != nil {
+		return err
+	}
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost,
+		c.baseURL+"/chat/completions", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	for k, v := range c.headers() {
+		httpReq.Header.Set(k, v)
+	}
+	httpReq.Header.Set("Accept", "text/event-stream")
+
+	// Use a client without timeout for streaming
+	streamClient := &http.Client{Timeout: 0}
+	resp, err := streamClient.Do(httpReq)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("ollama stream API error (%d): %s", resp.StatusCode, string(respBody))
+	}
+
+	// Parse SSE stream
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data: ") {
+			continue
+		}
+		data := strings.TrimPrefix(line, "data: ")
+		if data == "[DONE]" {
+			onChunk("", true)
+			return nil
+		}
+
+		var chunk struct {
+			Choices []struct {
+				Delta struct {
+					Content string `json:"content"`
+				} `json:"delta"`
+				FinishReason *string `json:"finish_reason"`
+			} `json:"choices"`
+		}
+		if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+			continue
+		}
+		if len(chunk.Choices) > 0 {
+			delta := chunk.Choices[0].Delta.Content
+			if delta != "" {
+				onChunk(delta, false)
+			}
+			if chunk.Choices[0].FinishReason != nil && *chunk.Choices[0].FinishReason == "stop" {
+				onChunk("", true)
+				return nil
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return err
+	}
+	onChunk("", true)
+	return nil
+}
 func (c *Client) Chat(ctx context.Context, req ChatRequest) (*ChatResponse, error) {
 	req.Stream = false

--- a/gateway/internal/orchestrator/orchestrator.go
+++ b/gateway/internal/orchestrator/orchestrator.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
+	"strings"
 	"time"

 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
@@ -31,13 +32,15 @@ type ToolCallStep struct {
 	DurationMs int64  `json:"durationMs"`
 }

+// ChatResult is the response from the orchestrator chat.
 type ChatResult struct {
-	Success   bool           `json:"success"`
-	Response  string         `json:"response"`
-	ToolCalls []ToolCallStep `json:"toolCalls"`
-	Model     string         `json:"model"`
-	Usage     *llm.Usage     `json:"usage,omitempty"`
-	Error     string         `json:"error,omitempty"`
+	Success      bool           `json:"success"`
+	Response     string         `json:"response"`
+	ToolCalls    []ToolCallStep `json:"toolCalls"`
+	Model        string         `json:"model"`
+	ModelWarning string         `json:"modelWarning,omitempty"`
+	Usage        *llm.Usage     `json:"usage,omitempty"`
+	Error        string         `json:"error,omitempty"`
 }

 // OrchestratorConfig is the runtime config loaded from DB or defaults.
@@ -51,6 +54,30 @@ type OrchestratorConfig struct {
 	MaxTokens    int
 }

+// RetryPolicy controls how the orchestrator retries failed or empty LLM calls.
+type RetryPolicy struct {
+	// MaxLLMRetries is the number of additional attempts after a failure.
+	// Total attempts = MaxLLMRetries + 1.  Default: 3 (4 total).
+	MaxLLMRetries int
+	// InitialDelay before the first retry.  Default: 2s.
+	InitialDelay time.Duration
+	// MaxDelay caps the exponential back-off.  Default: 30s.
+	MaxDelay time.Duration
+	// RetryOnEmpty means an empty-content response is treated as a soft failure
+	// and triggers a retry.  Default: true.
+	RetryOnEmpty bool
+}
+
+// defaultRetryPolicy returns the default retry policy.
+func defaultRetryPolicy() RetryPolicy {
+	return RetryPolicy{
+		MaxLLMRetries: 3,
+		InitialDelay:  2 * time.Second,
+		MaxDelay:      30 * time.Second,
+		RetryOnEmpty:  true,
+	}
+}
+
 // ─── Default System Prompt ────────────────────────────────────────────────────

 const defaultSystemPrompt = `You are GoClaw Orchestrator — the main AI agent managing the GoClaw distributed AI system.
@@ -86,6 +113,7 @@ type Orchestrator struct {
 	executor    *tools.Executor
 	database    *db.DB
 	projectRoot string
+	retry       RetryPolicy
 }

 func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrator {
@@ -93,6 +121,7 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat
 		llmClient:   llmClient,
 		database:    database,
 		projectRoot: projectRoot,
+		retry:       defaultRetryPolicy(),
 	}
 	// Inject agent list function to avoid circular dependency
 	o.executor = tools.NewExecutor(projectRoot, o.listAgentsFn)
@@ -101,6 +130,11 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat
 	return o
 }

+// SetRetryPolicy overrides the default retry policy.
+func (o *Orchestrator) SetRetryPolicy(p RetryPolicy) {
+	o.retry = p
+}
+
 // GetConfig loads orchestrator config from DB, falls back to defaults.
 func (o *Orchestrator) GetConfig() *OrchestratorConfig {
 	if o.database != nil {
@@ -131,25 +165,188 @@ func (o *Orchestrator) GetConfig() *OrchestratorConfig {
 	}
 }

-// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
-func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
-	if maxIter <= 0 {
-		maxIter = 10
+// resolveModel checks if the desired model is available via the LLM API.
+// If not, it tries to fall back to the first available model.
+// Returns the resolved model name and a warning if fallback was used.
+func (o *Orchestrator) resolveModel(ctx context.Context, desired string) (model string, warning string) {
+	ctxShort, cancel := context.WithTimeout(ctx, 8*time.Second)
+	defer cancel()
+
+	models, err := o.llmClient.ListModels(ctxShort)
+	if err != nil || models == nil || len(models.Data) == 0 {
+		// Cannot verify — use desired model as-is
+		log.Printf("[Orchestrator] Cannot fetch model list: %v — using %q as-is", err, desired)
+		return desired, ""
+	}
+
+	// Check if desired model is available
+	for _, m := range models.Data {
+		if m.ID == desired {
+			return desired, "" // found — all good
+		}
+	}
+
+	// Desired model not in list — fall back to first available
+	fallback := models.Data[0].ID
+	warning = fmt.Sprintf("model %q not available — using %q instead", desired, fallback)
+	log.Printf("[Orchestrator] WARNING: %s", warning)
+	return fallback, warning
+}
+
+// ─── LLM call with retry ──────────────────────────────────────────────────────
+
+// llmCallResult holds one attempt's outcome.
+type llmCallResult struct {
+	resp        *llm.ChatResponse
+	usedTools   bool // whether the call was made with tools enabled
+	err         error
+	attemptNum  int
+}
+
+// callLLMWithRetry calls the LLM and retries on error or empty response.
+// It also strips tools on the second attempt if the first fails with tools.
+func (o *Orchestrator) callLLMWithRetry(
+	ctx context.Context,
+	req llm.ChatRequest,
+	model string,
+	onRetry func(attempt int, reason string), // optional event callback (may be nil)
+) llmCallResult {
+	policy := o.retry
+	delay := policy.InitialDelay
+	maxAttempts := policy.MaxLLMRetries + 1
+	hasTools := len(req.Tools) > 0
+
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		// On attempt > 1, always strip tools (avoid repeated tool-format errors)
+		useTools := hasTools && attempt == 1
+		r := req
+		if !useTools {
+			r.Tools = nil
+			r.ToolChoice = ""
+		}
+
+		resp, err := o.llmClient.Chat(ctx, r)
+
+		// ── Hard error (network, auth, etc.) ─────────────────────────
+		if err != nil {
+			reason := fmt.Sprintf("LLM error (attempt %d/%d): %v", attempt, maxAttempts, err)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{err: fmt.Errorf("LLM error after %d attempts (model: %s): %w", maxAttempts, model, err), attemptNum: attempt}
+		}
+
+		// ── Context cancelled ─────────────────────────────────────────
+		if ctx.Err() != nil {
+			return llmCallResult{err: ctx.Err(), attemptNum: attempt}
+		}
+
+		// ── Empty choices ─────────────────────────────────────────────
+		if len(resp.Choices) == 0 {
+			reason := fmt.Sprintf("empty choices (attempt %d/%d)", attempt, maxAttempts)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		content := strings.TrimSpace(resp.Choices[0].Message.Content)
+		finishReason := resp.Choices[0].FinishReason
+
+		// ── Empty content AND no tool calls — retry ───────────────────
+		if policy.RetryOnEmpty &&
+			content == "" &&
+			finishReason != "tool_calls" &&
+			len(resp.Choices[0].Message.ToolCalls) == 0 {
+
+			reason := fmt.Sprintf("empty response content (attempt %d/%d, finish_reason=%q)", attempt, maxAttempts, finishReason)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			// Exhausted retries — return what we have (even if empty)
+			log.Printf("[Orchestrator] All %d attempts exhausted — returning empty response", maxAttempts)
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		// ── Success ───────────────────────────────────────────────────
+		if attempt > 1 {
+			log.Printf("[Orchestrator] Succeeded on attempt %d/%d", attempt, maxAttempts)
+		}
+		return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+	}
+
+	// Should not be reached
+	return llmCallResult{err: fmt.Errorf("retry loop exited unexpectedly"), attemptNum: maxAttempts}
+}
+
+// sleep waits for d, returning early if ctx is cancelled.
+func (o *Orchestrator) sleep(ctx context.Context, d time.Duration) {
+	select {
+	case <-ctx.Done():
+	case <-time.After(d):
+	}
+}
+
+// min returns the smaller of two durations.
+func min(a, b time.Duration) time.Duration {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// ─── Core loop (shared by Chat and ChatWithEvents) ────────────────────────────
+
+type loopOptions struct {
+	messages      []Message
+	overrideModel string
+	maxIter       int
+	onToolCall    func(ToolCallStep) // may be nil
+	onRetry       func(attempt int, reason string) // may be nil
+}
+
+func (o *Orchestrator) runLoop(ctx context.Context, opts loopOptions) ChatResult {
+	if opts.maxIter <= 0 {
+		opts.maxIter = 10
 	}

 	cfg := o.GetConfig()
 	model := cfg.Model
-	if overrideModel != "" {
-		model = overrideModel
+	if opts.overrideModel != "" {
+		model = opts.overrideModel
 	}

-	log.Printf("[Orchestrator] Chat started: model=%s, messages=%d", model, len(messages))
+	// Validate model against LLM API — fall back if unavailable (prevents 401/404)
+	model, modelWarning := o.resolveModel(ctx, model)
+	log.Printf("[Orchestrator] Loop started: model=%s, messages=%d, maxIter=%d, maxRetries=%d",
+		model, len(opts.messages), opts.maxIter, o.retry.MaxLLMRetries)

 	// Build conversation
 	conv := []llm.Message{
 		{Role: "system", Content: cfg.SystemPrompt},
 	}
-	for _, m := range messages {
+	for _, m := range opts.messages {
 		conv = append(conv, llm.Message{Role: m.Role, Content: m.Content})
 	}

@@ -175,7 +372,7 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 	var lastUsage *llm.Usage
 	var lastModel string

-	for iter := 0; iter < maxIter; iter++ {
+	for iter := 0; iter < opts.maxIter; iter++ {
 		req := llm.ChatRequest{
 			Model:       model,
 			Messages:    conv,
@@ -185,28 +382,22 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			ToolChoice:  "auto",
 		}

-		resp, err := o.llmClient.Chat(ctx, req)
-		if err != nil {
-			// Fallback: try without tools
-			log.Printf("[Orchestrator] LLM error with tools: %v — retrying without tools", err)
-			req.Tools = nil
-			req.ToolChoice = ""
-			resp2, err2 := o.llmClient.Chat(ctx, req)
-			if err2 != nil {
-				return ChatResult{
-					Success: false,
-					Error:   fmt.Sprintf("LLM error (model: %s): %v", model, err2),
-				}
+		// ── LLM call with retry ────────────────────────────────────
+		callRes := o.callLLMWithRetry(ctx, req, model, opts.onRetry)
+
+		if callRes.err != nil {
+			return ChatResult{
+				Success:      false,
+				ToolCalls:    toolCallSteps,
+				Model:        model,
+				ModelWarning: modelWarning,
+				Error:        callRes.err.Error(),
 			}
-			if len(resp2.Choices) > 0 {
-				finalResponse = resp2.Choices[0].Message.Content
-				lastUsage = resp2.Usage
-				lastModel = resp2.Model
-			}
-			break
 		}

+		resp := callRes.resp
 		if len(resp.Choices) == 0 {
+			log.Printf("[Orchestrator] No choices in response — stopping loop at iter %d", iter)
 			break
 		}

@@ -217,19 +408,17 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			lastModel = model
 		}

-		// Check if LLM wants to call tools
+		// ── Tool calls ─────────────────────────────────────────────
 		if choice.FinishReason == "tool_calls" && len(choice.Message.ToolCalls) > 0 {
 			// Add assistant message with tool calls to conversation
 			conv = append(conv, choice.Message)

-			// Execute each tool call
 			for _, tc := range choice.Message.ToolCalls {
 				toolName := tc.Function.Name
 				argsJSON := tc.Function.Arguments

 				log.Printf("[Orchestrator] Executing tool: %s args=%s", toolName, argsJSON)
 				start := time.Now()
-
 				result := o.executor.Execute(ctx, toolName, argsJSON)

 				step := ToolCallStep{
@@ -238,7 +427,6 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 					DurationMs: time.Since(start).Milliseconds(),
 				}

-				// Parse args for display
 				var argsMap any
 				_ = json.Unmarshal([]byte(argsJSON), &argsMap)
 				step.Args = argsMap
@@ -255,7 +443,10 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod

 				toolCallSteps = append(toolCallSteps, step)

-				// Add tool result to conversation
+				if opts.onToolCall != nil {
+					opts.onToolCall(step)
+				}
+
 				conv = append(conv, llm.Message{
 					Role:       "tool",
 					Content:    toolResultContent,
@@ -267,20 +458,70 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			continue
 		}

-		// LLM finished — extract final response
+		// ── Final response ─────────────────────────────────────────
 		finalResponse = choice.Message.Content
 		break
 	}

 	return ChatResult{
-		Success:   true,
-		Response:  finalResponse,
-		ToolCalls: toolCallSteps,
-		Model:     lastModel,
-		Usage:     lastUsage,
+		Success:      true,
+		Response:     finalResponse,
+		ToolCalls:    toolCallSteps,
+		Model:        lastModel,
+		ModelWarning: modelWarning,
+		Usage:        lastUsage,
 	}
 }

+// ─── Public API ───────────────────────────────────────────────────────────────
+
+// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
+func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+	})
+}
+
+// ChatWithEvents runs the full orchestration loop and calls callbacks for each
+// tool execution and each retry attempt.  Used for SSE streaming and DB event logging.
+func (o *Orchestrator) ChatWithEvents(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+	})
+}
+
+// ChatWithEventsAndRetry is the full-featured variant that also reports retry
+// attempts through onRetry so they can be streamed to the client.
+func (o *Orchestrator) ChatWithEventsAndRetry(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+	onRetry func(attempt int, reason string),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+		onRetry:       onRetry,
+	})
+}
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
 // listAgentsFn is injected into the tool executor to list agents from DB.
 func (o *Orchestrator) listAgentsFn() ([]map[string]any, error) {
 	if o.database == nil {
--- a/gateway/internal/tools/executor.go
+++ b/gateway/internal/tools/executor.go
@@ -13,6 +13,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"

 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
@@ -155,15 +156,44 @@ func OrchestratorTools() []ToolDef {
 		{
 			Type: "function",
 			Function: FuncDef{
-				Name:        "delegate_to_agent",
-				Description: "Delegate a task to a specialized agent (Browser Agent, Tool Builder, Agent Compiler).",
+				Name: "delegate_to_agent",
+				Description: "Delegate a task to a specific agent container via A2A protocol. " +
+					"The agent processes the task with its own LLM and tools. " +
+					"Use async=true for fire-and-forget with callback_url, or sync (default) to wait for result.",
 				Parameters: map[string]any{
 					"type": "object",
 					"properties": map[string]any{
-						"agentId": map[string]any{"type": "number", "description": "Agent ID to delegate to"},
-						"message": map[string]any{"type": "string", "description": "Task description for the agent"},
+						"agentId":     map[string]any{"type": "number", "description": "Target agent ID"},
+						"task":        map[string]any{"type": "string", "description": "Task description / prompt for the agent"},
+						"async":       map[string]any{"type": "boolean", "description": "If true, returns task_id immediately; if false (default), waits for result"},
+						"callbackUrl": map[string]any{"type": "string", "description": "URL to POST result when async=true"},
+						"priority":    map[string]any{"type": "number", "description": "Task priority 0-10 (default 5)"},
+						"timeoutSecs": map[string]any{"type": "number", "description": "Max seconds to wait (default 120)"},
 					},
-					"required":             []string{"agentId", "message"},
+					"required":             []string{"agentId", "task"},
+					"additionalProperties": false,
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: FuncDef{
+				Name: "fanout_agents",
+				Description: "Send the SAME task to MULTIPLE agents IN PARALLEL and collect all results. " +
+					"Useful when you need different specialists to work on the same problem simultaneously. " +
+					"Returns results from all agents as an array.",
+				Parameters: map[string]any{
+					"type": "object",
+					"properties": map[string]any{
+						"agentIds": map[string]any{
+							"type":        "array",
+							"items":       map[string]any{"type": "number"},
+							"description": "List of agent IDs to send the task to (max 10)",
+						},
+						"task":        map[string]any{"type": "string", "description": "Task to send to all agents"},
+						"timeoutSecs": map[string]any{"type": "number", "description": "Max seconds per agent (default 60)"},
+					},
+					"required":             []string{"agentIds", "task"},
 					"additionalProperties": false,
 				},
 			},
@@ -226,6 +256,8 @@ func (e *Executor) Execute(ctx context.Context, toolName string, argsJSON string
 		result, execErr = e.listAgents()
 	case "delegate_to_agent":
 		result, execErr = e.delegateToAgent(ctx, args)
+	case "fanout_agents":
+		result, execErr = e.fanoutAgents(ctx, args)
 	default:
 		return ToolResult{Success: false, Error: fmt.Sprintf("unknown tool: %s", toolName), DurationMs: ms(start)}
 	}
@@ -456,57 +488,100 @@ func (e *Executor) listAgents() (any, error) {
 	return map[string]any{"agents": agents, "count": len(agents)}, nil
 }

+// A2ATaskRequest is the standard agent-to-agent task message format (Phase C).
+type A2ATaskRequest struct {
+	TaskID      string `json:"task_id"`
+	FromAgentID int    `json:"from_agent_id"`
+	Task        string `json:"input"`
+	CallbackURL string `json:"callback_url,omitempty"`
+	Priority    int    `json:"priority"`
+	TimeoutSecs int    `json:"timeout_secs"`
+}
+
+// delegateToAgent sends a task to an agent's container via A2A HTTP protocol.
+// Resolves the agent's service address from DB, respects priority/timeout from args.
+// Falls back with a clear message if agent is not deployed/running.
 func (e *Executor) delegateToAgent(ctx context.Context, args map[string]any) (any, error) {
 	agentIDf, _ := args["agentId"].(float64)
 	agentID := int(agentIDf)
+
 	task, _ := args["task"].(string)
 	if task == "" {
 		task, _ = args["message"].(string) // backward compat
 	}
 	if task == "" {
-		return nil, fmt.Errorf("task (or message) is required")
+		return nil, fmt.Errorf("task is required")
 	}
+
 	callbackURL, _ := args["callbackUrl"].(string)
 	async, _ := args["async"].(bool)

+	priority := 5
+	if pf, ok := args["priority"].(float64); ok && pf > 0 {
+		priority = int(pf)
+	}
+	timeoutSecs := 120
+	if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 {
+		timeoutSecs = int(tf)
+	}
+
 	// Resolve agent container address from DB
 	if e.database != nil {
 		cfg, err := e.database.GetAgentByID(agentID)
 		if err == nil && cfg != nil && cfg.ServicePort > 0 && cfg.ContainerStatus == "running" {
-			// Agent is deployed — call its container via overlay DNS
-			// Docker Swarm DNS: service name resolves inside overlay network
 			agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort)
-			if async {
-				return e.postAgentTask(ctx, agentURL, agentID, task, callbackURL)
+			req := A2ATaskRequest{
+				TaskID:      fmt.Sprintf("orch-%d-%d", agentID, time.Now().UnixMilli()),
+				FromAgentID: 0, // orchestrator
+				Task:        task,
+				CallbackURL: callbackURL,
+				Priority:    priority,
+				TimeoutSecs: timeoutSecs,
 			}
-			return e.postAgentChat(ctx, agentURL, agentID, task)
+			if async {
+				return e.postA2ATask(ctx, agentURL, req)
+			}
+			return e.postA2AChat(ctx, agentURL, task, timeoutSecs)
+		}
+		if e.database != nil {
+			cfg, _ := e.database.GetAgentByID(agentID)
+			status := "unknown"
+			if cfg != nil {
+				status = cfg.ContainerStatus
+				if status == "" {
+					status = "stopped"
+				}
+			}
+			return map[string]any{
+				"delegated": false,
+				"agentId":   agentID,
+				"status":    status,
+				"note": fmt.Sprintf(
+					"Agent %d container is %q. Deploy it via Web Panel (POST /api/agents/%d/deploy) then retry.",
+					agentID, status, agentID),
+			}, nil
 		}
 	}

-	// Fallback: agent not deployed yet — return informational response
 	return map[string]any{
-		"delegated":   false,
-		"agentId":     agentID,
-		"task":        task,
-		"note":        fmt.Sprintf("Agent %d is not running (containerStatus != running). Deploy it first via Web Panel.", agentID),
+		"delegated": false,
+		"agentId":   agentID,
+		"note":      "No database connection — cannot resolve agent address.",
 	}, nil
 }

-// postAgentTask POSTs to agent's /task endpoint (async, returns task_id).
-func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgentID int, task, callbackURL string) (any, error) {
-	payload, _ := json.Marshal(map[string]any{
-		"input":         task,
-		"from_agent_id": fromAgentID,
-		"callback_url":  callbackURL,
-	})
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload))
+// postA2ATask POSTs to agent's /task endpoint using A2A protocol (async).
+func (e *Executor) postA2ATask(ctx context.Context, agentURL string, req A2ATaskRequest) (any, error) {
+	payload, _ := json.Marshal(req)
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload))
 	if err != nil {
-		return nil, fmt.Errorf("delegate build request: %w", err)
+		return nil, fmt.Errorf("a2a build request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
-	resp, err := e.httpClient.Do(req)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("X-GoClaw-From", "orchestrator")
+	resp, err := e.httpClient.Do(httpReq)
 	if err != nil {
-		return nil, fmt.Errorf("delegate HTTP error: %w", err)
+		return nil, fmt.Errorf("a2a task HTTP error: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(resp.Body)
@@ -515,19 +590,27 @@ func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgent
 	return result, nil
 }

-// postAgentChat POSTs to agent's /chat endpoint (sync, waits for response).
-func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, task string) (any, error) {
+// postA2AChat POSTs to agent's /chat endpoint (sync, waits for LLM response).
+func (e *Executor) postA2AChat(ctx context.Context, agentURL string, task string, timeoutSecs int) (any, error) {
 	payload, _ := json.Marshal(map[string]any{
-		"messages": []map[string]string{{"role": "user", "content": task}},
+		"messages":    []map[string]string{{"role": "user", "content": task}},
+		"timeout_secs": timeoutSecs,
 	})
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload))
+	chatCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs)*time.Second)
+	defer cancel()
+
+	httpReq, err := http.NewRequestWithContext(chatCtx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload))
 	if err != nil {
-		return nil, fmt.Errorf("delegate build request: %w", err)
+		return nil, fmt.Errorf("a2a chat request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
-	resp, err := e.httpClient.Do(req)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("X-GoClaw-From", "orchestrator")
+
+	// Use a client with longer timeout for sync chats
+	client := &http.Client{Timeout: time.Duration(timeoutSecs+10) * time.Second}
+	resp, err := client.Do(httpReq)
 	if err != nil {
-		return nil, fmt.Errorf("delegate HTTP error: %w", err)
+		return nil, fmt.Errorf("a2a chat HTTP error: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(resp.Body)
@@ -536,6 +619,107 @@ func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, ta
 	return result, nil
 }

+// fanoutAgents sends the same task to multiple agents in parallel and collects results.
+func (e *Executor) fanoutAgents(ctx context.Context, args map[string]any) (any, error) {
+	task, _ := args["task"].(string)
+	if task == "" {
+		return nil, fmt.Errorf("task is required")
+	}
+
+	timeoutSecs := 60
+	if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 {
+		timeoutSecs = int(tf)
+	}
+
+	// Parse agentIds array
+	rawIDs, _ := args["agentIds"].([]any)
+	if len(rawIDs) == 0 {
+		return nil, fmt.Errorf("agentIds must be a non-empty array")
+	}
+	if len(rawIDs) > 10 {
+		rawIDs = rawIDs[:10] // cap at 10
+	}
+
+	type agentResult struct {
+		AgentID   int    `json:"agentId"`
+		AgentName string `json:"agentName,omitempty"`
+		Success   bool   `json:"success"`
+		Result    any    `json:"result,omitempty"`
+		Error     string `json:"error,omitempty"`
+		Delegated bool   `json:"delegated"`
+		DurationMs int64 `json:"durationMs"`
+	}
+
+	results := make([]agentResult, len(rawIDs))
+	var wg sync.WaitGroup
+
+	fanCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs+5)*time.Second)
+	defer cancel()
+
+	for i, rawID := range rawIDs {
+		idf, _ := rawID.(float64)
+		agentID := int(idf)
+		idx := i
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			start := time.Now()
+			ar := agentResult{AgentID: agentID}
+
+			if e.database == nil {
+				ar.Error = "no database connection"
+				results[idx] = ar
+				return
+			}
+
+			cfg, err := e.database.GetAgentByID(agentID)
+			if err != nil || cfg == nil {
+				ar.Error = fmt.Sprintf("agent %d not found", agentID)
+				results[idx] = ar
+				return
+			}
+			ar.AgentName = cfg.Name
+
+			if cfg.ServicePort == 0 || cfg.ContainerStatus != "running" {
+				ar.Delegated = false
+				ar.Error = fmt.Sprintf("agent %q is %q — not running", cfg.Name, cfg.ContainerStatus)
+				results[idx] = ar
+				return
+			}
+
+			agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort)
+			res, chatErr := e.postA2AChat(fanCtx, agentURL, task, timeoutSecs)
+			ar.DurationMs = ms(start)
+			if chatErr != nil {
+				ar.Success = false
+				ar.Error = chatErr.Error()
+			} else {
+				ar.Success = true
+				ar.Delegated = true
+				ar.Result = res
+			}
+			results[idx] = ar
+		}()
+	}
+
+	wg.Wait()
+
+	succeeded := 0
+	for _, r := range results {
+		if r.Success {
+			succeeded++
+		}
+	}
+
+	return map[string]any{
+		"task":      task,
+		"total":     len(results),
+		"succeeded": succeeded,
+		"failed":    len(results) - succeeded,
+		"results":   results,
+	}, nil
+}
+
 // ─── Helpers ──────────────────────────────────────────────────────────────────

 func (e *Executor) resolvePath(path string) string {