From f8e0ca7d5dd133a5be6bb32d8067d63f250a7004 Mon Sep 17 00:00:00 2001
From: bboxwtf <genspark_dev@genspark.ai>
Date: Sun, 19 Apr 2026 11:40:39 +0000
Subject: [PATCH] feat(gateway): restore Phase C full agent lifecycle API

- Restored Phase C gateway code (handlers, main.go, docker client, db)
- Added routes: GET /api/agents/running, POST /api/agents (CRUD),
  POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop,
  POST /api/agents/{id}/restart, POST /api/agents/{id}/scale
- Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id}
  even when serviceName is empty in DB
- Fixed DeployAgent: handle 409 conflict by removing existing container
  and retrying once (idempotent deploy)
- Added swarm_manager.go: background SwarmManager for dead-letter recovery
- Added AGENT_NETWORK and AGENT_DB_URL config options
- Updated .gitignore to exclude gateway binaries
- All agents use standalone docker run (not Swarm) on bridge network

Verified on prod: deploy/stop/restart cycle works correctly,
/api/agents/running returns live running agents with containerStatus
---
 .gitignore                                    |    5 +
 gateway/cmd/agent-worker/main.go              |   90 +-
 gateway/cmd/agent-worker/main_test.go         |  120 +-
 gateway/cmd/agent/main.go                     |  270 +++
 gateway/cmd/gateway/main.go                   |   68 +-
 gateway/config/config.go                      |   22 +
 gateway/go.mod                                |   10 +-
 gateway/go.sum                                |   10 +-
 gateway/internal/api/handlers.go              | 1537 +++++++++++++++++
 gateway/internal/api/handlers_agents_test.go  |  334 ++++
 gateway/internal/api/swarm_manager.go         |  196 +++
 gateway/internal/db/db.go                     |  756 ++++++--
 gateway/internal/docker/client.go             |  670 ++++++-
 gateway/internal/llm/client.go                |   89 +-
 gateway/internal/orchestrator/orchestrator.go |  329 +++-
 gateway/internal/tools/executor.go            |  256 ++-
 16 files changed, 4505 insertions(+), 257 deletions(-)
 create mode 100644 gateway/cmd/agent/main.go
 create mode 100644 gateway/internal/api/handlers_agents_test.go
 create mode 100644 gateway/internal/api/swarm_manager.go

diff --git a/.gitignore b/.gitignore
index 6f04d85..00c980f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,3 +121,8 @@ deploy-secrets
 .kilo/
 .manus/
 AGENTS.md
+
+# Gateway binaries
+gateway/gateway
+gateway/gateway-new
+gateway/agent-worker
diff --git a/gateway/cmd/agent-worker/main.go b/gateway/cmd/agent-worker/main.go
index 2a72a0c..d63d260 100644
--- a/gateway/cmd/agent-worker/main.go
+++ b/gateway/cmd/agent-worker/main.go
@@ -127,30 +127,50 @@ type AgentWorker struct {
 	// Recent tasks ring buffer (для GET /tasks)
 	recentMu   sync.Mutex
 	recentKeys []string
+
+	// Rate-limiting semaphore — limits concurrent LLM calls.
+	// Filled with MAX_CONCURRENT_TASKS tokens; each worker acquires one before
+	// calling runChat() and releases it when done.
+	rateSem chan struct{}
+	// maxConcurrent is the configured concurrency limit (exported for /health).
+	maxConcurrent int
 }
 
 const (
-	taskQueueDepth  = 100
-	maxRecentTasks  = 50
-	defaultMaxIter  = 8
-	defaultTimeout  = 120
-	workerGoroutines = 4 // параллельных воркеров на агента
+	taskQueueDepth        = 100
+	maxRecentTasks        = 50
+	defaultMaxIter        = 8
+	defaultTimeout        = 120
+	workerGoroutines      = 4 // параллельных воркеров на агента
+	defaultMaxConcurrent  = 2 // default simultaneous LLM calls per agent
 )
 
-func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client) (*AgentWorker, error) {
+func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client, maxConcurrent int) (*AgentWorker, error) {
 	cfg, err := database.GetAgentByID(agentID)
 	if err != nil {
 		return nil, fmt.Errorf("agent %d not found in DB: %w", agentID, err)
 	}
 	log.Printf("[AgentWorker] Loaded config: id=%d name=%q model=%s", cfg.ID, cfg.Name, cfg.Model)
 
+	if maxConcurrent <= 0 {
+		maxConcurrent = defaultMaxConcurrent
+	}
+
+	// Fill the semaphore with tokens equal to the concurrency limit.
+	sem := make(chan struct{}, maxConcurrent)
+	for i := 0; i < maxConcurrent; i++ {
+		sem <- struct{}{}
+	}
+
 	w := &AgentWorker{
-		agentID:   agentID,
-		cfg:       cfg,
-		llm:       llmClient,
-		database:  database,
-		taskQueue: make(chan *Task, taskQueueDepth),
-		tasks:     make(map[string]*Task),
+		agentID:       agentID,
+		cfg:           cfg,
+		llm:           llmClient,
+		database:      database,
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: maxConcurrent,
 	}
 	// Tool executor: агент использует подмножество инструментов из allowedTools
 	w.executor = tools.NewExecutor("/app", func() ([]map[string]any, error) {
@@ -234,7 +254,23 @@ func (w *AgentWorker) EnqueueTask(req TaskRequest) *Task {
 }
 
 // processTask выполняет задачу через LLM loop и обновляет её статус.
+// Acquires a rate-limiting token before invoking the LLM to cap concurrent
+// calls at w.maxConcurrent.
 func (w *AgentWorker) processTask(ctx context.Context, task *Task) {
+	// ── Rate limiting: acquire a token ───────────────────────────────────────
+	// If no token is available, block until one frees up or ctx is cancelled.
+	select {
+	case <-ctx.Done():
+		w.tasksMu.Lock()
+		task.Status = TaskCancelled
+		task.Error = "context cancelled before execution"
+		w.tasksMu.Unlock()
+		return
+	case <-w.rateSem:
+		// acquired
+	}
+	defer func() { w.rateSem <- struct{}{} }() // release token
+
 	now := time.Now()
 	w.tasksMu.Lock()
 	task.Status = TaskRunning
@@ -478,12 +514,17 @@ func (w *AgentWorker) postCallback(task *Task) {
 // ─── HTTP Handlers ────────────────────────────────────────────────────────────
 
 func (w *AgentWorker) handleHealth(rw http.ResponseWriter, r *http.Request) {
+	activeSlots := w.maxConcurrent - len(w.rateSem)
+	rw.Header().Set("Content-Type", "application/json")
 	json.NewEncoder(rw).Encode(map[string]any{
-		"status":   "ok",
-		"agentId":  w.agentID,
-		"name":     w.cfg.Name,
-		"model":    w.cfg.Model,
-		"queueLen": len(w.taskQueue),
+		"status":         "ok",
+		"agentId":        w.agentID,
+		"name":           w.cfg.Name,
+		"model":          w.cfg.Model,
+		"queueLen":       len(w.taskQueue),
+		"activeTasks":    activeSlots,
+		"maxConcurrent":  w.maxConcurrent,
+		"rateLimitFree":  len(w.rateSem),
 	})
 }
 
@@ -643,7 +684,16 @@ func main() {
 		log.Fatal("[AgentWorker] DATABASE_URL env var is required")
 	}
 
-	log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s", agentID, port, llmBaseURL)
+	// MAX_CONCURRENT_TASKS controls the rate-limiting semaphore (Phase C).
+	maxConcurrent := defaultMaxConcurrent
+	if mcStr := os.Getenv("MAX_CONCURRENT_TASKS"); mcStr != "" {
+		if mc, err := strconv.Atoi(mcStr); err == nil && mc > 0 {
+			maxConcurrent = mc
+		}
+	}
+
+	log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s MAX_CONCURRENT=%d",
+		agentID, port, llmBaseURL, maxConcurrent)
 
 	// ── DB ───────────────────────────────────────────────────────────────────
 	database, err := db.Connect(dbURL)
@@ -656,7 +706,7 @@ func main() {
 	llmClient := llm.NewClient(llmBaseURL, llmAPIKey)
 
 	// ── Agent Worker ─────────────────────────────────────────────────────────
-	worker, err := newAgentWorker(agentID, database, llmClient)
+	worker, err := newAgentWorker(agentID, database, llmClient, maxConcurrent)
 	if err != nil {
 		log.Fatalf("[AgentWorker] init failed: %v", err)
 	}
@@ -724,4 +774,4 @@ func getEnvFirst(keys ...string) string {
 		}
 	}
 	return ""
-}
\ No newline at end of file
+}
diff --git a/gateway/cmd/agent-worker/main_test.go b/gateway/cmd/agent-worker/main_test.go
index 8245e87..3b917de 100644
--- a/gateway/cmd/agent-worker/main_test.go
+++ b/gateway/cmd/agent-worker/main_test.go
@@ -130,11 +130,18 @@ func TestEnqueueTask_DefaultTimeout(t *testing.T) {
 // ─── HTTP Handlers ────────────────────────────────────────────────────────────
 
 func makeTestWorker() *AgentWorker {
+	mc := defaultMaxConcurrent
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
 	return &AgentWorker{
-		agentID:   42,
-		cfg:       mockAgentConfig(),
-		taskQueue: make(chan *Task, taskQueueDepth),
-		tasks:     make(map[string]*Task),
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
 	}
 }
 
@@ -435,4 +442,107 @@ func TestWorkerProcessesTask_WithMockLLM(t *testing.T) {
 	if finalStatus != TaskDone {
 		t.Errorf("expected task done, got %s", finalStatus)
 	}
-}
\ No newline at end of file
+}
+
+// ─── Phase C: Rate-limiting tests ─────────────────────────────────────────────
+
+// TestRateLimiting_TokensInitialized verifies that the semaphore is filled with
+// maxConcurrent tokens on worker creation.
+func TestRateLimiting_TokensInitialized(t *testing.T) {
+	mc := 3
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+	if len(w.rateSem) != mc {
+		t.Errorf("expected %d tokens in semaphore, got %d", mc, len(w.rateSem))
+	}
+	if cap(w.rateSem) != mc {
+		t.Errorf("expected semaphore capacity=%d, got %d", mc, cap(w.rateSem))
+	}
+}
+
+// TestRateLimiting_TokenAcquireRelease verifies that tokens can be acquired and
+// released correctly (simulating what processTask does).
+func TestRateLimiting_TokenAcquireRelease(t *testing.T) {
+	mc := 2
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+
+	// Acquire both tokens
+	<-w.rateSem
+	<-w.rateSem
+
+	if len(w.rateSem) != 0 {
+		t.Errorf("expected 0 free tokens after acquiring all, got %d", len(w.rateSem))
+	}
+
+	// Release one token
+	w.rateSem <- struct{}{}
+	if len(w.rateSem) != 1 {
+		t.Errorf("expected 1 free token after release, got %d", len(w.rateSem))
+	}
+
+	// Release second token
+	w.rateSem <- struct{}{}
+	if len(w.rateSem) != mc {
+		t.Errorf("expected %d free tokens after full release, got %d", mc, len(w.rateSem))
+	}
+}
+
+// TestRateLimiting_HealthShowsActiveTasks verifies the /health endpoint reports
+// active task count and rate-limit info.
+func TestRateLimiting_HealthShowsActiveTasks(t *testing.T) {
+	mc := 3
+	sem := make(chan struct{}, mc)
+	for i := 0; i < mc; i++ {
+		sem <- struct{}{}
+	}
+	w := &AgentWorker{
+		agentID:       42,
+		cfg:           mockAgentConfig(),
+		taskQueue:     make(chan *Task, taskQueueDepth),
+		tasks:         make(map[string]*Task),
+		rateSem:       sem,
+		maxConcurrent: mc,
+	}
+	// Simulate 1 active task (consume 1 token)
+	<-w.rateSem
+
+	rr := httptest.NewRecorder()
+	req := httptest.NewRequest(http.MethodGet, "/health", nil)
+	w.handleHealth(rr, req)
+
+	var body map[string]any
+	if err := json.NewDecoder(rr.Body).Decode(&body); err != nil {
+		t.Fatalf("invalid JSON: %v", err)
+	}
+
+	if int(body["maxConcurrent"].(float64)) != mc {
+		t.Errorf("expected maxConcurrent=%d, got %v", mc, body["maxConcurrent"])
+	}
+	if int(body["rateLimitFree"].(float64)) != mc-1 {
+		t.Errorf("expected rateLimitFree=%d, got %v", mc-1, body["rateLimitFree"])
+	}
+	if int(body["activeTasks"].(float64)) != 1 {
+		t.Errorf("expected activeTasks=1, got %v", body["activeTasks"])
+	}
+}
diff --git a/gateway/cmd/agent/main.go b/gateway/cmd/agent/main.go
new file mode 100644
index 0000000..58ab376
--- /dev/null
+++ b/gateway/cmd/agent/main.go
@@ -0,0 +1,270 @@
+// GoClaw Agent Server — autonomous agent microservice
+//
+// Each agent runs as an independent container in the Docker Swarm overlay
+// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
+// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
+//
+// The agent:
+//   - Receives task requests from the orchestrator
+//   - Calls the LLM via the centrally-managed GoClaw Gateway
+//   - Reads/writes shared state in the MySQL database
+//   - Reports its last-activity time so the SwarmManager can auto-stop it
+//   - Gracefully shuts down after IdleTimeout with no requests
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+)
+
+// ─── Config ──────────────────────────────────────────────────────────────────
+
+type AgentConfig struct {
+	AgentID            string
+	Port               string
+	GatewayURL         string
+	LLMURL             string
+	LLMAPIKey          string
+	DatabaseURL        string
+	IdleTimeoutMinutes int
+}
+
+func loadConfig() AgentConfig {
+	idleMin := 15
+	if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil {
+			idleMin = n
+		}
+	}
+	port := os.Getenv("AGENT_PORT")
+	if port == "" {
+		port = "8080"
+	}
+	return AgentConfig{
+		AgentID:            getEnv("AGENT_ID", "unnamed-agent"),
+		Port:               port,
+		GatewayURL:         getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
+		LLMURL:             getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
+		LLMAPIKey:          os.Getenv("LLM_API_KEY"),
+		DatabaseURL:        os.Getenv("DATABASE_URL"),
+		IdleTimeoutMinutes: idleMin,
+	}
+}
+
+func getEnv(key, fallback string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return fallback
+}
+
+// ─── State ───────────────────────────────────────────────────────────────────
+
+type Agent struct {
+	cfg          AgentConfig
+	lastActivity time.Time
+	httpClient   *http.Client
+}
+
+func NewAgent(cfg AgentConfig) *Agent {
+	return &Agent{
+		cfg:          cfg,
+		lastActivity: time.Now(),
+		httpClient:   &http.Client{Timeout: 120 * time.Second},
+	}
+}
+
+func (a *Agent) touch() {
+	a.lastActivity = time.Now()
+}
+
+// ─── HTTP handlers ────────────────────────────────────────────────────────────
+
+// GET /health — liveness probe
+func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
+	respond(w, 200, map[string]any{
+		"ok":           true,
+		"agentId":      a.cfg.AgentID,
+		"lastActivity": a.lastActivity.Format(time.RFC3339),
+		"idleMinutes":  time.Since(a.lastActivity).Minutes(),
+	})
+}
+
+// POST /task — receive a task from the orchestrator
+// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
+func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
+	a.touch()
+	var body struct {
+		SessionID string          `json:"sessionId"`
+		Messages  json.RawMessage `json:"messages"`
+		Model     string          `json:"model"`
+		MaxIter   int             `json:"maxIter"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, 400, "invalid request: "+err.Error())
+		return
+	}
+	// Forward the task to the GoClaw Gateway orchestrator
+	gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
+	reqBody, _ := json.Marshal(map[string]any{
+		"messages": body.Messages,
+		"model":    body.Model,
+		"maxIter":  body.MaxIter,
+	})
+
+	req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
+	if err != nil {
+		respondError(w, 500, "request build error: "+err.Error())
+		return
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := a.httpClient.Do(req)
+	if err != nil {
+		respondError(w, 502, "gateway error: "+err.Error())
+		return
+	}
+	defer resp.Body.Close()
+
+	var result map[string]any
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		respondError(w, 502, "gateway response error: "+err.Error())
+		return
+	}
+
+	a.touch()
+	respond(w, 200, map[string]any{
+		"ok":        true,
+		"agentId":   a.cfg.AgentID,
+		"sessionId": body.SessionID,
+		"result":    result,
+	})
+}
+
+// GET /info — agent metadata
+func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
+	hostname, _ := os.Hostname()
+	respond(w, 200, map[string]any{
+		"agentId":       a.cfg.AgentID,
+		"hostname":      hostname,
+		"gatewayUrl":    a.cfg.GatewayURL,
+		"idleTimeout":   a.cfg.IdleTimeoutMinutes,
+		"lastActivity":  a.lastActivity.Format(time.RFC3339),
+		"idleMinutes":   time.Since(a.lastActivity).Minutes(),
+	})
+}
+
+// ─── Idle watchdog ────────────────────────────────────────────────────────────
+
+func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
+	threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
+	ticker := time.NewTicker(30 * time.Second)
+	defer ticker.Stop()
+	for range ticker.C {
+		idle := time.Since(a.lastActivity)
+		if idle >= threshold {
+			log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
+				a.cfg.AgentID, idle.Minutes())
+			a.selfStop()
+			cancel()
+			return
+		}
+	}
+}
+
+// selfStop asks the GoClaw Gateway to scale this service to 0.
+func (a *Agent) selfStop() {
+	url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
+	req, err := http.NewRequest("POST", url, nil)
+	if err != nil {
+		log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
+		return
+	}
+	resp, err := a.httpClient.Do(req)
+	if err != nil {
+		log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
+		return
+	}
+	body, _ := io.ReadAll(resp.Body)
+	resp.Body.Close()
+	log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
+}
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+func respond(w http.ResponseWriter, status int, data any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(status)
+	json.NewEncoder(w).Encode(data)
+}
+
+func respondError(w http.ResponseWriter, status int, msg string) {
+	respond(w, status, map[string]any{"error": msg})
+}
+
+// ─── Main ─────────────────────────────────────────────────────────────────────
+
+func main() {
+	log.SetFlags(log.LstdFlags | log.Lshortfile)
+
+	cfg := loadConfig()
+	agent := NewAgent(cfg)
+
+	log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
+		cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
+	log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
+
+	// ── HTTP server ──────────────────────────────────────────────────────────
+	mux := http.NewServeMux()
+	mux.HandleFunc("GET /health", agent.handleHealth)
+	mux.HandleFunc("POST /task", agent.handleTask)
+	mux.HandleFunc("GET /info", agent.handleInfo)
+
+	srv := &http.Server{
+		Addr:         ":" + cfg.Port,
+		Handler:      mux,
+		ReadTimeout:  30 * time.Second,
+		WriteTimeout: 150 * time.Second,
+		IdleTimeout:  120 * time.Second,
+	}
+
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// ── Idle watchdog ────────────────────────────────────────────────────────
+	go agent.runIdleWatchdog(cancel)
+
+	// ── Graceful shutdown ────────────────────────────────────────────────────
+	quit := make(chan os.Signal, 1)
+	signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
+
+	go func() {
+		log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
+		if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
+			log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
+		}
+	}()
+
+	select {
+	case <-quit:
+		log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
+	case <-ctx.Done():
+		log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
+	}
+
+	shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer shutCancel()
+	if err := srv.Shutdown(shutCtx); err != nil {
+		log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
+	}
+	log.Printf("[Agent %s] Stopped.", cfg.AgentID)
+}
diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go
index 18d3c1b..145d052 100644
--- a/gateway/cmd/gateway/main.go
+++ b/gateway/cmd/gateway/main.go
@@ -47,10 +47,35 @@ func main() {
 
 	// ── Orchestrator ─────────────────────────────────────────────────────────
 	orch := orchestrator.New(llmClient, database, cfg.ProjectRoot)
+	// Apply retry policy from config
+	orch.SetRetryPolicy(orchestrator.RetryPolicy{
+		MaxLLMRetries:  cfg.MaxLLMRetries,
+		InitialDelay:   time.Duration(cfg.RetryDelaySecs) * time.Second,
+		MaxDelay:       30 * time.Second,
+		RetryOnEmpty:   true,
+	})
+	log.Printf("[Gateway] LLM retry policy: maxRetries=%d, initialDelay=%ds", cfg.MaxLLMRetries, cfg.RetryDelaySecs)
 
 	// ── HTTP Handlers ────────────────────────────────────────────────────────
 	h := api.NewHandler(cfg, llmClient, orch, database)
 
+	// ── Sync Swarm tokens to DB on startup ──────────────────────────────────
+	go func() {
+		time.Sleep(3 * time.Second) // wait for Docker daemon readiness
+		if database != nil {
+			dockerCl := h.GetDockerClient()
+			if tokens, err := dockerCl.GetJoinTokens(); err == nil {
+				addr := dockerCl.GetManagerAddr()
+				database.UpsertSwarmTokens(
+					tokens.JoinTokens.Worker,
+					tokens.JoinTokens.Manager,
+					addr,
+				)
+				log.Printf("[Gateway] Swarm tokens synced to DB. Manager addr: %s", addr)
+			}
+		}
+	}()
+
 	// ── Router ───────────────────────────────────────────────────────────────
 	r := chi.NewRouter()
 
@@ -76,11 +101,19 @@ func main() {
 	r.Route("/api", func(r chi.Router) {
 		// Orchestrator
 		r.Post("/orchestrator/chat", h.OrchestratorChat)
+		r.Post("/orchestrator/stream", h.OrchestratorStream)
 		r.Get("/orchestrator/config", h.OrchestratorConfig)
 
-		// Agents
+		// Agents — CRUD + Container lifecycle (Phase A-C)
 		r.Get("/agents", h.ListAgents)
+		r.Get("/agents/running", h.ListRunningAgents) // Phase C: service discovery
+		r.Post("/agents", h.CreateAgent)
 		r.Get("/agents/{id}", h.GetAgent)
+		r.Delete("/agents/{id}", h.DeleteAgent)
+		r.Post("/agents/{id}/deploy", h.DeployAgent)
+		r.Post("/agents/{id}/stop", h.StopAgent)
+		r.Post("/agents/{id}/scale", h.ScaleAgent)
+		r.Post("/agents/{id}/restart", h.RestartAgent) // Phase C: dead-letter restart
 
 		// Models
 		r.Get("/models", h.ListModels)
@@ -92,8 +125,41 @@ func main() {
 		// Nodes / Docker Swarm monitoring
 		r.Get("/nodes", h.ListNodes)
 		r.Get("/nodes/stats", h.NodeStats)
+
+		// Provider config reload (called by Node.js after provider change)
+		r.Post("/providers/reload", h.ProvidersReload)
+
+		// Persistent chat sessions (background processing, DB-backed)
+		r.Post("/chat/session", h.StartChatSession)
+		r.Get("/chat/sessions", h.ListChatSessions)
+		r.Get("/chat/session/{id}", h.GetChatSession)
+		r.Get("/chat/session/{id}/events", h.GetChatEvents)
+
+		// ── Real Docker Swarm Management ─────────────────────────────────────
+		r.Get("/swarm/info", h.SwarmInfo)
+		r.Get("/swarm/nodes", h.SwarmNodes)
+		r.Post("/swarm/nodes/{id}/label", h.SwarmAddNodeLabel)
+		r.Post("/swarm/nodes/{id}/availability", h.SwarmSetNodeAvailability)
+		r.Get("/swarm/services", h.SwarmServices)
+		r.Post("/swarm/services/create", h.SwarmCreateService)
+		r.Delete("/swarm/services/{id}", h.SwarmRemoveService)
+		r.Get("/swarm/services/{id}/tasks", h.SwarmServiceTasks)
+		r.Post("/swarm/services/{id}/scale", h.SwarmScaleService)
+		r.Get("/swarm/join-token", h.SwarmJoinToken)
+		r.Post("/swarm/join-node", h.SwarmJoinNodeViaSSH)
+		r.Post("/swarm/ssh-test", h.SwarmSSHTest)
+		r.Post("/swarm/shell", h.SwarmShell)
+		r.Get("/swarm/agents", h.SwarmListAgents)
+		r.Post("/swarm/agents/{name}/start", h.SwarmStartAgent)
+		r.Post("/swarm/agents/{name}/stop", h.SwarmStopAgent)
 	})
 
+	// ── Swarm Manager: auto-stop idle agents after 15 min ────────────────────
+	swarmMgr := api.NewSwarmManager(h, 60*time.Second)
+	managerCtx, managerCancel := context.WithCancel(context.Background())
+	go swarmMgr.Start(managerCtx)
+	defer managerCancel()
+
 	// ── Start Server ─────────────────────────────────────────────────────────
 	srv := &http.Server{
 		Addr:         ":" + cfg.Port,
diff --git a/gateway/config/config.go b/gateway/config/config.go
index 7945659..7961f48 100644
--- a/gateway/config/config.go
+++ b/gateway/config/config.go
@@ -46,6 +46,22 @@ type Config struct {
 	DefaultModel       string
 	MaxToolIterations  int
 	RequestTimeoutSecs int
+
+	// Docker overlay network for agent containers
+	// AGENT_NETWORK — name of the Docker overlay/bridge network agents are attached to.
+	// Default: goclaw-agents (a dedicated overlay network)
+	AgentNetwork string
+
+	// AGENT_DB_URL — DATABASE_URL passed to agent containers.
+	// Useful when agents run on an overlay network and the DB hostname differs.
+	// Falls back to DatabaseURL if not set.
+	AgentDBURL string
+
+	// LLM retry policy
+	// GATEWAY_MAX_LLM_RETRIES — additional attempts after a failure/empty response (default 3).
+	MaxLLMRetries int
+	// GATEWAY_RETRY_DELAY_SECS — initial delay before first retry in seconds (default 2).
+	RetryDelaySecs int
 }
 
 func Load() *Config {
@@ -55,6 +71,8 @@ func Load() *Config {
 
 	maxIter, _ := strconv.Atoi(getEnv("GATEWAY_MAX_TOOL_ITERATIONS", "10"))
 	timeout, _ := strconv.Atoi(getEnv("GATEWAY_REQUEST_TIMEOUT_SECS", "120"))
+	maxLLMRetries, _ := strconv.Atoi(getEnv("GATEWAY_MAX_LLM_RETRIES", "3"))
+	retryDelaySecs, _ := strconv.Atoi(getEnv("GATEWAY_RETRY_DELAY_SECS", "2"))
 
 	// Resolve LLM base URL — priority: LLM_BASE_URL > OLLAMA_BASE_URL > default cloud
 	rawLLMURL := getEnvFirst(
@@ -82,6 +100,10 @@ func Load() *Config {
 		DefaultModel:       getEnv("DEFAULT_MODEL", "qwen2.5:7b"),
 		MaxToolIterations:  maxIter,
 		RequestTimeoutSecs: timeout,
+		MaxLLMRetries:      maxLLMRetries,
+		RetryDelaySecs:     retryDelaySecs,
+		AgentNetwork:       getEnv("AGENT_NETWORK", "goclaw-agents"),
+		AgentDBURL:         getEnv("AGENT_DB_URL", ""),
 	}
 
 	if cfg.LLMAPIKey == "" {
diff --git a/gateway/go.mod b/gateway/go.mod
index af39820..c9c1135 100644
--- a/gateway/go.mod
+++ b/gateway/go.mod
@@ -3,11 +3,15 @@ module git.softuniq.eu/UniqAI/GoClaw/gateway
 go 1.23.4
 
 require (
-	filippo.io/edwards25519 v1.1.0 // indirect
 	github.com/go-chi/chi/v5 v5.2.1
 	github.com/go-chi/cors v1.2.1
-	github.com/go-sql-driver/mysql v1.8.1 // indirect
+	github.com/go-sql-driver/mysql v1.8.1
 	github.com/google/uuid v1.6.0
-	github.com/jmoiron/sqlx v1.4.0 // indirect
 	github.com/joho/godotenv v1.5.1
+	golang.org/x/crypto v0.37.0
+)
+
+require (
+	filippo.io/edwards25519 v1.1.0 // indirect
+	golang.org/x/sys v0.32.0 // indirect
 )
diff --git a/gateway/go.sum b/gateway/go.sum
index 807180e..6962f72 100644
--- a/gateway/go.sum
+++ b/gateway/go.sum
@@ -8,9 +8,11 @@ github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpv
 github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
-github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
 github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
 github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
-github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
-github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
+golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
+golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
+golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
+golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
diff --git a/gateway/internal/api/handlers.go b/gateway/internal/api/handlers.go
index 2aa64bc..531bf98 100644
--- a/gateway/internal/api/handlers.go
+++ b/gateway/internal/api/handlers.go
@@ -8,6 +8,7 @@ import (
 	"log"
 	"net/http"
 	"strconv"
+	"strings"
 	"time"
 
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/config"
@@ -16,6 +17,7 @@ import (
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/llm"
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/orchestrator"
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/tools"
+	"golang.org/x/crypto/ssh"
 )
 
 // Handler holds all dependencies for HTTP handlers.
@@ -37,6 +39,11 @@ func NewHandler(cfg *config.Config, llmClient *llm.Client, orch *orchestrator.Or
 	}
 }
 
+// GetDockerClient exposes the docker client for use in main.go startup routines.
+func (h *Handler) GetDockerClient() *dockerclient.DockerClient {
+	return h.docker
+}
+
 // ─── Health ───────────────────────────────────────────────────────────────────
 
 // GET /health
@@ -99,6 +106,277 @@ func (h *Handler) OrchestratorConfig(w http.ResponseWriter, r *http.Request) {
 	})
 }
 
+// ─── SSE Stream ───────────────────────────────────────────────────────────────
+
+// SSE event types
+const (
+	sseEventToolCall  = "tool_call"
+	sseEventDelta     = "delta"
+	sseEventDone      = "done"
+	sseEventError     = "error"
+	sseEventThinking  = "thinking"
+)
+
+// streamEvent is a single SSE event sent to the client.
+type streamEvent struct {
+	Type string `json:"type"`
+	// For delta events
+	Content string `json:"content,omitempty"`
+	// For tool_call events
+	Tool       string `json:"tool,omitempty"`
+	Args       any    `json:"args,omitempty"`
+	Result     any    `json:"result,omitempty"`
+	Success    *bool  `json:"success,omitempty"`
+	DurationMs *int64 `json:"durationMs,omitempty"`
+	// For done events
+	Model        string     `json:"model,omitempty"`
+	ModelWarning string     `json:"modelWarning,omitempty"`
+	Usage        *llm.Usage `json:"usage,omitempty"`
+	// For error events
+	Error string `json:"error,omitempty"`
+}
+
+// writeSSE writes a single SSE event to the response writer and flushes.
+func writeSSE(w http.ResponseWriter, flusher http.Flusher, event streamEvent) {
+	data, err := json.Marshal(event)
+	if err != nil {
+		return
+	}
+	fmt.Fprintf(w, "data: %s\n\n", data)
+	flusher.Flush()
+}
+
+// POST /api/orchestrator/stream
+// SSE endpoint: streams tool-call events and LLM delta tokens in real time.
+func (h *Handler) OrchestratorStream(w http.ResponseWriter, r *http.Request) {
+	flusher, ok := w.(http.Flusher)
+	if !ok {
+		respondError(w, http.StatusInternalServerError, "streaming not supported")
+		return
+	}
+
+	var req struct {
+		Messages []orchestrator.Message `json:"messages"`
+		Model    string                 `json:"model,omitempty"`
+		MaxIter  int                    `json:"maxIter,omitempty"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
+		return
+	}
+	if len(req.Messages) == 0 {
+		respondError(w, http.StatusBadRequest, "messages array is required")
+		return
+	}
+
+	// Set SSE headers
+	w.Header().Set("Content-Type", "text/event-stream; charset=utf-8")
+	w.Header().Set("Cache-Control", "no-cache")
+	w.Header().Set("Connection", "keep-alive")
+	w.Header().Set("Access-Control-Allow-Origin", "*")
+	w.WriteHeader(http.StatusOK)
+	flusher.Flush()
+
+	log.Printf("[API] POST /api/orchestrator/stream — messages=%d model=%q", len(req.Messages), req.Model)
+
+	// Extract the last user message for history/metrics storage
+	userMessage := ""
+	for i := len(req.Messages) - 1; i >= 0; i-- {
+		if req.Messages[i].Role == "user" {
+			userMessage = req.Messages[i].Content
+			break
+		}
+	}
+
+	// Determine orchestrator agent ID (look for isOrchestrator=1 in DB)
+	orchAgentID := 1 // fallback to agent ID 1
+	if h.db != nil {
+		if cfg, err := h.db.GetOrchestratorConfig(); err == nil && cfg != nil {
+			orchAgentID = cfg.ID
+		}
+	}
+
+	startTime := time.Now()
+
+	ctx, cancel := context.WithTimeout(r.Context(), time.Duration(h.cfg.RequestTimeoutSecs)*time.Second)
+	defer cancel()
+
+	// Run orchestration in a goroutine, streaming events via channel
+	type toolEvent struct {
+		step orchestrator.ToolCallStep
+	}
+	toolCh := make(chan toolEvent, 32)
+	doneCh := make(chan orchestrator.ChatResult, 1)
+
+	// Custom streaming orchestrator
+	go func() {
+		result := h.orch.ChatWithEvents(ctx, req.Messages, req.Model, req.MaxIter, func(step orchestrator.ToolCallStep) {
+			toolCh <- toolEvent{step: step}
+		})
+		close(toolCh)
+		doneCh <- result
+	}()
+
+	// Send thinking event
+	writeSSE(w, flusher, streamEvent{Type: sseEventThinking})
+
+	// Drain tool events
+	for ev := range toolCh {
+		success := ev.step.Success
+		dur := ev.step.DurationMs
+		writeSSE(w, flusher, streamEvent{
+			Type:       sseEventToolCall,
+			Tool:       ev.step.Tool,
+			Args:       ev.step.Args,
+			Result:     ev.step.Result,
+			Success:    &success,
+			DurationMs: &dur,
+			Error:      ev.step.Error,
+		})
+	}
+
+	// Get final result
+	result := <-doneCh
+
+	if !result.Success {
+		writeSSE(w, flusher, streamEvent{Type: sseEventError, Error: result.Error})
+		fmt.Fprintf(w, "data: [DONE]\n\n")
+		flusher.Flush()
+		// Persist error metric + history (fire-and-forget goroutine)
+		if h.db != nil {
+			go func() {
+				reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano())
+				h.db.SaveMetric(db.MetricInput{
+					AgentID:          orchAgentID,
+					RequestID:        reqID,
+					UserMessage:      userMessage,
+					ProcessingTimeMs: time.Since(startTime).Milliseconds(),
+					Status:           "error",
+					ErrorMessage:     result.Error,
+					Model:            result.Model,
+				})
+				h.db.SaveHistory(db.HistoryInput{
+					AgentID:       orchAgentID,
+					UserMessage:   userMessage,
+					AgentResponse: "",
+					Status:        "error",
+				})
+			}()
+		}
+		return
+	}
+
+	// Stream the response in rune-safe chunks (important for UTF-8 / Cyrillic).
+	// We convert to []rune first so we never split a multi-byte character.
+	const runeChunkSize = 6
+	runes := []rune(result.Response)
+	for i := 0; i < len(runes); i += runeChunkSize {
+		end := i + runeChunkSize
+		if end > len(runes) {
+			end = len(runes)
+		}
+		writeSSE(w, flusher, streamEvent{
+			Type:    sseEventDelta,
+			Content: string(runes[i:end]),
+		})
+		select {
+		case <-ctx.Done():
+			return
+		default:
+		}
+	}
+
+	// Send done event
+	writeSSE(w, flusher, streamEvent{
+		Type:         sseEventDone,
+		Model:        result.Model,
+		ModelWarning: result.ModelWarning,
+		Usage:        result.Usage,
+	})
+	fmt.Fprintf(w, "data: [DONE]\n\n")
+	flusher.Flush()
+
+	// Persist metrics + history asynchronously (never blocks the response)
+	if h.db != nil {
+		go func() {
+			reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano())
+			var inputTok, outputTok, totalTok int
+			if result.Usage != nil {
+				inputTok = result.Usage.PromptTokens
+				outputTok = result.Usage.CompletionTokens
+				totalTok = result.Usage.TotalTokens
+			}
+			toolNames := make([]string, len(result.ToolCalls))
+			for i, tc := range result.ToolCalls {
+				toolNames[i] = tc.Tool
+			}
+			h.db.SaveMetric(db.MetricInput{
+				AgentID:          orchAgentID,
+				RequestID:        reqID,
+				UserMessage:      userMessage,
+				AgentResponse:    result.Response,
+				InputTokens:      inputTok,
+				OutputTokens:     outputTok,
+				TotalTokens:      totalTok,
+				ProcessingTimeMs: time.Since(startTime).Milliseconds(),
+				Status:           "success",
+				ToolsCalled:      toolNames,
+				Model:            result.Model,
+			})
+			h.db.SaveHistory(db.HistoryInput{
+				AgentID:       orchAgentID,
+				UserMessage:   userMessage,
+				AgentResponse: result.Response,
+				Status:        "success",
+			})
+		}()
+	}
+}
+
+// ─── Providers Reload ─────────────────────────────────────────────────────────
+
+// POST /api/providers/reload
+// Node.js calls this after activating a provider, sending the decrypted API key in the body.
+// Body: { "name": "...", "baseUrl": "...", "apiKey": "...", "modelDefault": "..." }
+func (h *Handler) ProvidersReload(w http.ResponseWriter, r *http.Request) {
+	// Try to read the decrypted credentials from the request body (preferred path)
+	var body struct {
+		Name         string `json:"name"`
+		BaseURL      string `json:"baseUrl"`
+		APIKey       string `json:"apiKey"`
+		ModelDefault string `json:"modelDefault"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err == nil && body.BaseURL != "" {
+		h.llm.UpdateCredentials(body.BaseURL, body.APIKey)
+		log.Printf("[API] Provider reloaded from body: %s (%s)", body.Name, body.BaseURL)
+		respond(w, http.StatusOK, map[string]any{
+			"ok":      true,
+			"name":    body.Name,
+			"baseUrl": body.BaseURL,
+		})
+		return
+	}
+
+	// Fallback: try to read from DB (key will be empty since Go can't decrypt it)
+	if h.db != nil {
+		provider, err := h.db.GetActiveProvider()
+		if err == nil && provider != nil {
+			h.llm.UpdateCredentials(provider.BaseURL, provider.APIKey)
+			log.Printf("[API] Provider reloaded from DB: %s (%s)", provider.Name, provider.BaseURL)
+			respond(w, http.StatusOK, map[string]any{
+				"ok":      true,
+				"name":    provider.Name,
+				"baseUrl": provider.BaseURL,
+			})
+			return
+		}
+		if err != nil {
+			log.Printf("[API] ProvidersReload: DB error: %v", err)
+		}
+	}
+	respond(w, http.StatusOK, map[string]any{"ok": true, "note": "No provider data received"})
+}
+
 // ─── Agents ───────────────────────────────────────────────────────────────────
 
 // GET /api/agents
@@ -135,6 +413,342 @@ func (h *Handler) GetAgent(w http.ResponseWriter, r *http.Request) {
 	respond(w, http.StatusOK, agent)
 }
 
+// POST /api/agents — create agent in DB (no container yet)
+func (h *Handler) CreateAgent(w http.ResponseWriter, r *http.Request) {
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	var in db.CreateAgentInput
+	if err := json.NewDecoder(r.Body).Decode(&in); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid request body: "+err.Error())
+		return
+	}
+	if in.Name == "" {
+		respondError(w, http.StatusBadRequest, "name is required")
+		return
+	}
+	if in.Model == "" {
+		in.Model = "qwen2.5:7b"
+	}
+	if in.Role == "" {
+		in.Role = "assistant"
+	}
+	id, err := h.db.CreateAgent(in)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to create agent: "+err.Error())
+		return
+	}
+	respond(w, http.StatusCreated, map[string]any{"ok": true, "id": id, "message": "Agent created. Use POST /api/agents/{id}/deploy to start the container."})
+}
+
+// DELETE /api/agents/{id} — delete agent from DB and remove Swarm service if running
+func (h *Handler) DeleteAgent(w http.ResponseWriter, r *http.Request) {
+	idStr := r.PathValue("id")
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid agent id")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	// Try to get agent and stop its container/service if running
+	agent, _ := h.db.GetAgentByID(id)
+	if agent != nil && agent.ServiceName != "" && h.docker != nil {
+		// Try standalone container first
+		if err := h.docker.StopContainer(agent.ServiceName); err != nil {
+			// Fallback: Swarm service
+			svcs, svcErr := h.docker.ListServices()
+			if svcErr == nil {
+				for _, svc := range svcs {
+					if svc.Spec.Name == agent.ServiceName {
+						if rmErr := h.docker.RemoveService(svc.ID); rmErr != nil {
+							log.Printf("[AgentDeploy] warn: remove service %s: %v", svc.Spec.Name, rmErr)
+						}
+						break
+					}
+				}
+			}
+		}
+	}
+	if err := h.db.DeleteAgent(id); err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to delete agent: "+err.Error())
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{"ok": true, "deleted": id})
+}
+
+// POST /api/agents/{id}/deploy — deploy agent as a standalone Docker container
+func (h *Handler) DeployAgent(w http.ResponseWriter, r *http.Request) {
+	idStr := r.PathValue("id")
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid agent id")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+
+	agent, err := h.db.GetAgentByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "agent not found")
+		return
+	}
+
+	// Assign a free port from pool 8100-8999
+	port, err := h.db.AssignServicePort(8100, 900)
+	if err != nil {
+		respondError(w, http.StatusConflict, "no free port available: "+err.Error())
+		return
+	}
+
+	svcName := fmt.Sprintf("goclaw-agent-%d", id)
+	image := agent.ContainerImage
+	if image == "" {
+		image = "goclaw-agent-worker:latest"
+	}
+
+	// Build env vars for the agent container
+	// Note: agent-worker reads AGENT_PORT (not PORT) for its HTTP listen address
+	envVars := []string{
+		fmt.Sprintf("AGENT_ID=%d", id),
+		fmt.Sprintf("AGENT_PORT=%d", port),
+		fmt.Sprintf("PORT=%d", port), // alias for compatibility
+	}
+	// Pass through gateway env vars for DB and LLM.
+	// AGENT_DB_URL overrides DATABASE_URL for agent containers
+	// (useful when agents run on overlay network with different DB hostname/IP).
+	agentDBURL := h.cfg.AgentDBURL
+	if agentDBURL == "" {
+		// fallback: replace short "db" alias with "goclaw-db" for overlay DNS
+		agentDBURL = strings.ReplaceAll(h.cfg.DatabaseURL, "tcp(db:", "tcp(goclaw-db:")
+	}
+	if agentDBURL != "" {
+		envVars = append(envVars, "DATABASE_URL="+agentDBURL)
+	}
+	if llmURL := h.cfg.LLMBaseURL; llmURL != "" {
+		envVars = append(envVars, "LLM_BASE_URL="+llmURL)
+	}
+	if apiKey := h.cfg.LLMAPIKey; apiKey != "" {
+		envVars = append(envVars, "LLM_API_KEY="+apiKey)
+	}
+
+	// Update DB status to "deploying" before creating service
+	_ = h.db.UpdateContainerStatus(id, "deploying", svcName, port)
+
+	if h.docker == nil {
+		// Docker not available (dev mode) — just update status
+		_ = h.db.UpdateContainerStatus(id, "running", svcName, port)
+		respond(w, http.StatusOK, map[string]any{
+			"ok":          true,
+			"note":        "Docker not available — simulated deploy",
+			"serviceName": svcName,
+			"servicePort": port,
+		})
+		return
+	}
+
+	// Use configurable network (AGENT_NETWORK env var, default: "goclaw-agents")
+	agentNetwork := "goclaw-agents"
+	if h.cfg != nil && h.cfg.AgentNetwork != "" {
+		agentNetwork = h.cfg.AgentNetwork
+	}
+
+	// Deploy agent as a standalone container (docker run) so it can share
+	// the same bridge/overlay network as goclaw-db and goclaw-gateway.
+	runOpts := dockerclient.RunContainerOpts{
+		Name:     svcName,
+		Image:    image,
+		Env:      envVars,
+		Networks: []string{agentNetwork},
+		Port:     port,
+		Labels: map[string]string{
+			"goclaw.agent":    "true",
+			"goclaw.agent.id": fmt.Sprintf("%d", id),
+		},
+	}
+	containerID, err := h.docker.RunContainer(runOpts)
+	if err != nil {
+		// If container already exists (409 Conflict), remove it and retry once
+		if strings.Contains(err.Error(), "409") || strings.Contains(err.Error(), "already in use") {
+			log.Printf("[AgentDeploy] container %s already exists, removing and retrying", svcName)
+			_ = h.docker.StopContainer(svcName)
+			containerID, err = h.docker.RunContainer(runOpts)
+		}
+	}
+	if err != nil {
+		_ = h.db.UpdateContainerStatus(id, "error", svcName, port)
+		respondError(w, http.StatusInternalServerError, "failed to deploy agent container: "+err.Error())
+		return
+	}
+
+	_ = h.db.UpdateContainerStatus(id, "running", svcName, port)
+	log.Printf("[AgentDeploy] Agent %d deployed as container %s (id=%s) on port %d", id, svcName, containerID, port)
+
+	respond(w, http.StatusOK, map[string]any{
+		"ok":          true,
+		"serviceId":   containerID,
+		"serviceName": svcName,
+		"servicePort": port,
+		"image":       image,
+		"agentId":     id,
+	})
+}
+
+// POST /api/agents/{id}/stop — stop and remove the agent container
+func (h *Handler) StopAgent(w http.ResponseWriter, r *http.Request) {
+	idStr := r.PathValue("id")
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid agent id")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	agent, err := h.db.GetAgentByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "agent not found")
+		return
+	}
+	// Use serviceName from DB if available, otherwise derive from agent ID
+	svcName := agent.ServiceName
+	if svcName == "" {
+		svcName = fmt.Sprintf("goclaw-agent-%d", id)
+	}
+	if h.docker != nil {
+		// Try to stop as standalone container first (always try by canonical name too)
+		_ = h.docker.StopContainer(svcName)
+		// Also try Swarm service fallback
+		svcs, svcErr := h.docker.ListServices()
+		if svcErr == nil {
+			for _, svc := range svcs {
+				if svc.Spec.Name == svcName {
+					_ = h.docker.RemoveService(svc.ID)
+					break
+				}
+			}
+		}
+	}
+	_ = h.db.UpdateContainerStatus(id, "stopped", "", 0)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "stopped": svcName})
+}
+
+// POST /api/agents/{id}/scale — scale replicas of agent Swarm service
+func (h *Handler) ScaleAgent(w http.ResponseWriter, r *http.Request) {
+	idStr := r.PathValue("id")
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid agent id")
+		return
+	}
+	var req struct {
+		Replicas int `json:"replicas"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Replicas < 1 {
+		respondError(w, http.StatusBadRequest, "replicas must be >= 1")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	agent, err := h.db.GetAgentByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "agent not found")
+		return
+	}
+	if agent.ServiceName == "" {
+		respondError(w, http.StatusBadRequest, "agent has no deployed service — deploy first")
+		return
+	}
+	if h.docker != nil {
+		svcs, err := h.docker.ListServices()
+		if err == nil {
+			for _, svc := range svcs {
+				if svc.Spec.Name == agent.ServiceName {
+					if err := h.docker.ScaleService(svc.ID, req.Replicas); err != nil {
+						respondError(w, http.StatusInternalServerError, "scale error: "+err.Error())
+						return
+					}
+					break
+				}
+			}
+		}
+	}
+	respond(w, http.StatusOK, map[string]any{"ok": true, "serviceName": agent.ServiceName, "replicas": req.Replicas})
+}
+
+// GET /api/agents/running — service discovery: only agents with containerStatus=running
+// Used by agents for peer-discovery (A2A protocol, Phase C).
+func (h *Handler) ListRunningAgents(w http.ResponseWriter, r *http.Request) {
+	if h.db == nil {
+		respond(w, http.StatusOK, map[string]any{"agents": []any{}, "count": 0})
+		return
+	}
+	all, err := h.db.ListAgents()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to list agents: "+err.Error())
+		return
+	}
+	running := make([]db.AgentRow, 0, len(all))
+	for _, a := range all {
+		if a.ContainerStatus == "running" && a.ServicePort > 0 {
+			running = append(running, a)
+		}
+	}
+	respond(w, http.StatusOK, map[string]any{"agents": running, "count": len(running)})
+}
+
+// POST /api/agents/{id}/restart — restart a stopped/error agent (dead-letter recovery).
+// Removes the old Swarm service (if any) and re-deploys.
+func (h *Handler) RestartAgent(w http.ResponseWriter, r *http.Request) {
+	idStr := r.PathValue("id")
+	id, err := strconv.Atoi(idStr)
+	if err != nil {
+		respondError(w, http.StatusBadRequest, "invalid agent id")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	agent, err := h.db.GetAgentByID(id)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "agent not found")
+		return
+	}
+	// Stop existing container/service if present
+	if agent.ServiceName != "" && h.docker != nil {
+		// Try standalone container first
+		if err := h.docker.StopContainer(agent.ServiceName); err != nil {
+			// Fallback: Swarm service
+			svcs, lsErr := h.docker.ListServices()
+			if lsErr == nil {
+				for _, svc := range svcs {
+					if svc.Spec.Name == agent.ServiceName {
+						if rmErr := h.docker.RemoveService(svc.ID); rmErr != nil {
+							log.Printf("[AgentRestart] remove old service %s: %v", svc.Spec.Name, rmErr)
+						}
+						break
+					}
+				}
+			}
+		}
+		// Reset DB status to stopped so DeployAgent can redeploy
+		h.db.UpdateContainerStatus(id, "stopped", "", 0)
+	}
+	// Delegate to DeployAgent logic by calling it directly
+	// We reuse the same request, just changing the path param
+	r2 := r.WithContext(r.Context())
+	h.DeployAgent(w, r2)
+}
+
 // ─── Models ───────────────────────────────────────────────────────────────────
 
 // GET /api/models
@@ -406,3 +1020,926 @@ func round2(f float64) float64 {
 func init() {
 	_ = fmt.Sprintf // suppress unused import
 }
+
+// ─── Persistent Chat Sessions ─────────────────────────────────────────────────
+
+// POST /api/chat/session
+// Creates a DB session, fires off the orchestrator in the background,
+// returns {"sessionId":"..."} immediately. The client polls for events.
+func (h *Handler) StartChatSession(w http.ResponseWriter, r *http.Request) {
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected — persistent sessions unavailable")
+		return
+	}
+
+	var req struct {
+		Messages  []orchestrator.Message `json:"messages"`
+		Model     string                 `json:"model,omitempty"`
+		MaxIter   int                    `json:"maxIter,omitempty"`
+		SessionID string                 `json:"sessionId,omitempty"` // client can supply its own ID
+	}
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid body: "+err.Error())
+		return
+	}
+	if len(req.Messages) == 0 {
+		respondError(w, http.StatusBadRequest, "messages array is required")
+		return
+	}
+
+	// Use client-supplied ID or generate one
+	sessionID := req.SessionID
+	if sessionID == "" {
+		sessionID = fmt.Sprintf("cs-%d", time.Now().UnixNano())
+	}
+
+	// Extract last user message for storage
+	userMessage := ""
+	for i := len(req.Messages) - 1; i >= 0; i-- {
+		if req.Messages[i].Role == "user" {
+			userMessage = req.Messages[i].Content
+			break
+		}
+	}
+
+	// Resolve orchestrator agent ID
+	orchAgentID := 1
+	if cfg, err := h.db.GetOrchestratorConfig(); err == nil && cfg != nil {
+		orchAgentID = cfg.ID
+	}
+
+	// Create session row in DB
+	if err := h.db.CreateSession(sessionID, userMessage, orchAgentID); err != nil {
+		respondError(w, http.StatusInternalServerError, "failed to create session: "+err.Error())
+		return
+	}
+
+	maxIter := req.MaxIter
+	if maxIter <= 0 {
+		maxIter = 10
+	}
+	model := req.Model
+
+	// Snapshot messages + config for the goroutine
+	messages := req.Messages
+
+	// Launch orchestration in a fully detached goroutine.
+	// This goroutine runs independently — survives HTTP disconnect.
+	go func() {
+		startTime := time.Now()
+
+		// Append initial "thinking" event
+		_ = h.db.AppendEvent(db.ChatEventRow{
+			SessionID: sessionID,
+			EventType: "thinking",
+		})
+
+		ctx, cancel := context.WithTimeout(context.Background(),
+			time.Duration(h.cfg.RequestTimeoutSecs)*time.Second)
+		defer cancel()
+
+		result := h.orch.ChatWithEventsAndRetry(ctx, messages, model, maxIter,
+			// onToolCall — store each tool execution as an event
+			func(step orchestrator.ToolCallStep) {
+				argsJSON, _ := json.Marshal(step.Args)
+				resultStr := ""
+				if step.Result != nil {
+					b, _ := json.Marshal(step.Result)
+					resultStr = string(b)
+				}
+				_ = h.db.AppendEvent(db.ChatEventRow{
+					SessionID:   sessionID,
+					EventType:   "tool_call",
+					ToolName:    step.Tool,
+					ToolArgs:    string(argsJSON),
+					ToolResult:  resultStr,
+					ToolSuccess: step.Success,
+					DurationMs:  int(step.DurationMs),
+					ErrorMsg:    step.Error,
+				})
+			},
+			// onRetry — emit a "thinking" event so the client sees retry progress
+			func(attempt int, reason string) {
+				msg := fmt.Sprintf("⟳ Retry %d: %s", attempt, reason)
+				log.Printf("[Orchestrator] %s", msg)
+				_ = h.db.AppendEvent(db.ChatEventRow{
+					SessionID: sessionID,
+					EventType: "thinking",
+					Content:   msg,
+				})
+			},
+		)
+
+		processingMs := time.Since(startTime).Milliseconds()
+
+		if !result.Success {
+			_ = h.db.AppendEvent(db.ChatEventRow{
+				SessionID: sessionID,
+				EventType: "error",
+				ErrorMsg:  result.Error,
+			})
+			h.db.MarkSessionDone(sessionID, "error", "", result.Model, result.Error, 0, processingMs)
+			return
+		}
+
+		// Append full response as a single delta (client will display it)
+		_ = h.db.AppendEvent(db.ChatEventRow{
+			SessionID: sessionID,
+			EventType: "delta",
+			Content:   result.Response,
+		})
+
+		// Append done event
+		totalTok := 0
+		usageStr := "null"
+		if result.Usage != nil {
+			totalTok = result.Usage.TotalTokens
+			b, _ := json.Marshal(result.Usage)
+			usageStr = string(b)
+		}
+		_ = h.db.AppendEvent(db.ChatEventRow{
+			SessionID: sessionID,
+			EventType: "done",
+			Model:     result.Model,
+			UsageJSON: usageStr,
+		})
+
+		h.db.MarkSessionDone(sessionID, "done", result.Response, result.Model, "", totalTok, processingMs)
+
+		// Also save to legacy metrics/history tables
+		reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano())
+		toolNames := make([]string, len(result.ToolCalls))
+		for i, tc := range result.ToolCalls {
+			toolNames[i] = tc.Tool
+		}
+		inputTok, outputTok := 0, 0
+		if result.Usage != nil {
+			inputTok = result.Usage.PromptTokens
+			outputTok = result.Usage.CompletionTokens
+		}
+		h.db.SaveMetric(db.MetricInput{
+			AgentID:          orchAgentID,
+			RequestID:        reqID,
+			UserMessage:      userMessage,
+			AgentResponse:    result.Response,
+			InputTokens:      inputTok,
+			OutputTokens:     outputTok,
+			TotalTokens:      totalTok,
+			ProcessingTimeMs: processingMs,
+			Status:           "success",
+			ToolsCalled:      toolNames,
+			Model:            result.Model,
+		})
+		h.db.SaveHistory(db.HistoryInput{
+			AgentID:       orchAgentID,
+			UserMessage:   userMessage,
+			AgentResponse: result.Response,
+			Status:        "success",
+		})
+	}()
+
+	respond(w, http.StatusOK, map[string]any{
+		"sessionId": sessionID,
+		"status":    "running",
+	})
+}
+
+// GET /api/chat/session/:id
+func (h *Handler) GetChatSession(w http.ResponseWriter, r *http.Request) {
+	sessionID := r.PathValue("id")
+	if sessionID == "" {
+		respondError(w, http.StatusBadRequest, "sessionId required")
+		return
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	sess, err := h.db.GetSession(sessionID)
+	if err != nil {
+		respondError(w, http.StatusNotFound, "session not found")
+		return
+	}
+	respond(w, http.StatusOK, sess)
+}
+
+// GET /api/chat/session/:id/events?after=N
+func (h *Handler) GetChatEvents(w http.ResponseWriter, r *http.Request) {
+	sessionID := r.PathValue("id")
+	if sessionID == "" {
+		respondError(w, http.StatusBadRequest, "sessionId required")
+		return
+	}
+	afterSeq := 0
+	if v := r.URL.Query().Get("after"); v != "" {
+		fmt.Sscanf(v, "%d", &afterSeq)
+	}
+	if h.db == nil {
+		respondError(w, http.StatusServiceUnavailable, "DB not connected")
+		return
+	}
+	events, err := h.db.GetEvents(sessionID, afterSeq)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	// Also return session status so client knows when to stop polling
+	var status string
+	if sess, err := h.db.GetSession(sessionID); err == nil {
+		status = sess.Status
+	}
+	respond(w, http.StatusOK, map[string]any{
+		"sessionId": sessionID,
+		"status":    status,
+		"events":    events,
+	})
+}
+
+// GET /api/chat/sessions?limit=N
+func (h *Handler) ListChatSessions(w http.ResponseWriter, r *http.Request) {
+	if h.db == nil {
+		respond(w, http.StatusOK, map[string]any{"sessions": []any{}})
+		return
+	}
+	limit := 50
+	if v := r.URL.Query().Get("limit"); v != "" {
+		fmt.Sscanf(v, "%d", &limit)
+	}
+	sessions, err := h.db.GetRecentSessions(limit)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{"sessions": sessions})
+}
+
+// ─── Real Docker Swarm Management ─────────────────────────────────────────────
+
+// GET /api/swarm/info
+// Returns swarm status, node count, join tokens, and manager address.
+func (h *Handler) SwarmInfo(w http.ResponseWriter, r *http.Request) {
+	info, err := h.docker.GetSwarmInfo()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "docker info error: "+err.Error())
+		return
+	}
+	tokens, err := h.docker.GetJoinTokens()
+	if err != nil {
+		tokens = nil
+	}
+	managerAddr := h.docker.GetManagerAddr()
+
+	result := map[string]any{
+		"nodeId":          info.Swarm.NodeID,
+		"localNodeState":  info.Swarm.LocalNodeState,
+		"isManager":       info.Swarm.ControlAvailable,
+		"managers":        info.Swarm.Managers,
+		"nodes":           info.Swarm.Nodes,
+		"managerAddr":     managerAddr,
+	}
+	if tokens != nil {
+		result["joinTokens"] = map[string]string{
+			"worker":  tokens.JoinTokens.Worker,
+			"manager": tokens.JoinTokens.Manager,
+		}
+	}
+	respond(w, http.StatusOK, result)
+}
+
+// GET /api/swarm/nodes
+// Returns all swarm nodes with their live status, labels, and resource info.
+func (h *Handler) SwarmNodes(w http.ResponseWriter, r *http.Request) {
+	nodes, err := h.docker.ListNodes()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list nodes: "+err.Error())
+		return
+	}
+
+	type NodeOut struct {
+		ID           string            `json:"id"`
+		Hostname     string            `json:"hostname"`
+		Role         string            `json:"role"`
+		State        string            `json:"state"`
+		Availability string            `json:"availability"`
+		IP           string            `json:"ip"`
+		OS           string            `json:"os"`
+		Arch         string            `json:"arch"`
+		CPUCores     int               `json:"cpuCores"`
+		MemTotalMB   int64             `json:"memTotalMB"`
+		DockerVersion string           `json:"dockerVersion"`
+		IsLeader     bool              `json:"isLeader"`
+		ManagerAddr  string            `json:"managerAddr,omitempty"`
+		Labels       map[string]string `json:"labels"`
+		UpdatedAt    string            `json:"updatedAt"`
+	}
+
+	out := make([]NodeOut, 0, len(nodes))
+	for _, n := range nodes {
+		no := NodeOut{
+			ID:            n.ID[:min(12, len(n.ID))],
+			Hostname:      n.Description.Hostname,
+			Role:          n.Spec.Role,
+			State:         n.Status.State,
+			Availability:  n.Spec.Availability,
+			IP:            n.Status.Addr,
+			OS:            n.Description.Platform.OS,
+			Arch:          n.Description.Platform.Architecture,
+			CPUCores:      int(n.Description.Resources.NanoCPUs / 1e9),
+			MemTotalMB:    n.Description.Resources.MemoryBytes / (1024 * 1024),
+			DockerVersion: n.Description.Engine.EngineVersion,
+			Labels:        n.Spec.Labels,
+			UpdatedAt:     n.UpdatedAt.UTC().Format(time.RFC3339),
+		}
+		if no.Labels == nil {
+			no.Labels = map[string]string{}
+		}
+		if n.ManagerStatus != nil {
+			no.IsLeader = n.ManagerStatus.Leader
+			no.ManagerAddr = n.ManagerStatus.Addr
+		}
+		out = append(out, no)
+	}
+
+	// Persist/update nodes in DB for history
+	if h.db != nil {
+		go h.db.UpsertSwarmNodes(out)
+	}
+
+	respond(w, http.StatusOK, map[string]any{
+		"nodes":    out,
+		"count":    len(out),
+		"fetchedAt": time.Now().UTC().Format(time.RFC3339),
+	})
+}
+
+// POST /api/swarm/nodes/{id}/label
+// Body: { "key": "gpu", "value": "true" }
+func (h *Handler) SwarmAddNodeLabel(w http.ResponseWriter, r *http.Request) {
+	nodeID := r.PathValue("id")
+	if nodeID == "" {
+		respondError(w, http.StatusBadRequest, "nodeId required")
+		return
+	}
+	var body struct {
+		Key   string `json:"key"`
+		Value string `json:"value"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Key == "" {
+		respondError(w, http.StatusBadRequest, "key required in body")
+		return
+	}
+	if err := h.docker.AddNodeLabel(nodeID, body.Key, body.Value); err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{"ok": true})
+}
+
+// POST /api/swarm/nodes/{id}/availability
+// Body: { "availability": "active|pause|drain" }
+func (h *Handler) SwarmSetNodeAvailability(w http.ResponseWriter, r *http.Request) {
+	nodeID := r.PathValue("id")
+	var body struct {
+		Availability string `json:"availability"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid body")
+		return
+	}
+	if body.Availability != "active" && body.Availability != "pause" && body.Availability != "drain" {
+		respondError(w, http.StatusBadRequest, "availability must be active|pause|drain")
+		return
+	}
+	if err := h.docker.UpdateNodeAvailability(nodeID, body.Availability); err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{"ok": true})
+}
+
+// GET /api/swarm/services
+// Returns all swarm services with replica counts and task status.
+func (h *Handler) SwarmServices(w http.ResponseWriter, r *http.Request) {
+	services, err := h.docker.ListServices()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list services: "+err.Error())
+		return
+	}
+
+	type ServiceOut struct {
+		ID           string `json:"id"`
+		Name         string `json:"name"`
+		Image        string `json:"image"`
+		Mode         string `json:"mode"`         // replicated | global
+		DesiredReplicas int  `json:"desiredReplicas"`
+		RunningTasks int    `json:"runningTasks"`
+		DesiredTasks int    `json:"desiredTasks"`
+		Labels       map[string]string `json:"labels"`
+		UpdatedAt    string `json:"updatedAt"`
+		Ports        []string `json:"ports"`
+		IsGoClaw     bool   `json:"isGoClaw"`     // goclaw.agent label present
+	}
+
+	out := make([]ServiceOut, 0, len(services))
+	for _, svc := range services {
+		mode := "replicated"
+		desired := 0
+		if svc.Spec.Mode.Replicated != nil {
+			desired = svc.Spec.Mode.Replicated.Replicas
+		} else if svc.Spec.Mode.Global != nil {
+			mode = "global"
+		}
+		running, desiredT := 0, 0
+		if svc.ServiceStatus != nil {
+			running = svc.ServiceStatus.RunningTasks
+			desiredT = svc.ServiceStatus.DesiredTasks
+		}
+		var ports []string
+		if svc.Spec.EndpointSpec != nil {
+			for _, p := range svc.Spec.EndpointSpec.Ports {
+				if p.PublishedPort > 0 {
+					ports = append(ports, fmt.Sprintf("%d:%d/%s", p.PublishedPort, p.TargetPort, p.Protocol))
+				}
+			}
+		}
+		labels := svc.Spec.Labels
+		if labels == nil {
+			labels = map[string]string{}
+		}
+		_, isGoClaw := labels["goclaw.agent"]
+		out = append(out, ServiceOut{
+			ID:              svc.ID[:min(12, len(svc.ID))],
+			Name:            svc.Spec.Name,
+			Image:           svc.Spec.TaskTemplate.ContainerSpec.Image,
+			Mode:            mode,
+			DesiredReplicas: desired,
+			RunningTasks:    running,
+			DesiredTasks:    desiredT,
+			Labels:          labels,
+			UpdatedAt:       svc.UpdatedAt.UTC().Format(time.RFC3339),
+			Ports:           ports,
+			IsGoClaw:        isGoClaw,
+		})
+	}
+	respond(w, http.StatusOK, map[string]any{"services": out, "count": len(out)})
+}
+
+// GET /api/swarm/services/{id}/tasks
+// Returns all tasks for a service (shows which node each replica runs on).
+func (h *Handler) SwarmServiceTasks(w http.ResponseWriter, r *http.Request) {
+	serviceID := r.PathValue("id")
+	if serviceID == "" {
+		respondError(w, http.StatusBadRequest, "serviceId required")
+		return
+	}
+	tasks, err := h.docker.ListServiceTasks(serviceID)
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+
+	type TaskOut struct {
+		ID          string `json:"id"`
+		ServiceID   string `json:"serviceId"`
+		NodeID      string `json:"nodeId"`
+		Slot        int    `json:"slot"`
+		State       string `json:"state"`
+		Message     string `json:"message"`
+		ContainerID string `json:"containerId"`
+		UpdatedAt   string `json:"updatedAt"`
+	}
+	out := make([]TaskOut, 0, len(tasks))
+	for _, t := range tasks {
+		cid := ""
+		if t.Status.ContainerStatus != nil {
+			cid = t.Status.ContainerStatus.ContainerID
+			if len(cid) > 12 {
+				cid = cid[:12]
+			}
+		}
+		out = append(out, TaskOut{
+			ID:          t.ID[:min(12, len(t.ID))],
+			ServiceID:   t.ServiceID[:min(12, len(t.ServiceID))],
+			NodeID:      t.NodeID[:min(12, len(t.NodeID))],
+			Slot:        t.Slot,
+			State:       t.Status.State,
+			Message:     t.Status.Message,
+			ContainerID: cid,
+			UpdatedAt:   t.UpdatedAt.UTC().Format(time.RFC3339),
+		})
+	}
+	respond(w, http.StatusOK, map[string]any{"tasks": out, "count": len(out)})
+}
+
+// POST /api/swarm/services/{id}/scale
+// Body: { "replicas": 3 }
+func (h *Handler) SwarmScaleService(w http.ResponseWriter, r *http.Request) {
+	serviceID := r.PathValue("id")
+	if serviceID == "" {
+		respondError(w, http.StatusBadRequest, "serviceId required")
+		return
+	}
+	var body struct {
+		Replicas int `json:"replicas"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid body: "+err.Error())
+		return
+	}
+	if body.Replicas < 0 || body.Replicas > 100 {
+		respondError(w, http.StatusBadRequest, "replicas must be 0-100")
+		return
+	}
+	if err := h.docker.ScaleService(serviceID, body.Replicas); err != nil {
+		respondError(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+	log.Printf("[Swarm] Scaled service %s to %d replicas", serviceID, body.Replicas)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "replicas": body.Replicas})
+}
+
+// POST /api/swarm/services/create
+// Deploy a new GoClaw agent as a Swarm service.
+// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0, "networks": ["goclaw-net"] }
+func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
+	var body struct {
+		Name     string   `json:"name"`
+		Image    string   `json:"image"`
+		Replicas int      `json:"replicas"`
+		Env      []string `json:"env"`
+		Port     int      `json:"port"`
+		Networks []string `json:"networks"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Name == "" || body.Image == "" {
+		respondError(w, http.StatusBadRequest, "name and image required")
+		return
+	}
+	if body.Replicas <= 0 {
+		body.Replicas = 1
+	}
+	svc, err := h.docker.CreateAgentServiceFull(dockerclient.CreateAgentServiceOpts{
+		Name:     body.Name,
+		Image:    body.Image,
+		Replicas: body.Replicas,
+		Env:      body.Env,
+		Port:     body.Port,
+		Networks: body.Networks,
+	})
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "create service: "+err.Error())
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{
+		"ok":        true,
+		"serviceId": svc.ID,
+		"name":      svc.Spec.Name,
+	})
+}
+
+// DELETE /api/swarm/services/{id}
+// Remove (stop) a swarm service.
+func (h *Handler) SwarmRemoveService(w http.ResponseWriter, r *http.Request) {
+	serviceID := r.PathValue("id")
+	if serviceID == "" {
+		respondError(w, http.StatusBadRequest, "service id required")
+		return
+	}
+	if err := h.docker.RemoveService(serviceID); err != nil {
+		respondError(w, http.StatusInternalServerError, "remove service: "+err.Error())
+		return
+	}
+	log.Printf("[Swarm] Removed service %s", serviceID)
+	respond(w, http.StatusOK, map[string]any{"ok": true})
+}
+
+// GET /api/swarm/agents
+// List all GoClaw agent services with idle time information.
+func (h *Handler) SwarmListAgents(w http.ResponseWriter, r *http.Request) {
+	services, err := h.docker.ListServices()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "list services: "+err.Error())
+		return
+	}
+
+	type AgentInfo struct {
+		ID              string    `json:"id"`
+		Name            string    `json:"name"`
+		Image           string    `json:"image"`
+		DesiredReplicas int       `json:"desiredReplicas"`
+		RunningTasks    int       `json:"runningTasks"`
+		LastActivity    time.Time `json:"lastActivity"`
+		IdleMinutes     float64   `json:"idleMinutes"`
+		IsGoClaw        bool      `json:"isGoClaw"`
+	}
+
+	var agents []AgentInfo
+	for _, svc := range services {
+		isGoClaw := svc.Spec.Labels["goclaw.agent"] == "true"
+		desired := 0
+		if svc.Spec.Mode.Replicated != nil {
+			desired = svc.Spec.Mode.Replicated.Replicas
+		}
+		running := 0
+		if svc.ServiceStatus != nil {
+			running = svc.ServiceStatus.RunningTasks
+		}
+		lastActivity, _ := h.docker.GetServiceLastActivity(svc.ID)
+		if lastActivity.IsZero() {
+			lastActivity = svc.UpdatedAt
+		}
+		idle := time.Since(lastActivity).Minutes()
+		agents = append(agents, AgentInfo{
+			ID:              svc.ID,
+			Name:            svc.Spec.Name,
+			Image:           svc.Spec.TaskTemplate.ContainerSpec.Image,
+			DesiredReplicas: desired,
+			RunningTasks:    running,
+			LastActivity:    lastActivity,
+			IdleMinutes:     idle,
+			IsGoClaw:        isGoClaw,
+		})
+	}
+	if agents == nil {
+		agents = []AgentInfo{}
+	}
+	respond(w, http.StatusOK, map[string]any{"agents": agents, "count": len(agents)})
+}
+
+
+// POST /api/swarm/shell
+// Execute a shell command on the HOST system (via nsenter into PID 1).
+// Body: { "command": "docker ps" }
+// Requires the gateway container to run with privileged: true + pid: host
+func (h *Handler) SwarmShell(w http.ResponseWriter, r *http.Request) {
+	var body struct {
+		Command string `json:"command"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Command == "" {
+		respondError(w, http.StatusBadRequest, "command required")
+		return
+	}
+	// Security: reject obviously dangerous patterns in production
+	// (In a real deployment you'd add auth + command whitelisting)
+	dangerous := []string{"rm -rf /", "mkfs", "dd if=/dev/zero", ":(){:|:&};:"}
+	cmdLower := strings.ToLower(body.Command)
+	for _, d := range dangerous {
+		if strings.Contains(cmdLower, d) {
+			respondError(w, http.StatusForbidden, "dangerous command rejected")
+			return
+		}
+	}
+
+	log.Printf("[SwarmShell] Executing: %s", body.Command)
+	output, err := dockerclient.ExecOnHost(body.Command)
+	if err != nil {
+		respond(w, http.StatusOK, map[string]any{
+			"output":  output,
+			"error":   err.Error(),
+			"success": false,
+		})
+		return
+	}
+	respond(w, http.StatusOK, map[string]any{
+		"output":  output,
+		"success": true,
+	})
+}
+
+// GET /api/swarm/join-token?role=worker|manager
+// Returns the join command for adding a new node to the swarm.
+func (h *Handler) SwarmJoinToken(w http.ResponseWriter, r *http.Request) {
+	role := r.URL.Query().Get("role")
+	if role == "" {
+		role = "worker"
+	}
+	tokens, err := h.docker.GetJoinTokens()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "cannot get join tokens: "+err.Error())
+		return
+	}
+	managerAddr := h.docker.GetManagerAddr()
+
+	var token string
+	if role == "manager" {
+		token = tokens.JoinTokens.Manager
+	} else {
+		token = tokens.JoinTokens.Worker
+	}
+
+	joinCmd := fmt.Sprintf("docker swarm join --token %s %s", token, managerAddr)
+	respond(w, http.StatusOK, map[string]any{
+		"role":        role,
+		"token":       token,
+		"managerAddr": managerAddr,
+		"joinCommand": joinCmd,
+	})
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// POST /api/swarm/join-node
+// Connects to a remote host via SSH and runs "docker swarm join ..." to add it to the cluster.
+// Body: { "host": "1.2.3.4", "port": 22, "user": "root", "password": "secret", "role": "worker" }
+func (h *Handler) SwarmJoinNodeViaSSH(w http.ResponseWriter, r *http.Request) {
+	var body struct {
+		Host     string `json:"host"`
+		Port     int    `json:"port"`
+		User     string `json:"user"`
+		Password string `json:"password"`
+		Role     string `json:"role"` // "worker" | "manager"
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if body.Host == "" || body.User == "" || body.Password == "" {
+		respondError(w, http.StatusBadRequest, "host, user and password are required")
+		return
+	}
+	if body.Port == 0 {
+		body.Port = 22
+	}
+	if body.Role == "" {
+		body.Role = "worker"
+	}
+
+	// 1. Get join token from local swarm
+	tokens, err := h.docker.GetJoinTokens()
+	if err != nil {
+		respondError(w, http.StatusInternalServerError, "cannot get join tokens: "+err.Error())
+		return
+	}
+	managerAddr := h.docker.GetManagerAddr()
+	var token string
+	if body.Role == "manager" {
+		token = tokens.JoinTokens.Manager
+	} else {
+		token = tokens.JoinTokens.Worker
+	}
+	joinCmd := fmt.Sprintf("docker swarm join --token %s %s", token, managerAddr)
+
+	// 2. Dial SSH to the remote host
+	sshCfg := &ssh.ClientConfig{
+		User: body.User,
+		Auth: []ssh.AuthMethod{
+			ssh.Password(body.Password),
+		},
+		HostKeyCallback: ssh.InsecureIgnoreHostKey(), // acceptable for internal cluster management
+		Timeout:         15 * time.Second,
+	}
+	addr := fmt.Sprintf("%s:%d", body.Host, body.Port)
+	log.Printf("[SwarmJoinNode] Dialing SSH %s as %s", addr, body.User)
+
+	client, err := ssh.Dial("tcp", addr, sshCfg)
+	if err != nil {
+		respond(w, http.StatusOK, map[string]any{
+			"ok":      false,
+			"step":    "ssh_connect",
+			"error":   fmt.Sprintf("SSH connection failed: %s", err.Error()),
+			"host":    body.Host,
+			"command": joinCmd,
+		})
+		return
+	}
+	defer client.Close()
+
+	// 3. Run docker swarm join on the remote node
+	sess, err := client.NewSession()
+	if err != nil {
+		respond(w, http.StatusOK, map[string]any{
+			"ok":    false,
+			"step":  "ssh_session",
+			"error": fmt.Sprintf("SSH session failed: %s", err.Error()),
+		})
+		return
+	}
+	defer sess.Close()
+
+	log.Printf("[SwarmJoinNode] Running on %s: %s", body.Host, joinCmd)
+	out, err := sess.CombinedOutput(joinCmd)
+	output := strings.TrimSpace(string(out))
+
+	if err != nil {
+		// Node might already be in the swarm — treat "already" as success
+		if strings.Contains(output, "already") || strings.Contains(output, "This node is already") {
+			respond(w, http.StatusOK, map[string]any{
+				"ok":      true,
+				"output":  output,
+				"note":    "node is already part of this swarm",
+				"command": joinCmd,
+			})
+			return
+		}
+		respond(w, http.StatusOK, map[string]any{
+			"ok":      false,
+			"step":    "docker_join",
+			"error":   fmt.Sprintf("docker swarm join failed: %s", err.Error()),
+			"output":  output,
+			"command": joinCmd,
+		})
+		return
+	}
+
+	log.Printf("[SwarmJoinNode] Success: %s joined as %s", body.Host, body.Role)
+
+	// Give Docker Swarm ~3 seconds to propagate the new node, then sync to DB.
+	go func() {
+		time.Sleep(3 * time.Second)
+		nodes, err := h.docker.ListNodes()
+		if err != nil {
+			log.Printf("[SwarmJoinNode] DB sync failed (ListNodes): %v", err)
+			return
+		}
+		if h.db != nil {
+			h.db.UpsertSwarmNodes(nodes)
+			log.Printf("[SwarmJoinNode] DB synced: %d nodes after join", len(nodes))
+		}
+	}()
+
+	respond(w, http.StatusOK, map[string]any{
+		"ok":      true,
+		"output":  output,
+		"host":    body.Host,
+		"role":    body.Role,
+		"command": joinCmd,
+	})
+}
+
+// POST /api/swarm/ssh-test
+// Tests SSH connectivity and checks if Docker is accessible on the remote host.
+// Body: { "host": "1.2.3.4", "port": 22, "user": "root", "password": "secret" }
+func (h *Handler) SwarmSSHTest(w http.ResponseWriter, r *http.Request) {
+	var body struct {
+		Host     string `json:"host"`
+		Port     int    `json:"port"`
+		User     string `json:"user"`
+		Password string `json:"password"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		respondError(w, http.StatusBadRequest, "invalid request body")
+		return
+	}
+	if body.Host == "" || body.User == "" || body.Password == "" {
+		respondError(w, http.StatusBadRequest, "host, user and password are required")
+		return
+	}
+	if body.Port == 0 {
+		body.Port = 22
+	}
+
+	sshCfg := &ssh.ClientConfig{
+		User: body.User,
+		Auth: []ssh.AuthMethod{
+			ssh.Password(body.Password),
+		},
+		HostKeyCallback: ssh.InsecureIgnoreHostKey(),
+		Timeout:         10 * time.Second,
+	}
+	addr := fmt.Sprintf("%s:%d", body.Host, body.Port)
+	log.Printf("[SSHTest] Dialing %s as %s", addr, body.User)
+
+	client, err := ssh.Dial("tcp", addr, sshCfg)
+	if err != nil {
+		respond(w, http.StatusOK, map[string]any{
+			"ok":    false,
+			"step":  "ssh_connect",
+			"error": fmt.Sprintf("SSH connection failed: %s", err.Error()),
+		})
+		return
+	}
+	defer client.Close()
+
+	// Run a quick docker version check to see if Docker daemon is accessible
+	sess, err := client.NewSession()
+	if err != nil {
+		respond(w, http.StatusOK, map[string]any{
+			"ok":    false,
+			"step":  "ssh_session",
+			"error": fmt.Sprintf("SSH session failed: %s", err.Error()),
+		})
+		return
+	}
+	defer sess.Close()
+
+	// Use plain 'docker info' to get server version — works on all distros
+	out, _ := sess.CombinedOutput("docker info --format '{{.ServerVersion}}' 2>/dev/null || docker version --format '{{.Server.Version}}' 2>/dev/null || echo 'docker_not_found'")
+	dockerVer := strings.TrimSpace(string(out))
+	if dockerVer == "" {
+		dockerVer = "docker_not_found"
+	}
+	dockerOk := dockerVer != "docker_not_found" && !strings.Contains(dockerVer, "not found") && !strings.Contains(dockerVer, "command not found")
+
+	log.Printf("[SSHTest] %s — SSH OK, docker: %s", addr, dockerVer)
+	respond(w, http.StatusOK, map[string]any{
+		"ok":            true,
+		"sshOk":         true,
+		"dockerOk":      dockerOk,
+		"dockerVersion": dockerVer,
+		"host":          body.Host,
+	})
+}
diff --git a/gateway/internal/api/handlers_agents_test.go b/gateway/internal/api/handlers_agents_test.go
new file mode 100644
index 0000000..d47a360
--- /dev/null
+++ b/gateway/internal/api/handlers_agents_test.go
@@ -0,0 +1,334 @@
+package api
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"git.softuniq.eu/UniqAI/GoClaw/gateway/config"
+	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
+)
+
+// mockDB implements only the methods needed for agent handler tests.
+type mockDB struct {
+	agents      []db.AgentRow
+	configs     map[int]*db.AgentConfig
+	created     []db.CreateAgentInput
+	deleted     []int
+	statusUpdates []statusUpdate
+	nextID      int
+}
+
+type statusUpdate struct {
+	agentID     int
+	status      string
+	serviceName string
+	servicePort int
+}
+
+func newMockDB() *mockDB {
+	return &mockDB{
+		configs: map[int]*db.AgentConfig{},
+		nextID:  100,
+	}
+}
+
+func (m *mockDB) ListAgents() ([]db.AgentRow, error) {
+	return m.agents, nil
+}
+
+func (m *mockDB) GetAgentByID(id int) (*db.AgentConfig, error) {
+	if cfg, ok := m.configs[id]; ok {
+		return cfg, nil
+	}
+	return nil, nil
+}
+
+func (m *mockDB) CreateAgent(in db.CreateAgentInput) (int, error) {
+	m.created = append(m.created, in)
+	id := m.nextID
+	m.nextID++
+	m.configs[id] = &db.AgentConfig{
+		ID:    id,
+		Name:  in.Name,
+		Model: in.Model,
+	}
+	return id, nil
+}
+
+func (m *mockDB) DeleteAgent(id int) error {
+	m.deleted = append(m.deleted, id)
+	delete(m.configs, id)
+	return nil
+}
+
+func (m *mockDB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+	m.statusUpdates = append(m.statusUpdates, statusUpdate{agentID, status, serviceName, servicePort})
+	if cfg, ok := m.configs[agentID]; ok {
+		cfg.ContainerStatus = status
+		cfg.ServiceName = serviceName
+		cfg.ServicePort = servicePort
+	}
+	return nil
+}
+
+func (m *mockDB) AssignServicePort(start, max int) (int, error) {
+	return start, nil
+}
+
+// ─── DB adapter: wrap mockDB to satisfy Handler which uses *db.DB ────────────
+// We use composition instead — create a handler variant that uses an interface.
+// For these tests, we bypass the Handler's db field and test the logic separately.
+
+// ─── Unit tests for new agent REST endpoints ──────────────────────────────────
+
+// TestCreateAgent_MissingDB verifies 503 when DB is nil.
+func TestCreateAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil, // no DB
+	}
+	body := `{"name":"Test","model":"qwen2.5:7b"}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+
+	h.CreateAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeleteAgent_MissingDB verifies 503 when DB is nil.
+func TestDeleteAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodDelete, "/api/agents/1", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.DeleteAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeleteAgent_InvalidID verifies 400 for non-numeric id.
+func TestDeleteAgent_InvalidID(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodDelete, "/api/agents/abc", nil)
+	req.SetPathValue("id", "abc")
+	w := httptest.NewRecorder()
+
+	h.DeleteAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_MissingDB verifies 503 when DB is nil.
+func TestDeployAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_InvalidID verifies 400 for non-numeric id.
+func TestDeployAgent_InvalidID(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/deploy", nil)
+	req.SetPathValue("id", "xyz")
+	w := httptest.NewRecorder()
+
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestStopAgent_MissingDB verifies 503 when DB is nil.
+func TestStopAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/stop", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.StopAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestScaleAgent_MissingDB verifies 503 when DB is nil.
+func TestScaleAgent_MissingDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	body := `{"replicas":3}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body))
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.ScaleAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestScaleAgent_BadReplicas verifies 400 for replicas < 1.
+func TestScaleAgent_BadReplicas(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	body := `{"replicas":0}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body))
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.ScaleAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestListAgents_NoDB verifies empty list when DB is nil.
+func TestListAgents_NoDB(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil,
+	}
+	req := httptest.NewRequest(http.MethodGet, "/api/agents", nil)
+	w := httptest.NewRecorder()
+
+	h.ListAgents(w, req)
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d", w.Code)
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatal("invalid JSON:", err)
+	}
+	if _, ok := resp["note"]; !ok {
+		t.Error("expected 'note' field in response when DB is nil")
+	}
+}
+
+// TestCreateAgent_MissingName verifies 400 when name is empty.
+func TestCreateAgent_MissingName(t *testing.T) {
+	h := &Handler{
+		cfg: &config.Config{},
+		db:  nil, // will fail at DB check first, but let's also test directly
+	}
+	body := `{"model":"qwen2.5:7b"}`
+	req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body))
+	req.Header.Set("Content-Type", "application/json")
+	w := httptest.NewRecorder()
+
+	h.CreateAgent(w, req)
+	// When DB is nil => 503; that's fine, confirms routing works
+	if w.Code != http.StatusServiceUnavailable && w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400 or 503, got %d", w.Code)
+	}
+}
+
+// TestDeployAgent_SimulatedNoDB verifies simulated deploy when docker is nil.
+// We can't inject a real *db.DB without a live MySQL, so we just verify
+// that the nil-docker path returns 200 with the right fields.
+// This test verifies the handler structure is correct.
+func TestDeployAgent_DockerNil_DBNil(t *testing.T) {
+	// When db is nil, returns 503
+	h := &Handler{
+		cfg:    &config.Config{},
+		db:     nil,
+		docker: nil,
+	}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+	h.DeployAgent(w, req)
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// ─── Phase C tests ─────────────────────────────────────────────────────────────
+
+// TestListRunningAgents_NoDB verifies empty list when DB is nil.
+func TestListRunningAgents_NoDB(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodGet, "/api/agents/running", nil)
+	w := httptest.NewRecorder()
+
+	h.ListRunningAgents(w, req)
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d", w.Code)
+	}
+	var resp map[string]any
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatal("invalid JSON:", err)
+	}
+	if cnt, _ := resp["count"].(float64); cnt != 0 {
+		t.Errorf("expected count=0, got %v", cnt)
+	}
+}
+
+// TestRestartAgent_NoDB verifies 503 when DB is nil.
+func TestRestartAgent_NoDB(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/1/restart", nil)
+	req.SetPathValue("id", "1")
+	w := httptest.NewRecorder()
+
+	h.RestartAgent(w, req)
+	// RestartAgent calls DeployAgent which checks DB → 503
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503, got %d", w.Code)
+	}
+}
+
+// TestRestartAgent_InvalidID verifies 400 for non-numeric id.
+func TestRestartAgent_InvalidID(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil}
+	req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/restart", nil)
+	req.SetPathValue("id", "xyz")
+	w := httptest.NewRecorder()
+
+	h.RestartAgent(w, req)
+	if w.Code != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", w.Code)
+	}
+}
+
+// TestSwarmManagerDeadLetter_DockerNil verifies that checkIdleAgents
+// returns early without panic when docker is nil.
+func TestSwarmManagerDeadLetter_DockerNil(t *testing.T) {
+	h := &Handler{cfg: &config.Config{}, db: nil, docker: nil}
+	sm := NewSwarmManager(h, 60*time.Second)
+	// Should not panic
+	sm.checkIdleAgents()
+}
diff --git a/gateway/internal/api/swarm_manager.go b/gateway/internal/api/swarm_manager.go
new file mode 100644
index 0000000..8f2184f
--- /dev/null
+++ b/gateway/internal/api/swarm_manager.go
@@ -0,0 +1,196 @@
+// Package api – Swarm Agent Lifecycle Manager
+//
+// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
+// (which is the Swarm manager node). It watches all agent services and
+// automatically scales them to 0 replicas after IdleTimeout minutes of no
+// activity. The orchestrator can call StartAgent / StopAgent via the REST API
+// to start/stop agents on demand.
+//
+//	Start flow:  POST /api/swarm/agents/{name}/start  → scale to N replicas (default 1)
+//	Stop flow:   POST /api/swarm/agents/{name}/stop   → scale to 0
+//	Auto-stop:   background loop checks every 60 s, scales idle agents to 0
+package api
+
+import (
+	"context"
+	"encoding/json"
+	"log"
+	"net/http"
+	"time"
+)
+
+const (
+	// IdleTimeout – how many minutes without any task updates before an agent
+	// is automatically scaled to 0.
+	defaultIdleTimeoutMinutes = 15
+	// deadLetterCheckEnabled – when true, SwarmManager reconciles DB containerStatus
+	// with actual Swarm service existence (dead-letter recovery).
+	deadLetterCheckEnabled = true
+)
+
+// SwarmManager watches agent services and auto-scales them down after idle.
+type SwarmManager struct {
+	handler *Handler
+	ticker  *time.Ticker
+	done    chan struct{}
+}
+
+// NewSwarmManager creates a manager that checks every checkInterval.
+func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
+	return &SwarmManager{
+		handler: h,
+		ticker:  time.NewTicker(checkInterval),
+		done:    make(chan struct{}),
+	}
+}
+
+// Start launches the background loop. Call in a goroutine.
+func (m *SwarmManager) Start(ctx context.Context) {
+	log.Printf("[SwarmManager] Started — idle timeout %d min",
+		defaultIdleTimeoutMinutes)
+	defer m.ticker.Stop()
+	for {
+		select {
+		case <-m.done:
+			return
+		case <-ctx.Done():
+			return
+		case <-m.ticker.C:
+			m.checkIdleAgents()
+		}
+	}
+}
+
+// Stop signals the background loop to exit.
+func (m *SwarmManager) Stop() {
+	close(m.done)
+}
+
+func (m *SwarmManager) checkIdleAgents() {
+	h := m.handler
+	if h.docker == nil {
+		return
+	}
+
+	// Build a lookup set of currently-live container/service names (both standalone + Swarm)
+	liveContainers := make(map[string]bool)
+
+	// Check standalone containers
+	if containers, err := h.docker.ListContainers(); err == nil {
+		for _, c := range containers {
+			for _, name := range c.Names {
+				// Docker container names are prefixed with "/"
+				clean := name
+				if len(clean) > 0 && clean[0] == '/' {
+					clean = clean[1:]
+				}
+				if c.Labels["goclaw.agent"] == "true" {
+					liveContainers[clean] = true
+				}
+			}
+		}
+	}
+
+	// Check Swarm services (for legacy/mixed environments)
+	services, err := h.docker.ListServices()
+	if err != nil {
+		log.Printf("[SwarmManager] list services error: %v", err)
+	}
+
+	idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
+	now := time.Now()
+
+	for _, svc := range services {
+		// Only manage services labelled as GoClaw agents
+		if svc.Spec.Labels["goclaw.agent"] != "true" {
+			continue
+		}
+		liveContainers[svc.Spec.Name] = true
+
+		// Skip already-stopped services (0 desired replicas)
+		desired := 0
+		if svc.Spec.Mode.Replicated != nil {
+			desired = svc.Spec.Mode.Replicated.Replicas
+		}
+		if desired == 0 {
+			continue
+		}
+		// Check last activity time
+		lastActivity, actErr := h.docker.GetServiceLastActivity(svc.ID)
+		if actErr != nil || lastActivity.IsZero() {
+			lastActivity = svc.UpdatedAt
+		}
+		idle := now.Sub(lastActivity)
+		if idle >= idleThreshold {
+			log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
+				svc.Spec.Name, idle.Minutes())
+			if scaleErr := h.docker.ScaleService(svc.ID, 0); scaleErr != nil {
+				log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, scaleErr)
+			}
+		}
+	}
+
+	// ── Dead-letter reconciliation (Phase C) ─────────────────────────────────
+	// If an agent's DB says "running" but its container/service is gone, update
+	// the status to "error" so the UI shows the discrepancy and allows redeploy.
+	if !deadLetterCheckEnabled || h.db == nil {
+		return
+	}
+	agents, dbErr := h.db.ListAgents()
+	if dbErr != nil {
+		return
+	}
+	for _, a := range agents {
+		if a.ContainerStatus != "running" || a.ServiceName == "" {
+			continue
+		}
+		if !liveContainers[a.ServiceName] {
+			log.Printf("[SwarmManager] Dead-letter: agent %d (%q) marked running but container %q not found — setting status=error",
+				a.ID, a.Name, a.ServiceName)
+			if updateErr := h.db.UpdateContainerStatus(a.ID, "error", a.ServiceName, a.ServicePort); updateErr != nil {
+				log.Printf("[SwarmManager] UpdateContainerStatus error for agent %d: %v", a.ID, updateErr)
+			}
+		}
+	}
+}
+
+// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
+
+// POST /api/swarm/agents/{name}/start
+// Start (scale-up) a named agent service. Body: { "replicas": 1 }
+func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
+	name := r.PathValue("name")
+	if name == "" {
+		respondError(w, http.StatusBadRequest, "agent name required")
+		return
+	}
+	var body struct {
+		Replicas int `json:"replicas"`
+	}
+	_ = json.NewDecoder(r.Body).Decode(&body)
+	if body.Replicas <= 0 {
+		body.Replicas = 1
+	}
+	if err := h.docker.ScaleService(name, body.Replicas); err != nil {
+		respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
+		return
+	}
+	log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
+}
+
+// POST /api/swarm/agents/{name}/stop
+// Stop (scale-to-0) a named agent service.
+func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
+	name := r.PathValue("name")
+	if name == "" {
+		respondError(w, http.StatusBadRequest, "agent name required")
+		return
+	}
+	if err := h.docker.ScaleService(name, 0); err != nil {
+		respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
+		return
+	}
+	log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
+	respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
+}
diff --git a/gateway/internal/db/db.go b/gateway/internal/db/db.go
index 4a9b8fe..7d1567c 100644
--- a/gateway/internal/db/db.go
+++ b/gateway/internal/db/db.go
@@ -3,6 +3,7 @@ package db
 
 import (
 	"database/sql"
+	"database/sql/driver"
 	"encoding/json"
 	"fmt"
 	"log"
@@ -20,9 +21,9 @@ type AgentConfig struct {
 	AllowedTools []string
 	Temperature  float64
 	MaxTokens    int
-	IsOrchestrator bool
-	IsSystem     bool
-	IsActive     bool
+	IsOrchestrator  bool
+	IsSystem        bool
+	IsActive        bool
 	// Container / Swarm fields (Phase A)
 	ServiceName     string
 	ServicePort     int
@@ -32,14 +33,34 @@ type AgentConfig struct {
 
 // AgentRow is a minimal agent representation for listing.
 type AgentRow struct {
-	ID          int    `json:"id"`
-	Name        string `json:"name"`
-	Role        string `json:"role"`
-	Model       string `json:"model"`
-	Description string `json:"description"`
-	IsActive    bool   `json:"isActive"`
-	IsSystem    bool   `json:"isSystem"`
-	IsOrchestrator bool `json:"isOrchestrator"`
+	ID             int    `json:"id"`
+	Name           string `json:"name"`
+	Role           string `json:"role"`
+	Model          string `json:"model"`
+	Description    string `json:"description"`
+	IsActive       bool   `json:"isActive"`
+	IsSystem       bool   `json:"isSystem"`
+	IsOrchestrator bool   `json:"isOrchestrator"`
+	// Container / Swarm fields
+	ServiceName     string `json:"serviceName"`
+	ServicePort     int    `json:"servicePort"`
+	ContainerImage  string `json:"containerImage"`
+	ContainerStatus string `json:"containerStatus"`
+}
+
+// CreateAgentInput holds the fields required to create a new agent in DB.
+type CreateAgentInput struct {
+	Name           string   `json:"name"`
+	Role           string   `json:"role"`
+	Model          string   `json:"model"`
+	Description    string   `json:"description"`
+	SystemPrompt   string   `json:"systemPrompt"`
+	Temperature    float64  `json:"temperature"`
+	MaxTokens      int      `json:"maxTokens"`
+	AllowedTools   []string `json:"allowedTools"`
+	IsSystem       bool     `json:"isSystem"`
+	IsOrchestrator bool     `json:"isOrchestrator"`
+	ContainerImage string   `json:"containerImage"`
 }
 
 type DB struct {
@@ -73,8 +94,7 @@ func (d *DB) Close() {
 // GetOrchestratorConfig loads the agent with isOrchestrator=1 from DB.
 func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) {
 	row := d.conn.QueryRow(`
-		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive,
-		       COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped')
+		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive
 		FROM agents
 		WHERE isOrchestrator = 1
 		LIMIT 1
@@ -85,8 +105,7 @@ func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) {
 // GetAgentByID loads a specific agent by ID.
 func (d *DB) GetAgentByID(id int) (*AgentConfig, error) {
 	row := d.conn.QueryRow(`
-		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive,
-		       COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped')
+		SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive
 		FROM agents
 		WHERE id = ?
 		LIMIT 1
@@ -94,10 +113,13 @@ func (d *DB) GetAgentByID(id int) (*AgentConfig, error) {
 	return scanAgentConfig(row)
 }
 
-// ListAgents returns all active agents.
+// ListAgents returns all agents with container status fields.
 func (d *DB) ListAgents() ([]AgentRow, error) {
 	rows, err := d.conn.Query(`
-		SELECT id, name, role, model, COALESCE(description,''), isActive, isSystem, isOrchestrator
+		SELECT id, name, role, model,
+		       COALESCE(description,''), isActive, isSystem, isOrchestrator,
+		       COALESCE(serviceName,''), COALESCE(servicePort,0),
+		       COALESCE(containerImage,''), COALESCE(containerStatus,'stopped')
 		FROM agents
 		ORDER BY isOrchestrator DESC, isSystem DESC, id ASC
 	`)
@@ -110,7 +132,11 @@ func (d *DB) ListAgents() ([]AgentRow, error) {
 	for rows.Next() {
 		var a AgentRow
 		var isActive, isSystem, isOrch int
-		if err := rows.Scan(&a.ID, &a.Name, &a.Role, &a.Model, &a.Description, &isActive, &isSystem, &isOrch); err != nil {
+		if err := rows.Scan(
+			&a.ID, &a.Name, &a.Role, &a.Model, &a.Description,
+			&isActive, &isSystem, &isOrch,
+			&a.ServiceName, &a.ServicePort, &a.ContainerImage, &a.ContainerStatus,
+		); err != nil {
 			continue
 		}
 		a.IsActive = isActive == 1
@@ -121,64 +147,423 @@ func (d *DB) ListAgents() ([]AgentRow, error) {
 	return agents, nil
 }
 
-// ─── Helpers ──────────────────────────────────────────────────────────────────
-
-func scanAgentConfig(row *sql.Row) (*AgentConfig, error) {
-	var cfg AgentConfig
-	var systemPrompt sql.NullString
-	var allowedToolsJSON sql.NullString
-	var temperature sql.NullFloat64
-	var maxTokens sql.NullInt64
-	var isOrch, isSystem, isActive int
-
-	err := row.Scan(
-		&cfg.ID, &cfg.Name, &cfg.Model,
-		&systemPrompt, &allowedToolsJSON,
-		&temperature, &maxTokens,
-		&isOrch, &isSystem, &isActive,
-		&cfg.ServiceName, &cfg.ServicePort, &cfg.ContainerImage, &cfg.ContainerStatus,
+// CreateAgent inserts a new agent into the DB and returns its ID.
+func (d *DB) CreateAgent(in CreateAgentInput) (int, error) {
+	if d.conn == nil {
+		return 0, fmt.Errorf("DB not connected")
+	}
+	toolsJSON := "[]"
+	if len(in.AllowedTools) > 0 {
+		b, _ := json.Marshal(in.AllowedTools)
+		toolsJSON = string(b)
+	}
+	temp := in.Temperature
+	if temp == 0 {
+		temp = 0.7
+	}
+	maxTok := in.MaxTokens
+	if maxTok == 0 {
+		maxTok = 8192
+	}
+	img := in.ContainerImage
+	if img == "" {
+		img = "goclaw-agent-worker:latest"
+	}
+	res, err := d.conn.Exec(`
+		INSERT INTO agents
+		  (name, role, model, description, systemPrompt, temperature, maxTokens,
+		   allowedTools, isActive, isSystem, isOrchestrator,
+		   containerImage, containerStatus, createdAt, updatedAt)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?, ?, 'stopped', NOW(), NOW())
+	`,
+		in.Name, in.Role, in.Model, in.Description, in.SystemPrompt,
+		temp, maxTok, toolsJSON,
+		boolToInt(in.IsSystem), boolToInt(in.IsOrchestrator),
+		img,
 	)
+	if err != nil {
+		return 0, fmt.Errorf("insert agent: %w", err)
+	}
+	id, _ := res.LastInsertId()
+	return int(id), nil
+}
+
+func boolToInt(b bool) int {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+// DeleteAgent removes an agent record by ID (only non-system agents).
+func (d *DB) DeleteAgent(id int) error {
+	if d.conn == nil {
+		return fmt.Errorf("DB not connected")
+	}
+	res, err := d.conn.Exec(`DELETE FROM agents WHERE id = ? AND isSystem = 0`, id)
+	if err != nil {
+		return err
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return fmt.Errorf("agent %d not found or is a system agent", id)
+	}
+	return nil
+}
+
+// AssignServicePort finds the lowest free port in range [start, start+maxAgents).
+// It reads all currently used ports from DB.
+func (d *DB) AssignServicePort(start, maxAgents int) (int, error) {
+	if d.conn == nil {
+		return start, nil // offline — just return start
+	}
+	rows, err := d.conn.Query(`SELECT COALESCE(servicePort,0) FROM agents WHERE servicePort > 0`)
+	if err != nil {
+		return start, nil
+	}
+	defer rows.Close()
+
+	used := map[int]bool{}
+	for rows.Next() {
+		var p int
+		if rows.Scan(&p) == nil && p > 0 {
+			used[p] = true
+		}
+	}
+	for port := start; port < start+maxAgents; port++ {
+		if !used[port] {
+			return port, nil
+		}
+	}
+	return 0, fmt.Errorf("no free port in range %d-%d", start, start+maxAgents)
+}
+
+// ─── LLM Provider ─────────────────────────────────────────────────────────────
+
+// ProviderRow holds the active LLM provider config from DB.
+type ProviderRow struct {
+	ID       int
+	Name     string
+	BaseURL  string
+	APIKey   string // decrypted (Node.js encrypts, Go just reads raw for now)
+}
+
+// GetActiveProvider returns the active LLM provider from the llmProviders table.
+// Note: The API key is stored AES-256-GCM encrypted by the Node.js server.
+// The Go gateway reads the raw encrypted bytes but cannot decrypt them (no shared key in Go).
+// The proper flow: Node.js decrypts the key and passes it via /api/providers/reload.
+// For now, GetActiveProvider returns the stored encrypted bytes as-is (not useful for direct use).
+// Use UpdateCredentials on the LLM client instead.
+func (d *DB) GetActiveProvider() (*ProviderRow, error) {
+	var p ProviderRow
+	var apiKeyEncrypted sql.NullString
+	row := d.conn.QueryRow(`
+		SELECT id, name, baseUrl, COALESCE(apiKeyEncrypted, '')
+		FROM llmProviders
+		WHERE isActive = 1
+		LIMIT 1
+	`)
+	err := row.Scan(&p.ID, &p.Name, &p.BaseURL, &apiKeyEncrypted)
 	if err != nil {
 		return nil, err
 	}
-
-	cfg.SystemPrompt = systemPrompt.String
-	cfg.Temperature = temperature.Float64
-	if cfg.Temperature == 0 {
-		cfg.Temperature = 0.5
-	}
-	cfg.MaxTokens = int(maxTokens.Int64)
-	if cfg.MaxTokens == 0 {
-		cfg.MaxTokens = 8192
-	}
-	cfg.IsOrchestrator = isOrch == 1
-	cfg.IsSystem = isSystem == 1
-	cfg.IsActive = isActive == 1
-
-	if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" {
-		_ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools)
-	}
-
-	return &cfg, nil
+	// We cannot decrypt the key in Go (different crypto impl from Node.js)
+	// Return empty key — the LLM client will use its env-configured key
+	p.APIKey = ""
+	return &p, nil
 }
 
-// ─── Agent Container Fields ───────────────────────────────────────────────────
-// These methods support the agent-worker container architecture where each
-// agent runs as an autonomous Docker Swarm service.
+// ─── Chat Sessions & Events ───────────────────────────────────────────────────
 
-// UpdateContainerStatus updates the container lifecycle state of an agent.
-func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+// ChatSessionRow holds one persistent chat session.
+type ChatSessionRow struct {
+	ID              int    `json:"id"`
+	SessionID       string `json:"sessionId"`
+	AgentID         int    `json:"agentId"`
+	Status          string `json:"status"` // running | done | error
+	UserMessage     string `json:"userMessage"`
+	FinalResponse   string `json:"finalResponse"`
+	Model           string `json:"model"`
+	TotalTokens     int    `json:"totalTokens"`
+	ProcessingTimeMs int64 `json:"processingTimeMs"`
+	ErrorMessage    string `json:"errorMessage"`
+	CreatedAt       string `json:"createdAt"`
+	UpdatedAt       string `json:"updatedAt"`
+}
+
+// ChatEventRow holds one event inside a session.
+type ChatEventRow struct {
+	ID          int    `json:"id"`
+	SessionID   string `json:"sessionId"`
+	Seq         int    `json:"seq"`
+	EventType   string `json:"eventType"` // thinking | tool_call | delta | done | error
+	Content     string `json:"content"`
+	ToolName    string `json:"toolName"`
+	ToolArgs    string `json:"toolArgs"`    // JSON string
+	ToolResult  string `json:"toolResult"`
+	ToolSuccess bool   `json:"toolSuccess"`
+	DurationMs  int    `json:"durationMs"`
+	Model       string `json:"model"`
+	UsageJSON   string `json:"usageJson"`   // JSON string
+	ErrorMsg    string `json:"errorMsg"`
+	CreatedAt   string `json:"createdAt"`
+}
+
+// CreateSession inserts a new running session and returns its row.
+func (d *DB) CreateSession(sessionID, userMessage string, agentID int) error {
+	if d.conn == nil {
+		return fmt.Errorf("DB not connected")
+	}
+	_, err := d.conn.Exec(`
+		INSERT INTO chatSessions (sessionId, agentId, status, userMessage)
+		VALUES (?, ?, 'running', ?)
+	`, sessionID, agentID, truncate(userMessage, 65535))
+	return err
+}
+
+// AppendEvent inserts a new event row for a session.
+// seq is auto-calculated as MAX(seq)+1 for the session.
+func (d *DB) AppendEvent(e ChatEventRow) error {
 	if d.conn == nil {
 		return nil
 	}
+	toolArgs := e.ToolArgs
+	if toolArgs == "" {
+		toolArgs = "null"
+	}
+	usageJSON := e.UsageJSON
+	if usageJSON == "" {
+		usageJSON = "null"
+	}
+	var toolSuccessVal interface{}
+	if e.EventType == "tool_call" {
+		if e.ToolSuccess {
+			toolSuccessVal = 1
+		} else {
+			toolSuccessVal = 0
+		}
+	}
 	_, err := d.conn.Exec(`
-		UPDATE agents
-		SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW()
-		WHERE id = ?
-	`, status, serviceName, servicePort, agentID)
+		INSERT INTO chatEvents
+			(sessionId, seq, eventType, content, toolName, toolArgs,
+			 toolResult, toolSuccess, durationMs, model, usageJson, errorMsg)
+		SELECT ?, COALESCE(MAX(seq),0)+1, ?, ?, ?, ?,
+			   ?, ?, ?, ?, ?, ?
+		FROM chatEvents WHERE sessionId = ?
+	`,
+		e.SessionID, e.EventType,
+		nullStr(e.Content), nullStr(e.ToolName), rawJSON(toolArgs),
+		nullStr(e.ToolResult), toolSuccessVal, nullInt(e.DurationMs),
+		nullStr(e.Model), rawJSON(usageJSON), nullStr(e.ErrorMsg),
+		e.SessionID,
+	)
+	if err != nil {
+		log.Printf("[DB] AppendEvent error: %v", err)
+	}
 	return err
 }
 
+// MarkSessionDone updates a session to done/error status.
+func (d *DB) MarkSessionDone(sessionID, status, finalResponse, model, errorMessage string, totalTokens int, processingTimeMs int64) {
+	if d.conn == nil {
+		return
+	}
+	_, err := d.conn.Exec(`
+		UPDATE chatSessions
+		SET status=?, finalResponse=?, model=?, totalTokens=?,
+		    processingTimeMs=?, errorMessage=?
+		WHERE sessionId=?
+	`, status,
+		truncate(finalResponse, 65535),
+		model,
+		totalTokens,
+		processingTimeMs,
+		truncate(errorMessage, 65535),
+		sessionID,
+	)
+	if err != nil {
+		log.Printf("[DB] MarkSessionDone error: %v", err)
+	}
+}
+
+// GetSession returns a single session by its string ID.
+func (d *DB) GetSession(sessionID string) (*ChatSessionRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	row := d.conn.QueryRow(`
+		SELECT id, sessionId, agentId, status,
+		       COALESCE(userMessage,''),
+		       COALESCE(finalResponse,''),
+		       COALESCE(model,''),
+		       COALESCE(totalTokens,0),
+		       COALESCE(processingTimeMs,0),
+		       COALESCE(errorMessage,''),
+		       createdAt, updatedAt
+		FROM chatSessions WHERE sessionId=? LIMIT 1
+	`, sessionID)
+	var s ChatSessionRow
+	err := row.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status,
+		&s.UserMessage, &s.FinalResponse, &s.Model,
+		&s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage,
+		&s.CreatedAt, &s.UpdatedAt)
+	if err != nil {
+		return nil, err
+	}
+	return &s, nil
+}
+
+// GetEvents returns all events for a session with seq > afterSeq (for incremental polling).
+func (d *DB) GetEvents(sessionID string, afterSeq int) ([]ChatEventRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, sessionId, seq, eventType,
+		       COALESCE(content,''), COALESCE(toolName,''),
+		       COALESCE(CAST(toolArgs AS CHAR),'null'),
+		       COALESCE(toolResult,''),
+		       COALESCE(toolSuccess,0),
+		       COALESCE(durationMs,0),
+		       COALESCE(model,''),
+		       COALESCE(CAST(usageJson AS CHAR),'null'),
+		       COALESCE(errorMsg,''),
+		       createdAt
+		FROM chatEvents
+		WHERE sessionId=? AND seq > ?
+		ORDER BY seq ASC
+	`, sessionID, afterSeq)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var result []ChatEventRow
+	for rows.Next() {
+		var e ChatEventRow
+		var toolSuccess int
+		if err := rows.Scan(
+			&e.ID, &e.SessionID, &e.Seq, &e.EventType,
+			&e.Content, &e.ToolName, &e.ToolArgs,
+			&e.ToolResult, &toolSuccess, &e.DurationMs,
+			&e.Model, &e.UsageJSON, &e.ErrorMsg, &e.CreatedAt,
+		); err != nil {
+			continue
+		}
+		e.ToolSuccess = toolSuccess == 1
+		result = append(result, e)
+	}
+	return result, nil
+}
+
+// GetRecentSessions returns the N most recent sessions.
+func (d *DB) GetRecentSessions(limit int) ([]ChatSessionRow, error) {
+	if d.conn == nil {
+		return nil, fmt.Errorf("DB not connected")
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, sessionId, agentId, status,
+		       COALESCE(userMessage,''),
+		       COALESCE(finalResponse,''),
+		       COALESCE(model,''),
+		       COALESCE(totalTokens,0),
+		       COALESCE(processingTimeMs,0),
+		       COALESCE(errorMessage,''),
+		       createdAt, updatedAt
+		FROM chatSessions ORDER BY id DESC LIMIT ?
+	`, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var result []ChatSessionRow
+	for rows.Next() {
+		var s ChatSessionRow
+		if err := rows.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status,
+			&s.UserMessage, &s.FinalResponse, &s.Model,
+			&s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage,
+			&s.CreatedAt, &s.UpdatedAt); err != nil {
+			continue
+		}
+		result = append(result, s)
+	}
+	return result, nil
+}
+
+// helper — nil for empty strings
+func nullStr(s string) interface{} {
+	if s == "" {
+		return nil
+	}
+	return s
+}
+
+// helper — nil for zero int
+func nullInt(n int) interface{} {
+	if n == 0 {
+		return nil
+	}
+	return n
+}
+
+// rawJSON wraps a JSON string so it's passed as-is to MySQL (not double-encoded)
+type rawJSON string
+
+func (r rawJSON) Value() (driver.Value, error) {
+	if r == "null" || r == "" {
+		return nil, nil
+	}
+	return string(r), nil
+}
+
+// ─── Metrics & History ────────────────────────────────────────────────────────
+
+// MetricInput holds data for a single orchestrator request metric.
+type MetricInput struct {
+	AgentID         int
+	RequestID       string
+	UserMessage     string
+	AgentResponse   string
+	InputTokens     int
+	OutputTokens    int
+	TotalTokens     int
+	ProcessingTimeMs int64
+	Status          string // "success" | "error" | "timeout"
+	ErrorMessage    string
+	ToolsCalled     []string
+	Model           string
+}
+
+// SaveMetric inserts a row into the agentMetrics table.
+// Non-fatal — logs on error but does not return one.
+func (d *DB) SaveMetric(m MetricInput) {
+	if d.conn == nil {
+		return
+	}
+	toolsJSON, _ := json.Marshal(m.ToolsCalled)
+	_, err := d.conn.Exec(`
+		INSERT INTO agentMetrics
+			(agentId, requestId, userMessage, agentResponse,
+			 inputTokens, outputTokens, totalTokens,
+			 processingTimeMs, status, errorMessage, toolsCalled, model)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+	`,
+		m.AgentID,
+		m.RequestID,
+		truncate(m.UserMessage, 65535),
+		truncate(m.AgentResponse, 65535),
+		m.InputTokens, m.OutputTokens, m.TotalTokens,
+		m.ProcessingTimeMs,
+		m.Status,
+		m.ErrorMessage,
+		string(toolsJSON),
+		m.Model,
+	)
+	if err != nil {
+		log.Printf("[DB] SaveMetric error: %v", err)
+	}
+}
+
 // HistoryInput holds data for one conversation entry.
 type HistoryInput struct {
 	AgentID        int
@@ -188,14 +573,6 @@ type HistoryInput struct {
 	Status         string // "success" | "error" | "pending"
 }
 
-// HistoryRow is a single entry from agentHistory for sliding window memory.
-type HistoryRow struct {
-	ID            int    `json:"id"`
-	UserMessage   string `json:"userMessage"`
-	AgentResponse string `json:"agentResponse"`
-	ConvID        string `json:"conversationId"`
-}
-
 // SaveHistory inserts a row into the agentHistory table.
 // Non-fatal — logs on error but does not return one.
 func (d *DB) SaveHistory(h HistoryInput) {
@@ -223,39 +600,7 @@ func (d *DB) SaveHistory(h HistoryInput) {
 	}
 }
 
-// GetAgentHistory returns the last N conversation turns for an agent, oldest first.
-func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) {
-	if d.conn == nil {
-		return nil, nil
-	}
-	rows, err := d.conn.Query(`
-		SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'')
-		FROM agentHistory
-		WHERE agentId = ?
-		ORDER BY id DESC
-		LIMIT ?
-	`, agentID, limit)
-	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-
-	var result []HistoryRow
-	for rows.Next() {
-		var h HistoryRow
-		if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil {
-			continue
-		}
-		result = append(result, h)
-	}
-	// Reverse so oldest is first (for LLM context ordering)
-	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
-		result[i], result[j] = result[j], result[i]
-	}
-	return result, nil
-}
-
-// truncate caps a string to maxLen bytes.
+// truncate caps a string to maxLen bytes (not runes — fast path for DB limits).
 func truncate(s string, maxLen int) string {
 	if len(s) <= maxLen {
 		return s
@@ -263,6 +608,154 @@ func truncate(s string, maxLen int) string {
 	return s[:maxLen]
 }
 
+// ─── Swarm Node Persistence ───────────────────────────────────────────────────
+
+// SwarmNodeInput is the data shape that handlers pass to UpsertSwarmNodes.
+// It matches the JSON shape from handler's NodeOut struct so we can reuse it.
+type SwarmNodeInput struct {
+	ID            string            `json:"id"`
+	Hostname      string            `json:"hostname"`
+	Role          string            `json:"role"`
+	State         string            `json:"state"`
+	Availability  string            `json:"availability"`
+	IP            string            `json:"ip"`
+	CPUCores      int               `json:"cpuCores"`
+	MemTotalMB    int64             `json:"memTotalMB"`
+	DockerVersion string            `json:"dockerVersion"`
+	IsLeader      bool              `json:"isLeader"`
+	ManagerAddr   string            `json:"managerAddr"`
+	Labels        map[string]string `json:"labels"`
+}
+
+// UpsertSwarmNodes inserts or updates swarm node records in the swarmNodes table.
+// Called asynchronously from the SwarmNodes handler — never blocks the response.
+func (d *DB) UpsertSwarmNodes(nodes interface{}) {
+	if d.conn == nil {
+		return
+	}
+	// We accept interface{} to avoid circular import; use json round-trip to parse.
+	b, err := json.Marshal(nodes)
+	if err != nil {
+		return
+	}
+	var list []SwarmNodeInput
+	if err := json.Unmarshal(b, &list); err != nil {
+		return
+	}
+	for _, n := range list {
+		labelsJSON, _ := json.Marshal(n.Labels)
+		isLeader := 0
+		if n.IsLeader {
+			isLeader = 1
+		}
+		isManager := 0
+		if n.Role == "manager" {
+			isManager = 1
+		}
+		state := n.State
+		if state != "ready" && state != "down" && state != "disconnected" {
+			state = "ready"
+		}
+		avail := n.Availability
+		if avail != "active" && avail != "pause" && avail != "drain" {
+			avail = "active"
+		}
+		_, err := d.conn.Exec(`
+			INSERT INTO swarmNodes
+				(nodeId, hostname, role, state, availability, advertiseAddr,
+				 labels, engineVersion, cpuCores, memTotalMB, isManager, isLeader)
+			VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+			ON DUPLICATE KEY UPDATE
+				hostname=VALUES(hostname), role=VALUES(role),
+				state=VALUES(state), availability=VALUES(availability),
+				advertiseAddr=VALUES(advertiseAddr),
+				labels=VALUES(labels), engineVersion=VALUES(engineVersion),
+				cpuCores=VALUES(cpuCores), memTotalMB=VALUES(memTotalMB),
+				isManager=VALUES(isManager), isLeader=VALUES(isLeader),
+				lastSeenAt=CURRENT_TIMESTAMP
+		`,
+			n.ID, n.Hostname, n.Role, state, avail, n.IP,
+			string(labelsJSON), n.DockerVersion,
+			n.CPUCores, n.MemTotalMB, isManager, isLeader,
+		)
+		if err != nil {
+			log.Printf("[DB] UpsertSwarmNodes error for node %s: %v", n.ID, err)
+		}
+	}
+}
+
+// UpsertSwarmTokens stores the current swarm join tokens.
+func (d *DB) UpsertSwarmTokens(workerToken, managerToken, managerAddr string) {
+	if d.conn == nil {
+		return
+	}
+	_, err := d.conn.Exec(`
+		INSERT INTO swarmTokens (managerToken, workerToken, managerAddr)
+		VALUES (?, ?, ?)
+		ON DUPLICATE KEY UPDATE
+			managerToken=VALUES(managerToken),
+			workerToken=VALUES(workerToken),
+			managerAddr=VALUES(managerAddr)
+	`, managerToken, workerToken, managerAddr)
+	if err != nil {
+		log.Printf("[DB] UpsertSwarmTokens error: %v", err)
+	}
+}
+
+// GetSwarmTokens retrieves the stored join tokens.
+func (d *DB) GetSwarmTokens() (worker, manager, addr string, err error) {
+	if d.conn == nil {
+		err = fmt.Errorf("DB not connected")
+		return
+	}
+	row := d.conn.QueryRow(`
+		SELECT COALESCE(workerToken,''), COALESCE(managerToken,''), COALESCE(managerAddr,'')
+		FROM swarmTokens ORDER BY id DESC LIMIT 1
+	`)
+	err = row.Scan(&worker, &manager, &addr)
+	return
+}
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+func scanAgentConfig(row *sql.Row) (*AgentConfig, error) {
+	var cfg AgentConfig
+	var systemPrompt sql.NullString
+	var allowedToolsJSON sql.NullString
+	var temperature sql.NullFloat64
+	var maxTokens sql.NullInt64
+	var isOrch, isSystem, isActive int
+
+	err := row.Scan(
+		&cfg.ID, &cfg.Name, &cfg.Model,
+		&systemPrompt, &allowedToolsJSON,
+		&temperature, &maxTokens,
+		&isOrch, &isSystem, &isActive,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	cfg.SystemPrompt = systemPrompt.String
+	cfg.Temperature = temperature.Float64
+	if cfg.Temperature == 0 {
+		cfg.Temperature = 0.5
+	}
+	cfg.MaxTokens = int(maxTokens.Int64)
+	if cfg.MaxTokens == 0 {
+		cfg.MaxTokens = 8192
+	}
+	cfg.IsOrchestrator = isOrch == 1
+	cfg.IsSystem = isSystem == 1
+	cfg.IsActive = isActive == 1
+
+	if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" {
+		_ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools)
+	}
+
+	return &cfg, nil
+}
+
 // normalizeDSN converts mysql://user:pass@host:port/db to user:pass@tcp(host:port)/db
 func normalizeDSN(dsn string) string {
 	if !strings.HasPrefix(dsn, "mysql://") {
@@ -304,3 +797,60 @@ func normalizeDSN(dsn string) string {
 	}
 	return fmt.Sprintf("%s@tcp(%s)%s?parseTime=true&charset=utf8mb4%s", userInfo, hostPort, dbName, tlsParam)
 }
+
+// ─── Agent Container Fields ───────────────────────────────────────────────────
+// These methods support the agent-worker container architecture where each
+// agent runs as an autonomous Docker Swarm service.
+
+// UpdateContainerStatus updates the container lifecycle state of an agent.
+func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error {
+	if d.conn == nil {
+		return nil
+	}
+	_, err := d.conn.Exec(`
+		UPDATE agents
+		SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW()
+		WHERE id = ?
+	`, status, serviceName, servicePort, agentID)
+	return err
+}
+
+// HistoryRow is a single entry from agentHistory for sliding window memory.
+type HistoryRow struct {
+	ID            int    `json:"id"`
+	UserMessage   string `json:"userMessage"`
+	AgentResponse string `json:"agentResponse"`
+	ConvID        string `json:"conversationId"`
+}
+
+// GetAgentHistory returns the last N conversation turns for an agent, oldest first.
+func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) {
+	if d.conn == nil {
+		return nil, nil
+	}
+	rows, err := d.conn.Query(`
+		SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'')
+		FROM agentHistory
+		WHERE agentId = ?
+		ORDER BY id DESC
+		LIMIT ?
+	`, agentID, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var result []HistoryRow
+	for rows.Next() {
+		var h HistoryRow
+		if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil {
+			continue
+		}
+		result = append(result, h)
+	}
+	// Reverse so oldest is first (for LLM context ordering)
+	for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 {
+		result[i], result[j] = result[j], result[i]
+	}
+	return result, nil
+}
diff --git a/gateway/internal/docker/client.go b/gateway/internal/docker/client.go
index 3c8037a..797ea34 100644
--- a/gateway/internal/docker/client.go
+++ b/gateway/internal/docker/client.go
@@ -1,22 +1,25 @@
 package docker
 
 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net"
 	"net/http"
+	"os/exec"
+	"strings"
 	"time"
 )
 
-// DockerClient communicates with the Docker daemon via Unix socket or TCP.
+// DockerClient communicates with the Docker daemon via Unix socket.
 type DockerClient struct {
 	httpClient *http.Client
 	baseURL    string
 }
 
-// NewDockerClient creates a client that talks to /var/run/docker.sock.
+// NewDockerClient creates a client talking to /var/run/docker.sock.
 func NewDockerClient() *DockerClient {
 	transport := &http.Transport{
 		DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
@@ -24,11 +27,13 @@ func NewDockerClient() *DockerClient {
 		},
 	}
 	return &DockerClient{
-		httpClient: &http.Client{Transport: transport, Timeout: 10 * time.Second},
-		baseURL:    "http://localhost", // host is ignored for unix socket
+		httpClient: &http.Client{Transport: transport, Timeout: 30 * time.Second},
+		baseURL:    "http://localhost",
 	}
 }
 
+// ─── HTTP helpers ─────────────────────────────────────────────────────────────
+
 func (c *DockerClient) get(path string, out interface{}) error {
 	resp, err := c.httpClient.Get(c.baseURL + path)
 	if err != nil {
@@ -42,16 +47,64 @@ func (c *DockerClient) get(path string, out interface{}) error {
 	return json.Unmarshal(body, out)
 }
 
-// ---- Types ----------------------------------------------------------------
+func (c *DockerClient) post(path string, payload interface{}, out interface{}) error {
+	b, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+	resp, err := c.httpClient.Post(c.baseURL+path, "application/json", bytes.NewReader(b))
+	if err != nil {
+		return fmt.Errorf("docker POST %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("docker POST %s: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	if out != nil && len(body) > 0 {
+		return json.Unmarshal(body, out)
+	}
+	return nil
+}
+
+func (c *DockerClient) postUpdate(path string, version int, payload interface{}) error {
+	b, err := json.Marshal(payload)
+	if err != nil {
+		return err
+	}
+	url := fmt.Sprintf("%s%s?version=%d", c.baseURL, path, version)
+	req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(b))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("docker POST(update) %s: %w", path, err)
+	}
+	defer resp.Body.Close()
+	body, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode >= 400 {
+		return fmt.Errorf("docker POST(update) %s: status %d: %s", path, resp.StatusCode, string(body))
+	}
+	return nil
+}
+
+// ─── Swarm Node Types ─────────────────────────────────────────────────────────
 
 type SwarmNode struct {
-	ID          string          `json:"ID"`
+	ID          string         `json:"ID"`
 	Description NodeDescription `json:"Description"`
-	Status      NodeStatus      `json:"Status"`
+	Status      NodeStatus     `json:"Status"`
 	ManagerStatus *ManagerStatus `json:"ManagerStatus,omitempty"`
-	Spec        NodeSpec        `json:"Spec"`
-	UpdatedAt   time.Time       `json:"UpdatedAt"`
-	CreatedAt   time.Time       `json:"CreatedAt"`
+	Spec        NodeSpec       `json:"Spec"`
+	UpdatedAt   time.Time      `json:"UpdatedAt"`
+	CreatedAt   time.Time      `json:"CreatedAt"`
+	Version     VersionInfo    `json:"Version"`
+}
+
+type VersionInfo struct {
+	Index int `json:"Index"`
 }
 
 type NodeDescription struct {
@@ -82,17 +135,155 @@ type NodeStatus struct {
 }
 
 type ManagerStatus struct {
-	Addr           string `json:"Addr"`
-	Leader         bool   `json:"Leader"`
-	Reachability   string `json:"Reachability"`
+	Addr         string `json:"Addr"`
+	Leader       bool   `json:"Leader"`
+	Reachability string `json:"Reachability"`
 }
 
 type NodeSpec struct {
-	Role         string `json:"Role"`
-	Availability string `json:"Availability"`
+	Role         string            `json:"Role"`
+	Availability string            `json:"Availability"`
 	Labels       map[string]string `json:"Labels"`
 }
 
+// ─── Swarm Service Types ──────────────────────────────────────────────────────
+
+type SwarmService struct {
+	ID          string          `json:"ID"`
+	Spec        ServiceSpec     `json:"Spec"`
+	ServiceStatus *ServiceStatus `json:"ServiceStatus,omitempty"`
+	UpdatedAt   time.Time       `json:"UpdatedAt"`
+	CreatedAt   time.Time       `json:"CreatedAt"`
+	Version     VersionInfo     `json:"Version"`
+}
+
+type ServiceSpec struct {
+	Name         string            `json:"Name"`
+	Mode         ServiceMode       `json:"Mode"`
+	TaskTemplate TaskTemplate      `json:"TaskTemplate"`
+	EndpointSpec *EndpointSpec     `json:"EndpointSpec,omitempty"`
+	Labels       map[string]string `json:"Labels"`
+	Networks     []NetworkAttachment `json:"Networks,omitempty"`
+}
+
+type NetworkAttachment struct {
+	Target  string   `json:"Target"`
+	Aliases []string `json:"Aliases,omitempty"`
+}
+
+type ServiceMode struct {
+	Replicated *ReplicatedService `json:"Replicated,omitempty"`
+	Global     *struct{}          `json:"Global,omitempty"`
+}
+
+type ReplicatedService struct {
+	Replicas int `json:"Replicas"`
+}
+
+type TaskTemplate struct {
+	ContainerSpec ContainerSpec `json:"ContainerSpec"`
+	Resources     *TaskResources `json:"Resources,omitempty"`
+	Placement     *Placement    `json:"Placement,omitempty"`
+}
+
+type ContainerSpec struct {
+	Image  string   `json:"Image"`
+	Env    []string `json:"Env,omitempty"`
+	Labels map[string]string `json:"Labels,omitempty"`
+}
+
+type TaskResources struct {
+	Limits       *ResourceSpec `json:"Limits,omitempty"`
+	Reservations *ResourceSpec `json:"Reservations,omitempty"`
+}
+
+type ResourceSpec struct {
+	NanoCPUs    int64 `json:"NanoCPUs,omitempty"`
+	MemoryBytes int64 `json:"MemoryBytes,omitempty"`
+}
+
+type Placement struct {
+	Constraints []string `json:"Constraints,omitempty"`
+}
+
+type EndpointSpec struct {
+	Ports []PortConfig `json:"Ports,omitempty"`
+}
+
+type PortConfig struct {
+	Protocol      string `json:"Protocol"`
+	TargetPort    int    `json:"TargetPort"`
+	PublishedPort int    `json:"PublishedPort"`
+	PublishMode   string `json:"PublishMode"`
+}
+
+type ServiceStatus struct {
+	RunningTasks   int `json:"RunningTasks"`
+	DesiredTasks   int `json:"DesiredTasks"`
+	CompletedTasks int `json:"CompletedTasks"`
+}
+
+// ─── Swarm Task Types ─────────────────────────────────────────────────────────
+
+type SwarmTask struct {
+	ID          string      `json:"ID"`
+	ServiceID   string      `json:"ServiceID"`
+	NodeID      string      `json:"NodeID"`
+	Spec        TaskSpec    `json:"Spec"`
+	Status      TaskStatus  `json:"Status"`
+	Slot        int         `json:"Slot"`
+	UpdatedAt   time.Time   `json:"UpdatedAt"`
+	CreatedAt   time.Time   `json:"CreatedAt"`
+}
+
+type TaskSpec struct {
+	ContainerSpec ContainerSpec `json:"ContainerSpec"`
+}
+
+type TaskStatus struct {
+	Timestamp   time.Time       `json:"Timestamp"`
+	State       string          `json:"State"`
+	Message     string          `json:"Message"`
+	ContainerStatus *ContainerTaskStatus `json:"ContainerStatus,omitempty"`
+}
+
+type ContainerTaskStatus struct {
+	ContainerID string `json:"ContainerID"`
+	PID         int    `json:"PID"`
+}
+
+// ─── Swarm Info / Tokens ──────────────────────────────────────────────────────
+
+type DockerInfo struct {
+	Swarm SwarmInfo `json:"Swarm"`
+}
+
+type SwarmInfo struct {
+	NodeID           string `json:"NodeID"`
+	LocalNodeState   string `json:"LocalNodeState"`
+	ControlAvailable bool   `json:"ControlAvailable"`
+	Managers         int    `json:"Managers"`
+	Nodes            int    `json:"Nodes"`
+	RemoteManagers   []RemoteManager `json:"RemoteManagers"`
+}
+
+type RemoteManager struct {
+	NodeID string `json:"NodeID"`
+	Addr   string `json:"Addr"`
+}
+
+type SwarmSpec struct {
+	JoinTokens JoinTokens `json:"JoinTokens"`
+	ID         string     `json:"ID"`
+}
+
+type JoinTokens struct {
+	Worker  string `json:"Worker"`
+	Manager string `json:"Manager"`
+}
+
+// ─── Container types ──────────────────────────────────────────────────────────
+
 type Container struct {
 	ID      string            `json:"Id"`
 	Names   []string          `json:"Names"`
@@ -109,9 +300,9 @@ type ContainerStats struct {
 }
 
 type CPUStats struct {
-	CPUUsage    CPUUsage `json:"cpu_usage"`
-	SystemCPUUsage int64 `json:"system_cpu_usage"`
-	OnlineCPUs  int      `json:"online_cpus"`
+	CPUUsage       CPUUsage `json:"cpu_usage"`
+	SystemCPUUsage int64    `json:"system_cpu_usage"`
+	OnlineCPUs     int      `json:"online_cpus"`
 }
 
 type CPUUsage struct {
@@ -120,27 +311,14 @@ type CPUUsage struct {
 }
 
 type MemoryStats struct {
-	Usage    int64 `json:"usage"`
-	MaxUsage int64 `json:"max_usage"`
-	Limit    int64 `json:"limit"`
+	Usage    int64            `json:"usage"`
+	MaxUsage int64            `json:"max_usage"`
+	Limit    int64            `json:"limit"`
 	Stats    map[string]int64 `json:"stats"`
 }
 
-type DockerInfo struct {
-	Swarm SwarmInfo `json:"Swarm"`
-}
+// ─── Methods: Swarm info ──────────────────────────────────────────────────────
 
-type SwarmInfo struct {
-	NodeID           string `json:"NodeID"`
-	LocalNodeState   string `json:"LocalNodeState"`
-	ControlAvailable bool   `json:"ControlAvailable"`
-	Managers         int    `json:"Managers"`
-	Nodes            int    `json:"Nodes"`
-}
-
-// ---- Methods ---------------------------------------------------------------
-
-// IsSwarmActive checks if Docker Swarm is initialized.
 func (c *DockerClient) IsSwarmActive() bool {
 	var info DockerInfo
 	if err := c.get("/v1.44/info", &info); err != nil {
@@ -149,7 +327,6 @@ func (c *DockerClient) IsSwarmActive() bool {
 	return info.Swarm.LocalNodeState == "active"
 }
 
-// GetSwarmInfo returns basic swarm info.
 func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) {
 	var info DockerInfo
 	if err := c.get("/v1.44/info", &info); err != nil {
@@ -158,7 +335,27 @@ func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) {
 	return &info, nil
 }
 
-// ListNodes returns all Swarm nodes (requires manager node).
+// GetJoinTokens returns the Swarm worker and manager join tokens.
+// Requires this node to be a swarm manager.
+func (c *DockerClient) GetJoinTokens() (*SwarmSpec, error) {
+	var spec SwarmSpec
+	if err := c.get("/v1.44/swarm", &spec); err != nil {
+		return nil, err
+	}
+	return &spec, nil
+}
+
+// GetManagerAddr returns the advertise address (IP:2377) for joining this swarm.
+func (c *DockerClient) GetManagerAddr() string {
+	info, err := c.GetSwarmInfo()
+	if err != nil || len(info.Swarm.RemoteManagers) == 0 {
+		return ""
+	}
+	return info.Swarm.RemoteManagers[0].Addr
+}
+
+// ─── Methods: Nodes ───────────────────────────────────────────────────────────
+
 func (c *DockerClient) ListNodes() ([]SwarmNode, error) {
 	var nodes []SwarmNode
 	if err := c.get("/v1.44/nodes", &nodes); err != nil {
@@ -167,7 +364,323 @@ func (c *DockerClient) ListNodes() ([]SwarmNode, error) {
 	return nodes, nil
 }
 
-// ListContainers returns all running containers on this host.
+// UpdateNodeAvailability sets a node's availability (active|pause|drain).
+func (c *DockerClient) UpdateNodeAvailability(nodeID, availability string) error {
+	// First get current node spec + version
+	var node SwarmNode
+	if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil {
+		return err
+	}
+	node.Spec.Availability = availability
+	return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec)
+}
+
+// AddNodeLabel adds a label to a swarm node.
+func (c *DockerClient) AddNodeLabel(nodeID, key, value string) error {
+	var node SwarmNode
+	if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil {
+		return err
+	}
+	if node.Spec.Labels == nil {
+		node.Spec.Labels = map[string]string{}
+	}
+	node.Spec.Labels[key] = value
+	return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec)
+}
+
+// ─── Methods: Services ────────────────────────────────────────────────────────
+
+// ListServices returns all swarm services, optionally filtered by label.
+func (c *DockerClient) ListServices() ([]SwarmService, error) {
+	var services []SwarmService
+	// Include ServiceStatus so running/desired replicas are returned
+	if err := c.get("/v1.44/services?status=true", &services); err != nil {
+		return nil, err
+	}
+	return services, nil
+}
+
+// GetService returns a single service by ID or name.
+func (c *DockerClient) GetService(idOrName string) (*SwarmService, error) {
+	var svc SwarmService
+	if err := c.get("/v1.44/services/"+idOrName+"?status=true", &svc); err != nil {
+		return nil, err
+	}
+	return &svc, nil
+}
+
+// ScaleService updates the replica count for a replicated service.
+func (c *DockerClient) ScaleService(idOrName string, replicas int) error {
+	svc, err := c.GetService(idOrName)
+	if err != nil {
+		return err
+	}
+	if svc.Spec.Mode.Replicated == nil {
+		return fmt.Errorf("service %s is not in replicated mode", idOrName)
+	}
+	svc.Spec.Mode.Replicated.Replicas = replicas
+	return c.postUpdate(
+		"/v1.44/services/"+svc.ID+"/update",
+		svc.Version.Index,
+		svc.Spec,
+	)
+}
+
+// ListServiceTasks returns all tasks for a given service.
+func (c *DockerClient) ListServiceTasks(serviceID string) ([]SwarmTask, error) {
+	var tasks []SwarmTask
+	filter := fmt.Sprintf(`{"service":["%s"]}`, serviceID)
+	path := "/v1.44/tasks?filters=" + urlEncode(filter)
+	if err := c.get(path, &tasks); err != nil {
+		return nil, err
+	}
+	return tasks, nil
+}
+
+// ListAllTasks returns all swarm tasks (across services).
+func (c *DockerClient) ListAllTasks() ([]SwarmTask, error) {
+	var tasks []SwarmTask
+	if err := c.get("/v1.44/tasks", &tasks); err != nil {
+		return nil, err
+	}
+	return tasks, nil
+}
+
+// CreateAgentService deploys a new swarm service for an AI agent.
+// image: container image, name: service name, replicas: initial count,
+// env: environment variables, port: optional published port (0 = none).
+// CreateAgentServiceOpts holds options for deploying an agent Swarm service.
+type CreateAgentServiceOpts struct {
+	Name        string
+	Image       string
+	Replicas    int
+	Env         []string
+	Port        int
+	Networks    []string // overlay network names/IDs to attach
+	Labels      map[string]string
+	Constraints []string // placement constraints, e.g. ["node.role==manager"]
+}
+
+func (c *DockerClient) CreateAgentService(name, image string, replicas int, env []string, port int) (*SwarmService, error) {
+	return c.CreateAgentServiceFull(CreateAgentServiceOpts{
+		Name:     name,
+		Image:    image,
+		Replicas: replicas,
+		Env:      env,
+		Port:     port,
+	})
+}
+
+func (c *DockerClient) CreateAgentServiceFull(opts CreateAgentServiceOpts) (*SwarmService, error) {
+	labels := map[string]string{
+		"goclaw.agent": "true",
+		"goclaw.name":  opts.Name,
+	}
+	for k, v := range opts.Labels {
+		labels[k] = v
+	}
+	var placement *Placement
+	if len(opts.Constraints) > 0 {
+		placement = &Placement{Constraints: opts.Constraints}
+	}
+	spec := ServiceSpec{
+		Name: opts.Name,
+		Mode: ServiceMode{
+			Replicated: &ReplicatedService{Replicas: opts.Replicas},
+		},
+		TaskTemplate: TaskTemplate{
+			ContainerSpec: ContainerSpec{
+				Image: opts.Image,
+				Env:   opts.Env,
+			},
+			Placement: placement,
+		},
+		Labels: labels,
+	}
+	if opts.Port > 0 {
+		spec.EndpointSpec = &EndpointSpec{
+			Ports: []PortConfig{
+				{
+					Protocol:      "tcp",
+					TargetPort:    opts.Port,
+					PublishedPort: opts.Port,
+					PublishMode:   "host",
+				},
+			},
+		}
+	}
+	if len(opts.Networks) > 0 {
+		for _, net := range opts.Networks {
+			spec.Networks = append(spec.Networks, NetworkAttachment{
+				Target:  net,
+				Aliases: []string{opts.Name},
+			})
+		}
+	}
+	var created struct {
+		ID string `json:"ID"`
+	}
+	if err := c.post("/v1.44/services/create", spec, &created); err != nil {
+		return nil, err
+	}
+	return c.GetService(created.ID)
+}
+
+// RemoveService removes a swarm service by ID or name.
+func (c *DockerClient) RemoveService(idOrName string) error {
+	req, err := http.NewRequest(http.MethodDelete, c.baseURL+"/v1.44/services/"+urlEncode(idOrName), nil)
+	if err != nil {
+		return err
+	}
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("docker DELETE service %s: %w", idOrName, err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 400 {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("docker DELETE service %s: status %d: %s", idOrName, resp.StatusCode, string(body))
+	}
+	return nil
+}
+
+// GetServiceLastActivity returns the most recent task update time for a service.
+// Used to determine whether a service is idle.
+func (c *DockerClient) GetServiceLastActivity(serviceID string) (time.Time, error) {
+	tasks, err := c.ListServiceTasks(serviceID)
+	if err != nil {
+		return time.Time{}, err
+	}
+	var latest time.Time
+	for _, t := range tasks {
+		if t.UpdatedAt.After(latest) {
+			latest = t.UpdatedAt
+		}
+	}
+	return latest, nil
+}
+
+// ─── Methods: Containers ─────────────────────────────────────────────────────
+
+// RunContainerOpts holds options for running a standalone container.
+type RunContainerOpts struct {
+	Name     string
+	Image    string
+	Env      []string
+	Networks []string // bridge/overlay networks to attach
+	Port     int      // host port (also used as container port)
+	Labels   map[string]string
+}
+
+// RunContainer creates and starts a standalone Docker container (docker run equivalent).
+// Returns the container ID on success.
+func (c *DockerClient) RunContainer(opts RunContainerOpts) (string, error) {
+	labels := map[string]string{"goclaw.agent": "true"}
+	for k, v := range opts.Labels {
+		labels[k] = v
+	}
+
+	// Build port bindings: host port -> container port
+	exposedPorts := map[string]struct{}{}
+	portBindings := map[string][]map[string]string{}
+	if opts.Port > 0 {
+		key := fmt.Sprintf("%d/tcp", opts.Port)
+		exposedPorts[key] = struct{}{}
+		portBindings[key] = []map[string]string{{"HostPort": fmt.Sprintf("%d", opts.Port)}}
+	}
+
+	// Pick first network for creation; additional networks attached after
+	firstNetwork := ""
+	if len(opts.Networks) > 0 {
+		firstNetwork = opts.Networks[0]
+	}
+
+	body := map[string]any{
+		"Image":        opts.Image,
+		"Env":          opts.Env,
+		"Labels":       labels,
+		"ExposedPorts": exposedPorts,
+		"HostConfig": map[string]any{
+			"PortBindings":  portBindings,
+			"RestartPolicy": map[string]any{"Name": "unless-stopped"},
+			"NetworkMode":   firstNetwork,
+		},
+		"NetworkingConfig": map[string]any{
+			"EndpointsConfig": map[string]any{
+				firstNetwork: map[string]any{
+					"Aliases": []string{opts.Name},
+				},
+			},
+		},
+	}
+
+	var created struct {
+		ID string `json:"Id"`
+	}
+	if err := c.post(fmt.Sprintf("/v1.44/containers/create?name=%s", urlEncode(opts.Name)), body, &created); err != nil {
+		return "", fmt.Errorf("create container %s: %w", opts.Name, err)
+	}
+
+	// Start the container
+	startURL := fmt.Sprintf("/v1.44/containers/%s/start", created.ID)
+	req, err := http.NewRequest(http.MethodPost, c.baseURL+startURL, nil)
+	if err != nil {
+		return created.ID, err
+	}
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return created.ID, fmt.Errorf("start container: %w", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 300 {
+		b, _ := io.ReadAll(resp.Body)
+		return created.ID, fmt.Errorf("start container HTTP %d: %s", resp.StatusCode, string(b))
+	}
+
+	// Attach additional networks
+	for i, net := range opts.Networks {
+		if i == 0 {
+			continue // already attached via NetworkMode
+		}
+		netBody := map[string]any{
+			"Container": created.ID,
+			"EndpointConfig": map[string]any{
+				"Aliases": []string{opts.Name},
+			},
+		}
+		_ = c.post(fmt.Sprintf("/v1.44/networks/%s/connect", urlEncode(net)), netBody, nil)
+	}
+
+	return created.ID, nil
+}
+
+// StopContainer stops and removes a standalone container by name or ID.
+func (c *DockerClient) StopContainer(nameOrID string) error {
+	// Stop
+	stopURL := fmt.Sprintf("/v1.44/containers/%s/stop", urlEncode(nameOrID))
+	req, _ := http.NewRequest(http.MethodPost, c.baseURL+stopURL, nil)
+	resp, err := c.httpClient.Do(req)
+	if err == nil {
+		resp.Body.Close()
+	}
+	// Remove (force)
+	rmURL := fmt.Sprintf("/v1.44/containers/%s?force=true", urlEncode(nameOrID))
+	req, err = http.NewRequest(http.MethodDelete, c.baseURL+rmURL, nil)
+	if err != nil {
+		return err
+	}
+	resp, err = c.httpClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode >= 400 {
+		b, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("remove container HTTP %d: %s", resp.StatusCode, string(b))
+	}
+	return nil
+}
+
 func (c *DockerClient) ListContainers() ([]Container, error) {
 	var containers []Container
 	if err := c.get("/v1.44/containers/json?all=false", &containers); err != nil {
@@ -176,7 +689,6 @@ func (c *DockerClient) ListContainers() ([]Container, error) {
 	return containers, nil
 }
 
-// GetContainerStats returns one-shot stats for a container (no streaming).
 func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, error) {
 	var stats ContainerStats
 	if err := c.get(fmt.Sprintf("/v1.44/containers/%s/stats?stream=false", containerID), &stats); err != nil {
@@ -185,7 +697,69 @@ func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, e
 	return &stats, nil
 }
 
-// CalcCPUPercent computes CPU usage % from two consecutive stats snapshots.
+// ─── Host Shell execution ─────────────────────────────────────────────────────
+// The gateway runs inside a container but has /var/run/docker.sock mounted.
+// We use `docker exec` against the host PID namespace via a privileged helper,
+// OR simply run commands via the docker socket by exec-ing into the gateway
+// container's own shell with nsenter to reach PID 1 on the host.
+//
+// Approach: use `nsenter -t 1 -m -u -i -n -p -- <cmd>` via the host PID namespace.
+// This requires the container to run with --privileged or SYS_PTRACE capability
+// and PID namespace sharing. We add that to docker-compose.yml.
+//
+// Alternative (safer): exec into host via SSH or a privileged sidecar.
+// For now we use nsenter which works when pid:host and privileged: true.
+
+// ExecOnHost runs a shell command on the host via nsenter into PID 1.
+// Returns combined stdout+stderr.
+func ExecOnHost(command string) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	// Try nsenter (requires pid:host + SYS_ADMIN or privileged)
+	cmd := exec.CommandContext(ctx, "nsenter", "-t", "1", "-m", "-u", "-i", "-n", "-p", "--",
+		"sh", "-c", command)
+	var out bytes.Buffer
+	var stderr bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		// If nsenter fails, fall back to running in container scope
+		cmd2 := exec.CommandContext(ctx, "sh", "-c", command)
+		var out2 bytes.Buffer
+		var stderr2 bytes.Buffer
+		cmd2.Stdout = &out2
+		cmd2.Stderr = &stderr2
+		if err2 := cmd2.Run(); err2 != nil {
+			combined := out2.String() + stderr2.String()
+			if combined == "" {
+				combined = err2.Error()
+			}
+			return combined, err2
+		}
+		return out2.String() + stderr2.String(), nil
+	}
+	return out.String() + stderr.String(), nil
+}
+
+// ExecDockerCLI runs `docker <args>` on the host by calling the docker socket.
+// Since we have the socket mounted, we can exec docker commands directly
+// using the docker CLI binary if available.
+func ExecDockerCLI(args ...string) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "docker", args...)
+	var out, stderr bytes.Buffer
+	cmd.Stdout = &out
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		return out.String() + stderr.String(), err
+	}
+	return out.String(), nil
+}
+
+// CalcCPUPercent computes CPU% from stats snapshot.
 func CalcCPUPercent(stats *ContainerStats) float64 {
 	cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage)
 	systemDelta := float64(stats.CPUStats.SystemCPUUsage) - float64(stats.PreCPUStats.SystemCPUUsage)
@@ -198,3 +772,19 @@ func CalcCPUPercent(stats *ContainerStats) float64 {
 	}
 	return 0
 }
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+func urlEncode(s string) string {
+	var b strings.Builder
+	for _, r := range s {
+		switch {
+		case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z', r >= '0' && r <= '9',
+			r == '-', r == '_', r == '.', r == '~':
+			b.WriteRune(r)
+		default:
+			b.WriteString(fmt.Sprintf("%%%02X", r))
+		}
+	}
+	return b.String()
+}
diff --git a/gateway/internal/llm/client.go b/gateway/internal/llm/client.go
index 1f03ac6..a21b279 100644
--- a/gateway/internal/llm/client.go
+++ b/gateway/internal/llm/client.go
@@ -2,6 +2,7 @@
 package llm
 
 import (
+	"bufio"
 	"bytes"
 	"context"
 	"encoding/json"
@@ -105,6 +106,13 @@ func NewClient(baseURL, apiKey string) *Client {
 	}
 }
 
+// UpdateCredentials updates the LLM client's base URL and API key at runtime.
+// Called when the active provider is changed via the Settings UI.
+func (c *Client) UpdateCredentials(baseURL, apiKey string) {
+	c.baseURL = strings.TrimRight(baseURL, "/")
+	c.apiKey = apiKey
+}
+
 func (c *Client) headers() map[string]string {
 	h := map[string]string{
 		"Content-Type": "application/json",
@@ -159,7 +167,86 @@ func (c *Client) ListModels(ctx context.Context) (*ModelsResponse, error) {
 	return &result, nil
 }
 
-// Chat sends a chat completion request (non-streaming).
+// ChatStream sends a streaming chat completion request (SSE).
+// It calls the callback for each chunk received.
+func (c *Client) ChatStream(ctx context.Context, req ChatRequest, onChunk func(delta string, done bool)) error {
+	req.Stream = true
+
+	body, err := json.Marshal(req)
+	if err != nil {
+		return err
+	}
+
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost,
+		c.baseURL+"/chat/completions", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	for k, v := range c.headers() {
+		httpReq.Header.Set(k, v)
+	}
+	httpReq.Header.Set("Accept", "text/event-stream")
+
+	// Use a client without timeout for streaming
+	streamClient := &http.Client{Timeout: 0}
+	resp, err := streamClient.Do(httpReq)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("ollama stream API error (%d): %s", resp.StatusCode, string(respBody))
+	}
+
+	// Parse SSE stream
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "data: ") {
+			continue
+		}
+		data := strings.TrimPrefix(line, "data: ")
+		if data == "[DONE]" {
+			onChunk("", true)
+			return nil
+		}
+
+		var chunk struct {
+			Choices []struct {
+				Delta struct {
+					Content string `json:"content"`
+				} `json:"delta"`
+				FinishReason *string `json:"finish_reason"`
+			} `json:"choices"`
+		}
+		if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+			continue
+		}
+		if len(chunk.Choices) > 0 {
+			delta := chunk.Choices[0].Delta.Content
+			if delta != "" {
+				onChunk(delta, false)
+			}
+			if chunk.Choices[0].FinishReason != nil && *chunk.Choices[0].FinishReason == "stop" {
+				onChunk("", true)
+				return nil
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return err
+	}
+	onChunk("", true)
+	return nil
+}
 func (c *Client) Chat(ctx context.Context, req ChatRequest) (*ChatResponse, error) {
 	req.Stream = false
 
diff --git a/gateway/internal/orchestrator/orchestrator.go b/gateway/internal/orchestrator/orchestrator.go
index 045dd98..dedd19c 100644
--- a/gateway/internal/orchestrator/orchestrator.go
+++ b/gateway/internal/orchestrator/orchestrator.go
@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
+	"strings"
 	"time"
 
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
@@ -31,13 +32,15 @@ type ToolCallStep struct {
 	DurationMs int64  `json:"durationMs"`
 }
 
+// ChatResult is the response from the orchestrator chat.
 type ChatResult struct {
-	Success   bool           `json:"success"`
-	Response  string         `json:"response"`
-	ToolCalls []ToolCallStep `json:"toolCalls"`
-	Model     string         `json:"model"`
-	Usage     *llm.Usage     `json:"usage,omitempty"`
-	Error     string         `json:"error,omitempty"`
+	Success      bool           `json:"success"`
+	Response     string         `json:"response"`
+	ToolCalls    []ToolCallStep `json:"toolCalls"`
+	Model        string         `json:"model"`
+	ModelWarning string         `json:"modelWarning,omitempty"`
+	Usage        *llm.Usage     `json:"usage,omitempty"`
+	Error        string         `json:"error,omitempty"`
 }
 
 // OrchestratorConfig is the runtime config loaded from DB or defaults.
@@ -51,6 +54,30 @@ type OrchestratorConfig struct {
 	MaxTokens    int
 }
 
+// RetryPolicy controls how the orchestrator retries failed or empty LLM calls.
+type RetryPolicy struct {
+	// MaxLLMRetries is the number of additional attempts after a failure.
+	// Total attempts = MaxLLMRetries + 1.  Default: 3 (4 total).
+	MaxLLMRetries int
+	// InitialDelay before the first retry.  Default: 2s.
+	InitialDelay time.Duration
+	// MaxDelay caps the exponential back-off.  Default: 30s.
+	MaxDelay time.Duration
+	// RetryOnEmpty means an empty-content response is treated as a soft failure
+	// and triggers a retry.  Default: true.
+	RetryOnEmpty bool
+}
+
+// defaultRetryPolicy returns the default retry policy.
+func defaultRetryPolicy() RetryPolicy {
+	return RetryPolicy{
+		MaxLLMRetries: 3,
+		InitialDelay:  2 * time.Second,
+		MaxDelay:      30 * time.Second,
+		RetryOnEmpty:  true,
+	}
+}
+
 // ─── Default System Prompt ────────────────────────────────────────────────────
 
 const defaultSystemPrompt = `You are GoClaw Orchestrator — the main AI agent managing the GoClaw distributed AI system.
@@ -86,6 +113,7 @@ type Orchestrator struct {
 	executor    *tools.Executor
 	database    *db.DB
 	projectRoot string
+	retry       RetryPolicy
 }
 
 func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrator {
@@ -93,6 +121,7 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat
 		llmClient:   llmClient,
 		database:    database,
 		projectRoot: projectRoot,
+		retry:       defaultRetryPolicy(),
 	}
 	// Inject agent list function to avoid circular dependency
 	o.executor = tools.NewExecutor(projectRoot, o.listAgentsFn)
@@ -101,6 +130,11 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat
 	return o
 }
 
+// SetRetryPolicy overrides the default retry policy.
+func (o *Orchestrator) SetRetryPolicy(p RetryPolicy) {
+	o.retry = p
+}
+
 // GetConfig loads orchestrator config from DB, falls back to defaults.
 func (o *Orchestrator) GetConfig() *OrchestratorConfig {
 	if o.database != nil {
@@ -131,25 +165,188 @@ func (o *Orchestrator) GetConfig() *OrchestratorConfig {
 	}
 }
 
-// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
-func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
-	if maxIter <= 0 {
-		maxIter = 10
+// resolveModel checks if the desired model is available via the LLM API.
+// If not, it tries to fall back to the first available model.
+// Returns the resolved model name and a warning if fallback was used.
+func (o *Orchestrator) resolveModel(ctx context.Context, desired string) (model string, warning string) {
+	ctxShort, cancel := context.WithTimeout(ctx, 8*time.Second)
+	defer cancel()
+
+	models, err := o.llmClient.ListModels(ctxShort)
+	if err != nil || models == nil || len(models.Data) == 0 {
+		// Cannot verify — use desired model as-is
+		log.Printf("[Orchestrator] Cannot fetch model list: %v — using %q as-is", err, desired)
+		return desired, ""
+	}
+
+	// Check if desired model is available
+	for _, m := range models.Data {
+		if m.ID == desired {
+			return desired, "" // found — all good
+		}
+	}
+
+	// Desired model not in list — fall back to first available
+	fallback := models.Data[0].ID
+	warning = fmt.Sprintf("model %q not available — using %q instead", desired, fallback)
+	log.Printf("[Orchestrator] WARNING: %s", warning)
+	return fallback, warning
+}
+
+// ─── LLM call with retry ──────────────────────────────────────────────────────
+
+// llmCallResult holds one attempt's outcome.
+type llmCallResult struct {
+	resp        *llm.ChatResponse
+	usedTools   bool // whether the call was made with tools enabled
+	err         error
+	attemptNum  int
+}
+
+// callLLMWithRetry calls the LLM and retries on error or empty response.
+// It also strips tools on the second attempt if the first fails with tools.
+func (o *Orchestrator) callLLMWithRetry(
+	ctx context.Context,
+	req llm.ChatRequest,
+	model string,
+	onRetry func(attempt int, reason string), // optional event callback (may be nil)
+) llmCallResult {
+	policy := o.retry
+	delay := policy.InitialDelay
+	maxAttempts := policy.MaxLLMRetries + 1
+	hasTools := len(req.Tools) > 0
+
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		// On attempt > 1, always strip tools (avoid repeated tool-format errors)
+		useTools := hasTools && attempt == 1
+		r := req
+		if !useTools {
+			r.Tools = nil
+			r.ToolChoice = ""
+		}
+
+		resp, err := o.llmClient.Chat(ctx, r)
+
+		// ── Hard error (network, auth, etc.) ─────────────────────────
+		if err != nil {
+			reason := fmt.Sprintf("LLM error (attempt %d/%d): %v", attempt, maxAttempts, err)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{err: fmt.Errorf("LLM error after %d attempts (model: %s): %w", maxAttempts, model, err), attemptNum: attempt}
+		}
+
+		// ── Context cancelled ─────────────────────────────────────────
+		if ctx.Err() != nil {
+			return llmCallResult{err: ctx.Err(), attemptNum: attempt}
+		}
+
+		// ── Empty choices ─────────────────────────────────────────────
+		if len(resp.Choices) == 0 {
+			reason := fmt.Sprintf("empty choices (attempt %d/%d)", attempt, maxAttempts)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		content := strings.TrimSpace(resp.Choices[0].Message.Content)
+		finishReason := resp.Choices[0].FinishReason
+
+		// ── Empty content AND no tool calls — retry ───────────────────
+		if policy.RetryOnEmpty &&
+			content == "" &&
+			finishReason != "tool_calls" &&
+			len(resp.Choices[0].Message.ToolCalls) == 0 {
+
+			reason := fmt.Sprintf("empty response content (attempt %d/%d, finish_reason=%q)", attempt, maxAttempts, finishReason)
+			log.Printf("[Orchestrator] %s", reason)
+
+			if attempt < maxAttempts {
+				if onRetry != nil {
+					onRetry(attempt, reason)
+				}
+				o.sleep(ctx, delay)
+				delay = min(delay*2, policy.MaxDelay)
+				continue
+			}
+			// Exhausted retries — return what we have (even if empty)
+			log.Printf("[Orchestrator] All %d attempts exhausted — returning empty response", maxAttempts)
+			return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+		}
+
+		// ── Success ───────────────────────────────────────────────────
+		if attempt > 1 {
+			log.Printf("[Orchestrator] Succeeded on attempt %d/%d", attempt, maxAttempts)
+		}
+		return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt}
+	}
+
+	// Should not be reached
+	return llmCallResult{err: fmt.Errorf("retry loop exited unexpectedly"), attemptNum: maxAttempts}
+}
+
+// sleep waits for d, returning early if ctx is cancelled.
+func (o *Orchestrator) sleep(ctx context.Context, d time.Duration) {
+	select {
+	case <-ctx.Done():
+	case <-time.After(d):
+	}
+}
+
+// min returns the smaller of two durations.
+func min(a, b time.Duration) time.Duration {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// ─── Core loop (shared by Chat and ChatWithEvents) ────────────────────────────
+
+type loopOptions struct {
+	messages      []Message
+	overrideModel string
+	maxIter       int
+	onToolCall    func(ToolCallStep) // may be nil
+	onRetry       func(attempt int, reason string) // may be nil
+}
+
+func (o *Orchestrator) runLoop(ctx context.Context, opts loopOptions) ChatResult {
+	if opts.maxIter <= 0 {
+		opts.maxIter = 10
 	}
 
 	cfg := o.GetConfig()
 	model := cfg.Model
-	if overrideModel != "" {
-		model = overrideModel
+	if opts.overrideModel != "" {
+		model = opts.overrideModel
 	}
 
-	log.Printf("[Orchestrator] Chat started: model=%s, messages=%d", model, len(messages))
+	// Validate model against LLM API — fall back if unavailable (prevents 401/404)
+	model, modelWarning := o.resolveModel(ctx, model)
+	log.Printf("[Orchestrator] Loop started: model=%s, messages=%d, maxIter=%d, maxRetries=%d",
+		model, len(opts.messages), opts.maxIter, o.retry.MaxLLMRetries)
 
 	// Build conversation
 	conv := []llm.Message{
 		{Role: "system", Content: cfg.SystemPrompt},
 	}
-	for _, m := range messages {
+	for _, m := range opts.messages {
 		conv = append(conv, llm.Message{Role: m.Role, Content: m.Content})
 	}
 
@@ -175,7 +372,7 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 	var lastUsage *llm.Usage
 	var lastModel string
 
-	for iter := 0; iter < maxIter; iter++ {
+	for iter := 0; iter < opts.maxIter; iter++ {
 		req := llm.ChatRequest{
 			Model:       model,
 			Messages:    conv,
@@ -185,28 +382,22 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			ToolChoice:  "auto",
 		}
 
-		resp, err := o.llmClient.Chat(ctx, req)
-		if err != nil {
-			// Fallback: try without tools
-			log.Printf("[Orchestrator] LLM error with tools: %v — retrying without tools", err)
-			req.Tools = nil
-			req.ToolChoice = ""
-			resp2, err2 := o.llmClient.Chat(ctx, req)
-			if err2 != nil {
-				return ChatResult{
-					Success: false,
-					Error:   fmt.Sprintf("LLM error (model: %s): %v", model, err2),
-				}
+		// ── LLM call with retry ────────────────────────────────────
+		callRes := o.callLLMWithRetry(ctx, req, model, opts.onRetry)
+
+		if callRes.err != nil {
+			return ChatResult{
+				Success:      false,
+				ToolCalls:    toolCallSteps,
+				Model:        model,
+				ModelWarning: modelWarning,
+				Error:        callRes.err.Error(),
 			}
-			if len(resp2.Choices) > 0 {
-				finalResponse = resp2.Choices[0].Message.Content
-				lastUsage = resp2.Usage
-				lastModel = resp2.Model
-			}
-			break
 		}
 
+		resp := callRes.resp
 		if len(resp.Choices) == 0 {
+			log.Printf("[Orchestrator] No choices in response — stopping loop at iter %d", iter)
 			break
 		}
 
@@ -217,19 +408,17 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			lastModel = model
 		}
 
-		// Check if LLM wants to call tools
+		// ── Tool calls ─────────────────────────────────────────────
 		if choice.FinishReason == "tool_calls" && len(choice.Message.ToolCalls) > 0 {
 			// Add assistant message with tool calls to conversation
 			conv = append(conv, choice.Message)
 
-			// Execute each tool call
 			for _, tc := range choice.Message.ToolCalls {
 				toolName := tc.Function.Name
 				argsJSON := tc.Function.Arguments
 
 				log.Printf("[Orchestrator] Executing tool: %s args=%s", toolName, argsJSON)
 				start := time.Now()
-
 				result := o.executor.Execute(ctx, toolName, argsJSON)
 
 				step := ToolCallStep{
@@ -238,7 +427,6 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 					DurationMs: time.Since(start).Milliseconds(),
 				}
 
-				// Parse args for display
 				var argsMap any
 				_ = json.Unmarshal([]byte(argsJSON), &argsMap)
 				step.Args = argsMap
@@ -255,7 +443,10 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 
 				toolCallSteps = append(toolCallSteps, step)
 
-				// Add tool result to conversation
+				if opts.onToolCall != nil {
+					opts.onToolCall(step)
+				}
+
 				conv = append(conv, llm.Message{
 					Role:       "tool",
 					Content:    toolResultContent,
@@ -267,20 +458,70 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod
 			continue
 		}
 
-		// LLM finished — extract final response
+		// ── Final response ─────────────────────────────────────────
 		finalResponse = choice.Message.Content
 		break
 	}
 
 	return ChatResult{
-		Success:   true,
-		Response:  finalResponse,
-		ToolCalls: toolCallSteps,
-		Model:     lastModel,
-		Usage:     lastUsage,
+		Success:      true,
+		Response:     finalResponse,
+		ToolCalls:    toolCallSteps,
+		Model:        lastModel,
+		ModelWarning: modelWarning,
+		Usage:        lastUsage,
 	}
 }
 
+// ─── Public API ───────────────────────────────────────────────────────────────
+
+// Chat runs the full orchestration loop: LLM → tool calls → LLM → response.
+func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+	})
+}
+
+// ChatWithEvents runs the full orchestration loop and calls callbacks for each
+// tool execution and each retry attempt.  Used for SSE streaming and DB event logging.
+func (o *Orchestrator) ChatWithEvents(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+	})
+}
+
+// ChatWithEventsAndRetry is the full-featured variant that also reports retry
+// attempts through onRetry so they can be streamed to the client.
+func (o *Orchestrator) ChatWithEventsAndRetry(
+	ctx context.Context,
+	messages []Message,
+	overrideModel string,
+	maxIter int,
+	onToolCall func(ToolCallStep),
+	onRetry func(attempt int, reason string),
+) ChatResult {
+	return o.runLoop(ctx, loopOptions{
+		messages:      messages,
+		overrideModel: overrideModel,
+		maxIter:       maxIter,
+		onToolCall:    onToolCall,
+		onRetry:       onRetry,
+	})
+}
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
 // listAgentsFn is injected into the tool executor to list agents from DB.
 func (o *Orchestrator) listAgentsFn() ([]map[string]any, error) {
 	if o.database == nil {
diff --git a/gateway/internal/tools/executor.go b/gateway/internal/tools/executor.go
index 0e8ab33..2191458 100644
--- a/gateway/internal/tools/executor.go
+++ b/gateway/internal/tools/executor.go
@@ -13,6 +13,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"
 
 	"git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db"
@@ -155,15 +156,44 @@ func OrchestratorTools() []ToolDef {
 		{
 			Type: "function",
 			Function: FuncDef{
-				Name:        "delegate_to_agent",
-				Description: "Delegate a task to a specialized agent (Browser Agent, Tool Builder, Agent Compiler).",
+				Name: "delegate_to_agent",
+				Description: "Delegate a task to a specific agent container via A2A protocol. " +
+					"The agent processes the task with its own LLM and tools. " +
+					"Use async=true for fire-and-forget with callback_url, or sync (default) to wait for result.",
 				Parameters: map[string]any{
 					"type": "object",
 					"properties": map[string]any{
-						"agentId": map[string]any{"type": "number", "description": "Agent ID to delegate to"},
-						"message": map[string]any{"type": "string", "description": "Task description for the agent"},
+						"agentId":     map[string]any{"type": "number", "description": "Target agent ID"},
+						"task":        map[string]any{"type": "string", "description": "Task description / prompt for the agent"},
+						"async":       map[string]any{"type": "boolean", "description": "If true, returns task_id immediately; if false (default), waits for result"},
+						"callbackUrl": map[string]any{"type": "string", "description": "URL to POST result when async=true"},
+						"priority":    map[string]any{"type": "number", "description": "Task priority 0-10 (default 5)"},
+						"timeoutSecs": map[string]any{"type": "number", "description": "Max seconds to wait (default 120)"},
 					},
-					"required":             []string{"agentId", "message"},
+					"required":             []string{"agentId", "task"},
+					"additionalProperties": false,
+				},
+			},
+		},
+		{
+			Type: "function",
+			Function: FuncDef{
+				Name: "fanout_agents",
+				Description: "Send the SAME task to MULTIPLE agents IN PARALLEL and collect all results. " +
+					"Useful when you need different specialists to work on the same problem simultaneously. " +
+					"Returns results from all agents as an array.",
+				Parameters: map[string]any{
+					"type": "object",
+					"properties": map[string]any{
+						"agentIds": map[string]any{
+							"type":        "array",
+							"items":       map[string]any{"type": "number"},
+							"description": "List of agent IDs to send the task to (max 10)",
+						},
+						"task":        map[string]any{"type": "string", "description": "Task to send to all agents"},
+						"timeoutSecs": map[string]any{"type": "number", "description": "Max seconds per agent (default 60)"},
+					},
+					"required":             []string{"agentIds", "task"},
 					"additionalProperties": false,
 				},
 			},
@@ -226,6 +256,8 @@ func (e *Executor) Execute(ctx context.Context, toolName string, argsJSON string
 		result, execErr = e.listAgents()
 	case "delegate_to_agent":
 		result, execErr = e.delegateToAgent(ctx, args)
+	case "fanout_agents":
+		result, execErr = e.fanoutAgents(ctx, args)
 	default:
 		return ToolResult{Success: false, Error: fmt.Sprintf("unknown tool: %s", toolName), DurationMs: ms(start)}
 	}
@@ -456,57 +488,100 @@ func (e *Executor) listAgents() (any, error) {
 	return map[string]any{"agents": agents, "count": len(agents)}, nil
 }
 
+// A2ATaskRequest is the standard agent-to-agent task message format (Phase C).
+type A2ATaskRequest struct {
+	TaskID      string `json:"task_id"`
+	FromAgentID int    `json:"from_agent_id"`
+	Task        string `json:"input"`
+	CallbackURL string `json:"callback_url,omitempty"`
+	Priority    int    `json:"priority"`
+	TimeoutSecs int    `json:"timeout_secs"`
+}
+
+// delegateToAgent sends a task to an agent's container via A2A HTTP protocol.
+// Resolves the agent's service address from DB, respects priority/timeout from args.
+// Falls back with a clear message if agent is not deployed/running.
 func (e *Executor) delegateToAgent(ctx context.Context, args map[string]any) (any, error) {
 	agentIDf, _ := args["agentId"].(float64)
 	agentID := int(agentIDf)
+
 	task, _ := args["task"].(string)
 	if task == "" {
 		task, _ = args["message"].(string) // backward compat
 	}
 	if task == "" {
-		return nil, fmt.Errorf("task (or message) is required")
+		return nil, fmt.Errorf("task is required")
 	}
+
 	callbackURL, _ := args["callbackUrl"].(string)
 	async, _ := args["async"].(bool)
 
+	priority := 5
+	if pf, ok := args["priority"].(float64); ok && pf > 0 {
+		priority = int(pf)
+	}
+	timeoutSecs := 120
+	if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 {
+		timeoutSecs = int(tf)
+	}
+
 	// Resolve agent container address from DB
 	if e.database != nil {
 		cfg, err := e.database.GetAgentByID(agentID)
 		if err == nil && cfg != nil && cfg.ServicePort > 0 && cfg.ContainerStatus == "running" {
-			// Agent is deployed — call its container via overlay DNS
-			// Docker Swarm DNS: service name resolves inside overlay network
 			agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort)
-			if async {
-				return e.postAgentTask(ctx, agentURL, agentID, task, callbackURL)
+			req := A2ATaskRequest{
+				TaskID:      fmt.Sprintf("orch-%d-%d", agentID, time.Now().UnixMilli()),
+				FromAgentID: 0, // orchestrator
+				Task:        task,
+				CallbackURL: callbackURL,
+				Priority:    priority,
+				TimeoutSecs: timeoutSecs,
 			}
-			return e.postAgentChat(ctx, agentURL, agentID, task)
+			if async {
+				return e.postA2ATask(ctx, agentURL, req)
+			}
+			return e.postA2AChat(ctx, agentURL, task, timeoutSecs)
+		}
+		if e.database != nil {
+			cfg, _ := e.database.GetAgentByID(agentID)
+			status := "unknown"
+			if cfg != nil {
+				status = cfg.ContainerStatus
+				if status == "" {
+					status = "stopped"
+				}
+			}
+			return map[string]any{
+				"delegated": false,
+				"agentId":   agentID,
+				"status":    status,
+				"note": fmt.Sprintf(
+					"Agent %d container is %q. Deploy it via Web Panel (POST /api/agents/%d/deploy) then retry.",
+					agentID, status, agentID),
+			}, nil
 		}
 	}
 
-	// Fallback: agent not deployed yet — return informational response
 	return map[string]any{
-		"delegated":   false,
-		"agentId":     agentID,
-		"task":        task,
-		"note":        fmt.Sprintf("Agent %d is not running (containerStatus != running). Deploy it first via Web Panel.", agentID),
+		"delegated": false,
+		"agentId":   agentID,
+		"note":      "No database connection — cannot resolve agent address.",
 	}, nil
 }
 
-// postAgentTask POSTs to agent's /task endpoint (async, returns task_id).
-func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgentID int, task, callbackURL string) (any, error) {
-	payload, _ := json.Marshal(map[string]any{
-		"input":         task,
-		"from_agent_id": fromAgentID,
-		"callback_url":  callbackURL,
-	})
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload))
+// postA2ATask POSTs to agent's /task endpoint using A2A protocol (async).
+func (e *Executor) postA2ATask(ctx context.Context, agentURL string, req A2ATaskRequest) (any, error) {
+	payload, _ := json.Marshal(req)
+	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload))
 	if err != nil {
-		return nil, fmt.Errorf("delegate build request: %w", err)
+		return nil, fmt.Errorf("a2a build request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
-	resp, err := e.httpClient.Do(req)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("X-GoClaw-From", "orchestrator")
+	resp, err := e.httpClient.Do(httpReq)
 	if err != nil {
-		return nil, fmt.Errorf("delegate HTTP error: %w", err)
+		return nil, fmt.Errorf("a2a task HTTP error: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(resp.Body)
@@ -515,19 +590,27 @@ func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgent
 	return result, nil
 }
 
-// postAgentChat POSTs to agent's /chat endpoint (sync, waits for response).
-func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, task string) (any, error) {
+// postA2AChat POSTs to agent's /chat endpoint (sync, waits for LLM response).
+func (e *Executor) postA2AChat(ctx context.Context, agentURL string, task string, timeoutSecs int) (any, error) {
 	payload, _ := json.Marshal(map[string]any{
-		"messages": []map[string]string{{"role": "user", "content": task}},
+		"messages":    []map[string]string{{"role": "user", "content": task}},
+		"timeout_secs": timeoutSecs,
 	})
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload))
+	chatCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs)*time.Second)
+	defer cancel()
+
+	httpReq, err := http.NewRequestWithContext(chatCtx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload))
 	if err != nil {
-		return nil, fmt.Errorf("delegate build request: %w", err)
+		return nil, fmt.Errorf("a2a chat request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
-	resp, err := e.httpClient.Do(req)
+	httpReq.Header.Set("Content-Type", "application/json")
+	httpReq.Header.Set("X-GoClaw-From", "orchestrator")
+
+	// Use a client with longer timeout for sync chats
+	client := &http.Client{Timeout: time.Duration(timeoutSecs+10) * time.Second}
+	resp, err := client.Do(httpReq)
 	if err != nil {
-		return nil, fmt.Errorf("delegate HTTP error: %w", err)
+		return nil, fmt.Errorf("a2a chat HTTP error: %w", err)
 	}
 	defer resp.Body.Close()
 	body, _ := io.ReadAll(resp.Body)
@@ -536,6 +619,107 @@ func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, ta
 	return result, nil
 }
 
+// fanoutAgents sends the same task to multiple agents in parallel and collects results.
+func (e *Executor) fanoutAgents(ctx context.Context, args map[string]any) (any, error) {
+	task, _ := args["task"].(string)
+	if task == "" {
+		return nil, fmt.Errorf("task is required")
+	}
+
+	timeoutSecs := 60
+	if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 {
+		timeoutSecs = int(tf)
+	}
+
+	// Parse agentIds array
+	rawIDs, _ := args["agentIds"].([]any)
+	if len(rawIDs) == 0 {
+		return nil, fmt.Errorf("agentIds must be a non-empty array")
+	}
+	if len(rawIDs) > 10 {
+		rawIDs = rawIDs[:10] // cap at 10
+	}
+
+	type agentResult struct {
+		AgentID   int    `json:"agentId"`
+		AgentName string `json:"agentName,omitempty"`
+		Success   bool   `json:"success"`
+		Result    any    `json:"result,omitempty"`
+		Error     string `json:"error,omitempty"`
+		Delegated bool   `json:"delegated"`
+		DurationMs int64 `json:"durationMs"`
+	}
+
+	results := make([]agentResult, len(rawIDs))
+	var wg sync.WaitGroup
+
+	fanCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs+5)*time.Second)
+	defer cancel()
+
+	for i, rawID := range rawIDs {
+		idf, _ := rawID.(float64)
+		agentID := int(idf)
+		idx := i
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			start := time.Now()
+			ar := agentResult{AgentID: agentID}
+
+			if e.database == nil {
+				ar.Error = "no database connection"
+				results[idx] = ar
+				return
+			}
+
+			cfg, err := e.database.GetAgentByID(agentID)
+			if err != nil || cfg == nil {
+				ar.Error = fmt.Sprintf("agent %d not found", agentID)
+				results[idx] = ar
+				return
+			}
+			ar.AgentName = cfg.Name
+
+			if cfg.ServicePort == 0 || cfg.ContainerStatus != "running" {
+				ar.Delegated = false
+				ar.Error = fmt.Sprintf("agent %q is %q — not running", cfg.Name, cfg.ContainerStatus)
+				results[idx] = ar
+				return
+			}
+
+			agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort)
+			res, chatErr := e.postA2AChat(fanCtx, agentURL, task, timeoutSecs)
+			ar.DurationMs = ms(start)
+			if chatErr != nil {
+				ar.Success = false
+				ar.Error = chatErr.Error()
+			} else {
+				ar.Success = true
+				ar.Delegated = true
+				ar.Result = res
+			}
+			results[idx] = ar
+		}()
+	}
+
+	wg.Wait()
+
+	succeeded := 0
+	for _, r := range results {
+		if r.Success {
+			succeeded++
+		}
+	}
+
+	return map[string]any{
+		"task":      task,
+		"total":     len(results),
+		"succeeded": succeeded,
+		"failed":    len(results) - succeeded,
+		"results":   results,
+	}, nil
+}
+
 // ─── Helpers ──────────────────────────────────────────────────────────────────
 
 func (e *Executor) resolvePath(path string) string {