From f8e0ca7d5dd133a5be6bb32d8067d63f250a7004 Mon Sep 17 00:00:00 2001 From: bboxwtf Date: Sun, 19 Apr 2026 11:40:39 +0000 Subject: [PATCH] feat(gateway): restore Phase C full agent lifecycle API - Restored Phase C gateway code (handlers, main.go, docker client, db) - Added routes: GET /api/agents/running, POST /api/agents (CRUD), POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop, POST /api/agents/{id}/restart, POST /api/agents/{id}/scale - Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id} even when serviceName is empty in DB - Fixed DeployAgent: handle 409 conflict by removing existing container and retrying once (idempotent deploy) - Added swarm_manager.go: background SwarmManager for dead-letter recovery - Added AGENT_NETWORK and AGENT_DB_URL config options - Updated .gitignore to exclude gateway binaries - All agents use standalone docker run (not Swarm) on bridge network Verified on prod: deploy/stop/restart cycle works correctly, /api/agents/running returns live running agents with containerStatus --- .gitignore | 5 + gateway/cmd/agent-worker/main.go | 90 +- gateway/cmd/agent-worker/main_test.go | 120 +- gateway/cmd/agent/main.go | 270 +++ gateway/cmd/gateway/main.go | 68 +- gateway/config/config.go | 22 + gateway/go.mod | 10 +- gateway/go.sum | 10 +- gateway/internal/api/handlers.go | 1537 +++++++++++++++++ gateway/internal/api/handlers_agents_test.go | 334 ++++ gateway/internal/api/swarm_manager.go | 196 +++ gateway/internal/db/db.go | 756 ++++++-- gateway/internal/docker/client.go | 670 ++++++- gateway/internal/llm/client.go | 89 +- gateway/internal/orchestrator/orchestrator.go | 329 +++- gateway/internal/tools/executor.go | 256 ++- 16 files changed, 4505 insertions(+), 257 deletions(-) create mode 100644 gateway/cmd/agent/main.go create mode 100644 gateway/internal/api/handlers_agents_test.go create mode 100644 gateway/internal/api/swarm_manager.go diff --git a/.gitignore b/.gitignore index 6f04d85..00c980f 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,8 @@ deploy-secrets .kilo/ .manus/ AGENTS.md + +# Gateway binaries +gateway/gateway +gateway/gateway-new +gateway/agent-worker diff --git a/gateway/cmd/agent-worker/main.go b/gateway/cmd/agent-worker/main.go index 2a72a0c..d63d260 100644 --- a/gateway/cmd/agent-worker/main.go +++ b/gateway/cmd/agent-worker/main.go @@ -127,30 +127,50 @@ type AgentWorker struct { // Recent tasks ring buffer (для GET /tasks) recentMu sync.Mutex recentKeys []string + + // Rate-limiting semaphore — limits concurrent LLM calls. + // Filled with MAX_CONCURRENT_TASKS tokens; each worker acquires one before + // calling runChat() and releases it when done. + rateSem chan struct{} + // maxConcurrent is the configured concurrency limit (exported for /health). + maxConcurrent int } const ( - taskQueueDepth = 100 - maxRecentTasks = 50 - defaultMaxIter = 8 - defaultTimeout = 120 - workerGoroutines = 4 // параллельных воркеров на агента + taskQueueDepth = 100 + maxRecentTasks = 50 + defaultMaxIter = 8 + defaultTimeout = 120 + workerGoroutines = 4 // параллельных воркеров на агента + defaultMaxConcurrent = 2 // default simultaneous LLM calls per agent ) -func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client) (*AgentWorker, error) { +func newAgentWorker(agentID int, database *db.DB, llmClient *llm.Client, maxConcurrent int) (*AgentWorker, error) { cfg, err := database.GetAgentByID(agentID) if err != nil { return nil, fmt.Errorf("agent %d not found in DB: %w", agentID, err) } log.Printf("[AgentWorker] Loaded config: id=%d name=%q model=%s", cfg.ID, cfg.Name, cfg.Model) + if maxConcurrent <= 0 { + maxConcurrent = defaultMaxConcurrent + } + + // Fill the semaphore with tokens equal to the concurrency limit. + sem := make(chan struct{}, maxConcurrent) + for i := 0; i < maxConcurrent; i++ { + sem <- struct{}{} + } + w := &AgentWorker{ - agentID: agentID, - cfg: cfg, - llm: llmClient, - database: database, - taskQueue: make(chan *Task, taskQueueDepth), - tasks: make(map[string]*Task), + agentID: agentID, + cfg: cfg, + llm: llmClient, + database: database, + taskQueue: make(chan *Task, taskQueueDepth), + tasks: make(map[string]*Task), + rateSem: sem, + maxConcurrent: maxConcurrent, } // Tool executor: агент использует подмножество инструментов из allowedTools w.executor = tools.NewExecutor("/app", func() ([]map[string]any, error) { @@ -234,7 +254,23 @@ func (w *AgentWorker) EnqueueTask(req TaskRequest) *Task { } // processTask выполняет задачу через LLM loop и обновляет её статус. +// Acquires a rate-limiting token before invoking the LLM to cap concurrent +// calls at w.maxConcurrent. func (w *AgentWorker) processTask(ctx context.Context, task *Task) { + // ── Rate limiting: acquire a token ─────────────────────────────────────── + // If no token is available, block until one frees up or ctx is cancelled. + select { + case <-ctx.Done(): + w.tasksMu.Lock() + task.Status = TaskCancelled + task.Error = "context cancelled before execution" + w.tasksMu.Unlock() + return + case <-w.rateSem: + // acquired + } + defer func() { w.rateSem <- struct{}{} }() // release token + now := time.Now() w.tasksMu.Lock() task.Status = TaskRunning @@ -478,12 +514,17 @@ func (w *AgentWorker) postCallback(task *Task) { // ─── HTTP Handlers ──────────────────────────────────────────────────────────── func (w *AgentWorker) handleHealth(rw http.ResponseWriter, r *http.Request) { + activeSlots := w.maxConcurrent - len(w.rateSem) + rw.Header().Set("Content-Type", "application/json") json.NewEncoder(rw).Encode(map[string]any{ - "status": "ok", - "agentId": w.agentID, - "name": w.cfg.Name, - "model": w.cfg.Model, - "queueLen": len(w.taskQueue), + "status": "ok", + "agentId": w.agentID, + "name": w.cfg.Name, + "model": w.cfg.Model, + "queueLen": len(w.taskQueue), + "activeTasks": activeSlots, + "maxConcurrent": w.maxConcurrent, + "rateLimitFree": len(w.rateSem), }) } @@ -643,7 +684,16 @@ func main() { log.Fatal("[AgentWorker] DATABASE_URL env var is required") } - log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s", agentID, port, llmBaseURL) + // MAX_CONCURRENT_TASKS controls the rate-limiting semaphore (Phase C). + maxConcurrent := defaultMaxConcurrent + if mcStr := os.Getenv("MAX_CONCURRENT_TASKS"); mcStr != "" { + if mc, err := strconv.Atoi(mcStr); err == nil && mc > 0 { + maxConcurrent = mc + } + } + + log.Printf("[AgentWorker] Starting: AGENT_ID=%d PORT=%s LLM=%s MAX_CONCURRENT=%d", + agentID, port, llmBaseURL, maxConcurrent) // ── DB ─────────────────────────────────────────────────────────────────── database, err := db.Connect(dbURL) @@ -656,7 +706,7 @@ func main() { llmClient := llm.NewClient(llmBaseURL, llmAPIKey) // ── Agent Worker ───────────────────────────────────────────────────────── - worker, err := newAgentWorker(agentID, database, llmClient) + worker, err := newAgentWorker(agentID, database, llmClient, maxConcurrent) if err != nil { log.Fatalf("[AgentWorker] init failed: %v", err) } @@ -724,4 +774,4 @@ func getEnvFirst(keys ...string) string { } } return "" -} \ No newline at end of file +} diff --git a/gateway/cmd/agent-worker/main_test.go b/gateway/cmd/agent-worker/main_test.go index 8245e87..3b917de 100644 --- a/gateway/cmd/agent-worker/main_test.go +++ b/gateway/cmd/agent-worker/main_test.go @@ -130,11 +130,18 @@ func TestEnqueueTask_DefaultTimeout(t *testing.T) { // ─── HTTP Handlers ──────────────────────────────────────────────────────────── func makeTestWorker() *AgentWorker { + mc := defaultMaxConcurrent + sem := make(chan struct{}, mc) + for i := 0; i < mc; i++ { + sem <- struct{}{} + } return &AgentWorker{ - agentID: 42, - cfg: mockAgentConfig(), - taskQueue: make(chan *Task, taskQueueDepth), - tasks: make(map[string]*Task), + agentID: 42, + cfg: mockAgentConfig(), + taskQueue: make(chan *Task, taskQueueDepth), + tasks: make(map[string]*Task), + rateSem: sem, + maxConcurrent: mc, } } @@ -435,4 +442,107 @@ func TestWorkerProcessesTask_WithMockLLM(t *testing.T) { if finalStatus != TaskDone { t.Errorf("expected task done, got %s", finalStatus) } -} \ No newline at end of file +} + +// ─── Phase C: Rate-limiting tests ───────────────────────────────────────────── + +// TestRateLimiting_TokensInitialized verifies that the semaphore is filled with +// maxConcurrent tokens on worker creation. +func TestRateLimiting_TokensInitialized(t *testing.T) { + mc := 3 + sem := make(chan struct{}, mc) + for i := 0; i < mc; i++ { + sem <- struct{}{} + } + w := &AgentWorker{ + agentID: 42, + cfg: mockAgentConfig(), + taskQueue: make(chan *Task, taskQueueDepth), + tasks: make(map[string]*Task), + rateSem: sem, + maxConcurrent: mc, + } + if len(w.rateSem) != mc { + t.Errorf("expected %d tokens in semaphore, got %d", mc, len(w.rateSem)) + } + if cap(w.rateSem) != mc { + t.Errorf("expected semaphore capacity=%d, got %d", mc, cap(w.rateSem)) + } +} + +// TestRateLimiting_TokenAcquireRelease verifies that tokens can be acquired and +// released correctly (simulating what processTask does). +func TestRateLimiting_TokenAcquireRelease(t *testing.T) { + mc := 2 + sem := make(chan struct{}, mc) + for i := 0; i < mc; i++ { + sem <- struct{}{} + } + w := &AgentWorker{ + agentID: 42, + cfg: mockAgentConfig(), + taskQueue: make(chan *Task, taskQueueDepth), + tasks: make(map[string]*Task), + rateSem: sem, + maxConcurrent: mc, + } + + // Acquire both tokens + <-w.rateSem + <-w.rateSem + + if len(w.rateSem) != 0 { + t.Errorf("expected 0 free tokens after acquiring all, got %d", len(w.rateSem)) + } + + // Release one token + w.rateSem <- struct{}{} + if len(w.rateSem) != 1 { + t.Errorf("expected 1 free token after release, got %d", len(w.rateSem)) + } + + // Release second token + w.rateSem <- struct{}{} + if len(w.rateSem) != mc { + t.Errorf("expected %d free tokens after full release, got %d", mc, len(w.rateSem)) + } +} + +// TestRateLimiting_HealthShowsActiveTasks verifies the /health endpoint reports +// active task count and rate-limit info. +func TestRateLimiting_HealthShowsActiveTasks(t *testing.T) { + mc := 3 + sem := make(chan struct{}, mc) + for i := 0; i < mc; i++ { + sem <- struct{}{} + } + w := &AgentWorker{ + agentID: 42, + cfg: mockAgentConfig(), + taskQueue: make(chan *Task, taskQueueDepth), + tasks: make(map[string]*Task), + rateSem: sem, + maxConcurrent: mc, + } + // Simulate 1 active task (consume 1 token) + <-w.rateSem + + rr := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/health", nil) + w.handleHealth(rr, req) + + var body map[string]any + if err := json.NewDecoder(rr.Body).Decode(&body); err != nil { + t.Fatalf("invalid JSON: %v", err) + } + + if int(body["maxConcurrent"].(float64)) != mc { + t.Errorf("expected maxConcurrent=%d, got %v", mc, body["maxConcurrent"]) + } + if int(body["rateLimitFree"].(float64)) != mc-1 { + t.Errorf("expected rateLimitFree=%d, got %v", mc-1, body["rateLimitFree"]) + } + if int(body["activeTasks"].(float64)) != 1 { + t.Errorf("expected activeTasks=1, got %v", body["activeTasks"]) + } +} diff --git a/gateway/cmd/agent/main.go b/gateway/cmd/agent/main.go new file mode 100644 index 0000000..58ab376 --- /dev/null +++ b/gateway/cmd/agent/main.go @@ -0,0 +1,270 @@ +// GoClaw Agent Server — autonomous agent microservice +// +// Each agent runs as an independent container in the Docker Swarm overlay +// network. It exposes an HTTP API that the GoClaw Orchestrator can reach +// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080). +// +// The agent: +// - Receives task requests from the orchestrator +// - Calls the LLM via the centrally-managed GoClaw Gateway +// - Reads/writes shared state in the MySQL database +// - Reports its last-activity time so the SwarmManager can auto-stop it +// - Gracefully shuts down after IdleTimeout with no requests +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "os/signal" + "strconv" + "strings" + "syscall" + "time" +) + +// ─── Config ────────────────────────────────────────────────────────────────── + +type AgentConfig struct { + AgentID string + Port string + GatewayURL string + LLMURL string + LLMAPIKey string + DatabaseURL string + IdleTimeoutMinutes int +} + +func loadConfig() AgentConfig { + idleMin := 15 + if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" { + if n, err := strconv.Atoi(v); err == nil { + idleMin = n + } + } + port := os.Getenv("AGENT_PORT") + if port == "" { + port = "8080" + } + return AgentConfig{ + AgentID: getEnv("AGENT_ID", "unnamed-agent"), + Port: port, + GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"), + LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"), + LLMAPIKey: os.Getenv("LLM_API_KEY"), + DatabaseURL: os.Getenv("DATABASE_URL"), + IdleTimeoutMinutes: idleMin, + } +} + +func getEnv(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} + +// ─── State ─────────────────────────────────────────────────────────────────── + +type Agent struct { + cfg AgentConfig + lastActivity time.Time + httpClient *http.Client +} + +func NewAgent(cfg AgentConfig) *Agent { + return &Agent{ + cfg: cfg, + lastActivity: time.Now(), + httpClient: &http.Client{Timeout: 120 * time.Second}, + } +} + +func (a *Agent) touch() { + a.lastActivity = time.Now() +} + +// ─── HTTP handlers ──────────────────────────────────────────────────────────── + +// GET /health — liveness probe +func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) { + respond(w, 200, map[string]any{ + "ok": true, + "agentId": a.cfg.AgentID, + "lastActivity": a.lastActivity.Format(time.RFC3339), + "idleMinutes": time.Since(a.lastActivity).Minutes(), + }) +} + +// POST /task — receive a task from the orchestrator +// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 } +func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) { + a.touch() + var body struct { + SessionID string `json:"sessionId"` + Messages json.RawMessage `json:"messages"` + Model string `json:"model"` + MaxIter int `json:"maxIter"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + respondError(w, 400, "invalid request: "+err.Error()) + return + } + // Forward the task to the GoClaw Gateway orchestrator + gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat" + reqBody, _ := json.Marshal(map[string]any{ + "messages": body.Messages, + "model": body.Model, + "maxIter": body.MaxIter, + }) + + req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody))) + if err != nil { + respondError(w, 500, "request build error: "+err.Error()) + return + } + req.Header.Set("Content-Type", "application/json") + + resp, err := a.httpClient.Do(req) + if err != nil { + respondError(w, 502, "gateway error: "+err.Error()) + return + } + defer resp.Body.Close() + + var result map[string]any + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + respondError(w, 502, "gateway response error: "+err.Error()) + return + } + + a.touch() + respond(w, 200, map[string]any{ + "ok": true, + "agentId": a.cfg.AgentID, + "sessionId": body.SessionID, + "result": result, + }) +} + +// GET /info — agent metadata +func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) { + hostname, _ := os.Hostname() + respond(w, 200, map[string]any{ + "agentId": a.cfg.AgentID, + "hostname": hostname, + "gatewayUrl": a.cfg.GatewayURL, + "idleTimeout": a.cfg.IdleTimeoutMinutes, + "lastActivity": a.lastActivity.Format(time.RFC3339), + "idleMinutes": time.Since(a.lastActivity).Minutes(), + }) +} + +// ─── Idle watchdog ──────────────────────────────────────────────────────────── + +func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) { + threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for range ticker.C { + idle := time.Since(a.lastActivity) + if idle >= threshold { + log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway", + a.cfg.AgentID, idle.Minutes()) + a.selfStop() + cancel() + return + } + } +} + +// selfStop asks the GoClaw Gateway to scale this service to 0. +func (a *Agent) selfStop() { + url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID) + req, err := http.NewRequest("POST", url, nil) + if err != nil { + log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err) + return + } + resp, err := a.httpClient.Do(req) + if err != nil { + log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err) + return + } + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body)) +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +func respond(w http.ResponseWriter, status int, data any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + json.NewEncoder(w).Encode(data) +} + +func respondError(w http.ResponseWriter, status int, msg string) { + respond(w, status, map[string]any{"error": msg}) +} + +// ─── Main ───────────────────────────────────────────────────────────────────── + +func main() { + log.SetFlags(log.LstdFlags | log.Lshortfile) + + cfg := loadConfig() + agent := NewAgent(cfg) + + log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)", + cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes) + log.Printf("[Agent] Gateway: %s", cfg.GatewayURL) + + // ── HTTP server ────────────────────────────────────────────────────────── + mux := http.NewServeMux() + mux.HandleFunc("GET /health", agent.handleHealth) + mux.HandleFunc("POST /task", agent.handleTask) + mux.HandleFunc("GET /info", agent.handleInfo) + + srv := &http.Server{ + Addr: ":" + cfg.Port, + Handler: mux, + ReadTimeout: 30 * time.Second, + WriteTimeout: 150 * time.Second, + IdleTimeout: 120 * time.Second, + } + + ctx, cancel := context.WithCancel(context.Background()) + + // ── Idle watchdog ──────────────────────────────────────────────────────── + go agent.runIdleWatchdog(cancel) + + // ── Graceful shutdown ──────────────────────────────────────────────────── + quit := make(chan os.Signal, 1) + signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM) + + go func() { + log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port) + if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err) + } + }() + + select { + case <-quit: + log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID) + case <-ctx.Done(): + log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID) + } + + shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer shutCancel() + if err := srv.Shutdown(shutCtx); err != nil { + log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err) + } + log.Printf("[Agent %s] Stopped.", cfg.AgentID) +} diff --git a/gateway/cmd/gateway/main.go b/gateway/cmd/gateway/main.go index 18d3c1b..145d052 100644 --- a/gateway/cmd/gateway/main.go +++ b/gateway/cmd/gateway/main.go @@ -47,10 +47,35 @@ func main() { // ── Orchestrator ───────────────────────────────────────────────────────── orch := orchestrator.New(llmClient, database, cfg.ProjectRoot) + // Apply retry policy from config + orch.SetRetryPolicy(orchestrator.RetryPolicy{ + MaxLLMRetries: cfg.MaxLLMRetries, + InitialDelay: time.Duration(cfg.RetryDelaySecs) * time.Second, + MaxDelay: 30 * time.Second, + RetryOnEmpty: true, + }) + log.Printf("[Gateway] LLM retry policy: maxRetries=%d, initialDelay=%ds", cfg.MaxLLMRetries, cfg.RetryDelaySecs) // ── HTTP Handlers ──────────────────────────────────────────────────────── h := api.NewHandler(cfg, llmClient, orch, database) + // ── Sync Swarm tokens to DB on startup ────────────────────────────────── + go func() { + time.Sleep(3 * time.Second) // wait for Docker daemon readiness + if database != nil { + dockerCl := h.GetDockerClient() + if tokens, err := dockerCl.GetJoinTokens(); err == nil { + addr := dockerCl.GetManagerAddr() + database.UpsertSwarmTokens( + tokens.JoinTokens.Worker, + tokens.JoinTokens.Manager, + addr, + ) + log.Printf("[Gateway] Swarm tokens synced to DB. Manager addr: %s", addr) + } + } + }() + // ── Router ─────────────────────────────────────────────────────────────── r := chi.NewRouter() @@ -76,11 +101,19 @@ func main() { r.Route("/api", func(r chi.Router) { // Orchestrator r.Post("/orchestrator/chat", h.OrchestratorChat) + r.Post("/orchestrator/stream", h.OrchestratorStream) r.Get("/orchestrator/config", h.OrchestratorConfig) - // Agents + // Agents — CRUD + Container lifecycle (Phase A-C) r.Get("/agents", h.ListAgents) + r.Get("/agents/running", h.ListRunningAgents) // Phase C: service discovery + r.Post("/agents", h.CreateAgent) r.Get("/agents/{id}", h.GetAgent) + r.Delete("/agents/{id}", h.DeleteAgent) + r.Post("/agents/{id}/deploy", h.DeployAgent) + r.Post("/agents/{id}/stop", h.StopAgent) + r.Post("/agents/{id}/scale", h.ScaleAgent) + r.Post("/agents/{id}/restart", h.RestartAgent) // Phase C: dead-letter restart // Models r.Get("/models", h.ListModels) @@ -92,8 +125,41 @@ func main() { // Nodes / Docker Swarm monitoring r.Get("/nodes", h.ListNodes) r.Get("/nodes/stats", h.NodeStats) + + // Provider config reload (called by Node.js after provider change) + r.Post("/providers/reload", h.ProvidersReload) + + // Persistent chat sessions (background processing, DB-backed) + r.Post("/chat/session", h.StartChatSession) + r.Get("/chat/sessions", h.ListChatSessions) + r.Get("/chat/session/{id}", h.GetChatSession) + r.Get("/chat/session/{id}/events", h.GetChatEvents) + + // ── Real Docker Swarm Management ───────────────────────────────────── + r.Get("/swarm/info", h.SwarmInfo) + r.Get("/swarm/nodes", h.SwarmNodes) + r.Post("/swarm/nodes/{id}/label", h.SwarmAddNodeLabel) + r.Post("/swarm/nodes/{id}/availability", h.SwarmSetNodeAvailability) + r.Get("/swarm/services", h.SwarmServices) + r.Post("/swarm/services/create", h.SwarmCreateService) + r.Delete("/swarm/services/{id}", h.SwarmRemoveService) + r.Get("/swarm/services/{id}/tasks", h.SwarmServiceTasks) + r.Post("/swarm/services/{id}/scale", h.SwarmScaleService) + r.Get("/swarm/join-token", h.SwarmJoinToken) + r.Post("/swarm/join-node", h.SwarmJoinNodeViaSSH) + r.Post("/swarm/ssh-test", h.SwarmSSHTest) + r.Post("/swarm/shell", h.SwarmShell) + r.Get("/swarm/agents", h.SwarmListAgents) + r.Post("/swarm/agents/{name}/start", h.SwarmStartAgent) + r.Post("/swarm/agents/{name}/stop", h.SwarmStopAgent) }) + // ── Swarm Manager: auto-stop idle agents after 15 min ──────────────────── + swarmMgr := api.NewSwarmManager(h, 60*time.Second) + managerCtx, managerCancel := context.WithCancel(context.Background()) + go swarmMgr.Start(managerCtx) + defer managerCancel() + // ── Start Server ───────────────────────────────────────────────────────── srv := &http.Server{ Addr: ":" + cfg.Port, diff --git a/gateway/config/config.go b/gateway/config/config.go index 7945659..7961f48 100644 --- a/gateway/config/config.go +++ b/gateway/config/config.go @@ -46,6 +46,22 @@ type Config struct { DefaultModel string MaxToolIterations int RequestTimeoutSecs int + + // Docker overlay network for agent containers + // AGENT_NETWORK — name of the Docker overlay/bridge network agents are attached to. + // Default: goclaw-agents (a dedicated overlay network) + AgentNetwork string + + // AGENT_DB_URL — DATABASE_URL passed to agent containers. + // Useful when agents run on an overlay network and the DB hostname differs. + // Falls back to DatabaseURL if not set. + AgentDBURL string + + // LLM retry policy + // GATEWAY_MAX_LLM_RETRIES — additional attempts after a failure/empty response (default 3). + MaxLLMRetries int + // GATEWAY_RETRY_DELAY_SECS — initial delay before first retry in seconds (default 2). + RetryDelaySecs int } func Load() *Config { @@ -55,6 +71,8 @@ func Load() *Config { maxIter, _ := strconv.Atoi(getEnv("GATEWAY_MAX_TOOL_ITERATIONS", "10")) timeout, _ := strconv.Atoi(getEnv("GATEWAY_REQUEST_TIMEOUT_SECS", "120")) + maxLLMRetries, _ := strconv.Atoi(getEnv("GATEWAY_MAX_LLM_RETRIES", "3")) + retryDelaySecs, _ := strconv.Atoi(getEnv("GATEWAY_RETRY_DELAY_SECS", "2")) // Resolve LLM base URL — priority: LLM_BASE_URL > OLLAMA_BASE_URL > default cloud rawLLMURL := getEnvFirst( @@ -82,6 +100,10 @@ func Load() *Config { DefaultModel: getEnv("DEFAULT_MODEL", "qwen2.5:7b"), MaxToolIterations: maxIter, RequestTimeoutSecs: timeout, + MaxLLMRetries: maxLLMRetries, + RetryDelaySecs: retryDelaySecs, + AgentNetwork: getEnv("AGENT_NETWORK", "goclaw-agents"), + AgentDBURL: getEnv("AGENT_DB_URL", ""), } if cfg.LLMAPIKey == "" { diff --git a/gateway/go.mod b/gateway/go.mod index af39820..c9c1135 100644 --- a/gateway/go.mod +++ b/gateway/go.mod @@ -3,11 +3,15 @@ module git.softuniq.eu/UniqAI/GoClaw/gateway go 1.23.4 require ( - filippo.io/edwards25519 v1.1.0 // indirect github.com/go-chi/chi/v5 v5.2.1 github.com/go-chi/cors v1.2.1 - github.com/go-sql-driver/mysql v1.8.1 // indirect + github.com/go-sql-driver/mysql v1.8.1 github.com/google/uuid v1.6.0 - github.com/jmoiron/sqlx v1.4.0 // indirect github.com/joho/godotenv v1.5.1 + golang.org/x/crypto v0.37.0 +) + +require ( + filippo.io/edwards25519 v1.1.0 // indirect + golang.org/x/sys v0.32.0 // indirect ) diff --git a/gateway/go.sum b/gateway/go.sum index 807180e..6962f72 100644 --- a/gateway/go.sum +++ b/gateway/go.sum @@ -8,9 +8,11 @@ github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpv github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= -github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= -github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= +golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= +golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= diff --git a/gateway/internal/api/handlers.go b/gateway/internal/api/handlers.go index 2aa64bc..531bf98 100644 --- a/gateway/internal/api/handlers.go +++ b/gateway/internal/api/handlers.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "strconv" + "strings" "time" "git.softuniq.eu/UniqAI/GoClaw/gateway/config" @@ -16,6 +17,7 @@ import ( "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/llm" "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/orchestrator" "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/tools" + "golang.org/x/crypto/ssh" ) // Handler holds all dependencies for HTTP handlers. @@ -37,6 +39,11 @@ func NewHandler(cfg *config.Config, llmClient *llm.Client, orch *orchestrator.Or } } +// GetDockerClient exposes the docker client for use in main.go startup routines. +func (h *Handler) GetDockerClient() *dockerclient.DockerClient { + return h.docker +} + // ─── Health ─────────────────────────────────────────────────────────────────── // GET /health @@ -99,6 +106,277 @@ func (h *Handler) OrchestratorConfig(w http.ResponseWriter, r *http.Request) { }) } +// ─── SSE Stream ─────────────────────────────────────────────────────────────── + +// SSE event types +const ( + sseEventToolCall = "tool_call" + sseEventDelta = "delta" + sseEventDone = "done" + sseEventError = "error" + sseEventThinking = "thinking" +) + +// streamEvent is a single SSE event sent to the client. +type streamEvent struct { + Type string `json:"type"` + // For delta events + Content string `json:"content,omitempty"` + // For tool_call events + Tool string `json:"tool,omitempty"` + Args any `json:"args,omitempty"` + Result any `json:"result,omitempty"` + Success *bool `json:"success,omitempty"` + DurationMs *int64 `json:"durationMs,omitempty"` + // For done events + Model string `json:"model,omitempty"` + ModelWarning string `json:"modelWarning,omitempty"` + Usage *llm.Usage `json:"usage,omitempty"` + // For error events + Error string `json:"error,omitempty"` +} + +// writeSSE writes a single SSE event to the response writer and flushes. +func writeSSE(w http.ResponseWriter, flusher http.Flusher, event streamEvent) { + data, err := json.Marshal(event) + if err != nil { + return + } + fmt.Fprintf(w, "data: %s\n\n", data) + flusher.Flush() +} + +// POST /api/orchestrator/stream +// SSE endpoint: streams tool-call events and LLM delta tokens in real time. +func (h *Handler) OrchestratorStream(w http.ResponseWriter, r *http.Request) { + flusher, ok := w.(http.Flusher) + if !ok { + respondError(w, http.StatusInternalServerError, "streaming not supported") + return + } + + var req struct { + Messages []orchestrator.Message `json:"messages"` + Model string `json:"model,omitempty"` + MaxIter int `json:"maxIter,omitempty"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + respondError(w, http.StatusBadRequest, "invalid request body: "+err.Error()) + return + } + if len(req.Messages) == 0 { + respondError(w, http.StatusBadRequest, "messages array is required") + return + } + + // Set SSE headers + w.Header().Set("Content-Type", "text/event-stream; charset=utf-8") + w.Header().Set("Cache-Control", "no-cache") + w.Header().Set("Connection", "keep-alive") + w.Header().Set("Access-Control-Allow-Origin", "*") + w.WriteHeader(http.StatusOK) + flusher.Flush() + + log.Printf("[API] POST /api/orchestrator/stream — messages=%d model=%q", len(req.Messages), req.Model) + + // Extract the last user message for history/metrics storage + userMessage := "" + for i := len(req.Messages) - 1; i >= 0; i-- { + if req.Messages[i].Role == "user" { + userMessage = req.Messages[i].Content + break + } + } + + // Determine orchestrator agent ID (look for isOrchestrator=1 in DB) + orchAgentID := 1 // fallback to agent ID 1 + if h.db != nil { + if cfg, err := h.db.GetOrchestratorConfig(); err == nil && cfg != nil { + orchAgentID = cfg.ID + } + } + + startTime := time.Now() + + ctx, cancel := context.WithTimeout(r.Context(), time.Duration(h.cfg.RequestTimeoutSecs)*time.Second) + defer cancel() + + // Run orchestration in a goroutine, streaming events via channel + type toolEvent struct { + step orchestrator.ToolCallStep + } + toolCh := make(chan toolEvent, 32) + doneCh := make(chan orchestrator.ChatResult, 1) + + // Custom streaming orchestrator + go func() { + result := h.orch.ChatWithEvents(ctx, req.Messages, req.Model, req.MaxIter, func(step orchestrator.ToolCallStep) { + toolCh <- toolEvent{step: step} + }) + close(toolCh) + doneCh <- result + }() + + // Send thinking event + writeSSE(w, flusher, streamEvent{Type: sseEventThinking}) + + // Drain tool events + for ev := range toolCh { + success := ev.step.Success + dur := ev.step.DurationMs + writeSSE(w, flusher, streamEvent{ + Type: sseEventToolCall, + Tool: ev.step.Tool, + Args: ev.step.Args, + Result: ev.step.Result, + Success: &success, + DurationMs: &dur, + Error: ev.step.Error, + }) + } + + // Get final result + result := <-doneCh + + if !result.Success { + writeSSE(w, flusher, streamEvent{Type: sseEventError, Error: result.Error}) + fmt.Fprintf(w, "data: [DONE]\n\n") + flusher.Flush() + // Persist error metric + history (fire-and-forget goroutine) + if h.db != nil { + go func() { + reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano()) + h.db.SaveMetric(db.MetricInput{ + AgentID: orchAgentID, + RequestID: reqID, + UserMessage: userMessage, + ProcessingTimeMs: time.Since(startTime).Milliseconds(), + Status: "error", + ErrorMessage: result.Error, + Model: result.Model, + }) + h.db.SaveHistory(db.HistoryInput{ + AgentID: orchAgentID, + UserMessage: userMessage, + AgentResponse: "", + Status: "error", + }) + }() + } + return + } + + // Stream the response in rune-safe chunks (important for UTF-8 / Cyrillic). + // We convert to []rune first so we never split a multi-byte character. + const runeChunkSize = 6 + runes := []rune(result.Response) + for i := 0; i < len(runes); i += runeChunkSize { + end := i + runeChunkSize + if end > len(runes) { + end = len(runes) + } + writeSSE(w, flusher, streamEvent{ + Type: sseEventDelta, + Content: string(runes[i:end]), + }) + select { + case <-ctx.Done(): + return + default: + } + } + + // Send done event + writeSSE(w, flusher, streamEvent{ + Type: sseEventDone, + Model: result.Model, + ModelWarning: result.ModelWarning, + Usage: result.Usage, + }) + fmt.Fprintf(w, "data: [DONE]\n\n") + flusher.Flush() + + // Persist metrics + history asynchronously (never blocks the response) + if h.db != nil { + go func() { + reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano()) + var inputTok, outputTok, totalTok int + if result.Usage != nil { + inputTok = result.Usage.PromptTokens + outputTok = result.Usage.CompletionTokens + totalTok = result.Usage.TotalTokens + } + toolNames := make([]string, len(result.ToolCalls)) + for i, tc := range result.ToolCalls { + toolNames[i] = tc.Tool + } + h.db.SaveMetric(db.MetricInput{ + AgentID: orchAgentID, + RequestID: reqID, + UserMessage: userMessage, + AgentResponse: result.Response, + InputTokens: inputTok, + OutputTokens: outputTok, + TotalTokens: totalTok, + ProcessingTimeMs: time.Since(startTime).Milliseconds(), + Status: "success", + ToolsCalled: toolNames, + Model: result.Model, + }) + h.db.SaveHistory(db.HistoryInput{ + AgentID: orchAgentID, + UserMessage: userMessage, + AgentResponse: result.Response, + Status: "success", + }) + }() + } +} + +// ─── Providers Reload ───────────────────────────────────────────────────────── + +// POST /api/providers/reload +// Node.js calls this after activating a provider, sending the decrypted API key in the body. +// Body: { "name": "...", "baseUrl": "...", "apiKey": "...", "modelDefault": "..." } +func (h *Handler) ProvidersReload(w http.ResponseWriter, r *http.Request) { + // Try to read the decrypted credentials from the request body (preferred path) + var body struct { + Name string `json:"name"` + BaseURL string `json:"baseUrl"` + APIKey string `json:"apiKey"` + ModelDefault string `json:"modelDefault"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err == nil && body.BaseURL != "" { + h.llm.UpdateCredentials(body.BaseURL, body.APIKey) + log.Printf("[API] Provider reloaded from body: %s (%s)", body.Name, body.BaseURL) + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "name": body.Name, + "baseUrl": body.BaseURL, + }) + return + } + + // Fallback: try to read from DB (key will be empty since Go can't decrypt it) + if h.db != nil { + provider, err := h.db.GetActiveProvider() + if err == nil && provider != nil { + h.llm.UpdateCredentials(provider.BaseURL, provider.APIKey) + log.Printf("[API] Provider reloaded from DB: %s (%s)", provider.Name, provider.BaseURL) + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "name": provider.Name, + "baseUrl": provider.BaseURL, + }) + return + } + if err != nil { + log.Printf("[API] ProvidersReload: DB error: %v", err) + } + } + respond(w, http.StatusOK, map[string]any{"ok": true, "note": "No provider data received"}) +} + // ─── Agents ─────────────────────────────────────────────────────────────────── // GET /api/agents @@ -135,6 +413,342 @@ func (h *Handler) GetAgent(w http.ResponseWriter, r *http.Request) { respond(w, http.StatusOK, agent) } +// POST /api/agents — create agent in DB (no container yet) +func (h *Handler) CreateAgent(w http.ResponseWriter, r *http.Request) { + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + var in db.CreateAgentInput + if err := json.NewDecoder(r.Body).Decode(&in); err != nil { + respondError(w, http.StatusBadRequest, "invalid request body: "+err.Error()) + return + } + if in.Name == "" { + respondError(w, http.StatusBadRequest, "name is required") + return + } + if in.Model == "" { + in.Model = "qwen2.5:7b" + } + if in.Role == "" { + in.Role = "assistant" + } + id, err := h.db.CreateAgent(in) + if err != nil { + respondError(w, http.StatusInternalServerError, "failed to create agent: "+err.Error()) + return + } + respond(w, http.StatusCreated, map[string]any{"ok": true, "id": id, "message": "Agent created. Use POST /api/agents/{id}/deploy to start the container."}) +} + +// DELETE /api/agents/{id} — delete agent from DB and remove Swarm service if running +func (h *Handler) DeleteAgent(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + id, err := strconv.Atoi(idStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid agent id") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + // Try to get agent and stop its container/service if running + agent, _ := h.db.GetAgentByID(id) + if agent != nil && agent.ServiceName != "" && h.docker != nil { + // Try standalone container first + if err := h.docker.StopContainer(agent.ServiceName); err != nil { + // Fallback: Swarm service + svcs, svcErr := h.docker.ListServices() + if svcErr == nil { + for _, svc := range svcs { + if svc.Spec.Name == agent.ServiceName { + if rmErr := h.docker.RemoveService(svc.ID); rmErr != nil { + log.Printf("[AgentDeploy] warn: remove service %s: %v", svc.Spec.Name, rmErr) + } + break + } + } + } + } + } + if err := h.db.DeleteAgent(id); err != nil { + respondError(w, http.StatusInternalServerError, "failed to delete agent: "+err.Error()) + return + } + respond(w, http.StatusOK, map[string]any{"ok": true, "deleted": id}) +} + +// POST /api/agents/{id}/deploy — deploy agent as a standalone Docker container +func (h *Handler) DeployAgent(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + id, err := strconv.Atoi(idStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid agent id") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + + agent, err := h.db.GetAgentByID(id) + if err != nil { + respondError(w, http.StatusNotFound, "agent not found") + return + } + + // Assign a free port from pool 8100-8999 + port, err := h.db.AssignServicePort(8100, 900) + if err != nil { + respondError(w, http.StatusConflict, "no free port available: "+err.Error()) + return + } + + svcName := fmt.Sprintf("goclaw-agent-%d", id) + image := agent.ContainerImage + if image == "" { + image = "goclaw-agent-worker:latest" + } + + // Build env vars for the agent container + // Note: agent-worker reads AGENT_PORT (not PORT) for its HTTP listen address + envVars := []string{ + fmt.Sprintf("AGENT_ID=%d", id), + fmt.Sprintf("AGENT_PORT=%d", port), + fmt.Sprintf("PORT=%d", port), // alias for compatibility + } + // Pass through gateway env vars for DB and LLM. + // AGENT_DB_URL overrides DATABASE_URL for agent containers + // (useful when agents run on overlay network with different DB hostname/IP). + agentDBURL := h.cfg.AgentDBURL + if agentDBURL == "" { + // fallback: replace short "db" alias with "goclaw-db" for overlay DNS + agentDBURL = strings.ReplaceAll(h.cfg.DatabaseURL, "tcp(db:", "tcp(goclaw-db:") + } + if agentDBURL != "" { + envVars = append(envVars, "DATABASE_URL="+agentDBURL) + } + if llmURL := h.cfg.LLMBaseURL; llmURL != "" { + envVars = append(envVars, "LLM_BASE_URL="+llmURL) + } + if apiKey := h.cfg.LLMAPIKey; apiKey != "" { + envVars = append(envVars, "LLM_API_KEY="+apiKey) + } + + // Update DB status to "deploying" before creating service + _ = h.db.UpdateContainerStatus(id, "deploying", svcName, port) + + if h.docker == nil { + // Docker not available (dev mode) — just update status + _ = h.db.UpdateContainerStatus(id, "running", svcName, port) + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "note": "Docker not available — simulated deploy", + "serviceName": svcName, + "servicePort": port, + }) + return + } + + // Use configurable network (AGENT_NETWORK env var, default: "goclaw-agents") + agentNetwork := "goclaw-agents" + if h.cfg != nil && h.cfg.AgentNetwork != "" { + agentNetwork = h.cfg.AgentNetwork + } + + // Deploy agent as a standalone container (docker run) so it can share + // the same bridge/overlay network as goclaw-db and goclaw-gateway. + runOpts := dockerclient.RunContainerOpts{ + Name: svcName, + Image: image, + Env: envVars, + Networks: []string{agentNetwork}, + Port: port, + Labels: map[string]string{ + "goclaw.agent": "true", + "goclaw.agent.id": fmt.Sprintf("%d", id), + }, + } + containerID, err := h.docker.RunContainer(runOpts) + if err != nil { + // If container already exists (409 Conflict), remove it and retry once + if strings.Contains(err.Error(), "409") || strings.Contains(err.Error(), "already in use") { + log.Printf("[AgentDeploy] container %s already exists, removing and retrying", svcName) + _ = h.docker.StopContainer(svcName) + containerID, err = h.docker.RunContainer(runOpts) + } + } + if err != nil { + _ = h.db.UpdateContainerStatus(id, "error", svcName, port) + respondError(w, http.StatusInternalServerError, "failed to deploy agent container: "+err.Error()) + return + } + + _ = h.db.UpdateContainerStatus(id, "running", svcName, port) + log.Printf("[AgentDeploy] Agent %d deployed as container %s (id=%s) on port %d", id, svcName, containerID, port) + + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "serviceId": containerID, + "serviceName": svcName, + "servicePort": port, + "image": image, + "agentId": id, + }) +} + +// POST /api/agents/{id}/stop — stop and remove the agent container +func (h *Handler) StopAgent(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + id, err := strconv.Atoi(idStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid agent id") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + agent, err := h.db.GetAgentByID(id) + if err != nil { + respondError(w, http.StatusNotFound, "agent not found") + return + } + // Use serviceName from DB if available, otherwise derive from agent ID + svcName := agent.ServiceName + if svcName == "" { + svcName = fmt.Sprintf("goclaw-agent-%d", id) + } + if h.docker != nil { + // Try to stop as standalone container first (always try by canonical name too) + _ = h.docker.StopContainer(svcName) + // Also try Swarm service fallback + svcs, svcErr := h.docker.ListServices() + if svcErr == nil { + for _, svc := range svcs { + if svc.Spec.Name == svcName { + _ = h.docker.RemoveService(svc.ID) + break + } + } + } + } + _ = h.db.UpdateContainerStatus(id, "stopped", "", 0) + respond(w, http.StatusOK, map[string]any{"ok": true, "stopped": svcName}) +} + +// POST /api/agents/{id}/scale — scale replicas of agent Swarm service +func (h *Handler) ScaleAgent(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + id, err := strconv.Atoi(idStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid agent id") + return + } + var req struct { + Replicas int `json:"replicas"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Replicas < 1 { + respondError(w, http.StatusBadRequest, "replicas must be >= 1") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + agent, err := h.db.GetAgentByID(id) + if err != nil { + respondError(w, http.StatusNotFound, "agent not found") + return + } + if agent.ServiceName == "" { + respondError(w, http.StatusBadRequest, "agent has no deployed service — deploy first") + return + } + if h.docker != nil { + svcs, err := h.docker.ListServices() + if err == nil { + for _, svc := range svcs { + if svc.Spec.Name == agent.ServiceName { + if err := h.docker.ScaleService(svc.ID, req.Replicas); err != nil { + respondError(w, http.StatusInternalServerError, "scale error: "+err.Error()) + return + } + break + } + } + } + } + respond(w, http.StatusOK, map[string]any{"ok": true, "serviceName": agent.ServiceName, "replicas": req.Replicas}) +} + +// GET /api/agents/running — service discovery: only agents with containerStatus=running +// Used by agents for peer-discovery (A2A protocol, Phase C). +func (h *Handler) ListRunningAgents(w http.ResponseWriter, r *http.Request) { + if h.db == nil { + respond(w, http.StatusOK, map[string]any{"agents": []any{}, "count": 0}) + return + } + all, err := h.db.ListAgents() + if err != nil { + respondError(w, http.StatusInternalServerError, "failed to list agents: "+err.Error()) + return + } + running := make([]db.AgentRow, 0, len(all)) + for _, a := range all { + if a.ContainerStatus == "running" && a.ServicePort > 0 { + running = append(running, a) + } + } + respond(w, http.StatusOK, map[string]any{"agents": running, "count": len(running)}) +} + +// POST /api/agents/{id}/restart — restart a stopped/error agent (dead-letter recovery). +// Removes the old Swarm service (if any) and re-deploys. +func (h *Handler) RestartAgent(w http.ResponseWriter, r *http.Request) { + idStr := r.PathValue("id") + id, err := strconv.Atoi(idStr) + if err != nil { + respondError(w, http.StatusBadRequest, "invalid agent id") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + agent, err := h.db.GetAgentByID(id) + if err != nil { + respondError(w, http.StatusNotFound, "agent not found") + return + } + // Stop existing container/service if present + if agent.ServiceName != "" && h.docker != nil { + // Try standalone container first + if err := h.docker.StopContainer(agent.ServiceName); err != nil { + // Fallback: Swarm service + svcs, lsErr := h.docker.ListServices() + if lsErr == nil { + for _, svc := range svcs { + if svc.Spec.Name == agent.ServiceName { + if rmErr := h.docker.RemoveService(svc.ID); rmErr != nil { + log.Printf("[AgentRestart] remove old service %s: %v", svc.Spec.Name, rmErr) + } + break + } + } + } + } + // Reset DB status to stopped so DeployAgent can redeploy + h.db.UpdateContainerStatus(id, "stopped", "", 0) + } + // Delegate to DeployAgent logic by calling it directly + // We reuse the same request, just changing the path param + r2 := r.WithContext(r.Context()) + h.DeployAgent(w, r2) +} + // ─── Models ─────────────────────────────────────────────────────────────────── // GET /api/models @@ -406,3 +1020,926 @@ func round2(f float64) float64 { func init() { _ = fmt.Sprintf // suppress unused import } + +// ─── Persistent Chat Sessions ───────────────────────────────────────────────── + +// POST /api/chat/session +// Creates a DB session, fires off the orchestrator in the background, +// returns {"sessionId":"..."} immediately. The client polls for events. +func (h *Handler) StartChatSession(w http.ResponseWriter, r *http.Request) { + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected — persistent sessions unavailable") + return + } + + var req struct { + Messages []orchestrator.Message `json:"messages"` + Model string `json:"model,omitempty"` + MaxIter int `json:"maxIter,omitempty"` + SessionID string `json:"sessionId,omitempty"` // client can supply its own ID + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + respondError(w, http.StatusBadRequest, "invalid body: "+err.Error()) + return + } + if len(req.Messages) == 0 { + respondError(w, http.StatusBadRequest, "messages array is required") + return + } + + // Use client-supplied ID or generate one + sessionID := req.SessionID + if sessionID == "" { + sessionID = fmt.Sprintf("cs-%d", time.Now().UnixNano()) + } + + // Extract last user message for storage + userMessage := "" + for i := len(req.Messages) - 1; i >= 0; i-- { + if req.Messages[i].Role == "user" { + userMessage = req.Messages[i].Content + break + } + } + + // Resolve orchestrator agent ID + orchAgentID := 1 + if cfg, err := h.db.GetOrchestratorConfig(); err == nil && cfg != nil { + orchAgentID = cfg.ID + } + + // Create session row in DB + if err := h.db.CreateSession(sessionID, userMessage, orchAgentID); err != nil { + respondError(w, http.StatusInternalServerError, "failed to create session: "+err.Error()) + return + } + + maxIter := req.MaxIter + if maxIter <= 0 { + maxIter = 10 + } + model := req.Model + + // Snapshot messages + config for the goroutine + messages := req.Messages + + // Launch orchestration in a fully detached goroutine. + // This goroutine runs independently — survives HTTP disconnect. + go func() { + startTime := time.Now() + + // Append initial "thinking" event + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "thinking", + }) + + ctx, cancel := context.WithTimeout(context.Background(), + time.Duration(h.cfg.RequestTimeoutSecs)*time.Second) + defer cancel() + + result := h.orch.ChatWithEventsAndRetry(ctx, messages, model, maxIter, + // onToolCall — store each tool execution as an event + func(step orchestrator.ToolCallStep) { + argsJSON, _ := json.Marshal(step.Args) + resultStr := "" + if step.Result != nil { + b, _ := json.Marshal(step.Result) + resultStr = string(b) + } + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "tool_call", + ToolName: step.Tool, + ToolArgs: string(argsJSON), + ToolResult: resultStr, + ToolSuccess: step.Success, + DurationMs: int(step.DurationMs), + ErrorMsg: step.Error, + }) + }, + // onRetry — emit a "thinking" event so the client sees retry progress + func(attempt int, reason string) { + msg := fmt.Sprintf("⟳ Retry %d: %s", attempt, reason) + log.Printf("[Orchestrator] %s", msg) + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "thinking", + Content: msg, + }) + }, + ) + + processingMs := time.Since(startTime).Milliseconds() + + if !result.Success { + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "error", + ErrorMsg: result.Error, + }) + h.db.MarkSessionDone(sessionID, "error", "", result.Model, result.Error, 0, processingMs) + return + } + + // Append full response as a single delta (client will display it) + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "delta", + Content: result.Response, + }) + + // Append done event + totalTok := 0 + usageStr := "null" + if result.Usage != nil { + totalTok = result.Usage.TotalTokens + b, _ := json.Marshal(result.Usage) + usageStr = string(b) + } + _ = h.db.AppendEvent(db.ChatEventRow{ + SessionID: sessionID, + EventType: "done", + Model: result.Model, + UsageJSON: usageStr, + }) + + h.db.MarkSessionDone(sessionID, "done", result.Response, result.Model, "", totalTok, processingMs) + + // Also save to legacy metrics/history tables + reqID := fmt.Sprintf("orch-%d", time.Now().UnixNano()) + toolNames := make([]string, len(result.ToolCalls)) + for i, tc := range result.ToolCalls { + toolNames[i] = tc.Tool + } + inputTok, outputTok := 0, 0 + if result.Usage != nil { + inputTok = result.Usage.PromptTokens + outputTok = result.Usage.CompletionTokens + } + h.db.SaveMetric(db.MetricInput{ + AgentID: orchAgentID, + RequestID: reqID, + UserMessage: userMessage, + AgentResponse: result.Response, + InputTokens: inputTok, + OutputTokens: outputTok, + TotalTokens: totalTok, + ProcessingTimeMs: processingMs, + Status: "success", + ToolsCalled: toolNames, + Model: result.Model, + }) + h.db.SaveHistory(db.HistoryInput{ + AgentID: orchAgentID, + UserMessage: userMessage, + AgentResponse: result.Response, + Status: "success", + }) + }() + + respond(w, http.StatusOK, map[string]any{ + "sessionId": sessionID, + "status": "running", + }) +} + +// GET /api/chat/session/:id +func (h *Handler) GetChatSession(w http.ResponseWriter, r *http.Request) { + sessionID := r.PathValue("id") + if sessionID == "" { + respondError(w, http.StatusBadRequest, "sessionId required") + return + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + sess, err := h.db.GetSession(sessionID) + if err != nil { + respondError(w, http.StatusNotFound, "session not found") + return + } + respond(w, http.StatusOK, sess) +} + +// GET /api/chat/session/:id/events?after=N +func (h *Handler) GetChatEvents(w http.ResponseWriter, r *http.Request) { + sessionID := r.PathValue("id") + if sessionID == "" { + respondError(w, http.StatusBadRequest, "sessionId required") + return + } + afterSeq := 0 + if v := r.URL.Query().Get("after"); v != "" { + fmt.Sscanf(v, "%d", &afterSeq) + } + if h.db == nil { + respondError(w, http.StatusServiceUnavailable, "DB not connected") + return + } + events, err := h.db.GetEvents(sessionID, afterSeq) + if err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + // Also return session status so client knows when to stop polling + var status string + if sess, err := h.db.GetSession(sessionID); err == nil { + status = sess.Status + } + respond(w, http.StatusOK, map[string]any{ + "sessionId": sessionID, + "status": status, + "events": events, + }) +} + +// GET /api/chat/sessions?limit=N +func (h *Handler) ListChatSessions(w http.ResponseWriter, r *http.Request) { + if h.db == nil { + respond(w, http.StatusOK, map[string]any{"sessions": []any{}}) + return + } + limit := 50 + if v := r.URL.Query().Get("limit"); v != "" { + fmt.Sscanf(v, "%d", &limit) + } + sessions, err := h.db.GetRecentSessions(limit) + if err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + respond(w, http.StatusOK, map[string]any{"sessions": sessions}) +} + +// ─── Real Docker Swarm Management ───────────────────────────────────────────── + +// GET /api/swarm/info +// Returns swarm status, node count, join tokens, and manager address. +func (h *Handler) SwarmInfo(w http.ResponseWriter, r *http.Request) { + info, err := h.docker.GetSwarmInfo() + if err != nil { + respondError(w, http.StatusInternalServerError, "docker info error: "+err.Error()) + return + } + tokens, err := h.docker.GetJoinTokens() + if err != nil { + tokens = nil + } + managerAddr := h.docker.GetManagerAddr() + + result := map[string]any{ + "nodeId": info.Swarm.NodeID, + "localNodeState": info.Swarm.LocalNodeState, + "isManager": info.Swarm.ControlAvailable, + "managers": info.Swarm.Managers, + "nodes": info.Swarm.Nodes, + "managerAddr": managerAddr, + } + if tokens != nil { + result["joinTokens"] = map[string]string{ + "worker": tokens.JoinTokens.Worker, + "manager": tokens.JoinTokens.Manager, + } + } + respond(w, http.StatusOK, result) +} + +// GET /api/swarm/nodes +// Returns all swarm nodes with their live status, labels, and resource info. +func (h *Handler) SwarmNodes(w http.ResponseWriter, r *http.Request) { + nodes, err := h.docker.ListNodes() + if err != nil { + respondError(w, http.StatusInternalServerError, "list nodes: "+err.Error()) + return + } + + type NodeOut struct { + ID string `json:"id"` + Hostname string `json:"hostname"` + Role string `json:"role"` + State string `json:"state"` + Availability string `json:"availability"` + IP string `json:"ip"` + OS string `json:"os"` + Arch string `json:"arch"` + CPUCores int `json:"cpuCores"` + MemTotalMB int64 `json:"memTotalMB"` + DockerVersion string `json:"dockerVersion"` + IsLeader bool `json:"isLeader"` + ManagerAddr string `json:"managerAddr,omitempty"` + Labels map[string]string `json:"labels"` + UpdatedAt string `json:"updatedAt"` + } + + out := make([]NodeOut, 0, len(nodes)) + for _, n := range nodes { + no := NodeOut{ + ID: n.ID[:min(12, len(n.ID))], + Hostname: n.Description.Hostname, + Role: n.Spec.Role, + State: n.Status.State, + Availability: n.Spec.Availability, + IP: n.Status.Addr, + OS: n.Description.Platform.OS, + Arch: n.Description.Platform.Architecture, + CPUCores: int(n.Description.Resources.NanoCPUs / 1e9), + MemTotalMB: n.Description.Resources.MemoryBytes / (1024 * 1024), + DockerVersion: n.Description.Engine.EngineVersion, + Labels: n.Spec.Labels, + UpdatedAt: n.UpdatedAt.UTC().Format(time.RFC3339), + } + if no.Labels == nil { + no.Labels = map[string]string{} + } + if n.ManagerStatus != nil { + no.IsLeader = n.ManagerStatus.Leader + no.ManagerAddr = n.ManagerStatus.Addr + } + out = append(out, no) + } + + // Persist/update nodes in DB for history + if h.db != nil { + go h.db.UpsertSwarmNodes(out) + } + + respond(w, http.StatusOK, map[string]any{ + "nodes": out, + "count": len(out), + "fetchedAt": time.Now().UTC().Format(time.RFC3339), + }) +} + +// POST /api/swarm/nodes/{id}/label +// Body: { "key": "gpu", "value": "true" } +func (h *Handler) SwarmAddNodeLabel(w http.ResponseWriter, r *http.Request) { + nodeID := r.PathValue("id") + if nodeID == "" { + respondError(w, http.StatusBadRequest, "nodeId required") + return + } + var body struct { + Key string `json:"key"` + Value string `json:"value"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Key == "" { + respondError(w, http.StatusBadRequest, "key required in body") + return + } + if err := h.docker.AddNodeLabel(nodeID, body.Key, body.Value); err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + respond(w, http.StatusOK, map[string]any{"ok": true}) +} + +// POST /api/swarm/nodes/{id}/availability +// Body: { "availability": "active|pause|drain" } +func (h *Handler) SwarmSetNodeAvailability(w http.ResponseWriter, r *http.Request) { + nodeID := r.PathValue("id") + var body struct { + Availability string `json:"availability"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + respondError(w, http.StatusBadRequest, "invalid body") + return + } + if body.Availability != "active" && body.Availability != "pause" && body.Availability != "drain" { + respondError(w, http.StatusBadRequest, "availability must be active|pause|drain") + return + } + if err := h.docker.UpdateNodeAvailability(nodeID, body.Availability); err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + respond(w, http.StatusOK, map[string]any{"ok": true}) +} + +// GET /api/swarm/services +// Returns all swarm services with replica counts and task status. +func (h *Handler) SwarmServices(w http.ResponseWriter, r *http.Request) { + services, err := h.docker.ListServices() + if err != nil { + respondError(w, http.StatusInternalServerError, "list services: "+err.Error()) + return + } + + type ServiceOut struct { + ID string `json:"id"` + Name string `json:"name"` + Image string `json:"image"` + Mode string `json:"mode"` // replicated | global + DesiredReplicas int `json:"desiredReplicas"` + RunningTasks int `json:"runningTasks"` + DesiredTasks int `json:"desiredTasks"` + Labels map[string]string `json:"labels"` + UpdatedAt string `json:"updatedAt"` + Ports []string `json:"ports"` + IsGoClaw bool `json:"isGoClaw"` // goclaw.agent label present + } + + out := make([]ServiceOut, 0, len(services)) + for _, svc := range services { + mode := "replicated" + desired := 0 + if svc.Spec.Mode.Replicated != nil { + desired = svc.Spec.Mode.Replicated.Replicas + } else if svc.Spec.Mode.Global != nil { + mode = "global" + } + running, desiredT := 0, 0 + if svc.ServiceStatus != nil { + running = svc.ServiceStatus.RunningTasks + desiredT = svc.ServiceStatus.DesiredTasks + } + var ports []string + if svc.Spec.EndpointSpec != nil { + for _, p := range svc.Spec.EndpointSpec.Ports { + if p.PublishedPort > 0 { + ports = append(ports, fmt.Sprintf("%d:%d/%s", p.PublishedPort, p.TargetPort, p.Protocol)) + } + } + } + labels := svc.Spec.Labels + if labels == nil { + labels = map[string]string{} + } + _, isGoClaw := labels["goclaw.agent"] + out = append(out, ServiceOut{ + ID: svc.ID[:min(12, len(svc.ID))], + Name: svc.Spec.Name, + Image: svc.Spec.TaskTemplate.ContainerSpec.Image, + Mode: mode, + DesiredReplicas: desired, + RunningTasks: running, + DesiredTasks: desiredT, + Labels: labels, + UpdatedAt: svc.UpdatedAt.UTC().Format(time.RFC3339), + Ports: ports, + IsGoClaw: isGoClaw, + }) + } + respond(w, http.StatusOK, map[string]any{"services": out, "count": len(out)}) +} + +// GET /api/swarm/services/{id}/tasks +// Returns all tasks for a service (shows which node each replica runs on). +func (h *Handler) SwarmServiceTasks(w http.ResponseWriter, r *http.Request) { + serviceID := r.PathValue("id") + if serviceID == "" { + respondError(w, http.StatusBadRequest, "serviceId required") + return + } + tasks, err := h.docker.ListServiceTasks(serviceID) + if err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + + type TaskOut struct { + ID string `json:"id"` + ServiceID string `json:"serviceId"` + NodeID string `json:"nodeId"` + Slot int `json:"slot"` + State string `json:"state"` + Message string `json:"message"` + ContainerID string `json:"containerId"` + UpdatedAt string `json:"updatedAt"` + } + out := make([]TaskOut, 0, len(tasks)) + for _, t := range tasks { + cid := "" + if t.Status.ContainerStatus != nil { + cid = t.Status.ContainerStatus.ContainerID + if len(cid) > 12 { + cid = cid[:12] + } + } + out = append(out, TaskOut{ + ID: t.ID[:min(12, len(t.ID))], + ServiceID: t.ServiceID[:min(12, len(t.ServiceID))], + NodeID: t.NodeID[:min(12, len(t.NodeID))], + Slot: t.Slot, + State: t.Status.State, + Message: t.Status.Message, + ContainerID: cid, + UpdatedAt: t.UpdatedAt.UTC().Format(time.RFC3339), + }) + } + respond(w, http.StatusOK, map[string]any{"tasks": out, "count": len(out)}) +} + +// POST /api/swarm/services/{id}/scale +// Body: { "replicas": 3 } +func (h *Handler) SwarmScaleService(w http.ResponseWriter, r *http.Request) { + serviceID := r.PathValue("id") + if serviceID == "" { + respondError(w, http.StatusBadRequest, "serviceId required") + return + } + var body struct { + Replicas int `json:"replicas"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + respondError(w, http.StatusBadRequest, "invalid body: "+err.Error()) + return + } + if body.Replicas < 0 || body.Replicas > 100 { + respondError(w, http.StatusBadRequest, "replicas must be 0-100") + return + } + if err := h.docker.ScaleService(serviceID, body.Replicas); err != nil { + respondError(w, http.StatusInternalServerError, err.Error()) + return + } + log.Printf("[Swarm] Scaled service %s to %d replicas", serviceID, body.Replicas) + respond(w, http.StatusOK, map[string]any{"ok": true, "replicas": body.Replicas}) +} + +// POST /api/swarm/services/create +// Deploy a new GoClaw agent as a Swarm service. +// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0, "networks": ["goclaw-net"] } +func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) { + var body struct { + Name string `json:"name"` + Image string `json:"image"` + Replicas int `json:"replicas"` + Env []string `json:"env"` + Port int `json:"port"` + Networks []string `json:"networks"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Name == "" || body.Image == "" { + respondError(w, http.StatusBadRequest, "name and image required") + return + } + if body.Replicas <= 0 { + body.Replicas = 1 + } + svc, err := h.docker.CreateAgentServiceFull(dockerclient.CreateAgentServiceOpts{ + Name: body.Name, + Image: body.Image, + Replicas: body.Replicas, + Env: body.Env, + Port: body.Port, + Networks: body.Networks, + }) + if err != nil { + respondError(w, http.StatusInternalServerError, "create service: "+err.Error()) + return + } + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "serviceId": svc.ID, + "name": svc.Spec.Name, + }) +} + +// DELETE /api/swarm/services/{id} +// Remove (stop) a swarm service. +func (h *Handler) SwarmRemoveService(w http.ResponseWriter, r *http.Request) { + serviceID := r.PathValue("id") + if serviceID == "" { + respondError(w, http.StatusBadRequest, "service id required") + return + } + if err := h.docker.RemoveService(serviceID); err != nil { + respondError(w, http.StatusInternalServerError, "remove service: "+err.Error()) + return + } + log.Printf("[Swarm] Removed service %s", serviceID) + respond(w, http.StatusOK, map[string]any{"ok": true}) +} + +// GET /api/swarm/agents +// List all GoClaw agent services with idle time information. +func (h *Handler) SwarmListAgents(w http.ResponseWriter, r *http.Request) { + services, err := h.docker.ListServices() + if err != nil { + respondError(w, http.StatusInternalServerError, "list services: "+err.Error()) + return + } + + type AgentInfo struct { + ID string `json:"id"` + Name string `json:"name"` + Image string `json:"image"` + DesiredReplicas int `json:"desiredReplicas"` + RunningTasks int `json:"runningTasks"` + LastActivity time.Time `json:"lastActivity"` + IdleMinutes float64 `json:"idleMinutes"` + IsGoClaw bool `json:"isGoClaw"` + } + + var agents []AgentInfo + for _, svc := range services { + isGoClaw := svc.Spec.Labels["goclaw.agent"] == "true" + desired := 0 + if svc.Spec.Mode.Replicated != nil { + desired = svc.Spec.Mode.Replicated.Replicas + } + running := 0 + if svc.ServiceStatus != nil { + running = svc.ServiceStatus.RunningTasks + } + lastActivity, _ := h.docker.GetServiceLastActivity(svc.ID) + if lastActivity.IsZero() { + lastActivity = svc.UpdatedAt + } + idle := time.Since(lastActivity).Minutes() + agents = append(agents, AgentInfo{ + ID: svc.ID, + Name: svc.Spec.Name, + Image: svc.Spec.TaskTemplate.ContainerSpec.Image, + DesiredReplicas: desired, + RunningTasks: running, + LastActivity: lastActivity, + IdleMinutes: idle, + IsGoClaw: isGoClaw, + }) + } + if agents == nil { + agents = []AgentInfo{} + } + respond(w, http.StatusOK, map[string]any{"agents": agents, "count": len(agents)}) +} + + +// POST /api/swarm/shell +// Execute a shell command on the HOST system (via nsenter into PID 1). +// Body: { "command": "docker ps" } +// Requires the gateway container to run with privileged: true + pid: host +func (h *Handler) SwarmShell(w http.ResponseWriter, r *http.Request) { + var body struct { + Command string `json:"command"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Command == "" { + respondError(w, http.StatusBadRequest, "command required") + return + } + // Security: reject obviously dangerous patterns in production + // (In a real deployment you'd add auth + command whitelisting) + dangerous := []string{"rm -rf /", "mkfs", "dd if=/dev/zero", ":(){:|:&};:"} + cmdLower := strings.ToLower(body.Command) + for _, d := range dangerous { + if strings.Contains(cmdLower, d) { + respondError(w, http.StatusForbidden, "dangerous command rejected") + return + } + } + + log.Printf("[SwarmShell] Executing: %s", body.Command) + output, err := dockerclient.ExecOnHost(body.Command) + if err != nil { + respond(w, http.StatusOK, map[string]any{ + "output": output, + "error": err.Error(), + "success": false, + }) + return + } + respond(w, http.StatusOK, map[string]any{ + "output": output, + "success": true, + }) +} + +// GET /api/swarm/join-token?role=worker|manager +// Returns the join command for adding a new node to the swarm. +func (h *Handler) SwarmJoinToken(w http.ResponseWriter, r *http.Request) { + role := r.URL.Query().Get("role") + if role == "" { + role = "worker" + } + tokens, err := h.docker.GetJoinTokens() + if err != nil { + respondError(w, http.StatusInternalServerError, "cannot get join tokens: "+err.Error()) + return + } + managerAddr := h.docker.GetManagerAddr() + + var token string + if role == "manager" { + token = tokens.JoinTokens.Manager + } else { + token = tokens.JoinTokens.Worker + } + + joinCmd := fmt.Sprintf("docker swarm join --token %s %s", token, managerAddr) + respond(w, http.StatusOK, map[string]any{ + "role": role, + "token": token, + "managerAddr": managerAddr, + "joinCommand": joinCmd, + }) +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// POST /api/swarm/join-node +// Connects to a remote host via SSH and runs "docker swarm join ..." to add it to the cluster. +// Body: { "host": "1.2.3.4", "port": 22, "user": "root", "password": "secret", "role": "worker" } +func (h *Handler) SwarmJoinNodeViaSSH(w http.ResponseWriter, r *http.Request) { + var body struct { + Host string `json:"host"` + Port int `json:"port"` + User string `json:"user"` + Password string `json:"password"` + Role string `json:"role"` // "worker" | "manager" + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + respondError(w, http.StatusBadRequest, "invalid request body") + return + } + if body.Host == "" || body.User == "" || body.Password == "" { + respondError(w, http.StatusBadRequest, "host, user and password are required") + return + } + if body.Port == 0 { + body.Port = 22 + } + if body.Role == "" { + body.Role = "worker" + } + + // 1. Get join token from local swarm + tokens, err := h.docker.GetJoinTokens() + if err != nil { + respondError(w, http.StatusInternalServerError, "cannot get join tokens: "+err.Error()) + return + } + managerAddr := h.docker.GetManagerAddr() + var token string + if body.Role == "manager" { + token = tokens.JoinTokens.Manager + } else { + token = tokens.JoinTokens.Worker + } + joinCmd := fmt.Sprintf("docker swarm join --token %s %s", token, managerAddr) + + // 2. Dial SSH to the remote host + sshCfg := &ssh.ClientConfig{ + User: body.User, + Auth: []ssh.AuthMethod{ + ssh.Password(body.Password), + }, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), // acceptable for internal cluster management + Timeout: 15 * time.Second, + } + addr := fmt.Sprintf("%s:%d", body.Host, body.Port) + log.Printf("[SwarmJoinNode] Dialing SSH %s as %s", addr, body.User) + + client, err := ssh.Dial("tcp", addr, sshCfg) + if err != nil { + respond(w, http.StatusOK, map[string]any{ + "ok": false, + "step": "ssh_connect", + "error": fmt.Sprintf("SSH connection failed: %s", err.Error()), + "host": body.Host, + "command": joinCmd, + }) + return + } + defer client.Close() + + // 3. Run docker swarm join on the remote node + sess, err := client.NewSession() + if err != nil { + respond(w, http.StatusOK, map[string]any{ + "ok": false, + "step": "ssh_session", + "error": fmt.Sprintf("SSH session failed: %s", err.Error()), + }) + return + } + defer sess.Close() + + log.Printf("[SwarmJoinNode] Running on %s: %s", body.Host, joinCmd) + out, err := sess.CombinedOutput(joinCmd) + output := strings.TrimSpace(string(out)) + + if err != nil { + // Node might already be in the swarm — treat "already" as success + if strings.Contains(output, "already") || strings.Contains(output, "This node is already") { + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "output": output, + "note": "node is already part of this swarm", + "command": joinCmd, + }) + return + } + respond(w, http.StatusOK, map[string]any{ + "ok": false, + "step": "docker_join", + "error": fmt.Sprintf("docker swarm join failed: %s", err.Error()), + "output": output, + "command": joinCmd, + }) + return + } + + log.Printf("[SwarmJoinNode] Success: %s joined as %s", body.Host, body.Role) + + // Give Docker Swarm ~3 seconds to propagate the new node, then sync to DB. + go func() { + time.Sleep(3 * time.Second) + nodes, err := h.docker.ListNodes() + if err != nil { + log.Printf("[SwarmJoinNode] DB sync failed (ListNodes): %v", err) + return + } + if h.db != nil { + h.db.UpsertSwarmNodes(nodes) + log.Printf("[SwarmJoinNode] DB synced: %d nodes after join", len(nodes)) + } + }() + + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "output": output, + "host": body.Host, + "role": body.Role, + "command": joinCmd, + }) +} + +// POST /api/swarm/ssh-test +// Tests SSH connectivity and checks if Docker is accessible on the remote host. +// Body: { "host": "1.2.3.4", "port": 22, "user": "root", "password": "secret" } +func (h *Handler) SwarmSSHTest(w http.ResponseWriter, r *http.Request) { + var body struct { + Host string `json:"host"` + Port int `json:"port"` + User string `json:"user"` + Password string `json:"password"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + respondError(w, http.StatusBadRequest, "invalid request body") + return + } + if body.Host == "" || body.User == "" || body.Password == "" { + respondError(w, http.StatusBadRequest, "host, user and password are required") + return + } + if body.Port == 0 { + body.Port = 22 + } + + sshCfg := &ssh.ClientConfig{ + User: body.User, + Auth: []ssh.AuthMethod{ + ssh.Password(body.Password), + }, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + Timeout: 10 * time.Second, + } + addr := fmt.Sprintf("%s:%d", body.Host, body.Port) + log.Printf("[SSHTest] Dialing %s as %s", addr, body.User) + + client, err := ssh.Dial("tcp", addr, sshCfg) + if err != nil { + respond(w, http.StatusOK, map[string]any{ + "ok": false, + "step": "ssh_connect", + "error": fmt.Sprintf("SSH connection failed: %s", err.Error()), + }) + return + } + defer client.Close() + + // Run a quick docker version check to see if Docker daemon is accessible + sess, err := client.NewSession() + if err != nil { + respond(w, http.StatusOK, map[string]any{ + "ok": false, + "step": "ssh_session", + "error": fmt.Sprintf("SSH session failed: %s", err.Error()), + }) + return + } + defer sess.Close() + + // Use plain 'docker info' to get server version — works on all distros + out, _ := sess.CombinedOutput("docker info --format '{{.ServerVersion}}' 2>/dev/null || docker version --format '{{.Server.Version}}' 2>/dev/null || echo 'docker_not_found'") + dockerVer := strings.TrimSpace(string(out)) + if dockerVer == "" { + dockerVer = "docker_not_found" + } + dockerOk := dockerVer != "docker_not_found" && !strings.Contains(dockerVer, "not found") && !strings.Contains(dockerVer, "command not found") + + log.Printf("[SSHTest] %s — SSH OK, docker: %s", addr, dockerVer) + respond(w, http.StatusOK, map[string]any{ + "ok": true, + "sshOk": true, + "dockerOk": dockerOk, + "dockerVersion": dockerVer, + "host": body.Host, + }) +} diff --git a/gateway/internal/api/handlers_agents_test.go b/gateway/internal/api/handlers_agents_test.go new file mode 100644 index 0000000..d47a360 --- /dev/null +++ b/gateway/internal/api/handlers_agents_test.go @@ -0,0 +1,334 @@ +package api + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "git.softuniq.eu/UniqAI/GoClaw/gateway/config" + "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db" +) + +// mockDB implements only the methods needed for agent handler tests. +type mockDB struct { + agents []db.AgentRow + configs map[int]*db.AgentConfig + created []db.CreateAgentInput + deleted []int + statusUpdates []statusUpdate + nextID int +} + +type statusUpdate struct { + agentID int + status string + serviceName string + servicePort int +} + +func newMockDB() *mockDB { + return &mockDB{ + configs: map[int]*db.AgentConfig{}, + nextID: 100, + } +} + +func (m *mockDB) ListAgents() ([]db.AgentRow, error) { + return m.agents, nil +} + +func (m *mockDB) GetAgentByID(id int) (*db.AgentConfig, error) { + if cfg, ok := m.configs[id]; ok { + return cfg, nil + } + return nil, nil +} + +func (m *mockDB) CreateAgent(in db.CreateAgentInput) (int, error) { + m.created = append(m.created, in) + id := m.nextID + m.nextID++ + m.configs[id] = &db.AgentConfig{ + ID: id, + Name: in.Name, + Model: in.Model, + } + return id, nil +} + +func (m *mockDB) DeleteAgent(id int) error { + m.deleted = append(m.deleted, id) + delete(m.configs, id) + return nil +} + +func (m *mockDB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error { + m.statusUpdates = append(m.statusUpdates, statusUpdate{agentID, status, serviceName, servicePort}) + if cfg, ok := m.configs[agentID]; ok { + cfg.ContainerStatus = status + cfg.ServiceName = serviceName + cfg.ServicePort = servicePort + } + return nil +} + +func (m *mockDB) AssignServicePort(start, max int) (int, error) { + return start, nil +} + +// ─── DB adapter: wrap mockDB to satisfy Handler which uses *db.DB ──────────── +// We use composition instead — create a handler variant that uses an interface. +// For these tests, we bypass the Handler's db field and test the logic separately. + +// ─── Unit tests for new agent REST endpoints ────────────────────────────────── + +// TestCreateAgent_MissingDB verifies 503 when DB is nil. +func TestCreateAgent_MissingDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, // no DB + } + body := `{"name":"Test","model":"qwen2.5:7b"}` + req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + h.CreateAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestDeleteAgent_MissingDB verifies 503 when DB is nil. +func TestDeleteAgent_MissingDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodDelete, "/api/agents/1", nil) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.DeleteAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestDeleteAgent_InvalidID verifies 400 for non-numeric id. +func TestDeleteAgent_InvalidID(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodDelete, "/api/agents/abc", nil) + req.SetPathValue("id", "abc") + w := httptest.NewRecorder() + + h.DeleteAgent(w, req) + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +// TestDeployAgent_MissingDB verifies 503 when DB is nil. +func TestDeployAgent_MissingDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.DeployAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestDeployAgent_InvalidID verifies 400 for non-numeric id. +func TestDeployAgent_InvalidID(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/deploy", nil) + req.SetPathValue("id", "xyz") + w := httptest.NewRecorder() + + h.DeployAgent(w, req) + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +// TestStopAgent_MissingDB verifies 503 when DB is nil. +func TestStopAgent_MissingDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/stop", nil) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.StopAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestScaleAgent_MissingDB verifies 503 when DB is nil. +func TestScaleAgent_MissingDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + body := `{"replicas":3}` + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body)) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.ScaleAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestScaleAgent_BadReplicas verifies 400 for replicas < 1. +func TestScaleAgent_BadReplicas(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + body := `{"replicas":0}` + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/scale", bytes.NewBufferString(body)) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.ScaleAgent(w, req) + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +// TestListAgents_NoDB verifies empty list when DB is nil. +func TestListAgents_NoDB(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, + } + req := httptest.NewRequest(http.MethodGet, "/api/agents", nil) + w := httptest.NewRecorder() + + h.ListAgents(w, req) + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + var resp map[string]any + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatal("invalid JSON:", err) + } + if _, ok := resp["note"]; !ok { + t.Error("expected 'note' field in response when DB is nil") + } +} + +// TestCreateAgent_MissingName verifies 400 when name is empty. +func TestCreateAgent_MissingName(t *testing.T) { + h := &Handler{ + cfg: &config.Config{}, + db: nil, // will fail at DB check first, but let's also test directly + } + body := `{"model":"qwen2.5:7b"}` + req := httptest.NewRequest(http.MethodPost, "/api/agents", bytes.NewBufferString(body)) + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + + h.CreateAgent(w, req) + // When DB is nil => 503; that's fine, confirms routing works + if w.Code != http.StatusServiceUnavailable && w.Code != http.StatusBadRequest { + t.Errorf("expected 400 or 503, got %d", w.Code) + } +} + +// TestDeployAgent_SimulatedNoDB verifies simulated deploy when docker is nil. +// We can't inject a real *db.DB without a live MySQL, so we just verify +// that the nil-docker path returns 200 with the right fields. +// This test verifies the handler structure is correct. +func TestDeployAgent_DockerNil_DBNil(t *testing.T) { + // When db is nil, returns 503 + h := &Handler{ + cfg: &config.Config{}, + db: nil, + docker: nil, + } + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/deploy", nil) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + h.DeployAgent(w, req) + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// ─── Phase C tests ───────────────────────────────────────────────────────────── + +// TestListRunningAgents_NoDB verifies empty list when DB is nil. +func TestListRunningAgents_NoDB(t *testing.T) { + h := &Handler{cfg: &config.Config{}, db: nil} + req := httptest.NewRequest(http.MethodGet, "/api/agents/running", nil) + w := httptest.NewRecorder() + + h.ListRunningAgents(w, req) + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } + var resp map[string]any + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatal("invalid JSON:", err) + } + if cnt, _ := resp["count"].(float64); cnt != 0 { + t.Errorf("expected count=0, got %v", cnt) + } +} + +// TestRestartAgent_NoDB verifies 503 when DB is nil. +func TestRestartAgent_NoDB(t *testing.T) { + h := &Handler{cfg: &config.Config{}, db: nil} + req := httptest.NewRequest(http.MethodPost, "/api/agents/1/restart", nil) + req.SetPathValue("id", "1") + w := httptest.NewRecorder() + + h.RestartAgent(w, req) + // RestartAgent calls DeployAgent which checks DB → 503 + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +// TestRestartAgent_InvalidID verifies 400 for non-numeric id. +func TestRestartAgent_InvalidID(t *testing.T) { + h := &Handler{cfg: &config.Config{}, db: nil} + req := httptest.NewRequest(http.MethodPost, "/api/agents/xyz/restart", nil) + req.SetPathValue("id", "xyz") + w := httptest.NewRecorder() + + h.RestartAgent(w, req) + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +// TestSwarmManagerDeadLetter_DockerNil verifies that checkIdleAgents +// returns early without panic when docker is nil. +func TestSwarmManagerDeadLetter_DockerNil(t *testing.T) { + h := &Handler{cfg: &config.Config{}, db: nil, docker: nil} + sm := NewSwarmManager(h, 60*time.Second) + // Should not panic + sm.checkIdleAgents() +} diff --git a/gateway/internal/api/swarm_manager.go b/gateway/internal/api/swarm_manager.go new file mode 100644 index 0000000..8f2184f --- /dev/null +++ b/gateway/internal/api/swarm_manager.go @@ -0,0 +1,196 @@ +// Package api – Swarm Agent Lifecycle Manager +// +// The SwarmManager runs as a background goroutine inside the GoClaw Gateway +// (which is the Swarm manager node). It watches all agent services and +// automatically scales them to 0 replicas after IdleTimeout minutes of no +// activity. The orchestrator can call StartAgent / StopAgent via the REST API +// to start/stop agents on demand. +// +// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1) +// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0 +// Auto-stop: background loop checks every 60 s, scales idle agents to 0 +package api + +import ( + "context" + "encoding/json" + "log" + "net/http" + "time" +) + +const ( + // IdleTimeout – how many minutes without any task updates before an agent + // is automatically scaled to 0. + defaultIdleTimeoutMinutes = 15 + // deadLetterCheckEnabled – when true, SwarmManager reconciles DB containerStatus + // with actual Swarm service existence (dead-letter recovery). + deadLetterCheckEnabled = true +) + +// SwarmManager watches agent services and auto-scales them down after idle. +type SwarmManager struct { + handler *Handler + ticker *time.Ticker + done chan struct{} +} + +// NewSwarmManager creates a manager that checks every checkInterval. +func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager { + return &SwarmManager{ + handler: h, + ticker: time.NewTicker(checkInterval), + done: make(chan struct{}), + } +} + +// Start launches the background loop. Call in a goroutine. +func (m *SwarmManager) Start(ctx context.Context) { + log.Printf("[SwarmManager] Started — idle timeout %d min", + defaultIdleTimeoutMinutes) + defer m.ticker.Stop() + for { + select { + case <-m.done: + return + case <-ctx.Done(): + return + case <-m.ticker.C: + m.checkIdleAgents() + } + } +} + +// Stop signals the background loop to exit. +func (m *SwarmManager) Stop() { + close(m.done) +} + +func (m *SwarmManager) checkIdleAgents() { + h := m.handler + if h.docker == nil { + return + } + + // Build a lookup set of currently-live container/service names (both standalone + Swarm) + liveContainers := make(map[string]bool) + + // Check standalone containers + if containers, err := h.docker.ListContainers(); err == nil { + for _, c := range containers { + for _, name := range c.Names { + // Docker container names are prefixed with "/" + clean := name + if len(clean) > 0 && clean[0] == '/' { + clean = clean[1:] + } + if c.Labels["goclaw.agent"] == "true" { + liveContainers[clean] = true + } + } + } + } + + // Check Swarm services (for legacy/mixed environments) + services, err := h.docker.ListServices() + if err != nil { + log.Printf("[SwarmManager] list services error: %v", err) + } + + idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute + now := time.Now() + + for _, svc := range services { + // Only manage services labelled as GoClaw agents + if svc.Spec.Labels["goclaw.agent"] != "true" { + continue + } + liveContainers[svc.Spec.Name] = true + + // Skip already-stopped services (0 desired replicas) + desired := 0 + if svc.Spec.Mode.Replicated != nil { + desired = svc.Spec.Mode.Replicated.Replicas + } + if desired == 0 { + continue + } + // Check last activity time + lastActivity, actErr := h.docker.GetServiceLastActivity(svc.ID) + if actErr != nil || lastActivity.IsZero() { + lastActivity = svc.UpdatedAt + } + idle := now.Sub(lastActivity) + if idle >= idleThreshold { + log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0", + svc.Spec.Name, idle.Minutes()) + if scaleErr := h.docker.ScaleService(svc.ID, 0); scaleErr != nil { + log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, scaleErr) + } + } + } + + // ── Dead-letter reconciliation (Phase C) ───────────────────────────────── + // If an agent's DB says "running" but its container/service is gone, update + // the status to "error" so the UI shows the discrepancy and allows redeploy. + if !deadLetterCheckEnabled || h.db == nil { + return + } + agents, dbErr := h.db.ListAgents() + if dbErr != nil { + return + } + for _, a := range agents { + if a.ContainerStatus != "running" || a.ServiceName == "" { + continue + } + if !liveContainers[a.ServiceName] { + log.Printf("[SwarmManager] Dead-letter: agent %d (%q) marked running but container %q not found — setting status=error", + a.ID, a.Name, a.ServiceName) + if updateErr := h.db.UpdateContainerStatus(a.ID, "error", a.ServiceName, a.ServicePort); updateErr != nil { + log.Printf("[SwarmManager] UpdateContainerStatus error for agent %d: %v", a.ID, updateErr) + } + } + } +} + +// ─── HTTP Handlers for agent lifecycle ──────────────────────────────────────── + +// POST /api/swarm/agents/{name}/start +// Start (scale-up) a named agent service. Body: { "replicas": 1 } +func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + respondError(w, http.StatusBadRequest, "agent name required") + return + } + var body struct { + Replicas int `json:"replicas"` + } + _ = json.NewDecoder(r.Body).Decode(&body) + if body.Replicas <= 0 { + body.Replicas = 1 + } + if err := h.docker.ScaleService(name, body.Replicas); err != nil { + respondError(w, http.StatusInternalServerError, "start agent: "+err.Error()) + return + } + log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas) + respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas}) +} + +// POST /api/swarm/agents/{name}/stop +// Stop (scale-to-0) a named agent service. +func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + respondError(w, http.StatusBadRequest, "agent name required") + return + } + if err := h.docker.ScaleService(name, 0); err != nil { + respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error()) + return + } + log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name) + respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0}) +} diff --git a/gateway/internal/db/db.go b/gateway/internal/db/db.go index 4a9b8fe..7d1567c 100644 --- a/gateway/internal/db/db.go +++ b/gateway/internal/db/db.go @@ -3,6 +3,7 @@ package db import ( "database/sql" + "database/sql/driver" "encoding/json" "fmt" "log" @@ -20,9 +21,9 @@ type AgentConfig struct { AllowedTools []string Temperature float64 MaxTokens int - IsOrchestrator bool - IsSystem bool - IsActive bool + IsOrchestrator bool + IsSystem bool + IsActive bool // Container / Swarm fields (Phase A) ServiceName string ServicePort int @@ -32,14 +33,34 @@ type AgentConfig struct { // AgentRow is a minimal agent representation for listing. type AgentRow struct { - ID int `json:"id"` - Name string `json:"name"` - Role string `json:"role"` - Model string `json:"model"` - Description string `json:"description"` - IsActive bool `json:"isActive"` - IsSystem bool `json:"isSystem"` - IsOrchestrator bool `json:"isOrchestrator"` + ID int `json:"id"` + Name string `json:"name"` + Role string `json:"role"` + Model string `json:"model"` + Description string `json:"description"` + IsActive bool `json:"isActive"` + IsSystem bool `json:"isSystem"` + IsOrchestrator bool `json:"isOrchestrator"` + // Container / Swarm fields + ServiceName string `json:"serviceName"` + ServicePort int `json:"servicePort"` + ContainerImage string `json:"containerImage"` + ContainerStatus string `json:"containerStatus"` +} + +// CreateAgentInput holds the fields required to create a new agent in DB. +type CreateAgentInput struct { + Name string `json:"name"` + Role string `json:"role"` + Model string `json:"model"` + Description string `json:"description"` + SystemPrompt string `json:"systemPrompt"` + Temperature float64 `json:"temperature"` + MaxTokens int `json:"maxTokens"` + AllowedTools []string `json:"allowedTools"` + IsSystem bool `json:"isSystem"` + IsOrchestrator bool `json:"isOrchestrator"` + ContainerImage string `json:"containerImage"` } type DB struct { @@ -73,8 +94,7 @@ func (d *DB) Close() { // GetOrchestratorConfig loads the agent with isOrchestrator=1 from DB. func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) { row := d.conn.QueryRow(` - SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive, - COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped') + SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive FROM agents WHERE isOrchestrator = 1 LIMIT 1 @@ -85,8 +105,7 @@ func (d *DB) GetOrchestratorConfig() (*AgentConfig, error) { // GetAgentByID loads a specific agent by ID. func (d *DB) GetAgentByID(id int) (*AgentConfig, error) { row := d.conn.QueryRow(` - SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive, - COALESCE(serviceName,''), COALESCE(servicePort,0), COALESCE(containerImage,'goclaw-agent-worker:latest'), COALESCE(containerStatus,'stopped') + SELECT id, name, model, systemPrompt, allowedTools, temperature, maxTokens, isOrchestrator, isSystem, isActive FROM agents WHERE id = ? LIMIT 1 @@ -94,10 +113,13 @@ func (d *DB) GetAgentByID(id int) (*AgentConfig, error) { return scanAgentConfig(row) } -// ListAgents returns all active agents. +// ListAgents returns all agents with container status fields. func (d *DB) ListAgents() ([]AgentRow, error) { rows, err := d.conn.Query(` - SELECT id, name, role, model, COALESCE(description,''), isActive, isSystem, isOrchestrator + SELECT id, name, role, model, + COALESCE(description,''), isActive, isSystem, isOrchestrator, + COALESCE(serviceName,''), COALESCE(servicePort,0), + COALESCE(containerImage,''), COALESCE(containerStatus,'stopped') FROM agents ORDER BY isOrchestrator DESC, isSystem DESC, id ASC `) @@ -110,7 +132,11 @@ func (d *DB) ListAgents() ([]AgentRow, error) { for rows.Next() { var a AgentRow var isActive, isSystem, isOrch int - if err := rows.Scan(&a.ID, &a.Name, &a.Role, &a.Model, &a.Description, &isActive, &isSystem, &isOrch); err != nil { + if err := rows.Scan( + &a.ID, &a.Name, &a.Role, &a.Model, &a.Description, + &isActive, &isSystem, &isOrch, + &a.ServiceName, &a.ServicePort, &a.ContainerImage, &a.ContainerStatus, + ); err != nil { continue } a.IsActive = isActive == 1 @@ -121,64 +147,423 @@ func (d *DB) ListAgents() ([]AgentRow, error) { return agents, nil } -// ─── Helpers ────────────────────────────────────────────────────────────────── - -func scanAgentConfig(row *sql.Row) (*AgentConfig, error) { - var cfg AgentConfig - var systemPrompt sql.NullString - var allowedToolsJSON sql.NullString - var temperature sql.NullFloat64 - var maxTokens sql.NullInt64 - var isOrch, isSystem, isActive int - - err := row.Scan( - &cfg.ID, &cfg.Name, &cfg.Model, - &systemPrompt, &allowedToolsJSON, - &temperature, &maxTokens, - &isOrch, &isSystem, &isActive, - &cfg.ServiceName, &cfg.ServicePort, &cfg.ContainerImage, &cfg.ContainerStatus, +// CreateAgent inserts a new agent into the DB and returns its ID. +func (d *DB) CreateAgent(in CreateAgentInput) (int, error) { + if d.conn == nil { + return 0, fmt.Errorf("DB not connected") + } + toolsJSON := "[]" + if len(in.AllowedTools) > 0 { + b, _ := json.Marshal(in.AllowedTools) + toolsJSON = string(b) + } + temp := in.Temperature + if temp == 0 { + temp = 0.7 + } + maxTok := in.MaxTokens + if maxTok == 0 { + maxTok = 8192 + } + img := in.ContainerImage + if img == "" { + img = "goclaw-agent-worker:latest" + } + res, err := d.conn.Exec(` + INSERT INTO agents + (name, role, model, description, systemPrompt, temperature, maxTokens, + allowedTools, isActive, isSystem, isOrchestrator, + containerImage, containerStatus, createdAt, updatedAt) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1, ?, ?, ?, 'stopped', NOW(), NOW()) + `, + in.Name, in.Role, in.Model, in.Description, in.SystemPrompt, + temp, maxTok, toolsJSON, + boolToInt(in.IsSystem), boolToInt(in.IsOrchestrator), + img, ) + if err != nil { + return 0, fmt.Errorf("insert agent: %w", err) + } + id, _ := res.LastInsertId() + return int(id), nil +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} + +// DeleteAgent removes an agent record by ID (only non-system agents). +func (d *DB) DeleteAgent(id int) error { + if d.conn == nil { + return fmt.Errorf("DB not connected") + } + res, err := d.conn.Exec(`DELETE FROM agents WHERE id = ? AND isSystem = 0`, id) + if err != nil { + return err + } + n, _ := res.RowsAffected() + if n == 0 { + return fmt.Errorf("agent %d not found or is a system agent", id) + } + return nil +} + +// AssignServicePort finds the lowest free port in range [start, start+maxAgents). +// It reads all currently used ports from DB. +func (d *DB) AssignServicePort(start, maxAgents int) (int, error) { + if d.conn == nil { + return start, nil // offline — just return start + } + rows, err := d.conn.Query(`SELECT COALESCE(servicePort,0) FROM agents WHERE servicePort > 0`) + if err != nil { + return start, nil + } + defer rows.Close() + + used := map[int]bool{} + for rows.Next() { + var p int + if rows.Scan(&p) == nil && p > 0 { + used[p] = true + } + } + for port := start; port < start+maxAgents; port++ { + if !used[port] { + return port, nil + } + } + return 0, fmt.Errorf("no free port in range %d-%d", start, start+maxAgents) +} + +// ─── LLM Provider ───────────────────────────────────────────────────────────── + +// ProviderRow holds the active LLM provider config from DB. +type ProviderRow struct { + ID int + Name string + BaseURL string + APIKey string // decrypted (Node.js encrypts, Go just reads raw for now) +} + +// GetActiveProvider returns the active LLM provider from the llmProviders table. +// Note: The API key is stored AES-256-GCM encrypted by the Node.js server. +// The Go gateway reads the raw encrypted bytes but cannot decrypt them (no shared key in Go). +// The proper flow: Node.js decrypts the key and passes it via /api/providers/reload. +// For now, GetActiveProvider returns the stored encrypted bytes as-is (not useful for direct use). +// Use UpdateCredentials on the LLM client instead. +func (d *DB) GetActiveProvider() (*ProviderRow, error) { + var p ProviderRow + var apiKeyEncrypted sql.NullString + row := d.conn.QueryRow(` + SELECT id, name, baseUrl, COALESCE(apiKeyEncrypted, '') + FROM llmProviders + WHERE isActive = 1 + LIMIT 1 + `) + err := row.Scan(&p.ID, &p.Name, &p.BaseURL, &apiKeyEncrypted) if err != nil { return nil, err } - - cfg.SystemPrompt = systemPrompt.String - cfg.Temperature = temperature.Float64 - if cfg.Temperature == 0 { - cfg.Temperature = 0.5 - } - cfg.MaxTokens = int(maxTokens.Int64) - if cfg.MaxTokens == 0 { - cfg.MaxTokens = 8192 - } - cfg.IsOrchestrator = isOrch == 1 - cfg.IsSystem = isSystem == 1 - cfg.IsActive = isActive == 1 - - if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" { - _ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools) - } - - return &cfg, nil + // We cannot decrypt the key in Go (different crypto impl from Node.js) + // Return empty key — the LLM client will use its env-configured key + p.APIKey = "" + return &p, nil } -// ─── Agent Container Fields ─────────────────────────────────────────────────── -// These methods support the agent-worker container architecture where each -// agent runs as an autonomous Docker Swarm service. +// ─── Chat Sessions & Events ─────────────────────────────────────────────────── -// UpdateContainerStatus updates the container lifecycle state of an agent. -func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error { +// ChatSessionRow holds one persistent chat session. +type ChatSessionRow struct { + ID int `json:"id"` + SessionID string `json:"sessionId"` + AgentID int `json:"agentId"` + Status string `json:"status"` // running | done | error + UserMessage string `json:"userMessage"` + FinalResponse string `json:"finalResponse"` + Model string `json:"model"` + TotalTokens int `json:"totalTokens"` + ProcessingTimeMs int64 `json:"processingTimeMs"` + ErrorMessage string `json:"errorMessage"` + CreatedAt string `json:"createdAt"` + UpdatedAt string `json:"updatedAt"` +} + +// ChatEventRow holds one event inside a session. +type ChatEventRow struct { + ID int `json:"id"` + SessionID string `json:"sessionId"` + Seq int `json:"seq"` + EventType string `json:"eventType"` // thinking | tool_call | delta | done | error + Content string `json:"content"` + ToolName string `json:"toolName"` + ToolArgs string `json:"toolArgs"` // JSON string + ToolResult string `json:"toolResult"` + ToolSuccess bool `json:"toolSuccess"` + DurationMs int `json:"durationMs"` + Model string `json:"model"` + UsageJSON string `json:"usageJson"` // JSON string + ErrorMsg string `json:"errorMsg"` + CreatedAt string `json:"createdAt"` +} + +// CreateSession inserts a new running session and returns its row. +func (d *DB) CreateSession(sessionID, userMessage string, agentID int) error { + if d.conn == nil { + return fmt.Errorf("DB not connected") + } + _, err := d.conn.Exec(` + INSERT INTO chatSessions (sessionId, agentId, status, userMessage) + VALUES (?, ?, 'running', ?) + `, sessionID, agentID, truncate(userMessage, 65535)) + return err +} + +// AppendEvent inserts a new event row for a session. +// seq is auto-calculated as MAX(seq)+1 for the session. +func (d *DB) AppendEvent(e ChatEventRow) error { if d.conn == nil { return nil } + toolArgs := e.ToolArgs + if toolArgs == "" { + toolArgs = "null" + } + usageJSON := e.UsageJSON + if usageJSON == "" { + usageJSON = "null" + } + var toolSuccessVal interface{} + if e.EventType == "tool_call" { + if e.ToolSuccess { + toolSuccessVal = 1 + } else { + toolSuccessVal = 0 + } + } _, err := d.conn.Exec(` - UPDATE agents - SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW() - WHERE id = ? - `, status, serviceName, servicePort, agentID) + INSERT INTO chatEvents + (sessionId, seq, eventType, content, toolName, toolArgs, + toolResult, toolSuccess, durationMs, model, usageJson, errorMsg) + SELECT ?, COALESCE(MAX(seq),0)+1, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ? + FROM chatEvents WHERE sessionId = ? + `, + e.SessionID, e.EventType, + nullStr(e.Content), nullStr(e.ToolName), rawJSON(toolArgs), + nullStr(e.ToolResult), toolSuccessVal, nullInt(e.DurationMs), + nullStr(e.Model), rawJSON(usageJSON), nullStr(e.ErrorMsg), + e.SessionID, + ) + if err != nil { + log.Printf("[DB] AppendEvent error: %v", err) + } return err } +// MarkSessionDone updates a session to done/error status. +func (d *DB) MarkSessionDone(sessionID, status, finalResponse, model, errorMessage string, totalTokens int, processingTimeMs int64) { + if d.conn == nil { + return + } + _, err := d.conn.Exec(` + UPDATE chatSessions + SET status=?, finalResponse=?, model=?, totalTokens=?, + processingTimeMs=?, errorMessage=? + WHERE sessionId=? + `, status, + truncate(finalResponse, 65535), + model, + totalTokens, + processingTimeMs, + truncate(errorMessage, 65535), + sessionID, + ) + if err != nil { + log.Printf("[DB] MarkSessionDone error: %v", err) + } +} + +// GetSession returns a single session by its string ID. +func (d *DB) GetSession(sessionID string) (*ChatSessionRow, error) { + if d.conn == nil { + return nil, fmt.Errorf("DB not connected") + } + row := d.conn.QueryRow(` + SELECT id, sessionId, agentId, status, + COALESCE(userMessage,''), + COALESCE(finalResponse,''), + COALESCE(model,''), + COALESCE(totalTokens,0), + COALESCE(processingTimeMs,0), + COALESCE(errorMessage,''), + createdAt, updatedAt + FROM chatSessions WHERE sessionId=? LIMIT 1 + `, sessionID) + var s ChatSessionRow + err := row.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status, + &s.UserMessage, &s.FinalResponse, &s.Model, + &s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage, + &s.CreatedAt, &s.UpdatedAt) + if err != nil { + return nil, err + } + return &s, nil +} + +// GetEvents returns all events for a session with seq > afterSeq (for incremental polling). +func (d *DB) GetEvents(sessionID string, afterSeq int) ([]ChatEventRow, error) { + if d.conn == nil { + return nil, fmt.Errorf("DB not connected") + } + rows, err := d.conn.Query(` + SELECT id, sessionId, seq, eventType, + COALESCE(content,''), COALESCE(toolName,''), + COALESCE(CAST(toolArgs AS CHAR),'null'), + COALESCE(toolResult,''), + COALESCE(toolSuccess,0), + COALESCE(durationMs,0), + COALESCE(model,''), + COALESCE(CAST(usageJson AS CHAR),'null'), + COALESCE(errorMsg,''), + createdAt + FROM chatEvents + WHERE sessionId=? AND seq > ? + ORDER BY seq ASC + `, sessionID, afterSeq) + if err != nil { + return nil, err + } + defer rows.Close() + + var result []ChatEventRow + for rows.Next() { + var e ChatEventRow + var toolSuccess int + if err := rows.Scan( + &e.ID, &e.SessionID, &e.Seq, &e.EventType, + &e.Content, &e.ToolName, &e.ToolArgs, + &e.ToolResult, &toolSuccess, &e.DurationMs, + &e.Model, &e.UsageJSON, &e.ErrorMsg, &e.CreatedAt, + ); err != nil { + continue + } + e.ToolSuccess = toolSuccess == 1 + result = append(result, e) + } + return result, nil +} + +// GetRecentSessions returns the N most recent sessions. +func (d *DB) GetRecentSessions(limit int) ([]ChatSessionRow, error) { + if d.conn == nil { + return nil, fmt.Errorf("DB not connected") + } + rows, err := d.conn.Query(` + SELECT id, sessionId, agentId, status, + COALESCE(userMessage,''), + COALESCE(finalResponse,''), + COALESCE(model,''), + COALESCE(totalTokens,0), + COALESCE(processingTimeMs,0), + COALESCE(errorMessage,''), + createdAt, updatedAt + FROM chatSessions ORDER BY id DESC LIMIT ? + `, limit) + if err != nil { + return nil, err + } + defer rows.Close() + var result []ChatSessionRow + for rows.Next() { + var s ChatSessionRow + if err := rows.Scan(&s.ID, &s.SessionID, &s.AgentID, &s.Status, + &s.UserMessage, &s.FinalResponse, &s.Model, + &s.TotalTokens, &s.ProcessingTimeMs, &s.ErrorMessage, + &s.CreatedAt, &s.UpdatedAt); err != nil { + continue + } + result = append(result, s) + } + return result, nil +} + +// helper — nil for empty strings +func nullStr(s string) interface{} { + if s == "" { + return nil + } + return s +} + +// helper — nil for zero int +func nullInt(n int) interface{} { + if n == 0 { + return nil + } + return n +} + +// rawJSON wraps a JSON string so it's passed as-is to MySQL (not double-encoded) +type rawJSON string + +func (r rawJSON) Value() (driver.Value, error) { + if r == "null" || r == "" { + return nil, nil + } + return string(r), nil +} + +// ─── Metrics & History ──────────────────────────────────────────────────────── + +// MetricInput holds data for a single orchestrator request metric. +type MetricInput struct { + AgentID int + RequestID string + UserMessage string + AgentResponse string + InputTokens int + OutputTokens int + TotalTokens int + ProcessingTimeMs int64 + Status string // "success" | "error" | "timeout" + ErrorMessage string + ToolsCalled []string + Model string +} + +// SaveMetric inserts a row into the agentMetrics table. +// Non-fatal — logs on error but does not return one. +func (d *DB) SaveMetric(m MetricInput) { + if d.conn == nil { + return + } + toolsJSON, _ := json.Marshal(m.ToolsCalled) + _, err := d.conn.Exec(` + INSERT INTO agentMetrics + (agentId, requestId, userMessage, agentResponse, + inputTokens, outputTokens, totalTokens, + processingTimeMs, status, errorMessage, toolsCalled, model) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + m.AgentID, + m.RequestID, + truncate(m.UserMessage, 65535), + truncate(m.AgentResponse, 65535), + m.InputTokens, m.OutputTokens, m.TotalTokens, + m.ProcessingTimeMs, + m.Status, + m.ErrorMessage, + string(toolsJSON), + m.Model, + ) + if err != nil { + log.Printf("[DB] SaveMetric error: %v", err) + } +} + // HistoryInput holds data for one conversation entry. type HistoryInput struct { AgentID int @@ -188,14 +573,6 @@ type HistoryInput struct { Status string // "success" | "error" | "pending" } -// HistoryRow is a single entry from agentHistory for sliding window memory. -type HistoryRow struct { - ID int `json:"id"` - UserMessage string `json:"userMessage"` - AgentResponse string `json:"agentResponse"` - ConvID string `json:"conversationId"` -} - // SaveHistory inserts a row into the agentHistory table. // Non-fatal — logs on error but does not return one. func (d *DB) SaveHistory(h HistoryInput) { @@ -223,39 +600,7 @@ func (d *DB) SaveHistory(h HistoryInput) { } } -// GetAgentHistory returns the last N conversation turns for an agent, oldest first. -func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) { - if d.conn == nil { - return nil, nil - } - rows, err := d.conn.Query(` - SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'') - FROM agentHistory - WHERE agentId = ? - ORDER BY id DESC - LIMIT ? - `, agentID, limit) - if err != nil { - return nil, err - } - defer rows.Close() - - var result []HistoryRow - for rows.Next() { - var h HistoryRow - if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil { - continue - } - result = append(result, h) - } - // Reverse so oldest is first (for LLM context ordering) - for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 { - result[i], result[j] = result[j], result[i] - } - return result, nil -} - -// truncate caps a string to maxLen bytes. +// truncate caps a string to maxLen bytes (not runes — fast path for DB limits). func truncate(s string, maxLen int) string { if len(s) <= maxLen { return s @@ -263,6 +608,154 @@ func truncate(s string, maxLen int) string { return s[:maxLen] } +// ─── Swarm Node Persistence ─────────────────────────────────────────────────── + +// SwarmNodeInput is the data shape that handlers pass to UpsertSwarmNodes. +// It matches the JSON shape from handler's NodeOut struct so we can reuse it. +type SwarmNodeInput struct { + ID string `json:"id"` + Hostname string `json:"hostname"` + Role string `json:"role"` + State string `json:"state"` + Availability string `json:"availability"` + IP string `json:"ip"` + CPUCores int `json:"cpuCores"` + MemTotalMB int64 `json:"memTotalMB"` + DockerVersion string `json:"dockerVersion"` + IsLeader bool `json:"isLeader"` + ManagerAddr string `json:"managerAddr"` + Labels map[string]string `json:"labels"` +} + +// UpsertSwarmNodes inserts or updates swarm node records in the swarmNodes table. +// Called asynchronously from the SwarmNodes handler — never blocks the response. +func (d *DB) UpsertSwarmNodes(nodes interface{}) { + if d.conn == nil { + return + } + // We accept interface{} to avoid circular import; use json round-trip to parse. + b, err := json.Marshal(nodes) + if err != nil { + return + } + var list []SwarmNodeInput + if err := json.Unmarshal(b, &list); err != nil { + return + } + for _, n := range list { + labelsJSON, _ := json.Marshal(n.Labels) + isLeader := 0 + if n.IsLeader { + isLeader = 1 + } + isManager := 0 + if n.Role == "manager" { + isManager = 1 + } + state := n.State + if state != "ready" && state != "down" && state != "disconnected" { + state = "ready" + } + avail := n.Availability + if avail != "active" && avail != "pause" && avail != "drain" { + avail = "active" + } + _, err := d.conn.Exec(` + INSERT INTO swarmNodes + (nodeId, hostname, role, state, availability, advertiseAddr, + labels, engineVersion, cpuCores, memTotalMB, isManager, isLeader) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE + hostname=VALUES(hostname), role=VALUES(role), + state=VALUES(state), availability=VALUES(availability), + advertiseAddr=VALUES(advertiseAddr), + labels=VALUES(labels), engineVersion=VALUES(engineVersion), + cpuCores=VALUES(cpuCores), memTotalMB=VALUES(memTotalMB), + isManager=VALUES(isManager), isLeader=VALUES(isLeader), + lastSeenAt=CURRENT_TIMESTAMP + `, + n.ID, n.Hostname, n.Role, state, avail, n.IP, + string(labelsJSON), n.DockerVersion, + n.CPUCores, n.MemTotalMB, isManager, isLeader, + ) + if err != nil { + log.Printf("[DB] UpsertSwarmNodes error for node %s: %v", n.ID, err) + } + } +} + +// UpsertSwarmTokens stores the current swarm join tokens. +func (d *DB) UpsertSwarmTokens(workerToken, managerToken, managerAddr string) { + if d.conn == nil { + return + } + _, err := d.conn.Exec(` + INSERT INTO swarmTokens (managerToken, workerToken, managerAddr) + VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE + managerToken=VALUES(managerToken), + workerToken=VALUES(workerToken), + managerAddr=VALUES(managerAddr) + `, managerToken, workerToken, managerAddr) + if err != nil { + log.Printf("[DB] UpsertSwarmTokens error: %v", err) + } +} + +// GetSwarmTokens retrieves the stored join tokens. +func (d *DB) GetSwarmTokens() (worker, manager, addr string, err error) { + if d.conn == nil { + err = fmt.Errorf("DB not connected") + return + } + row := d.conn.QueryRow(` + SELECT COALESCE(workerToken,''), COALESCE(managerToken,''), COALESCE(managerAddr,'') + FROM swarmTokens ORDER BY id DESC LIMIT 1 + `) + err = row.Scan(&worker, &manager, &addr) + return +} + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +func scanAgentConfig(row *sql.Row) (*AgentConfig, error) { + var cfg AgentConfig + var systemPrompt sql.NullString + var allowedToolsJSON sql.NullString + var temperature sql.NullFloat64 + var maxTokens sql.NullInt64 + var isOrch, isSystem, isActive int + + err := row.Scan( + &cfg.ID, &cfg.Name, &cfg.Model, + &systemPrompt, &allowedToolsJSON, + &temperature, &maxTokens, + &isOrch, &isSystem, &isActive, + ) + if err != nil { + return nil, err + } + + cfg.SystemPrompt = systemPrompt.String + cfg.Temperature = temperature.Float64 + if cfg.Temperature == 0 { + cfg.Temperature = 0.5 + } + cfg.MaxTokens = int(maxTokens.Int64) + if cfg.MaxTokens == 0 { + cfg.MaxTokens = 8192 + } + cfg.IsOrchestrator = isOrch == 1 + cfg.IsSystem = isSystem == 1 + cfg.IsActive = isActive == 1 + + if allowedToolsJSON.Valid && allowedToolsJSON.String != "" && allowedToolsJSON.String != "null" { + _ = json.Unmarshal([]byte(allowedToolsJSON.String), &cfg.AllowedTools) + } + + return &cfg, nil +} + // normalizeDSN converts mysql://user:pass@host:port/db to user:pass@tcp(host:port)/db func normalizeDSN(dsn string) string { if !strings.HasPrefix(dsn, "mysql://") { @@ -304,3 +797,60 @@ func normalizeDSN(dsn string) string { } return fmt.Sprintf("%s@tcp(%s)%s?parseTime=true&charset=utf8mb4%s", userInfo, hostPort, dbName, tlsParam) } + +// ─── Agent Container Fields ─────────────────────────────────────────────────── +// These methods support the agent-worker container architecture where each +// agent runs as an autonomous Docker Swarm service. + +// UpdateContainerStatus updates the container lifecycle state of an agent. +func (d *DB) UpdateContainerStatus(agentID int, status, serviceName string, servicePort int) error { + if d.conn == nil { + return nil + } + _, err := d.conn.Exec(` + UPDATE agents + SET containerStatus = ?, serviceName = ?, servicePort = ?, updatedAt = NOW() + WHERE id = ? + `, status, serviceName, servicePort, agentID) + return err +} + +// HistoryRow is a single entry from agentHistory for sliding window memory. +type HistoryRow struct { + ID int `json:"id"` + UserMessage string `json:"userMessage"` + AgentResponse string `json:"agentResponse"` + ConvID string `json:"conversationId"` +} + +// GetAgentHistory returns the last N conversation turns for an agent, oldest first. +func (d *DB) GetAgentHistory(agentID, limit int) ([]HistoryRow, error) { + if d.conn == nil { + return nil, nil + } + rows, err := d.conn.Query(` + SELECT id, userMessage, COALESCE(agentResponse,''), COALESCE(conversationId,'') + FROM agentHistory + WHERE agentId = ? + ORDER BY id DESC + LIMIT ? + `, agentID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var result []HistoryRow + for rows.Next() { + var h HistoryRow + if err := rows.Scan(&h.ID, &h.UserMessage, &h.AgentResponse, &h.ConvID); err != nil { + continue + } + result = append(result, h) + } + // Reverse so oldest is first (for LLM context ordering) + for i, j := 0, len(result)-1; i < j; i, j = i+1, j-1 { + result[i], result[j] = result[j], result[i] + } + return result, nil +} diff --git a/gateway/internal/docker/client.go b/gateway/internal/docker/client.go index 3c8037a..797ea34 100644 --- a/gateway/internal/docker/client.go +++ b/gateway/internal/docker/client.go @@ -1,22 +1,25 @@ package docker import ( + "bytes" "context" "encoding/json" "fmt" "io" "net" "net/http" + "os/exec" + "strings" "time" ) -// DockerClient communicates with the Docker daemon via Unix socket or TCP. +// DockerClient communicates with the Docker daemon via Unix socket. type DockerClient struct { httpClient *http.Client baseURL string } -// NewDockerClient creates a client that talks to /var/run/docker.sock. +// NewDockerClient creates a client talking to /var/run/docker.sock. func NewDockerClient() *DockerClient { transport := &http.Transport{ DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { @@ -24,11 +27,13 @@ func NewDockerClient() *DockerClient { }, } return &DockerClient{ - httpClient: &http.Client{Transport: transport, Timeout: 10 * time.Second}, - baseURL: "http://localhost", // host is ignored for unix socket + httpClient: &http.Client{Transport: transport, Timeout: 30 * time.Second}, + baseURL: "http://localhost", } } +// ─── HTTP helpers ───────────────────────────────────────────────────────────── + func (c *DockerClient) get(path string, out interface{}) error { resp, err := c.httpClient.Get(c.baseURL + path) if err != nil { @@ -42,16 +47,64 @@ func (c *DockerClient) get(path string, out interface{}) error { return json.Unmarshal(body, out) } -// ---- Types ---------------------------------------------------------------- +func (c *DockerClient) post(path string, payload interface{}, out interface{}) error { + b, err := json.Marshal(payload) + if err != nil { + return err + } + resp, err := c.httpClient.Post(c.baseURL+path, "application/json", bytes.NewReader(b)) + if err != nil { + return fmt.Errorf("docker POST %s: %w", path, err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode >= 400 { + return fmt.Errorf("docker POST %s: status %d: %s", path, resp.StatusCode, string(body)) + } + if out != nil && len(body) > 0 { + return json.Unmarshal(body, out) + } + return nil +} + +func (c *DockerClient) postUpdate(path string, version int, payload interface{}) error { + b, err := json.Marshal(payload) + if err != nil { + return err + } + url := fmt.Sprintf("%s%s?version=%d", c.baseURL, path, version) + req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(b)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + resp, err := c.httpClient.Do(req) + if err != nil { + return fmt.Errorf("docker POST(update) %s: %w", path, err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode >= 400 { + return fmt.Errorf("docker POST(update) %s: status %d: %s", path, resp.StatusCode, string(body)) + } + return nil +} + +// ─── Swarm Node Types ───────────────────────────────────────────────────────── type SwarmNode struct { - ID string `json:"ID"` + ID string `json:"ID"` Description NodeDescription `json:"Description"` - Status NodeStatus `json:"Status"` + Status NodeStatus `json:"Status"` ManagerStatus *ManagerStatus `json:"ManagerStatus,omitempty"` - Spec NodeSpec `json:"Spec"` - UpdatedAt time.Time `json:"UpdatedAt"` - CreatedAt time.Time `json:"CreatedAt"` + Spec NodeSpec `json:"Spec"` + UpdatedAt time.Time `json:"UpdatedAt"` + CreatedAt time.Time `json:"CreatedAt"` + Version VersionInfo `json:"Version"` +} + +type VersionInfo struct { + Index int `json:"Index"` } type NodeDescription struct { @@ -82,17 +135,155 @@ type NodeStatus struct { } type ManagerStatus struct { - Addr string `json:"Addr"` - Leader bool `json:"Leader"` - Reachability string `json:"Reachability"` + Addr string `json:"Addr"` + Leader bool `json:"Leader"` + Reachability string `json:"Reachability"` } type NodeSpec struct { - Role string `json:"Role"` - Availability string `json:"Availability"` + Role string `json:"Role"` + Availability string `json:"Availability"` Labels map[string]string `json:"Labels"` } +// ─── Swarm Service Types ────────────────────────────────────────────────────── + +type SwarmService struct { + ID string `json:"ID"` + Spec ServiceSpec `json:"Spec"` + ServiceStatus *ServiceStatus `json:"ServiceStatus,omitempty"` + UpdatedAt time.Time `json:"UpdatedAt"` + CreatedAt time.Time `json:"CreatedAt"` + Version VersionInfo `json:"Version"` +} + +type ServiceSpec struct { + Name string `json:"Name"` + Mode ServiceMode `json:"Mode"` + TaskTemplate TaskTemplate `json:"TaskTemplate"` + EndpointSpec *EndpointSpec `json:"EndpointSpec,omitempty"` + Labels map[string]string `json:"Labels"` + Networks []NetworkAttachment `json:"Networks,omitempty"` +} + +type NetworkAttachment struct { + Target string `json:"Target"` + Aliases []string `json:"Aliases,omitempty"` +} + +type ServiceMode struct { + Replicated *ReplicatedService `json:"Replicated,omitempty"` + Global *struct{} `json:"Global,omitempty"` +} + +type ReplicatedService struct { + Replicas int `json:"Replicas"` +} + +type TaskTemplate struct { + ContainerSpec ContainerSpec `json:"ContainerSpec"` + Resources *TaskResources `json:"Resources,omitempty"` + Placement *Placement `json:"Placement,omitempty"` +} + +type ContainerSpec struct { + Image string `json:"Image"` + Env []string `json:"Env,omitempty"` + Labels map[string]string `json:"Labels,omitempty"` +} + +type TaskResources struct { + Limits *ResourceSpec `json:"Limits,omitempty"` + Reservations *ResourceSpec `json:"Reservations,omitempty"` +} + +type ResourceSpec struct { + NanoCPUs int64 `json:"NanoCPUs,omitempty"` + MemoryBytes int64 `json:"MemoryBytes,omitempty"` +} + +type Placement struct { + Constraints []string `json:"Constraints,omitempty"` +} + +type EndpointSpec struct { + Ports []PortConfig `json:"Ports,omitempty"` +} + +type PortConfig struct { + Protocol string `json:"Protocol"` + TargetPort int `json:"TargetPort"` + PublishedPort int `json:"PublishedPort"` + PublishMode string `json:"PublishMode"` +} + +type ServiceStatus struct { + RunningTasks int `json:"RunningTasks"` + DesiredTasks int `json:"DesiredTasks"` + CompletedTasks int `json:"CompletedTasks"` +} + +// ─── Swarm Task Types ───────────────────────────────────────────────────────── + +type SwarmTask struct { + ID string `json:"ID"` + ServiceID string `json:"ServiceID"` + NodeID string `json:"NodeID"` + Spec TaskSpec `json:"Spec"` + Status TaskStatus `json:"Status"` + Slot int `json:"Slot"` + UpdatedAt time.Time `json:"UpdatedAt"` + CreatedAt time.Time `json:"CreatedAt"` +} + +type TaskSpec struct { + ContainerSpec ContainerSpec `json:"ContainerSpec"` +} + +type TaskStatus struct { + Timestamp time.Time `json:"Timestamp"` + State string `json:"State"` + Message string `json:"Message"` + ContainerStatus *ContainerTaskStatus `json:"ContainerStatus,omitempty"` +} + +type ContainerTaskStatus struct { + ContainerID string `json:"ContainerID"` + PID int `json:"PID"` +} + +// ─── Swarm Info / Tokens ────────────────────────────────────────────────────── + +type DockerInfo struct { + Swarm SwarmInfo `json:"Swarm"` +} + +type SwarmInfo struct { + NodeID string `json:"NodeID"` + LocalNodeState string `json:"LocalNodeState"` + ControlAvailable bool `json:"ControlAvailable"` + Managers int `json:"Managers"` + Nodes int `json:"Nodes"` + RemoteManagers []RemoteManager `json:"RemoteManagers"` +} + +type RemoteManager struct { + NodeID string `json:"NodeID"` + Addr string `json:"Addr"` +} + +type SwarmSpec struct { + JoinTokens JoinTokens `json:"JoinTokens"` + ID string `json:"ID"` +} + +type JoinTokens struct { + Worker string `json:"Worker"` + Manager string `json:"Manager"` +} + +// ─── Container types ────────────────────────────────────────────────────────── + type Container struct { ID string `json:"Id"` Names []string `json:"Names"` @@ -109,9 +300,9 @@ type ContainerStats struct { } type CPUStats struct { - CPUUsage CPUUsage `json:"cpu_usage"` - SystemCPUUsage int64 `json:"system_cpu_usage"` - OnlineCPUs int `json:"online_cpus"` + CPUUsage CPUUsage `json:"cpu_usage"` + SystemCPUUsage int64 `json:"system_cpu_usage"` + OnlineCPUs int `json:"online_cpus"` } type CPUUsage struct { @@ -120,27 +311,14 @@ type CPUUsage struct { } type MemoryStats struct { - Usage int64 `json:"usage"` - MaxUsage int64 `json:"max_usage"` - Limit int64 `json:"limit"` + Usage int64 `json:"usage"` + MaxUsage int64 `json:"max_usage"` + Limit int64 `json:"limit"` Stats map[string]int64 `json:"stats"` } -type DockerInfo struct { - Swarm SwarmInfo `json:"Swarm"` -} +// ─── Methods: Swarm info ────────────────────────────────────────────────────── -type SwarmInfo struct { - NodeID string `json:"NodeID"` - LocalNodeState string `json:"LocalNodeState"` - ControlAvailable bool `json:"ControlAvailable"` - Managers int `json:"Managers"` - Nodes int `json:"Nodes"` -} - -// ---- Methods --------------------------------------------------------------- - -// IsSwarmActive checks if Docker Swarm is initialized. func (c *DockerClient) IsSwarmActive() bool { var info DockerInfo if err := c.get("/v1.44/info", &info); err != nil { @@ -149,7 +327,6 @@ func (c *DockerClient) IsSwarmActive() bool { return info.Swarm.LocalNodeState == "active" } -// GetSwarmInfo returns basic swarm info. func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) { var info DockerInfo if err := c.get("/v1.44/info", &info); err != nil { @@ -158,7 +335,27 @@ func (c *DockerClient) GetSwarmInfo() (*DockerInfo, error) { return &info, nil } -// ListNodes returns all Swarm nodes (requires manager node). +// GetJoinTokens returns the Swarm worker and manager join tokens. +// Requires this node to be a swarm manager. +func (c *DockerClient) GetJoinTokens() (*SwarmSpec, error) { + var spec SwarmSpec + if err := c.get("/v1.44/swarm", &spec); err != nil { + return nil, err + } + return &spec, nil +} + +// GetManagerAddr returns the advertise address (IP:2377) for joining this swarm. +func (c *DockerClient) GetManagerAddr() string { + info, err := c.GetSwarmInfo() + if err != nil || len(info.Swarm.RemoteManagers) == 0 { + return "" + } + return info.Swarm.RemoteManagers[0].Addr +} + +// ─── Methods: Nodes ─────────────────────────────────────────────────────────── + func (c *DockerClient) ListNodes() ([]SwarmNode, error) { var nodes []SwarmNode if err := c.get("/v1.44/nodes", &nodes); err != nil { @@ -167,7 +364,323 @@ func (c *DockerClient) ListNodes() ([]SwarmNode, error) { return nodes, nil } -// ListContainers returns all running containers on this host. +// UpdateNodeAvailability sets a node's availability (active|pause|drain). +func (c *DockerClient) UpdateNodeAvailability(nodeID, availability string) error { + // First get current node spec + version + var node SwarmNode + if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil { + return err + } + node.Spec.Availability = availability + return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec) +} + +// AddNodeLabel adds a label to a swarm node. +func (c *DockerClient) AddNodeLabel(nodeID, key, value string) error { + var node SwarmNode + if err := c.get("/v1.44/nodes/"+nodeID, &node); err != nil { + return err + } + if node.Spec.Labels == nil { + node.Spec.Labels = map[string]string{} + } + node.Spec.Labels[key] = value + return c.postUpdate("/v1.44/nodes/"+nodeID+"/update", node.Version.Index, node.Spec) +} + +// ─── Methods: Services ──────────────────────────────────────────────────────── + +// ListServices returns all swarm services, optionally filtered by label. +func (c *DockerClient) ListServices() ([]SwarmService, error) { + var services []SwarmService + // Include ServiceStatus so running/desired replicas are returned + if err := c.get("/v1.44/services?status=true", &services); err != nil { + return nil, err + } + return services, nil +} + +// GetService returns a single service by ID or name. +func (c *DockerClient) GetService(idOrName string) (*SwarmService, error) { + var svc SwarmService + if err := c.get("/v1.44/services/"+idOrName+"?status=true", &svc); err != nil { + return nil, err + } + return &svc, nil +} + +// ScaleService updates the replica count for a replicated service. +func (c *DockerClient) ScaleService(idOrName string, replicas int) error { + svc, err := c.GetService(idOrName) + if err != nil { + return err + } + if svc.Spec.Mode.Replicated == nil { + return fmt.Errorf("service %s is not in replicated mode", idOrName) + } + svc.Spec.Mode.Replicated.Replicas = replicas + return c.postUpdate( + "/v1.44/services/"+svc.ID+"/update", + svc.Version.Index, + svc.Spec, + ) +} + +// ListServiceTasks returns all tasks for a given service. +func (c *DockerClient) ListServiceTasks(serviceID string) ([]SwarmTask, error) { + var tasks []SwarmTask + filter := fmt.Sprintf(`{"service":["%s"]}`, serviceID) + path := "/v1.44/tasks?filters=" + urlEncode(filter) + if err := c.get(path, &tasks); err != nil { + return nil, err + } + return tasks, nil +} + +// ListAllTasks returns all swarm tasks (across services). +func (c *DockerClient) ListAllTasks() ([]SwarmTask, error) { + var tasks []SwarmTask + if err := c.get("/v1.44/tasks", &tasks); err != nil { + return nil, err + } + return tasks, nil +} + +// CreateAgentService deploys a new swarm service for an AI agent. +// image: container image, name: service name, replicas: initial count, +// env: environment variables, port: optional published port (0 = none). +// CreateAgentServiceOpts holds options for deploying an agent Swarm service. +type CreateAgentServiceOpts struct { + Name string + Image string + Replicas int + Env []string + Port int + Networks []string // overlay network names/IDs to attach + Labels map[string]string + Constraints []string // placement constraints, e.g. ["node.role==manager"] +} + +func (c *DockerClient) CreateAgentService(name, image string, replicas int, env []string, port int) (*SwarmService, error) { + return c.CreateAgentServiceFull(CreateAgentServiceOpts{ + Name: name, + Image: image, + Replicas: replicas, + Env: env, + Port: port, + }) +} + +func (c *DockerClient) CreateAgentServiceFull(opts CreateAgentServiceOpts) (*SwarmService, error) { + labels := map[string]string{ + "goclaw.agent": "true", + "goclaw.name": opts.Name, + } + for k, v := range opts.Labels { + labels[k] = v + } + var placement *Placement + if len(opts.Constraints) > 0 { + placement = &Placement{Constraints: opts.Constraints} + } + spec := ServiceSpec{ + Name: opts.Name, + Mode: ServiceMode{ + Replicated: &ReplicatedService{Replicas: opts.Replicas}, + }, + TaskTemplate: TaskTemplate{ + ContainerSpec: ContainerSpec{ + Image: opts.Image, + Env: opts.Env, + }, + Placement: placement, + }, + Labels: labels, + } + if opts.Port > 0 { + spec.EndpointSpec = &EndpointSpec{ + Ports: []PortConfig{ + { + Protocol: "tcp", + TargetPort: opts.Port, + PublishedPort: opts.Port, + PublishMode: "host", + }, + }, + } + } + if len(opts.Networks) > 0 { + for _, net := range opts.Networks { + spec.Networks = append(spec.Networks, NetworkAttachment{ + Target: net, + Aliases: []string{opts.Name}, + }) + } + } + var created struct { + ID string `json:"ID"` + } + if err := c.post("/v1.44/services/create", spec, &created); err != nil { + return nil, err + } + return c.GetService(created.ID) +} + +// RemoveService removes a swarm service by ID or name. +func (c *DockerClient) RemoveService(idOrName string) error { + req, err := http.NewRequest(http.MethodDelete, c.baseURL+"/v1.44/services/"+urlEncode(idOrName), nil) + if err != nil { + return err + } + resp, err := c.httpClient.Do(req) + if err != nil { + return fmt.Errorf("docker DELETE service %s: %w", idOrName, err) + } + defer resp.Body.Close() + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker DELETE service %s: status %d: %s", idOrName, resp.StatusCode, string(body)) + } + return nil +} + +// GetServiceLastActivity returns the most recent task update time for a service. +// Used to determine whether a service is idle. +func (c *DockerClient) GetServiceLastActivity(serviceID string) (time.Time, error) { + tasks, err := c.ListServiceTasks(serviceID) + if err != nil { + return time.Time{}, err + } + var latest time.Time + for _, t := range tasks { + if t.UpdatedAt.After(latest) { + latest = t.UpdatedAt + } + } + return latest, nil +} + +// ─── Methods: Containers ───────────────────────────────────────────────────── + +// RunContainerOpts holds options for running a standalone container. +type RunContainerOpts struct { + Name string + Image string + Env []string + Networks []string // bridge/overlay networks to attach + Port int // host port (also used as container port) + Labels map[string]string +} + +// RunContainer creates and starts a standalone Docker container (docker run equivalent). +// Returns the container ID on success. +func (c *DockerClient) RunContainer(opts RunContainerOpts) (string, error) { + labels := map[string]string{"goclaw.agent": "true"} + for k, v := range opts.Labels { + labels[k] = v + } + + // Build port bindings: host port -> container port + exposedPorts := map[string]struct{}{} + portBindings := map[string][]map[string]string{} + if opts.Port > 0 { + key := fmt.Sprintf("%d/tcp", opts.Port) + exposedPorts[key] = struct{}{} + portBindings[key] = []map[string]string{{"HostPort": fmt.Sprintf("%d", opts.Port)}} + } + + // Pick first network for creation; additional networks attached after + firstNetwork := "" + if len(opts.Networks) > 0 { + firstNetwork = opts.Networks[0] + } + + body := map[string]any{ + "Image": opts.Image, + "Env": opts.Env, + "Labels": labels, + "ExposedPorts": exposedPorts, + "HostConfig": map[string]any{ + "PortBindings": portBindings, + "RestartPolicy": map[string]any{"Name": "unless-stopped"}, + "NetworkMode": firstNetwork, + }, + "NetworkingConfig": map[string]any{ + "EndpointsConfig": map[string]any{ + firstNetwork: map[string]any{ + "Aliases": []string{opts.Name}, + }, + }, + }, + } + + var created struct { + ID string `json:"Id"` + } + if err := c.post(fmt.Sprintf("/v1.44/containers/create?name=%s", urlEncode(opts.Name)), body, &created); err != nil { + return "", fmt.Errorf("create container %s: %w", opts.Name, err) + } + + // Start the container + startURL := fmt.Sprintf("/v1.44/containers/%s/start", created.ID) + req, err := http.NewRequest(http.MethodPost, c.baseURL+startURL, nil) + if err != nil { + return created.ID, err + } + resp, err := c.httpClient.Do(req) + if err != nil { + return created.ID, fmt.Errorf("start container: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + b, _ := io.ReadAll(resp.Body) + return created.ID, fmt.Errorf("start container HTTP %d: %s", resp.StatusCode, string(b)) + } + + // Attach additional networks + for i, net := range opts.Networks { + if i == 0 { + continue // already attached via NetworkMode + } + netBody := map[string]any{ + "Container": created.ID, + "EndpointConfig": map[string]any{ + "Aliases": []string{opts.Name}, + }, + } + _ = c.post(fmt.Sprintf("/v1.44/networks/%s/connect", urlEncode(net)), netBody, nil) + } + + return created.ID, nil +} + +// StopContainer stops and removes a standalone container by name or ID. +func (c *DockerClient) StopContainer(nameOrID string) error { + // Stop + stopURL := fmt.Sprintf("/v1.44/containers/%s/stop", urlEncode(nameOrID)) + req, _ := http.NewRequest(http.MethodPost, c.baseURL+stopURL, nil) + resp, err := c.httpClient.Do(req) + if err == nil { + resp.Body.Close() + } + // Remove (force) + rmURL := fmt.Sprintf("/v1.44/containers/%s?force=true", urlEncode(nameOrID)) + req, err = http.NewRequest(http.MethodDelete, c.baseURL+rmURL, nil) + if err != nil { + return err + } + resp, err = c.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 400 { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("remove container HTTP %d: %s", resp.StatusCode, string(b)) + } + return nil +} + func (c *DockerClient) ListContainers() ([]Container, error) { var containers []Container if err := c.get("/v1.44/containers/json?all=false", &containers); err != nil { @@ -176,7 +689,6 @@ func (c *DockerClient) ListContainers() ([]Container, error) { return containers, nil } -// GetContainerStats returns one-shot stats for a container (no streaming). func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, error) { var stats ContainerStats if err := c.get(fmt.Sprintf("/v1.44/containers/%s/stats?stream=false", containerID), &stats); err != nil { @@ -185,7 +697,69 @@ func (c *DockerClient) GetContainerStats(containerID string) (*ContainerStats, e return &stats, nil } -// CalcCPUPercent computes CPU usage % from two consecutive stats snapshots. +// ─── Host Shell execution ───────────────────────────────────────────────────── +// The gateway runs inside a container but has /var/run/docker.sock mounted. +// We use `docker exec` against the host PID namespace via a privileged helper, +// OR simply run commands via the docker socket by exec-ing into the gateway +// container's own shell with nsenter to reach PID 1 on the host. +// +// Approach: use `nsenter -t 1 -m -u -i -n -p -- ` via the host PID namespace. +// This requires the container to run with --privileged or SYS_PTRACE capability +// and PID namespace sharing. We add that to docker-compose.yml. +// +// Alternative (safer): exec into host via SSH or a privileged sidecar. +// For now we use nsenter which works when pid:host and privileged: true. + +// ExecOnHost runs a shell command on the host via nsenter into PID 1. +// Returns combined stdout+stderr. +func ExecOnHost(command string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Try nsenter (requires pid:host + SYS_ADMIN or privileged) + cmd := exec.CommandContext(ctx, "nsenter", "-t", "1", "-m", "-u", "-i", "-n", "-p", "--", + "sh", "-c", command) + var out bytes.Buffer + var stderr bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + // If nsenter fails, fall back to running in container scope + cmd2 := exec.CommandContext(ctx, "sh", "-c", command) + var out2 bytes.Buffer + var stderr2 bytes.Buffer + cmd2.Stdout = &out2 + cmd2.Stderr = &stderr2 + if err2 := cmd2.Run(); err2 != nil { + combined := out2.String() + stderr2.String() + if combined == "" { + combined = err2.Error() + } + return combined, err2 + } + return out2.String() + stderr2.String(), nil + } + return out.String() + stderr.String(), nil +} + +// ExecDockerCLI runs `docker ` on the host by calling the docker socket. +// Since we have the socket mounted, we can exec docker commands directly +// using the docker CLI binary if available. +func ExecDockerCLI(args ...string) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "docker", args...) + var out, stderr bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return out.String() + stderr.String(), err + } + return out.String(), nil +} + +// CalcCPUPercent computes CPU% from stats snapshot. func CalcCPUPercent(stats *ContainerStats) float64 { cpuDelta := float64(stats.CPUStats.CPUUsage.TotalUsage) - float64(stats.PreCPUStats.CPUUsage.TotalUsage) systemDelta := float64(stats.CPUStats.SystemCPUUsage) - float64(stats.PreCPUStats.SystemCPUUsage) @@ -198,3 +772,19 @@ func CalcCPUPercent(stats *ContainerStats) float64 { } return 0 } + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +func urlEncode(s string) string { + var b strings.Builder + for _, r := range s { + switch { + case r >= 'A' && r <= 'Z', r >= 'a' && r <= 'z', r >= '0' && r <= '9', + r == '-', r == '_', r == '.', r == '~': + b.WriteRune(r) + default: + b.WriteString(fmt.Sprintf("%%%02X", r)) + } + } + return b.String() +} diff --git a/gateway/internal/llm/client.go b/gateway/internal/llm/client.go index 1f03ac6..a21b279 100644 --- a/gateway/internal/llm/client.go +++ b/gateway/internal/llm/client.go @@ -2,6 +2,7 @@ package llm import ( + "bufio" "bytes" "context" "encoding/json" @@ -105,6 +106,13 @@ func NewClient(baseURL, apiKey string) *Client { } } +// UpdateCredentials updates the LLM client's base URL and API key at runtime. +// Called when the active provider is changed via the Settings UI. +func (c *Client) UpdateCredentials(baseURL, apiKey string) { + c.baseURL = strings.TrimRight(baseURL, "/") + c.apiKey = apiKey +} + func (c *Client) headers() map[string]string { h := map[string]string{ "Content-Type": "application/json", @@ -159,7 +167,86 @@ func (c *Client) ListModels(ctx context.Context) (*ModelsResponse, error) { return &result, nil } -// Chat sends a chat completion request (non-streaming). +// ChatStream sends a streaming chat completion request (SSE). +// It calls the callback for each chunk received. +func (c *Client) ChatStream(ctx context.Context, req ChatRequest, onChunk func(delta string, done bool)) error { + req.Stream = true + + body, err := json.Marshal(req) + if err != nil { + return err + } + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, + c.baseURL+"/chat/completions", bytes.NewReader(body)) + if err != nil { + return err + } + for k, v := range c.headers() { + httpReq.Header.Set(k, v) + } + httpReq.Header.Set("Accept", "text/event-stream") + + // Use a client without timeout for streaming + streamClient := &http.Client{Timeout: 0} + resp, err := streamClient.Do(httpReq) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("ollama stream API error (%d): %s", resp.StatusCode, string(respBody)) + } + + // Parse SSE stream + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + line := scanner.Text() + if !strings.HasPrefix(line, "data: ") { + continue + } + data := strings.TrimPrefix(line, "data: ") + if data == "[DONE]" { + onChunk("", true) + return nil + } + + var chunk struct { + Choices []struct { + Delta struct { + Content string `json:"content"` + } `json:"delta"` + FinishReason *string `json:"finish_reason"` + } `json:"choices"` + } + if err := json.Unmarshal([]byte(data), &chunk); err != nil { + continue + } + if len(chunk.Choices) > 0 { + delta := chunk.Choices[0].Delta.Content + if delta != "" { + onChunk(delta, false) + } + if chunk.Choices[0].FinishReason != nil && *chunk.Choices[0].FinishReason == "stop" { + onChunk("", true) + return nil + } + } + } + if err := scanner.Err(); err != nil { + return err + } + onChunk("", true) + return nil +} func (c *Client) Chat(ctx context.Context, req ChatRequest) (*ChatResponse, error) { req.Stream = false diff --git a/gateway/internal/orchestrator/orchestrator.go b/gateway/internal/orchestrator/orchestrator.go index 045dd98..dedd19c 100644 --- a/gateway/internal/orchestrator/orchestrator.go +++ b/gateway/internal/orchestrator/orchestrator.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "log" + "strings" "time" "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db" @@ -31,13 +32,15 @@ type ToolCallStep struct { DurationMs int64 `json:"durationMs"` } +// ChatResult is the response from the orchestrator chat. type ChatResult struct { - Success bool `json:"success"` - Response string `json:"response"` - ToolCalls []ToolCallStep `json:"toolCalls"` - Model string `json:"model"` - Usage *llm.Usage `json:"usage,omitempty"` - Error string `json:"error,omitempty"` + Success bool `json:"success"` + Response string `json:"response"` + ToolCalls []ToolCallStep `json:"toolCalls"` + Model string `json:"model"` + ModelWarning string `json:"modelWarning,omitempty"` + Usage *llm.Usage `json:"usage,omitempty"` + Error string `json:"error,omitempty"` } // OrchestratorConfig is the runtime config loaded from DB or defaults. @@ -51,6 +54,30 @@ type OrchestratorConfig struct { MaxTokens int } +// RetryPolicy controls how the orchestrator retries failed or empty LLM calls. +type RetryPolicy struct { + // MaxLLMRetries is the number of additional attempts after a failure. + // Total attempts = MaxLLMRetries + 1. Default: 3 (4 total). + MaxLLMRetries int + // InitialDelay before the first retry. Default: 2s. + InitialDelay time.Duration + // MaxDelay caps the exponential back-off. Default: 30s. + MaxDelay time.Duration + // RetryOnEmpty means an empty-content response is treated as a soft failure + // and triggers a retry. Default: true. + RetryOnEmpty bool +} + +// defaultRetryPolicy returns the default retry policy. +func defaultRetryPolicy() RetryPolicy { + return RetryPolicy{ + MaxLLMRetries: 3, + InitialDelay: 2 * time.Second, + MaxDelay: 30 * time.Second, + RetryOnEmpty: true, + } +} + // ─── Default System Prompt ──────────────────────────────────────────────────── const defaultSystemPrompt = `You are GoClaw Orchestrator — the main AI agent managing the GoClaw distributed AI system. @@ -86,6 +113,7 @@ type Orchestrator struct { executor *tools.Executor database *db.DB projectRoot string + retry RetryPolicy } func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrator { @@ -93,6 +121,7 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat llmClient: llmClient, database: database, projectRoot: projectRoot, + retry: defaultRetryPolicy(), } // Inject agent list function to avoid circular dependency o.executor = tools.NewExecutor(projectRoot, o.listAgentsFn) @@ -101,6 +130,11 @@ func New(llmClient *llm.Client, database *db.DB, projectRoot string) *Orchestrat return o } +// SetRetryPolicy overrides the default retry policy. +func (o *Orchestrator) SetRetryPolicy(p RetryPolicy) { + o.retry = p +} + // GetConfig loads orchestrator config from DB, falls back to defaults. func (o *Orchestrator) GetConfig() *OrchestratorConfig { if o.database != nil { @@ -131,25 +165,188 @@ func (o *Orchestrator) GetConfig() *OrchestratorConfig { } } -// Chat runs the full orchestration loop: LLM → tool calls → LLM → response. -func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult { - if maxIter <= 0 { - maxIter = 10 +// resolveModel checks if the desired model is available via the LLM API. +// If not, it tries to fall back to the first available model. +// Returns the resolved model name and a warning if fallback was used. +func (o *Orchestrator) resolveModel(ctx context.Context, desired string) (model string, warning string) { + ctxShort, cancel := context.WithTimeout(ctx, 8*time.Second) + defer cancel() + + models, err := o.llmClient.ListModels(ctxShort) + if err != nil || models == nil || len(models.Data) == 0 { + // Cannot verify — use desired model as-is + log.Printf("[Orchestrator] Cannot fetch model list: %v — using %q as-is", err, desired) + return desired, "" + } + + // Check if desired model is available + for _, m := range models.Data { + if m.ID == desired { + return desired, "" // found — all good + } + } + + // Desired model not in list — fall back to first available + fallback := models.Data[0].ID + warning = fmt.Sprintf("model %q not available — using %q instead", desired, fallback) + log.Printf("[Orchestrator] WARNING: %s", warning) + return fallback, warning +} + +// ─── LLM call with retry ────────────────────────────────────────────────────── + +// llmCallResult holds one attempt's outcome. +type llmCallResult struct { + resp *llm.ChatResponse + usedTools bool // whether the call was made with tools enabled + err error + attemptNum int +} + +// callLLMWithRetry calls the LLM and retries on error or empty response. +// It also strips tools on the second attempt if the first fails with tools. +func (o *Orchestrator) callLLMWithRetry( + ctx context.Context, + req llm.ChatRequest, + model string, + onRetry func(attempt int, reason string), // optional event callback (may be nil) +) llmCallResult { + policy := o.retry + delay := policy.InitialDelay + maxAttempts := policy.MaxLLMRetries + 1 + hasTools := len(req.Tools) > 0 + + for attempt := 1; attempt <= maxAttempts; attempt++ { + // On attempt > 1, always strip tools (avoid repeated tool-format errors) + useTools := hasTools && attempt == 1 + r := req + if !useTools { + r.Tools = nil + r.ToolChoice = "" + } + + resp, err := o.llmClient.Chat(ctx, r) + + // ── Hard error (network, auth, etc.) ───────────────────────── + if err != nil { + reason := fmt.Sprintf("LLM error (attempt %d/%d): %v", attempt, maxAttempts, err) + log.Printf("[Orchestrator] %s", reason) + + if attempt < maxAttempts { + if onRetry != nil { + onRetry(attempt, reason) + } + o.sleep(ctx, delay) + delay = min(delay*2, policy.MaxDelay) + continue + } + return llmCallResult{err: fmt.Errorf("LLM error after %d attempts (model: %s): %w", maxAttempts, model, err), attemptNum: attempt} + } + + // ── Context cancelled ───────────────────────────────────────── + if ctx.Err() != nil { + return llmCallResult{err: ctx.Err(), attemptNum: attempt} + } + + // ── Empty choices ───────────────────────────────────────────── + if len(resp.Choices) == 0 { + reason := fmt.Sprintf("empty choices (attempt %d/%d)", attempt, maxAttempts) + log.Printf("[Orchestrator] %s", reason) + + if attempt < maxAttempts { + if onRetry != nil { + onRetry(attempt, reason) + } + o.sleep(ctx, delay) + delay = min(delay*2, policy.MaxDelay) + continue + } + return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt} + } + + content := strings.TrimSpace(resp.Choices[0].Message.Content) + finishReason := resp.Choices[0].FinishReason + + // ── Empty content AND no tool calls — retry ─────────────────── + if policy.RetryOnEmpty && + content == "" && + finishReason != "tool_calls" && + len(resp.Choices[0].Message.ToolCalls) == 0 { + + reason := fmt.Sprintf("empty response content (attempt %d/%d, finish_reason=%q)", attempt, maxAttempts, finishReason) + log.Printf("[Orchestrator] %s", reason) + + if attempt < maxAttempts { + if onRetry != nil { + onRetry(attempt, reason) + } + o.sleep(ctx, delay) + delay = min(delay*2, policy.MaxDelay) + continue + } + // Exhausted retries — return what we have (even if empty) + log.Printf("[Orchestrator] All %d attempts exhausted — returning empty response", maxAttempts) + return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt} + } + + // ── Success ─────────────────────────────────────────────────── + if attempt > 1 { + log.Printf("[Orchestrator] Succeeded on attempt %d/%d", attempt, maxAttempts) + } + return llmCallResult{resp: resp, usedTools: useTools, attemptNum: attempt} + } + + // Should not be reached + return llmCallResult{err: fmt.Errorf("retry loop exited unexpectedly"), attemptNum: maxAttempts} +} + +// sleep waits for d, returning early if ctx is cancelled. +func (o *Orchestrator) sleep(ctx context.Context, d time.Duration) { + select { + case <-ctx.Done(): + case <-time.After(d): + } +} + +// min returns the smaller of two durations. +func min(a, b time.Duration) time.Duration { + if a < b { + return a + } + return b +} + +// ─── Core loop (shared by Chat and ChatWithEvents) ──────────────────────────── + +type loopOptions struct { + messages []Message + overrideModel string + maxIter int + onToolCall func(ToolCallStep) // may be nil + onRetry func(attempt int, reason string) // may be nil +} + +func (o *Orchestrator) runLoop(ctx context.Context, opts loopOptions) ChatResult { + if opts.maxIter <= 0 { + opts.maxIter = 10 } cfg := o.GetConfig() model := cfg.Model - if overrideModel != "" { - model = overrideModel + if opts.overrideModel != "" { + model = opts.overrideModel } - log.Printf("[Orchestrator] Chat started: model=%s, messages=%d", model, len(messages)) + // Validate model against LLM API — fall back if unavailable (prevents 401/404) + model, modelWarning := o.resolveModel(ctx, model) + log.Printf("[Orchestrator] Loop started: model=%s, messages=%d, maxIter=%d, maxRetries=%d", + model, len(opts.messages), opts.maxIter, o.retry.MaxLLMRetries) // Build conversation conv := []llm.Message{ {Role: "system", Content: cfg.SystemPrompt}, } - for _, m := range messages { + for _, m := range opts.messages { conv = append(conv, llm.Message{Role: m.Role, Content: m.Content}) } @@ -175,7 +372,7 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod var lastUsage *llm.Usage var lastModel string - for iter := 0; iter < maxIter; iter++ { + for iter := 0; iter < opts.maxIter; iter++ { req := llm.ChatRequest{ Model: model, Messages: conv, @@ -185,28 +382,22 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod ToolChoice: "auto", } - resp, err := o.llmClient.Chat(ctx, req) - if err != nil { - // Fallback: try without tools - log.Printf("[Orchestrator] LLM error with tools: %v — retrying without tools", err) - req.Tools = nil - req.ToolChoice = "" - resp2, err2 := o.llmClient.Chat(ctx, req) - if err2 != nil { - return ChatResult{ - Success: false, - Error: fmt.Sprintf("LLM error (model: %s): %v", model, err2), - } + // ── LLM call with retry ──────────────────────────────────── + callRes := o.callLLMWithRetry(ctx, req, model, opts.onRetry) + + if callRes.err != nil { + return ChatResult{ + Success: false, + ToolCalls: toolCallSteps, + Model: model, + ModelWarning: modelWarning, + Error: callRes.err.Error(), } - if len(resp2.Choices) > 0 { - finalResponse = resp2.Choices[0].Message.Content - lastUsage = resp2.Usage - lastModel = resp2.Model - } - break } + resp := callRes.resp if len(resp.Choices) == 0 { + log.Printf("[Orchestrator] No choices in response — stopping loop at iter %d", iter) break } @@ -217,19 +408,17 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod lastModel = model } - // Check if LLM wants to call tools + // ── Tool calls ───────────────────────────────────────────── if choice.FinishReason == "tool_calls" && len(choice.Message.ToolCalls) > 0 { // Add assistant message with tool calls to conversation conv = append(conv, choice.Message) - // Execute each tool call for _, tc := range choice.Message.ToolCalls { toolName := tc.Function.Name argsJSON := tc.Function.Arguments log.Printf("[Orchestrator] Executing tool: %s args=%s", toolName, argsJSON) start := time.Now() - result := o.executor.Execute(ctx, toolName, argsJSON) step := ToolCallStep{ @@ -238,7 +427,6 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod DurationMs: time.Since(start).Milliseconds(), } - // Parse args for display var argsMap any _ = json.Unmarshal([]byte(argsJSON), &argsMap) step.Args = argsMap @@ -255,7 +443,10 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod toolCallSteps = append(toolCallSteps, step) - // Add tool result to conversation + if opts.onToolCall != nil { + opts.onToolCall(step) + } + conv = append(conv, llm.Message{ Role: "tool", Content: toolResultContent, @@ -267,20 +458,70 @@ func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideMod continue } - // LLM finished — extract final response + // ── Final response ───────────────────────────────────────── finalResponse = choice.Message.Content break } return ChatResult{ - Success: true, - Response: finalResponse, - ToolCalls: toolCallSteps, - Model: lastModel, - Usage: lastUsage, + Success: true, + Response: finalResponse, + ToolCalls: toolCallSteps, + Model: lastModel, + ModelWarning: modelWarning, + Usage: lastUsage, } } +// ─── Public API ─────────────────────────────────────────────────────────────── + +// Chat runs the full orchestration loop: LLM → tool calls → LLM → response. +func (o *Orchestrator) Chat(ctx context.Context, messages []Message, overrideModel string, maxIter int) ChatResult { + return o.runLoop(ctx, loopOptions{ + messages: messages, + overrideModel: overrideModel, + maxIter: maxIter, + }) +} + +// ChatWithEvents runs the full orchestration loop and calls callbacks for each +// tool execution and each retry attempt. Used for SSE streaming and DB event logging. +func (o *Orchestrator) ChatWithEvents( + ctx context.Context, + messages []Message, + overrideModel string, + maxIter int, + onToolCall func(ToolCallStep), +) ChatResult { + return o.runLoop(ctx, loopOptions{ + messages: messages, + overrideModel: overrideModel, + maxIter: maxIter, + onToolCall: onToolCall, + }) +} + +// ChatWithEventsAndRetry is the full-featured variant that also reports retry +// attempts through onRetry so they can be streamed to the client. +func (o *Orchestrator) ChatWithEventsAndRetry( + ctx context.Context, + messages []Message, + overrideModel string, + maxIter int, + onToolCall func(ToolCallStep), + onRetry func(attempt int, reason string), +) ChatResult { + return o.runLoop(ctx, loopOptions{ + messages: messages, + overrideModel: overrideModel, + maxIter: maxIter, + onToolCall: onToolCall, + onRetry: onRetry, + }) +} + +// ─── Helpers ────────────────────────────────────────────────────────────────── + // listAgentsFn is injected into the tool executor to list agents from DB. func (o *Orchestrator) listAgentsFn() ([]map[string]any, error) { if o.database == nil { diff --git a/gateway/internal/tools/executor.go b/gateway/internal/tools/executor.go index 0e8ab33..2191458 100644 --- a/gateway/internal/tools/executor.go +++ b/gateway/internal/tools/executor.go @@ -13,6 +13,7 @@ import ( "os/exec" "path/filepath" "strings" + "sync" "time" "git.softuniq.eu/UniqAI/GoClaw/gateway/internal/db" @@ -155,15 +156,44 @@ func OrchestratorTools() []ToolDef { { Type: "function", Function: FuncDef{ - Name: "delegate_to_agent", - Description: "Delegate a task to a specialized agent (Browser Agent, Tool Builder, Agent Compiler).", + Name: "delegate_to_agent", + Description: "Delegate a task to a specific agent container via A2A protocol. " + + "The agent processes the task with its own LLM and tools. " + + "Use async=true for fire-and-forget with callback_url, or sync (default) to wait for result.", Parameters: map[string]any{ "type": "object", "properties": map[string]any{ - "agentId": map[string]any{"type": "number", "description": "Agent ID to delegate to"}, - "message": map[string]any{"type": "string", "description": "Task description for the agent"}, + "agentId": map[string]any{"type": "number", "description": "Target agent ID"}, + "task": map[string]any{"type": "string", "description": "Task description / prompt for the agent"}, + "async": map[string]any{"type": "boolean", "description": "If true, returns task_id immediately; if false (default), waits for result"}, + "callbackUrl": map[string]any{"type": "string", "description": "URL to POST result when async=true"}, + "priority": map[string]any{"type": "number", "description": "Task priority 0-10 (default 5)"}, + "timeoutSecs": map[string]any{"type": "number", "description": "Max seconds to wait (default 120)"}, }, - "required": []string{"agentId", "message"}, + "required": []string{"agentId", "task"}, + "additionalProperties": false, + }, + }, + }, + { + Type: "function", + Function: FuncDef{ + Name: "fanout_agents", + Description: "Send the SAME task to MULTIPLE agents IN PARALLEL and collect all results. " + + "Useful when you need different specialists to work on the same problem simultaneously. " + + "Returns results from all agents as an array.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "agentIds": map[string]any{ + "type": "array", + "items": map[string]any{"type": "number"}, + "description": "List of agent IDs to send the task to (max 10)", + }, + "task": map[string]any{"type": "string", "description": "Task to send to all agents"}, + "timeoutSecs": map[string]any{"type": "number", "description": "Max seconds per agent (default 60)"}, + }, + "required": []string{"agentIds", "task"}, "additionalProperties": false, }, }, @@ -226,6 +256,8 @@ func (e *Executor) Execute(ctx context.Context, toolName string, argsJSON string result, execErr = e.listAgents() case "delegate_to_agent": result, execErr = e.delegateToAgent(ctx, args) + case "fanout_agents": + result, execErr = e.fanoutAgents(ctx, args) default: return ToolResult{Success: false, Error: fmt.Sprintf("unknown tool: %s", toolName), DurationMs: ms(start)} } @@ -456,57 +488,100 @@ func (e *Executor) listAgents() (any, error) { return map[string]any{"agents": agents, "count": len(agents)}, nil } +// A2ATaskRequest is the standard agent-to-agent task message format (Phase C). +type A2ATaskRequest struct { + TaskID string `json:"task_id"` + FromAgentID int `json:"from_agent_id"` + Task string `json:"input"` + CallbackURL string `json:"callback_url,omitempty"` + Priority int `json:"priority"` + TimeoutSecs int `json:"timeout_secs"` +} + +// delegateToAgent sends a task to an agent's container via A2A HTTP protocol. +// Resolves the agent's service address from DB, respects priority/timeout from args. +// Falls back with a clear message if agent is not deployed/running. func (e *Executor) delegateToAgent(ctx context.Context, args map[string]any) (any, error) { agentIDf, _ := args["agentId"].(float64) agentID := int(agentIDf) + task, _ := args["task"].(string) if task == "" { task, _ = args["message"].(string) // backward compat } if task == "" { - return nil, fmt.Errorf("task (or message) is required") + return nil, fmt.Errorf("task is required") } + callbackURL, _ := args["callbackUrl"].(string) async, _ := args["async"].(bool) + priority := 5 + if pf, ok := args["priority"].(float64); ok && pf > 0 { + priority = int(pf) + } + timeoutSecs := 120 + if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 { + timeoutSecs = int(tf) + } + // Resolve agent container address from DB if e.database != nil { cfg, err := e.database.GetAgentByID(agentID) if err == nil && cfg != nil && cfg.ServicePort > 0 && cfg.ContainerStatus == "running" { - // Agent is deployed — call its container via overlay DNS - // Docker Swarm DNS: service name resolves inside overlay network agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort) - if async { - return e.postAgentTask(ctx, agentURL, agentID, task, callbackURL) + req := A2ATaskRequest{ + TaskID: fmt.Sprintf("orch-%d-%d", agentID, time.Now().UnixMilli()), + FromAgentID: 0, // orchestrator + Task: task, + CallbackURL: callbackURL, + Priority: priority, + TimeoutSecs: timeoutSecs, } - return e.postAgentChat(ctx, agentURL, agentID, task) + if async { + return e.postA2ATask(ctx, agentURL, req) + } + return e.postA2AChat(ctx, agentURL, task, timeoutSecs) + } + if e.database != nil { + cfg, _ := e.database.GetAgentByID(agentID) + status := "unknown" + if cfg != nil { + status = cfg.ContainerStatus + if status == "" { + status = "stopped" + } + } + return map[string]any{ + "delegated": false, + "agentId": agentID, + "status": status, + "note": fmt.Sprintf( + "Agent %d container is %q. Deploy it via Web Panel (POST /api/agents/%d/deploy) then retry.", + agentID, status, agentID), + }, nil } } - // Fallback: agent not deployed yet — return informational response return map[string]any{ - "delegated": false, - "agentId": agentID, - "task": task, - "note": fmt.Sprintf("Agent %d is not running (containerStatus != running). Deploy it first via Web Panel.", agentID), + "delegated": false, + "agentId": agentID, + "note": "No database connection — cannot resolve agent address.", }, nil } -// postAgentTask POSTs to agent's /task endpoint (async, returns task_id). -func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgentID int, task, callbackURL string) (any, error) { - payload, _ := json.Marshal(map[string]any{ - "input": task, - "from_agent_id": fromAgentID, - "callback_url": callbackURL, - }) - req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload)) +// postA2ATask POSTs to agent's /task endpoint using A2A protocol (async). +func (e *Executor) postA2ATask(ctx context.Context, agentURL string, req A2ATaskRequest) (any, error) { + payload, _ := json.Marshal(req) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/task", bytes.NewReader(payload)) if err != nil { - return nil, fmt.Errorf("delegate build request: %w", err) + return nil, fmt.Errorf("a2a build request: %w", err) } - req.Header.Set("Content-Type", "application/json") - resp, err := e.httpClient.Do(req) + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("X-GoClaw-From", "orchestrator") + resp, err := e.httpClient.Do(httpReq) if err != nil { - return nil, fmt.Errorf("delegate HTTP error: %w", err) + return nil, fmt.Errorf("a2a task HTTP error: %w", err) } defer resp.Body.Close() body, _ := io.ReadAll(resp.Body) @@ -515,19 +590,27 @@ func (e *Executor) postAgentTask(ctx context.Context, agentURL string, fromAgent return result, nil } -// postAgentChat POSTs to agent's /chat endpoint (sync, waits for response). -func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, task string) (any, error) { +// postA2AChat POSTs to agent's /chat endpoint (sync, waits for LLM response). +func (e *Executor) postA2AChat(ctx context.Context, agentURL string, task string, timeoutSecs int) (any, error) { payload, _ := json.Marshal(map[string]any{ - "messages": []map[string]string{{"role": "user", "content": task}}, + "messages": []map[string]string{{"role": "user", "content": task}}, + "timeout_secs": timeoutSecs, }) - req, err := http.NewRequestWithContext(ctx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload)) + chatCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs)*time.Second) + defer cancel() + + httpReq, err := http.NewRequestWithContext(chatCtx, http.MethodPost, agentURL+"/chat", bytes.NewReader(payload)) if err != nil { - return nil, fmt.Errorf("delegate build request: %w", err) + return nil, fmt.Errorf("a2a chat request: %w", err) } - req.Header.Set("Content-Type", "application/json") - resp, err := e.httpClient.Do(req) + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("X-GoClaw-From", "orchestrator") + + // Use a client with longer timeout for sync chats + client := &http.Client{Timeout: time.Duration(timeoutSecs+10) * time.Second} + resp, err := client.Do(httpReq) if err != nil { - return nil, fmt.Errorf("delegate HTTP error: %w", err) + return nil, fmt.Errorf("a2a chat HTTP error: %w", err) } defer resp.Body.Close() body, _ := io.ReadAll(resp.Body) @@ -536,6 +619,107 @@ func (e *Executor) postAgentChat(ctx context.Context, agentURL string, _ int, ta return result, nil } +// fanoutAgents sends the same task to multiple agents in parallel and collects results. +func (e *Executor) fanoutAgents(ctx context.Context, args map[string]any) (any, error) { + task, _ := args["task"].(string) + if task == "" { + return nil, fmt.Errorf("task is required") + } + + timeoutSecs := 60 + if tf, ok := args["timeoutSecs"].(float64); ok && tf > 0 { + timeoutSecs = int(tf) + } + + // Parse agentIds array + rawIDs, _ := args["agentIds"].([]any) + if len(rawIDs) == 0 { + return nil, fmt.Errorf("agentIds must be a non-empty array") + } + if len(rawIDs) > 10 { + rawIDs = rawIDs[:10] // cap at 10 + } + + type agentResult struct { + AgentID int `json:"agentId"` + AgentName string `json:"agentName,omitempty"` + Success bool `json:"success"` + Result any `json:"result,omitempty"` + Error string `json:"error,omitempty"` + Delegated bool `json:"delegated"` + DurationMs int64 `json:"durationMs"` + } + + results := make([]agentResult, len(rawIDs)) + var wg sync.WaitGroup + + fanCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSecs+5)*time.Second) + defer cancel() + + for i, rawID := range rawIDs { + idf, _ := rawID.(float64) + agentID := int(idf) + idx := i + wg.Add(1) + go func() { + defer wg.Done() + start := time.Now() + ar := agentResult{AgentID: agentID} + + if e.database == nil { + ar.Error = "no database connection" + results[idx] = ar + return + } + + cfg, err := e.database.GetAgentByID(agentID) + if err != nil || cfg == nil { + ar.Error = fmt.Sprintf("agent %d not found", agentID) + results[idx] = ar + return + } + ar.AgentName = cfg.Name + + if cfg.ServicePort == 0 || cfg.ContainerStatus != "running" { + ar.Delegated = false + ar.Error = fmt.Sprintf("agent %q is %q — not running", cfg.Name, cfg.ContainerStatus) + results[idx] = ar + return + } + + agentURL := fmt.Sprintf("http://%s:%d", cfg.ServiceName, cfg.ServicePort) + res, chatErr := e.postA2AChat(fanCtx, agentURL, task, timeoutSecs) + ar.DurationMs = ms(start) + if chatErr != nil { + ar.Success = false + ar.Error = chatErr.Error() + } else { + ar.Success = true + ar.Delegated = true + ar.Result = res + } + results[idx] = ar + }() + } + + wg.Wait() + + succeeded := 0 + for _, r := range results { + if r.Success { + succeeded++ + } + } + + return map[string]any{ + "task": task, + "total": len(results), + "succeeded": succeeded, + "failed": len(results) - succeeded, + "results": results, + }, nil +} + // ─── Helpers ────────────────────────────────────────────────────────────────── func (e *Executor) resolvePath(path string) string {