feat(swarm): autonomous agent containers, Swarm Manager with auto-stop, /nodes UI overhaul
## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence
## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
* GET /health — liveness probe with idle time info
* POST /task — receives task, forwards to Gateway orchestrator
* GET /info — agent metadata (id, hostname, gateway url)
* Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
* Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
* Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES
## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
* SwarmManager goroutine checks every 60s
* Scales idle GoClaw agent services to 0 replicas after 15 min
* Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
* GET /api/swarm/agents — list agents with idleMinutes
* POST /api/swarm/agents/{name}/start — scale up agent
* POST /api/swarm/agents/{name}/stop — scale to 0
* DELETE /api/swarm/services/{id} — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel
## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection
## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent
## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
This commit is contained in:
270
gateway/cmd/agent/main.go
Normal file
270
gateway/cmd/agent/main.go
Normal file
@@ -0,0 +1,270 @@
|
||||
// GoClaw Agent Server — autonomous agent microservice
|
||||
//
|
||||
// Each agent runs as an independent container in the Docker Swarm overlay
|
||||
// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
|
||||
// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
|
||||
//
|
||||
// The agent:
|
||||
// - Receives task requests from the orchestrator
|
||||
// - Calls the LLM via the centrally-managed GoClaw Gateway
|
||||
// - Reads/writes shared state in the MySQL database
|
||||
// - Reports its last-activity time so the SwarmManager can auto-stop it
|
||||
// - Gracefully shuts down after IdleTimeout with no requests
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
type AgentConfig struct {
|
||||
AgentID string
|
||||
Port string
|
||||
GatewayURL string
|
||||
LLMURL string
|
||||
LLMAPIKey string
|
||||
DatabaseURL string
|
||||
IdleTimeoutMinutes int
|
||||
}
|
||||
|
||||
func loadConfig() AgentConfig {
|
||||
idleMin := 15
|
||||
if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil {
|
||||
idleMin = n
|
||||
}
|
||||
}
|
||||
port := os.Getenv("AGENT_PORT")
|
||||
if port == "" {
|
||||
port = "8080"
|
||||
}
|
||||
return AgentConfig{
|
||||
AgentID: getEnv("AGENT_ID", "unnamed-agent"),
|
||||
Port: port,
|
||||
GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
|
||||
LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
|
||||
LLMAPIKey: os.Getenv("LLM_API_KEY"),
|
||||
DatabaseURL: os.Getenv("DATABASE_URL"),
|
||||
IdleTimeoutMinutes: idleMin,
|
||||
}
|
||||
}
|
||||
|
||||
func getEnv(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// ─── State ───────────────────────────────────────────────────────────────────
|
||||
|
||||
type Agent struct {
|
||||
cfg AgentConfig
|
||||
lastActivity time.Time
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
func NewAgent(cfg AgentConfig) *Agent {
|
||||
return &Agent{
|
||||
cfg: cfg,
|
||||
lastActivity: time.Now(),
|
||||
httpClient: &http.Client{Timeout: 120 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Agent) touch() {
|
||||
a.lastActivity = time.Now()
|
||||
}
|
||||
|
||||
// ─── HTTP handlers ────────────────────────────────────────────────────────────
|
||||
|
||||
// GET /health — liveness probe
|
||||
func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
respond(w, 200, map[string]any{
|
||||
"ok": true,
|
||||
"agentId": a.cfg.AgentID,
|
||||
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
||||
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
||||
})
|
||||
}
|
||||
|
||||
// POST /task — receive a task from the orchestrator
|
||||
// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
|
||||
func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
|
||||
a.touch()
|
||||
var body struct {
|
||||
SessionID string `json:"sessionId"`
|
||||
Messages json.RawMessage `json:"messages"`
|
||||
Model string `json:"model"`
|
||||
MaxIter int `json:"maxIter"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
respondError(w, 400, "invalid request: "+err.Error())
|
||||
return
|
||||
}
|
||||
// Forward the task to the GoClaw Gateway orchestrator
|
||||
gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
|
||||
reqBody, _ := json.Marshal(map[string]any{
|
||||
"messages": body.Messages,
|
||||
"model": body.Model,
|
||||
"maxIter": body.MaxIter,
|
||||
})
|
||||
|
||||
req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
|
||||
if err != nil {
|
||||
respondError(w, 500, "request build error: "+err.Error())
|
||||
return
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := a.httpClient.Do(req)
|
||||
if err != nil {
|
||||
respondError(w, 502, "gateway error: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result map[string]any
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
respondError(w, 502, "gateway response error: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
a.touch()
|
||||
respond(w, 200, map[string]any{
|
||||
"ok": true,
|
||||
"agentId": a.cfg.AgentID,
|
||||
"sessionId": body.SessionID,
|
||||
"result": result,
|
||||
})
|
||||
}
|
||||
|
||||
// GET /info — agent metadata
|
||||
func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
|
||||
hostname, _ := os.Hostname()
|
||||
respond(w, 200, map[string]any{
|
||||
"agentId": a.cfg.AgentID,
|
||||
"hostname": hostname,
|
||||
"gatewayUrl": a.cfg.GatewayURL,
|
||||
"idleTimeout": a.cfg.IdleTimeoutMinutes,
|
||||
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
||||
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
||||
})
|
||||
}
|
||||
|
||||
// ─── Idle watchdog ────────────────────────────────────────────────────────────
|
||||
|
||||
func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
|
||||
threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
idle := time.Since(a.lastActivity)
|
||||
if idle >= threshold {
|
||||
log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
|
||||
a.cfg.AgentID, idle.Minutes())
|
||||
a.selfStop()
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// selfStop asks the GoClaw Gateway to scale this service to 0.
|
||||
func (a *Agent) selfStop() {
|
||||
url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
|
||||
req, err := http.NewRequest("POST", url, nil)
|
||||
if err != nil {
|
||||
log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
|
||||
return
|
||||
}
|
||||
resp, err := a.httpClient.Do(req)
|
||||
if err != nil {
|
||||
log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
|
||||
return
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func respond(w http.ResponseWriter, status int, data any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
|
||||
func respondError(w http.ResponseWriter, status int, msg string) {
|
||||
respond(w, status, map[string]any{"error": msg})
|
||||
}
|
||||
|
||||
// ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func main() {
|
||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||
|
||||
cfg := loadConfig()
|
||||
agent := NewAgent(cfg)
|
||||
|
||||
log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
|
||||
cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
|
||||
log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
|
||||
|
||||
// ── HTTP server ──────────────────────────────────────────────────────────
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /health", agent.handleHealth)
|
||||
mux.HandleFunc("POST /task", agent.handleTask)
|
||||
mux.HandleFunc("GET /info", agent.handleInfo)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: ":" + cfg.Port,
|
||||
Handler: mux,
|
||||
ReadTimeout: 30 * time.Second,
|
||||
WriteTimeout: 150 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// ── Idle watchdog ────────────────────────────────────────────────────────
|
||||
go agent.runIdleWatchdog(cancel)
|
||||
|
||||
// ── Graceful shutdown ────────────────────────────────────────────────────
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
|
||||
}
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-quit:
|
||||
log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
|
||||
case <-ctx.Done():
|
||||
log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
|
||||
}
|
||||
|
||||
shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer shutCancel()
|
||||
if err := srv.Shutdown(shutCtx); err != nil {
|
||||
log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
|
||||
}
|
||||
log.Printf("[Agent %s] Stopped.", cfg.AgentID)
|
||||
}
|
||||
@@ -135,12 +135,22 @@ func main() {
|
||||
r.Post("/swarm/nodes/{id}/availability", h.SwarmSetNodeAvailability)
|
||||
r.Get("/swarm/services", h.SwarmServices)
|
||||
r.Post("/swarm/services/create", h.SwarmCreateService)
|
||||
r.Delete("/swarm/services/{id}", h.SwarmRemoveService)
|
||||
r.Get("/swarm/services/{id}/tasks", h.SwarmServiceTasks)
|
||||
r.Post("/swarm/services/{id}/scale", h.SwarmScaleService)
|
||||
r.Get("/swarm/join-token", h.SwarmJoinToken)
|
||||
r.Post("/swarm/shell", h.SwarmShell)
|
||||
r.Get("/swarm/agents", h.SwarmListAgents)
|
||||
r.Post("/swarm/agents/{name}/start", h.SwarmStartAgent)
|
||||
r.Post("/swarm/agents/{name}/stop", h.SwarmStopAgent)
|
||||
})
|
||||
|
||||
// ── Swarm Manager: auto-stop idle agents after 15 min ────────────────────
|
||||
swarmMgr := api.NewSwarmManager(h, 60*time.Second)
|
||||
managerCtx, managerCancel := context.WithCancel(context.Background())
|
||||
go swarmMgr.Start(managerCtx)
|
||||
defer managerCancel()
|
||||
|
||||
// ── Start Server ─────────────────────────────────────────────────────────
|
||||
srv := &http.Server{
|
||||
Addr: ":" + cfg.Port,
|
||||
|
||||
Reference in New Issue
Block a user