feat(swarm): autonomous agent containers, Swarm Manager with auto-stop, /nodes UI overhaul
## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence
## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
* GET /health — liveness probe with idle time info
* POST /task — receives task, forwards to Gateway orchestrator
* GET /info — agent metadata (id, hostname, gateway url)
* Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
* Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
* Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES
## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
* SwarmManager goroutine checks every 60s
* Scales idle GoClaw agent services to 0 replicas after 15 min
* Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
* GET /api/swarm/agents — list agents with idleMinutes
* POST /api/swarm/agents/{name}/start — scale up agent
* POST /api/swarm/agents/{name}/stop — scale to 0
* DELETE /api/swarm/services/{id} — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel
## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection
## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent
## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
This commit is contained in:
270
gateway/cmd/agent/main.go
Normal file
270
gateway/cmd/agent/main.go
Normal file
@@ -0,0 +1,270 @@
|
||||
// GoClaw Agent Server — autonomous agent microservice
|
||||
//
|
||||
// Each agent runs as an independent container in the Docker Swarm overlay
|
||||
// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
|
||||
// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
|
||||
//
|
||||
// The agent:
|
||||
// - Receives task requests from the orchestrator
|
||||
// - Calls the LLM via the centrally-managed GoClaw Gateway
|
||||
// - Reads/writes shared state in the MySQL database
|
||||
// - Reports its last-activity time so the SwarmManager can auto-stop it
|
||||
// - Gracefully shuts down after IdleTimeout with no requests
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ─── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
type AgentConfig struct {
|
||||
AgentID string
|
||||
Port string
|
||||
GatewayURL string
|
||||
LLMURL string
|
||||
LLMAPIKey string
|
||||
DatabaseURL string
|
||||
IdleTimeoutMinutes int
|
||||
}
|
||||
|
||||
func loadConfig() AgentConfig {
|
||||
idleMin := 15
|
||||
if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil {
|
||||
idleMin = n
|
||||
}
|
||||
}
|
||||
port := os.Getenv("AGENT_PORT")
|
||||
if port == "" {
|
||||
port = "8080"
|
||||
}
|
||||
return AgentConfig{
|
||||
AgentID: getEnv("AGENT_ID", "unnamed-agent"),
|
||||
Port: port,
|
||||
GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
|
||||
LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
|
||||
LLMAPIKey: os.Getenv("LLM_API_KEY"),
|
||||
DatabaseURL: os.Getenv("DATABASE_URL"),
|
||||
IdleTimeoutMinutes: idleMin,
|
||||
}
|
||||
}
|
||||
|
||||
func getEnv(key, fallback string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
// ─── State ───────────────────────────────────────────────────────────────────
|
||||
|
||||
type Agent struct {
|
||||
cfg AgentConfig
|
||||
lastActivity time.Time
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
func NewAgent(cfg AgentConfig) *Agent {
|
||||
return &Agent{
|
||||
cfg: cfg,
|
||||
lastActivity: time.Now(),
|
||||
httpClient: &http.Client{Timeout: 120 * time.Second},
|
||||
}
|
||||
}
|
||||
|
||||
func (a *Agent) touch() {
|
||||
a.lastActivity = time.Now()
|
||||
}
|
||||
|
||||
// ─── HTTP handlers ────────────────────────────────────────────────────────────
|
||||
|
||||
// GET /health — liveness probe
|
||||
func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
respond(w, 200, map[string]any{
|
||||
"ok": true,
|
||||
"agentId": a.cfg.AgentID,
|
||||
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
||||
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
||||
})
|
||||
}
|
||||
|
||||
// POST /task — receive a task from the orchestrator
|
||||
// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
|
||||
func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
|
||||
a.touch()
|
||||
var body struct {
|
||||
SessionID string `json:"sessionId"`
|
||||
Messages json.RawMessage `json:"messages"`
|
||||
Model string `json:"model"`
|
||||
MaxIter int `json:"maxIter"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
respondError(w, 400, "invalid request: "+err.Error())
|
||||
return
|
||||
}
|
||||
// Forward the task to the GoClaw Gateway orchestrator
|
||||
gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
|
||||
reqBody, _ := json.Marshal(map[string]any{
|
||||
"messages": body.Messages,
|
||||
"model": body.Model,
|
||||
"maxIter": body.MaxIter,
|
||||
})
|
||||
|
||||
req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
|
||||
if err != nil {
|
||||
respondError(w, 500, "request build error: "+err.Error())
|
||||
return
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := a.httpClient.Do(req)
|
||||
if err != nil {
|
||||
respondError(w, 502, "gateway error: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var result map[string]any
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
respondError(w, 502, "gateway response error: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
a.touch()
|
||||
respond(w, 200, map[string]any{
|
||||
"ok": true,
|
||||
"agentId": a.cfg.AgentID,
|
||||
"sessionId": body.SessionID,
|
||||
"result": result,
|
||||
})
|
||||
}
|
||||
|
||||
// GET /info — agent metadata
|
||||
func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
|
||||
hostname, _ := os.Hostname()
|
||||
respond(w, 200, map[string]any{
|
||||
"agentId": a.cfg.AgentID,
|
||||
"hostname": hostname,
|
||||
"gatewayUrl": a.cfg.GatewayURL,
|
||||
"idleTimeout": a.cfg.IdleTimeoutMinutes,
|
||||
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
||||
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
||||
})
|
||||
}
|
||||
|
||||
// ─── Idle watchdog ────────────────────────────────────────────────────────────
|
||||
|
||||
func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
|
||||
threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
idle := time.Since(a.lastActivity)
|
||||
if idle >= threshold {
|
||||
log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
|
||||
a.cfg.AgentID, idle.Minutes())
|
||||
a.selfStop()
|
||||
cancel()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// selfStop asks the GoClaw Gateway to scale this service to 0.
|
||||
func (a *Agent) selfStop() {
|
||||
url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
|
||||
req, err := http.NewRequest("POST", url, nil)
|
||||
if err != nil {
|
||||
log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
|
||||
return
|
||||
}
|
||||
resp, err := a.httpClient.Do(req)
|
||||
if err != nil {
|
||||
log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
|
||||
return
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
resp.Body.Close()
|
||||
log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func respond(w http.ResponseWriter, status int, data any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(status)
|
||||
json.NewEncoder(w).Encode(data)
|
||||
}
|
||||
|
||||
func respondError(w http.ResponseWriter, status int, msg string) {
|
||||
respond(w, status, map[string]any{"error": msg})
|
||||
}
|
||||
|
||||
// ─── Main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func main() {
|
||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||
|
||||
cfg := loadConfig()
|
||||
agent := NewAgent(cfg)
|
||||
|
||||
log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
|
||||
cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
|
||||
log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
|
||||
|
||||
// ── HTTP server ──────────────────────────────────────────────────────────
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("GET /health", agent.handleHealth)
|
||||
mux.HandleFunc("POST /task", agent.handleTask)
|
||||
mux.HandleFunc("GET /info", agent.handleInfo)
|
||||
|
||||
srv := &http.Server{
|
||||
Addr: ":" + cfg.Port,
|
||||
Handler: mux,
|
||||
ReadTimeout: 30 * time.Second,
|
||||
WriteTimeout: 150 * time.Second,
|
||||
IdleTimeout: 120 * time.Second,
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// ── Idle watchdog ────────────────────────────────────────────────────────
|
||||
go agent.runIdleWatchdog(cancel)
|
||||
|
||||
// ── Graceful shutdown ────────────────────────────────────────────────────
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
go func() {
|
||||
log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
|
||||
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
|
||||
}
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-quit:
|
||||
log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
|
||||
case <-ctx.Done():
|
||||
log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
|
||||
}
|
||||
|
||||
shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer shutCancel()
|
||||
if err := srv.Shutdown(shutCtx); err != nil {
|
||||
log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
|
||||
}
|
||||
log.Printf("[Agent %s] Stopped.", cfg.AgentID)
|
||||
}
|
||||
@@ -135,12 +135,22 @@ func main() {
|
||||
r.Post("/swarm/nodes/{id}/availability", h.SwarmSetNodeAvailability)
|
||||
r.Get("/swarm/services", h.SwarmServices)
|
||||
r.Post("/swarm/services/create", h.SwarmCreateService)
|
||||
r.Delete("/swarm/services/{id}", h.SwarmRemoveService)
|
||||
r.Get("/swarm/services/{id}/tasks", h.SwarmServiceTasks)
|
||||
r.Post("/swarm/services/{id}/scale", h.SwarmScaleService)
|
||||
r.Get("/swarm/join-token", h.SwarmJoinToken)
|
||||
r.Post("/swarm/shell", h.SwarmShell)
|
||||
r.Get("/swarm/agents", h.SwarmListAgents)
|
||||
r.Post("/swarm/agents/{name}/start", h.SwarmStartAgent)
|
||||
r.Post("/swarm/agents/{name}/stop", h.SwarmStopAgent)
|
||||
})
|
||||
|
||||
// ── Swarm Manager: auto-stop idle agents after 15 min ────────────────────
|
||||
swarmMgr := api.NewSwarmManager(h, 60*time.Second)
|
||||
managerCtx, managerCancel := context.WithCancel(context.Background())
|
||||
go swarmMgr.Start(managerCtx)
|
||||
defer managerCancel()
|
||||
|
||||
// ── Start Server ─────────────────────────────────────────────────────────
|
||||
srv := &http.Server{
|
||||
Addr: ":" + cfg.Port,
|
||||
|
||||
@@ -1223,7 +1223,7 @@ func (h *Handler) SwarmScaleService(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
// POST /api/swarm/services/create
|
||||
// Deploy a new GoClaw agent as a Swarm service.
|
||||
// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0 }
|
||||
// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0, "networks": ["goclaw-net"] }
|
||||
func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
|
||||
var body struct {
|
||||
Name string `json:"name"`
|
||||
@@ -1231,6 +1231,7 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
|
||||
Replicas int `json:"replicas"`
|
||||
Env []string `json:"env"`
|
||||
Port int `json:"port"`
|
||||
Networks []string `json:"networks"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Name == "" || body.Image == "" {
|
||||
respondError(w, http.StatusBadRequest, "name and image required")
|
||||
@@ -1239,7 +1240,14 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
|
||||
if body.Replicas <= 0 {
|
||||
body.Replicas = 1
|
||||
}
|
||||
svc, err := h.docker.CreateAgentService(body.Name, body.Image, body.Replicas, body.Env, body.Port)
|
||||
svc, err := h.docker.CreateAgentServiceFull(dockerclient.CreateAgentServiceOpts{
|
||||
Name: body.Name,
|
||||
Image: body.Image,
|
||||
Replicas: body.Replicas,
|
||||
Env: body.Env,
|
||||
Port: body.Port,
|
||||
Networks: body.Networks,
|
||||
})
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "create service: "+err.Error())
|
||||
return
|
||||
@@ -1251,6 +1259,76 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
|
||||
})
|
||||
}
|
||||
|
||||
// DELETE /api/swarm/services/{id}
|
||||
// Remove (stop) a swarm service.
|
||||
func (h *Handler) SwarmRemoveService(w http.ResponseWriter, r *http.Request) {
|
||||
serviceID := r.PathValue("id")
|
||||
if serviceID == "" {
|
||||
respondError(w, http.StatusBadRequest, "service id required")
|
||||
return
|
||||
}
|
||||
if err := h.docker.RemoveService(serviceID); err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "remove service: "+err.Error())
|
||||
return
|
||||
}
|
||||
log.Printf("[Swarm] Removed service %s", serviceID)
|
||||
respond(w, http.StatusOK, map[string]any{"ok": true})
|
||||
}
|
||||
|
||||
// GET /api/swarm/agents
|
||||
// List all GoClaw agent services with idle time information.
|
||||
func (h *Handler) SwarmListAgents(w http.ResponseWriter, r *http.Request) {
|
||||
services, err := h.docker.ListServices()
|
||||
if err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "list services: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
type AgentInfo struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Image string `json:"image"`
|
||||
DesiredReplicas int `json:"desiredReplicas"`
|
||||
RunningTasks int `json:"runningTasks"`
|
||||
LastActivity time.Time `json:"lastActivity"`
|
||||
IdleMinutes float64 `json:"idleMinutes"`
|
||||
IsGoClaw bool `json:"isGoClaw"`
|
||||
}
|
||||
|
||||
var agents []AgentInfo
|
||||
for _, svc := range services {
|
||||
isGoClaw := svc.Spec.Labels["goclaw.agent"] == "true"
|
||||
desired := 0
|
||||
if svc.Spec.Mode.Replicated != nil {
|
||||
desired = svc.Spec.Mode.Replicated.Replicas
|
||||
}
|
||||
running := 0
|
||||
if svc.ServiceStatus != nil {
|
||||
running = svc.ServiceStatus.RunningTasks
|
||||
}
|
||||
lastActivity, _ := h.docker.GetServiceLastActivity(svc.ID)
|
||||
if lastActivity.IsZero() {
|
||||
lastActivity = svc.UpdatedAt
|
||||
}
|
||||
idle := time.Since(lastActivity).Minutes()
|
||||
agents = append(agents, AgentInfo{
|
||||
ID: svc.ID,
|
||||
Name: svc.Spec.Name,
|
||||
Image: svc.Spec.TaskTemplate.ContainerSpec.Image,
|
||||
DesiredReplicas: desired,
|
||||
RunningTasks: running,
|
||||
LastActivity: lastActivity,
|
||||
IdleMinutes: idle,
|
||||
IsGoClaw: isGoClaw,
|
||||
})
|
||||
}
|
||||
if agents == nil {
|
||||
agents = []AgentInfo{}
|
||||
}
|
||||
respond(w, http.StatusOK, map[string]any{"agents": agents, "count": len(agents)})
|
||||
}
|
||||
|
||||
|
||||
// POST /api/swarm/shell
|
||||
// Execute a shell command on the HOST system (via nsenter into PID 1).
|
||||
// Body: { "command": "docker ps" }
|
||||
|
||||
142
gateway/internal/api/swarm_manager.go
Normal file
142
gateway/internal/api/swarm_manager.go
Normal file
@@ -0,0 +1,142 @@
|
||||
// Package api – Swarm Agent Lifecycle Manager
|
||||
//
|
||||
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
|
||||
// (which is the Swarm manager node). It watches all agent services and
|
||||
// automatically scales them to 0 replicas after IdleTimeout minutes of no
|
||||
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
|
||||
// to start/stop agents on demand.
|
||||
//
|
||||
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
|
||||
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
|
||||
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
// IdleTimeout – how many minutes without any task updates before an agent
|
||||
// is automatically scaled to 0.
|
||||
defaultIdleTimeoutMinutes = 15
|
||||
)
|
||||
|
||||
// SwarmManager watches agent services and auto-scales them down after idle.
|
||||
type SwarmManager struct {
|
||||
handler *Handler
|
||||
ticker *time.Ticker
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// NewSwarmManager creates a manager that checks every checkInterval.
|
||||
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
|
||||
return &SwarmManager{
|
||||
handler: h,
|
||||
ticker: time.NewTicker(checkInterval),
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start launches the background loop. Call in a goroutine.
|
||||
func (m *SwarmManager) Start(ctx context.Context) {
|
||||
log.Printf("[SwarmManager] Started — idle timeout %d min, check every %s",
|
||||
defaultIdleTimeoutMinutes, m.ticker)
|
||||
defer m.ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-m.done:
|
||||
return
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-m.ticker.C:
|
||||
m.checkIdleAgents()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stop signals the background loop to exit.
|
||||
func (m *SwarmManager) Stop() {
|
||||
close(m.done)
|
||||
}
|
||||
|
||||
func (m *SwarmManager) checkIdleAgents() {
|
||||
services, err := m.handler.docker.ListServices()
|
||||
if err != nil {
|
||||
log.Printf("[SwarmManager] list services error: %v", err)
|
||||
return
|
||||
}
|
||||
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
|
||||
now := time.Now()
|
||||
for _, svc := range services {
|
||||
// Only manage services labelled as GoClaw agents
|
||||
if svc.Spec.Labels["goclaw.agent"] != "true" {
|
||||
continue
|
||||
}
|
||||
// Skip already-stopped services (0 desired replicas)
|
||||
desired := 0
|
||||
if svc.Spec.Mode.Replicated != nil {
|
||||
desired = svc.Spec.Mode.Replicated.Replicas
|
||||
}
|
||||
if desired == 0 {
|
||||
continue
|
||||
}
|
||||
// Check last activity time
|
||||
lastActivity, err := m.handler.docker.GetServiceLastActivity(svc.ID)
|
||||
if err != nil || lastActivity.IsZero() {
|
||||
lastActivity = svc.UpdatedAt
|
||||
}
|
||||
idle := now.Sub(lastActivity)
|
||||
if idle >= idleThreshold {
|
||||
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
|
||||
svc.Spec.Name, idle.Minutes())
|
||||
if err := m.handler.docker.ScaleService(svc.ID, 0); err != nil {
|
||||
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
|
||||
|
||||
// POST /api/swarm/agents/{name}/start
|
||||
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
|
||||
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
|
||||
name := r.PathValue("name")
|
||||
if name == "" {
|
||||
respondError(w, http.StatusBadRequest, "agent name required")
|
||||
return
|
||||
}
|
||||
var body struct {
|
||||
Replicas int `json:"replicas"`
|
||||
}
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
if body.Replicas <= 0 {
|
||||
body.Replicas = 1
|
||||
}
|
||||
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
|
||||
return
|
||||
}
|
||||
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
|
||||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
|
||||
}
|
||||
|
||||
// POST /api/swarm/agents/{name}/stop
|
||||
// Stop (scale-to-0) a named agent service.
|
||||
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
|
||||
name := r.PathValue("name")
|
||||
if name == "" {
|
||||
respondError(w, http.StatusBadRequest, "agent name required")
|
||||
return
|
||||
}
|
||||
if err := h.docker.ScaleService(name, 0); err != nil {
|
||||
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
|
||||
return
|
||||
}
|
||||
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
|
||||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
|
||||
}
|
||||
@@ -158,11 +158,17 @@ type SwarmService struct {
|
||||
}
|
||||
|
||||
type ServiceSpec struct {
|
||||
Name string `json:"Name"`
|
||||
Mode ServiceMode `json:"Mode"`
|
||||
TaskTemplate TaskTemplate `json:"TaskTemplate"`
|
||||
EndpointSpec *EndpointSpec `json:"EndpointSpec,omitempty"`
|
||||
Labels map[string]string `json:"Labels"`
|
||||
Name string `json:"Name"`
|
||||
Mode ServiceMode `json:"Mode"`
|
||||
TaskTemplate TaskTemplate `json:"TaskTemplate"`
|
||||
EndpointSpec *EndpointSpec `json:"EndpointSpec,omitempty"`
|
||||
Labels map[string]string `json:"Labels"`
|
||||
Networks []NetworkAttachment `json:"Networks,omitempty"`
|
||||
}
|
||||
|
||||
type NetworkAttachment struct {
|
||||
Target string `json:"Target"`
|
||||
Aliases []string `json:"Aliases,omitempty"`
|
||||
}
|
||||
|
||||
type ServiceMode struct {
|
||||
@@ -443,34 +449,67 @@ func (c *DockerClient) ListAllTasks() ([]SwarmTask, error) {
|
||||
// CreateAgentService deploys a new swarm service for an AI agent.
|
||||
// image: container image, name: service name, replicas: initial count,
|
||||
// env: environment variables, port: optional published port (0 = none).
|
||||
// CreateAgentServiceOpts holds options for deploying an agent Swarm service.
|
||||
type CreateAgentServiceOpts struct {
|
||||
Name string
|
||||
Image string
|
||||
Replicas int
|
||||
Env []string
|
||||
Port int
|
||||
Networks []string // overlay network names/IDs to attach
|
||||
Labels map[string]string
|
||||
}
|
||||
|
||||
func (c *DockerClient) CreateAgentService(name, image string, replicas int, env []string, port int) (*SwarmService, error) {
|
||||
return c.CreateAgentServiceFull(CreateAgentServiceOpts{
|
||||
Name: name,
|
||||
Image: image,
|
||||
Replicas: replicas,
|
||||
Env: env,
|
||||
Port: port,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *DockerClient) CreateAgentServiceFull(opts CreateAgentServiceOpts) (*SwarmService, error) {
|
||||
labels := map[string]string{
|
||||
"goclaw.agent": "true",
|
||||
"goclaw.name": opts.Name,
|
||||
}
|
||||
for k, v := range opts.Labels {
|
||||
labels[k] = v
|
||||
}
|
||||
spec := ServiceSpec{
|
||||
Name: name,
|
||||
Name: opts.Name,
|
||||
Mode: ServiceMode{
|
||||
Replicated: &ReplicatedService{Replicas: replicas},
|
||||
Replicated: &ReplicatedService{Replicas: opts.Replicas},
|
||||
},
|
||||
TaskTemplate: TaskTemplate{
|
||||
ContainerSpec: ContainerSpec{
|
||||
Image: image,
|
||||
Env: env,
|
||||
Image: opts.Image,
|
||||
Env: opts.Env,
|
||||
},
|
||||
},
|
||||
Labels: map[string]string{
|
||||
"goclaw.agent": "true",
|
||||
"goclaw.name": name,
|
||||
},
|
||||
Labels: labels,
|
||||
}
|
||||
if port > 0 {
|
||||
if opts.Port > 0 {
|
||||
spec.EndpointSpec = &EndpointSpec{
|
||||
Ports: []PortConfig{
|
||||
{
|
||||
Protocol: "tcp",
|
||||
TargetPort: port,
|
||||
TargetPort: opts.Port,
|
||||
PublishMode: "ingress",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
if len(opts.Networks) > 0 {
|
||||
for _, net := range opts.Networks {
|
||||
spec.Networks = append(spec.Networks, NetworkAttachment{
|
||||
Target: net,
|
||||
Aliases: []string{opts.Name},
|
||||
})
|
||||
}
|
||||
}
|
||||
var created struct {
|
||||
ID string `json:"ID"`
|
||||
}
|
||||
@@ -480,6 +519,40 @@ func (c *DockerClient) CreateAgentService(name, image string, replicas int, env
|
||||
return c.GetService(created.ID)
|
||||
}
|
||||
|
||||
// RemoveService removes a swarm service by ID or name.
|
||||
func (c *DockerClient) RemoveService(idOrName string) error {
|
||||
req, err := http.NewRequest(http.MethodDelete, c.baseURL+"/v1.44/services/"+urlEncode(idOrName), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("docker DELETE service %s: %w", idOrName, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 400 {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("docker DELETE service %s: status %d: %s", idOrName, resp.StatusCode, string(body))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetServiceLastActivity returns the most recent task update time for a service.
|
||||
// Used to determine whether a service is idle.
|
||||
func (c *DockerClient) GetServiceLastActivity(serviceID string) (time.Time, error) {
|
||||
tasks, err := c.ListServiceTasks(serviceID)
|
||||
if err != nil {
|
||||
return time.Time{}, err
|
||||
}
|
||||
var latest time.Time
|
||||
for _, t := range tasks {
|
||||
if t.UpdatedAt.After(latest) {
|
||||
latest = t.UpdatedAt
|
||||
}
|
||||
}
|
||||
return latest, nil
|
||||
}
|
||||
|
||||
// ─── Methods: Containers ─────────────────────────────────────────────────────
|
||||
|
||||
func (c *DockerClient) ListContainers() ([]Container, error) {
|
||||
|
||||
Reference in New Issue
Block a user