feat(swarm): autonomous agent containers, Swarm Manager with auto-stop, /nodes UI overhaul

## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence

## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
  * GET /health — liveness probe with idle time info
  * POST /task — receives task, forwards to Gateway orchestrator
  * GET /info  — agent metadata (id, hostname, gateway url)
  * Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
  * Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
  * Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES

## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
  * SwarmManager goroutine checks every 60s
  * Scales idle GoClaw agent services to 0 replicas after 15 min
  * Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
  * GET  /api/swarm/agents           — list agents with idleMinutes
  * POST /api/swarm/agents/{name}/start — scale up agent
  * POST /api/swarm/agents/{name}/stop  — scale to 0
  * DELETE /api/swarm/services/{id}     — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel

## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection

## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent

## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
This commit is contained in:
bboxwtf
2026-03-21 20:37:21 +00:00
parent 12b8332b2f
commit a8a8ea1ee2
9 changed files with 1168 additions and 194 deletions

View File

@@ -1223,7 +1223,7 @@ func (h *Handler) SwarmScaleService(w http.ResponseWriter, r *http.Request) {
// POST /api/swarm/services/create
// Deploy a new GoClaw agent as a Swarm service.
// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0 }
// Body: { "name": "agent-researcher", "image": "goclaw-gateway:latest", "replicas": 2, "env": ["KEY=val"], "port": 0, "networks": ["goclaw-net"] }
func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
var body struct {
Name string `json:"name"`
@@ -1231,6 +1231,7 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
Replicas int `json:"replicas"`
Env []string `json:"env"`
Port int `json:"port"`
Networks []string `json:"networks"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil || body.Name == "" || body.Image == "" {
respondError(w, http.StatusBadRequest, "name and image required")
@@ -1239,7 +1240,14 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
if body.Replicas <= 0 {
body.Replicas = 1
}
svc, err := h.docker.CreateAgentService(body.Name, body.Image, body.Replicas, body.Env, body.Port)
svc, err := h.docker.CreateAgentServiceFull(dockerclient.CreateAgentServiceOpts{
Name: body.Name,
Image: body.Image,
Replicas: body.Replicas,
Env: body.Env,
Port: body.Port,
Networks: body.Networks,
})
if err != nil {
respondError(w, http.StatusInternalServerError, "create service: "+err.Error())
return
@@ -1251,6 +1259,76 @@ func (h *Handler) SwarmCreateService(w http.ResponseWriter, r *http.Request) {
})
}
// DELETE /api/swarm/services/{id}
// Remove (stop) a swarm service.
func (h *Handler) SwarmRemoveService(w http.ResponseWriter, r *http.Request) {
serviceID := r.PathValue("id")
if serviceID == "" {
respondError(w, http.StatusBadRequest, "service id required")
return
}
if err := h.docker.RemoveService(serviceID); err != nil {
respondError(w, http.StatusInternalServerError, "remove service: "+err.Error())
return
}
log.Printf("[Swarm] Removed service %s", serviceID)
respond(w, http.StatusOK, map[string]any{"ok": true})
}
// GET /api/swarm/agents
// List all GoClaw agent services with idle time information.
func (h *Handler) SwarmListAgents(w http.ResponseWriter, r *http.Request) {
services, err := h.docker.ListServices()
if err != nil {
respondError(w, http.StatusInternalServerError, "list services: "+err.Error())
return
}
type AgentInfo struct {
ID string `json:"id"`
Name string `json:"name"`
Image string `json:"image"`
DesiredReplicas int `json:"desiredReplicas"`
RunningTasks int `json:"runningTasks"`
LastActivity time.Time `json:"lastActivity"`
IdleMinutes float64 `json:"idleMinutes"`
IsGoClaw bool `json:"isGoClaw"`
}
var agents []AgentInfo
for _, svc := range services {
isGoClaw := svc.Spec.Labels["goclaw.agent"] == "true"
desired := 0
if svc.Spec.Mode.Replicated != nil {
desired = svc.Spec.Mode.Replicated.Replicas
}
running := 0
if svc.ServiceStatus != nil {
running = svc.ServiceStatus.RunningTasks
}
lastActivity, _ := h.docker.GetServiceLastActivity(svc.ID)
if lastActivity.IsZero() {
lastActivity = svc.UpdatedAt
}
idle := time.Since(lastActivity).Minutes()
agents = append(agents, AgentInfo{
ID: svc.ID,
Name: svc.Spec.Name,
Image: svc.Spec.TaskTemplate.ContainerSpec.Image,
DesiredReplicas: desired,
RunningTasks: running,
LastActivity: lastActivity,
IdleMinutes: idle,
IsGoClaw: isGoClaw,
})
}
if agents == nil {
agents = []AgentInfo{}
}
respond(w, http.StatusOK, map[string]any{"agents": agents, "count": len(agents)})
}
// POST /api/swarm/shell
// Execute a shell command on the HOST system (via nsenter into PID 1).
// Body: { "command": "docker ps" }

View File

@@ -0,0 +1,142 @@
// Package api Swarm Agent Lifecycle Manager
//
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
// (which is the Swarm manager node). It watches all agent services and
// automatically scales them to 0 replicas after IdleTimeout minutes of no
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
// to start/stop agents on demand.
//
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
package api
import (
"context"
"encoding/json"
"log"
"net/http"
"time"
)
const (
// IdleTimeout how many minutes without any task updates before an agent
// is automatically scaled to 0.
defaultIdleTimeoutMinutes = 15
)
// SwarmManager watches agent services and auto-scales them down after idle.
type SwarmManager struct {
handler *Handler
ticker *time.Ticker
done chan struct{}
}
// NewSwarmManager creates a manager that checks every checkInterval.
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
return &SwarmManager{
handler: h,
ticker: time.NewTicker(checkInterval),
done: make(chan struct{}),
}
}
// Start launches the background loop. Call in a goroutine.
func (m *SwarmManager) Start(ctx context.Context) {
log.Printf("[SwarmManager] Started — idle timeout %d min, check every %s",
defaultIdleTimeoutMinutes, m.ticker)
defer m.ticker.Stop()
for {
select {
case <-m.done:
return
case <-ctx.Done():
return
case <-m.ticker.C:
m.checkIdleAgents()
}
}
}
// Stop signals the background loop to exit.
func (m *SwarmManager) Stop() {
close(m.done)
}
func (m *SwarmManager) checkIdleAgents() {
services, err := m.handler.docker.ListServices()
if err != nil {
log.Printf("[SwarmManager] list services error: %v", err)
return
}
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
now := time.Now()
for _, svc := range services {
// Only manage services labelled as GoClaw agents
if svc.Spec.Labels["goclaw.agent"] != "true" {
continue
}
// Skip already-stopped services (0 desired replicas)
desired := 0
if svc.Spec.Mode.Replicated != nil {
desired = svc.Spec.Mode.Replicated.Replicas
}
if desired == 0 {
continue
}
// Check last activity time
lastActivity, err := m.handler.docker.GetServiceLastActivity(svc.ID)
if err != nil || lastActivity.IsZero() {
lastActivity = svc.UpdatedAt
}
idle := now.Sub(lastActivity)
if idle >= idleThreshold {
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
svc.Spec.Name, idle.Minutes())
if err := m.handler.docker.ScaleService(svc.ID, 0); err != nil {
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, err)
}
}
}
}
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
// POST /api/swarm/agents/{name}/start
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
var body struct {
Replicas int `json:"replicas"`
}
_ = json.NewDecoder(r.Body).Decode(&body)
if body.Replicas <= 0 {
body.Replicas = 1
}
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
}
// POST /api/swarm/agents/{name}/stop
// Stop (scale-to-0) a named agent service.
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
if err := h.docker.ScaleService(name, 0); err != nil {
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
}