Files
GoClaw/gateway/internal/api/swarm_manager.go
bboxwtf a8a8ea1ee2 feat(swarm): autonomous agent containers, Swarm Manager with auto-stop, /nodes UI overhaul
## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence

## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
  * GET /health — liveness probe with idle time info
  * POST /task — receives task, forwards to Gateway orchestrator
  * GET /info  — agent metadata (id, hostname, gateway url)
  * Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
  * Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
  * Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES

## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
  * SwarmManager goroutine checks every 60s
  * Scales idle GoClaw agent services to 0 replicas after 15 min
  * Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
  * GET  /api/swarm/agents           — list agents with idleMinutes
  * POST /api/swarm/agents/{name}/start — scale up agent
  * POST /api/swarm/agents/{name}/stop  — scale to 0
  * DELETE /api/swarm/services/{id}     — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel

## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection

## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent

## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
2026-03-21 20:37:21 +00:00

143 lines
4.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package api Swarm Agent Lifecycle Manager
//
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
// (which is the Swarm manager node). It watches all agent services and
// automatically scales them to 0 replicas after IdleTimeout minutes of no
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
// to start/stop agents on demand.
//
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
package api
import (
"context"
"encoding/json"
"log"
"net/http"
"time"
)
const (
// IdleTimeout how many minutes without any task updates before an agent
// is automatically scaled to 0.
defaultIdleTimeoutMinutes = 15
)
// SwarmManager watches agent services and auto-scales them down after idle.
type SwarmManager struct {
handler *Handler
ticker *time.Ticker
done chan struct{}
}
// NewSwarmManager creates a manager that checks every checkInterval.
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
return &SwarmManager{
handler: h,
ticker: time.NewTicker(checkInterval),
done: make(chan struct{}),
}
}
// Start launches the background loop. Call in a goroutine.
func (m *SwarmManager) Start(ctx context.Context) {
log.Printf("[SwarmManager] Started — idle timeout %d min, check every %s",
defaultIdleTimeoutMinutes, m.ticker)
defer m.ticker.Stop()
for {
select {
case <-m.done:
return
case <-ctx.Done():
return
case <-m.ticker.C:
m.checkIdleAgents()
}
}
}
// Stop signals the background loop to exit.
func (m *SwarmManager) Stop() {
close(m.done)
}
func (m *SwarmManager) checkIdleAgents() {
services, err := m.handler.docker.ListServices()
if err != nil {
log.Printf("[SwarmManager] list services error: %v", err)
return
}
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
now := time.Now()
for _, svc := range services {
// Only manage services labelled as GoClaw agents
if svc.Spec.Labels["goclaw.agent"] != "true" {
continue
}
// Skip already-stopped services (0 desired replicas)
desired := 0
if svc.Spec.Mode.Replicated != nil {
desired = svc.Spec.Mode.Replicated.Replicas
}
if desired == 0 {
continue
}
// Check last activity time
lastActivity, err := m.handler.docker.GetServiceLastActivity(svc.ID)
if err != nil || lastActivity.IsZero() {
lastActivity = svc.UpdatedAt
}
idle := now.Sub(lastActivity)
if idle >= idleThreshold {
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
svc.Spec.Name, idle.Minutes())
if err := m.handler.docker.ScaleService(svc.ID, 0); err != nil {
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, err)
}
}
}
}
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
// POST /api/swarm/agents/{name}/start
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
var body struct {
Replicas int `json:"replicas"`
}
_ = json.NewDecoder(r.Body).Decode(&body)
if body.Replicas <= 0 {
body.Replicas = 1
}
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
}
// POST /api/swarm/agents/{name}/stop
// Stop (scale-to-0) a named agent service.
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
if err := h.docker.ScaleService(name, 0); err != nil {
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
}