## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence
## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
* GET /health — liveness probe with idle time info
* POST /task — receives task, forwards to Gateway orchestrator
* GET /info — agent metadata (id, hostname, gateway url)
* Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
* Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
* Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES
## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
* SwarmManager goroutine checks every 60s
* Scales idle GoClaw agent services to 0 replicas after 15 min
* Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
* GET /api/swarm/agents — list agents with idleMinutes
* POST /api/swarm/agents/{name}/start — scale up agent
* POST /api/swarm/agents/{name}/stop — scale to 0
* DELETE /api/swarm/services/{id} — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel
## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection
## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent
## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
143 lines
4.3 KiB
Go
143 lines
4.3 KiB
Go
// Package api – Swarm Agent Lifecycle Manager
|
||
//
|
||
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
|
||
// (which is the Swarm manager node). It watches all agent services and
|
||
// automatically scales them to 0 replicas after IdleTimeout minutes of no
|
||
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
|
||
// to start/stop agents on demand.
|
||
//
|
||
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
|
||
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
|
||
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
|
||
package api
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"log"
|
||
"net/http"
|
||
"time"
|
||
)
|
||
|
||
const (
|
||
// IdleTimeout – how many minutes without any task updates before an agent
|
||
// is automatically scaled to 0.
|
||
defaultIdleTimeoutMinutes = 15
|
||
)
|
||
|
||
// SwarmManager watches agent services and auto-scales them down after idle.
|
||
type SwarmManager struct {
|
||
handler *Handler
|
||
ticker *time.Ticker
|
||
done chan struct{}
|
||
}
|
||
|
||
// NewSwarmManager creates a manager that checks every checkInterval.
|
||
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
|
||
return &SwarmManager{
|
||
handler: h,
|
||
ticker: time.NewTicker(checkInterval),
|
||
done: make(chan struct{}),
|
||
}
|
||
}
|
||
|
||
// Start launches the background loop. Call in a goroutine.
|
||
func (m *SwarmManager) Start(ctx context.Context) {
|
||
log.Printf("[SwarmManager] Started — idle timeout %d min, check every %s",
|
||
defaultIdleTimeoutMinutes, m.ticker)
|
||
defer m.ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-m.done:
|
||
return
|
||
case <-ctx.Done():
|
||
return
|
||
case <-m.ticker.C:
|
||
m.checkIdleAgents()
|
||
}
|
||
}
|
||
}
|
||
|
||
// Stop signals the background loop to exit.
|
||
func (m *SwarmManager) Stop() {
|
||
close(m.done)
|
||
}
|
||
|
||
func (m *SwarmManager) checkIdleAgents() {
|
||
services, err := m.handler.docker.ListServices()
|
||
if err != nil {
|
||
log.Printf("[SwarmManager] list services error: %v", err)
|
||
return
|
||
}
|
||
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
|
||
now := time.Now()
|
||
for _, svc := range services {
|
||
// Only manage services labelled as GoClaw agents
|
||
if svc.Spec.Labels["goclaw.agent"] != "true" {
|
||
continue
|
||
}
|
||
// Skip already-stopped services (0 desired replicas)
|
||
desired := 0
|
||
if svc.Spec.Mode.Replicated != nil {
|
||
desired = svc.Spec.Mode.Replicated.Replicas
|
||
}
|
||
if desired == 0 {
|
||
continue
|
||
}
|
||
// Check last activity time
|
||
lastActivity, err := m.handler.docker.GetServiceLastActivity(svc.ID)
|
||
if err != nil || lastActivity.IsZero() {
|
||
lastActivity = svc.UpdatedAt
|
||
}
|
||
idle := now.Sub(lastActivity)
|
||
if idle >= idleThreshold {
|
||
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
|
||
svc.Spec.Name, idle.Minutes())
|
||
if err := m.handler.docker.ScaleService(svc.ID, 0); err != nil {
|
||
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, err)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
|
||
|
||
// POST /api/swarm/agents/{name}/start
|
||
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
|
||
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
|
||
name := r.PathValue("name")
|
||
if name == "" {
|
||
respondError(w, http.StatusBadRequest, "agent name required")
|
||
return
|
||
}
|
||
var body struct {
|
||
Replicas int `json:"replicas"`
|
||
}
|
||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||
if body.Replicas <= 0 {
|
||
body.Replicas = 1
|
||
}
|
||
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
|
||
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
|
||
return
|
||
}
|
||
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
|
||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
|
||
}
|
||
|
||
// POST /api/swarm/agents/{name}/stop
|
||
// Stop (scale-to-0) a named agent service.
|
||
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
|
||
name := r.PathValue("name")
|
||
if name == "" {
|
||
respondError(w, http.StatusBadRequest, "agent name required")
|
||
return
|
||
}
|
||
if err := h.docker.ScaleService(name, 0); err != nil {
|
||
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
|
||
return
|
||
}
|
||
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
|
||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
|
||
}
|