Files
GoClaw/gateway/internal/api/swarm_manager.go
bboxwtf f8e0ca7d5d feat(gateway): restore Phase C full agent lifecycle API
- Restored Phase C gateway code (handlers, main.go, docker client, db)
- Added routes: GET /api/agents/running, POST /api/agents (CRUD),
  POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop,
  POST /api/agents/{id}/restart, POST /api/agents/{id}/scale
- Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id}
  even when serviceName is empty in DB
- Fixed DeployAgent: handle 409 conflict by removing existing container
  and retrying once (idempotent deploy)
- Added swarm_manager.go: background SwarmManager for dead-letter recovery
- Added AGENT_NETWORK and AGENT_DB_URL config options
- Updated .gitignore to exclude gateway binaries
- All agents use standalone docker run (not Swarm) on bridge network

Verified on prod: deploy/stop/restart cycle works correctly,
/api/agents/running returns live running agents with containerStatus
2026-04-19 11:40:39 +00:00

197 lines
6.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package api Swarm Agent Lifecycle Manager
//
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
// (which is the Swarm manager node). It watches all agent services and
// automatically scales them to 0 replicas after IdleTimeout minutes of no
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
// to start/stop agents on demand.
//
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
package api
import (
"context"
"encoding/json"
"log"
"net/http"
"time"
)
const (
// IdleTimeout how many minutes without any task updates before an agent
// is automatically scaled to 0.
defaultIdleTimeoutMinutes = 15
// deadLetterCheckEnabled when true, SwarmManager reconciles DB containerStatus
// with actual Swarm service existence (dead-letter recovery).
deadLetterCheckEnabled = true
)
// SwarmManager watches agent services and auto-scales them down after idle.
type SwarmManager struct {
handler *Handler
ticker *time.Ticker
done chan struct{}
}
// NewSwarmManager creates a manager that checks every checkInterval.
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
return &SwarmManager{
handler: h,
ticker: time.NewTicker(checkInterval),
done: make(chan struct{}),
}
}
// Start launches the background loop. Call in a goroutine.
func (m *SwarmManager) Start(ctx context.Context) {
log.Printf("[SwarmManager] Started — idle timeout %d min",
defaultIdleTimeoutMinutes)
defer m.ticker.Stop()
for {
select {
case <-m.done:
return
case <-ctx.Done():
return
case <-m.ticker.C:
m.checkIdleAgents()
}
}
}
// Stop signals the background loop to exit.
func (m *SwarmManager) Stop() {
close(m.done)
}
func (m *SwarmManager) checkIdleAgents() {
h := m.handler
if h.docker == nil {
return
}
// Build a lookup set of currently-live container/service names (both standalone + Swarm)
liveContainers := make(map[string]bool)
// Check standalone containers
if containers, err := h.docker.ListContainers(); err == nil {
for _, c := range containers {
for _, name := range c.Names {
// Docker container names are prefixed with "/"
clean := name
if len(clean) > 0 && clean[0] == '/' {
clean = clean[1:]
}
if c.Labels["goclaw.agent"] == "true" {
liveContainers[clean] = true
}
}
}
}
// Check Swarm services (for legacy/mixed environments)
services, err := h.docker.ListServices()
if err != nil {
log.Printf("[SwarmManager] list services error: %v", err)
}
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
now := time.Now()
for _, svc := range services {
// Only manage services labelled as GoClaw agents
if svc.Spec.Labels["goclaw.agent"] != "true" {
continue
}
liveContainers[svc.Spec.Name] = true
// Skip already-stopped services (0 desired replicas)
desired := 0
if svc.Spec.Mode.Replicated != nil {
desired = svc.Spec.Mode.Replicated.Replicas
}
if desired == 0 {
continue
}
// Check last activity time
lastActivity, actErr := h.docker.GetServiceLastActivity(svc.ID)
if actErr != nil || lastActivity.IsZero() {
lastActivity = svc.UpdatedAt
}
idle := now.Sub(lastActivity)
if idle >= idleThreshold {
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
svc.Spec.Name, idle.Minutes())
if scaleErr := h.docker.ScaleService(svc.ID, 0); scaleErr != nil {
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, scaleErr)
}
}
}
// ── Dead-letter reconciliation (Phase C) ─────────────────────────────────
// If an agent's DB says "running" but its container/service is gone, update
// the status to "error" so the UI shows the discrepancy and allows redeploy.
if !deadLetterCheckEnabled || h.db == nil {
return
}
agents, dbErr := h.db.ListAgents()
if dbErr != nil {
return
}
for _, a := range agents {
if a.ContainerStatus != "running" || a.ServiceName == "" {
continue
}
if !liveContainers[a.ServiceName] {
log.Printf("[SwarmManager] Dead-letter: agent %d (%q) marked running but container %q not found — setting status=error",
a.ID, a.Name, a.ServiceName)
if updateErr := h.db.UpdateContainerStatus(a.ID, "error", a.ServiceName, a.ServicePort); updateErr != nil {
log.Printf("[SwarmManager] UpdateContainerStatus error for agent %d: %v", a.ID, updateErr)
}
}
}
}
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
// POST /api/swarm/agents/{name}/start
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
var body struct {
Replicas int `json:"replicas"`
}
_ = json.NewDecoder(r.Body).Decode(&body)
if body.Replicas <= 0 {
body.Replicas = 1
}
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
}
// POST /api/swarm/agents/{name}/stop
// Stop (scale-to-0) a named agent service.
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
respondError(w, http.StatusBadRequest, "agent name required")
return
}
if err := h.docker.ScaleService(name, 0); err != nil {
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
return
}
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
}