- Restored Phase C gateway code (handlers, main.go, docker client, db)
- Added routes: GET /api/agents/running, POST /api/agents (CRUD),
POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop,
POST /api/agents/{id}/restart, POST /api/agents/{id}/scale
- Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id}
even when serviceName is empty in DB
- Fixed DeployAgent: handle 409 conflict by removing existing container
and retrying once (idempotent deploy)
- Added swarm_manager.go: background SwarmManager for dead-letter recovery
- Added AGENT_NETWORK and AGENT_DB_URL config options
- Updated .gitignore to exclude gateway binaries
- All agents use standalone docker run (not Swarm) on bridge network
Verified on prod: deploy/stop/restart cycle works correctly,
/api/agents/running returns live running agents with containerStatus
197 lines
6.1 KiB
Go
197 lines
6.1 KiB
Go
// Package api – Swarm Agent Lifecycle Manager
|
||
//
|
||
// The SwarmManager runs as a background goroutine inside the GoClaw Gateway
|
||
// (which is the Swarm manager node). It watches all agent services and
|
||
// automatically scales them to 0 replicas after IdleTimeout minutes of no
|
||
// activity. The orchestrator can call StartAgent / StopAgent via the REST API
|
||
// to start/stop agents on demand.
|
||
//
|
||
// Start flow: POST /api/swarm/agents/{name}/start → scale to N replicas (default 1)
|
||
// Stop flow: POST /api/swarm/agents/{name}/stop → scale to 0
|
||
// Auto-stop: background loop checks every 60 s, scales idle agents to 0
|
||
package api
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"log"
|
||
"net/http"
|
||
"time"
|
||
)
|
||
|
||
const (
|
||
// IdleTimeout – how many minutes without any task updates before an agent
|
||
// is automatically scaled to 0.
|
||
defaultIdleTimeoutMinutes = 15
|
||
// deadLetterCheckEnabled – when true, SwarmManager reconciles DB containerStatus
|
||
// with actual Swarm service existence (dead-letter recovery).
|
||
deadLetterCheckEnabled = true
|
||
)
|
||
|
||
// SwarmManager watches agent services and auto-scales them down after idle.
|
||
type SwarmManager struct {
|
||
handler *Handler
|
||
ticker *time.Ticker
|
||
done chan struct{}
|
||
}
|
||
|
||
// NewSwarmManager creates a manager that checks every checkInterval.
|
||
func NewSwarmManager(h *Handler, checkInterval time.Duration) *SwarmManager {
|
||
return &SwarmManager{
|
||
handler: h,
|
||
ticker: time.NewTicker(checkInterval),
|
||
done: make(chan struct{}),
|
||
}
|
||
}
|
||
|
||
// Start launches the background loop. Call in a goroutine.
|
||
func (m *SwarmManager) Start(ctx context.Context) {
|
||
log.Printf("[SwarmManager] Started — idle timeout %d min",
|
||
defaultIdleTimeoutMinutes)
|
||
defer m.ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-m.done:
|
||
return
|
||
case <-ctx.Done():
|
||
return
|
||
case <-m.ticker.C:
|
||
m.checkIdleAgents()
|
||
}
|
||
}
|
||
}
|
||
|
||
// Stop signals the background loop to exit.
|
||
func (m *SwarmManager) Stop() {
|
||
close(m.done)
|
||
}
|
||
|
||
func (m *SwarmManager) checkIdleAgents() {
|
||
h := m.handler
|
||
if h.docker == nil {
|
||
return
|
||
}
|
||
|
||
// Build a lookup set of currently-live container/service names (both standalone + Swarm)
|
||
liveContainers := make(map[string]bool)
|
||
|
||
// Check standalone containers
|
||
if containers, err := h.docker.ListContainers(); err == nil {
|
||
for _, c := range containers {
|
||
for _, name := range c.Names {
|
||
// Docker container names are prefixed with "/"
|
||
clean := name
|
||
if len(clean) > 0 && clean[0] == '/' {
|
||
clean = clean[1:]
|
||
}
|
||
if c.Labels["goclaw.agent"] == "true" {
|
||
liveContainers[clean] = true
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check Swarm services (for legacy/mixed environments)
|
||
services, err := h.docker.ListServices()
|
||
if err != nil {
|
||
log.Printf("[SwarmManager] list services error: %v", err)
|
||
}
|
||
|
||
idleThreshold := time.Duration(defaultIdleTimeoutMinutes) * time.Minute
|
||
now := time.Now()
|
||
|
||
for _, svc := range services {
|
||
// Only manage services labelled as GoClaw agents
|
||
if svc.Spec.Labels["goclaw.agent"] != "true" {
|
||
continue
|
||
}
|
||
liveContainers[svc.Spec.Name] = true
|
||
|
||
// Skip already-stopped services (0 desired replicas)
|
||
desired := 0
|
||
if svc.Spec.Mode.Replicated != nil {
|
||
desired = svc.Spec.Mode.Replicated.Replicas
|
||
}
|
||
if desired == 0 {
|
||
continue
|
||
}
|
||
// Check last activity time
|
||
lastActivity, actErr := h.docker.GetServiceLastActivity(svc.ID)
|
||
if actErr != nil || lastActivity.IsZero() {
|
||
lastActivity = svc.UpdatedAt
|
||
}
|
||
idle := now.Sub(lastActivity)
|
||
if idle >= idleThreshold {
|
||
log.Printf("[SwarmManager] Agent '%s' idle for %.1f min → scaling to 0",
|
||
svc.Spec.Name, idle.Minutes())
|
||
if scaleErr := h.docker.ScaleService(svc.ID, 0); scaleErr != nil {
|
||
log.Printf("[SwarmManager] scale-to-0 error for %s: %v", svc.Spec.Name, scaleErr)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── Dead-letter reconciliation (Phase C) ─────────────────────────────────
|
||
// If an agent's DB says "running" but its container/service is gone, update
|
||
// the status to "error" so the UI shows the discrepancy and allows redeploy.
|
||
if !deadLetterCheckEnabled || h.db == nil {
|
||
return
|
||
}
|
||
agents, dbErr := h.db.ListAgents()
|
||
if dbErr != nil {
|
||
return
|
||
}
|
||
for _, a := range agents {
|
||
if a.ContainerStatus != "running" || a.ServiceName == "" {
|
||
continue
|
||
}
|
||
if !liveContainers[a.ServiceName] {
|
||
log.Printf("[SwarmManager] Dead-letter: agent %d (%q) marked running but container %q not found — setting status=error",
|
||
a.ID, a.Name, a.ServiceName)
|
||
if updateErr := h.db.UpdateContainerStatus(a.ID, "error", a.ServiceName, a.ServicePort); updateErr != nil {
|
||
log.Printf("[SwarmManager] UpdateContainerStatus error for agent %d: %v", a.ID, updateErr)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── HTTP Handlers for agent lifecycle ────────────────────────────────────────
|
||
|
||
// POST /api/swarm/agents/{name}/start
|
||
// Start (scale-up) a named agent service. Body: { "replicas": 1 }
|
||
func (h *Handler) SwarmStartAgent(w http.ResponseWriter, r *http.Request) {
|
||
name := r.PathValue("name")
|
||
if name == "" {
|
||
respondError(w, http.StatusBadRequest, "agent name required")
|
||
return
|
||
}
|
||
var body struct {
|
||
Replicas int `json:"replicas"`
|
||
}
|
||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||
if body.Replicas <= 0 {
|
||
body.Replicas = 1
|
||
}
|
||
if err := h.docker.ScaleService(name, body.Replicas); err != nil {
|
||
respondError(w, http.StatusInternalServerError, "start agent: "+err.Error())
|
||
return
|
||
}
|
||
log.Printf("[Swarm] Agent '%s' started with %d replica(s)", name, body.Replicas)
|
||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": body.Replicas})
|
||
}
|
||
|
||
// POST /api/swarm/agents/{name}/stop
|
||
// Stop (scale-to-0) a named agent service.
|
||
func (h *Handler) SwarmStopAgent(w http.ResponseWriter, r *http.Request) {
|
||
name := r.PathValue("name")
|
||
if name == "" {
|
||
respondError(w, http.StatusBadRequest, "agent name required")
|
||
return
|
||
}
|
||
if err := h.docker.ScaleService(name, 0); err != nil {
|
||
respondError(w, http.StatusInternalServerError, "stop agent: "+err.Error())
|
||
return
|
||
}
|
||
log.Printf("[Swarm] Agent '%s' stopped (scaled to 0)", name)
|
||
respond(w, http.StatusOK, map[string]any{"ok": true, "name": name, "replicas": 0})
|
||
}
|