Files
GoClaw/gateway/cmd/agent/main.go
bboxwtf a8a8ea1ee2 feat(swarm): autonomous agent containers, Swarm Manager with auto-stop, /nodes UI overhaul
## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence

## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
  * GET /health — liveness probe with idle time info
  * POST /task — receives task, forwards to Gateway orchestrator
  * GET /info  — agent metadata (id, hostname, gateway url)
  * Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
  * Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
  * Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES

## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
  * SwarmManager goroutine checks every 60s
  * Scales idle GoClaw agent services to 0 replicas after 15 min
  * Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
  * GET  /api/swarm/agents           — list agents with idleMinutes
  * POST /api/swarm/agents/{name}/start — scale up agent
  * POST /api/swarm/agents/{name}/stop  — scale to 0
  * DELETE /api/swarm/services/{id}     — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel

## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection

## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent

## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
2026-03-21 20:37:21 +00:00

271 lines
8.8 KiB
Go

// GoClaw Agent Server — autonomous agent microservice
//
// Each agent runs as an independent container in the Docker Swarm overlay
// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
//
// The agent:
// - Receives task requests from the orchestrator
// - Calls the LLM via the centrally-managed GoClaw Gateway
// - Reads/writes shared state in the MySQL database
// - Reports its last-activity time so the SwarmManager can auto-stop it
// - Gracefully shuts down after IdleTimeout with no requests
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
)
// ─── Config ──────────────────────────────────────────────────────────────────
type AgentConfig struct {
AgentID string
Port string
GatewayURL string
LLMURL string
LLMAPIKey string
DatabaseURL string
IdleTimeoutMinutes int
}
func loadConfig() AgentConfig {
idleMin := 15
if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
if n, err := strconv.Atoi(v); err == nil {
idleMin = n
}
}
port := os.Getenv("AGENT_PORT")
if port == "" {
port = "8080"
}
return AgentConfig{
AgentID: getEnv("AGENT_ID", "unnamed-agent"),
Port: port,
GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
LLMAPIKey: os.Getenv("LLM_API_KEY"),
DatabaseURL: os.Getenv("DATABASE_URL"),
IdleTimeoutMinutes: idleMin,
}
}
func getEnv(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
// ─── State ───────────────────────────────────────────────────────────────────
type Agent struct {
cfg AgentConfig
lastActivity time.Time
httpClient *http.Client
}
func NewAgent(cfg AgentConfig) *Agent {
return &Agent{
cfg: cfg,
lastActivity: time.Now(),
httpClient: &http.Client{Timeout: 120 * time.Second},
}
}
func (a *Agent) touch() {
a.lastActivity = time.Now()
}
// ─── HTTP handlers ────────────────────────────────────────────────────────────
// GET /health — liveness probe
func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
respond(w, 200, map[string]any{
"ok": true,
"agentId": a.cfg.AgentID,
"lastActivity": a.lastActivity.Format(time.RFC3339),
"idleMinutes": time.Since(a.lastActivity).Minutes(),
})
}
// POST /task — receive a task from the orchestrator
// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
a.touch()
var body struct {
SessionID string `json:"sessionId"`
Messages json.RawMessage `json:"messages"`
Model string `json:"model"`
MaxIter int `json:"maxIter"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
respondError(w, 400, "invalid request: "+err.Error())
return
}
// Forward the task to the GoClaw Gateway orchestrator
gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
reqBody, _ := json.Marshal(map[string]any{
"messages": body.Messages,
"model": body.Model,
"maxIter": body.MaxIter,
})
req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
if err != nil {
respondError(w, 500, "request build error: "+err.Error())
return
}
req.Header.Set("Content-Type", "application/json")
resp, err := a.httpClient.Do(req)
if err != nil {
respondError(w, 502, "gateway error: "+err.Error())
return
}
defer resp.Body.Close()
var result map[string]any
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
respondError(w, 502, "gateway response error: "+err.Error())
return
}
a.touch()
respond(w, 200, map[string]any{
"ok": true,
"agentId": a.cfg.AgentID,
"sessionId": body.SessionID,
"result": result,
})
}
// GET /info — agent metadata
func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
hostname, _ := os.Hostname()
respond(w, 200, map[string]any{
"agentId": a.cfg.AgentID,
"hostname": hostname,
"gatewayUrl": a.cfg.GatewayURL,
"idleTimeout": a.cfg.IdleTimeoutMinutes,
"lastActivity": a.lastActivity.Format(time.RFC3339),
"idleMinutes": time.Since(a.lastActivity).Minutes(),
})
}
// ─── Idle watchdog ────────────────────────────────────────────────────────────
func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
idle := time.Since(a.lastActivity)
if idle >= threshold {
log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
a.cfg.AgentID, idle.Minutes())
a.selfStop()
cancel()
return
}
}
}
// selfStop asks the GoClaw Gateway to scale this service to 0.
func (a *Agent) selfStop() {
url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
req, err := http.NewRequest("POST", url, nil)
if err != nil {
log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
return
}
resp, err := a.httpClient.Do(req)
if err != nil {
log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
return
}
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
func respond(w http.ResponseWriter, status int, data any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
func respondError(w http.ResponseWriter, status int, msg string) {
respond(w, status, map[string]any{"error": msg})
}
// ─── Main ─────────────────────────────────────────────────────────────────────
func main() {
log.SetFlags(log.LstdFlags | log.Lshortfile)
cfg := loadConfig()
agent := NewAgent(cfg)
log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
// ── HTTP server ──────────────────────────────────────────────────────────
mux := http.NewServeMux()
mux.HandleFunc("GET /health", agent.handleHealth)
mux.HandleFunc("POST /task", agent.handleTask)
mux.HandleFunc("GET /info", agent.handleInfo)
srv := &http.Server{
Addr: ":" + cfg.Port,
Handler: mux,
ReadTimeout: 30 * time.Second,
WriteTimeout: 150 * time.Second,
IdleTimeout: 120 * time.Second,
}
ctx, cancel := context.WithCancel(context.Background())
// ── Idle watchdog ────────────────────────────────────────────────────────
go agent.runIdleWatchdog(cancel)
// ── Graceful shutdown ────────────────────────────────────────────────────
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
go func() {
log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
}
}()
select {
case <-quit:
log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
case <-ctx.Done():
log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
}
shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer shutCancel()
if err := srv.Shutdown(shutCtx); err != nil {
log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
}
log.Printf("[Agent %s] Stopped.", cfg.AgentID)
}