## 1. Fix /nodes Swarm Status Display
- Add SwarmStatusBanner component: clear green/red/loading state
- Shows nodeId, managerAddr, isManager badge
- Error state explains what to check (docker.sock mount)
- Header now shows 'swarm unreachable — check gateway' vs 'active'
- swarmOk now checks nodeId presence, not just data existence
## 2. Autonomous Agent Container
- New docker/Dockerfile.agent — builds Go agent binary from gateway/cmd/agent/
- New gateway/cmd/agent/main.go — standalone HTTP microservice:
* GET /health — liveness probe with idle time info
* POST /task — receives task, forwards to Gateway orchestrator
* GET /info — agent metadata (id, hostname, gateway url)
* Idle watchdog: calls /api/swarm/agents/{name}/stop after IdleTimeoutMinutes
* Connects to Swarm overlay network (goclaw-net) → reaches DB/Gateway by DNS
* Env: AGENT_ID, GATEWAY_URL, DATABASE_URL, IDLE_TIMEOUT_MINUTES
## 3. Swarm Manager Agent (auto-stop after 15min idle)
- New gateway/internal/api/swarm_manager.go:
* SwarmManager goroutine checks every 60s
* Scales idle GoClaw agent services to 0 replicas after 15 min
* Tracks lastActivity from task UpdatedAt timestamps
- New REST endpoints in gateway:
* GET /api/swarm/agents — list agents with idleMinutes
* POST /api/swarm/agents/{name}/start — scale up agent
* POST /api/swarm/agents/{name}/stop — scale to 0
* DELETE /api/swarm/services/{id} — remove service permanently
- SwarmManager started as background goroutine in main.go with context cancel
## 4. Docker Client Enhancements
- Added NetworkAttachment type and Networks field to ServiceSpec
- CreateAgentServiceFull(opts) — supports overlay networks, custom labels
- CreateAgentService() delegates to CreateAgentServiceFull for backward compat
- RemoveService(id) — DELETE /v1.44/services/{id}
- GetServiceLastActivity(id) — finds latest task UpdatedAt for idle detection
## 5. tRPC & Gateway Proxy
- New functions: removeSwarmService, listSwarmAgents, startSwarmAgent, stopSwarmAgent
- SwarmAgentInfo type with idleMinutes, lastActivity, desiredReplicas
- createAgentService now accepts networks[] parameter
- New tRPC endpoints: nodes.removeService, nodes.listAgents, nodes.startAgent, nodes.stopAgent
## 6. Nodes.tsx UI Overhaul
- SwarmStatusBanner component at top — no more silent 'connecting…'
- New 'Agents' tab with AgentManagerRow: idle time, auto-stop warning, start/stop/remove buttons
- IdleColor coding: green < 5m, yellow 5-10m, red 10m+ with countdown to auto-stop
- ServiceRow: added Remove button with confirmation dialog
- RemoveConfirmDialog component
- DeployAgentDialog: added overlay networks field, default env includes GATEWAY_URL
- All queries refetch after agent start/stop/remove
271 lines
8.8 KiB
Go
271 lines
8.8 KiB
Go
// GoClaw Agent Server — autonomous agent microservice
|
|
//
|
|
// Each agent runs as an independent container in the Docker Swarm overlay
|
|
// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
|
|
// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
|
|
//
|
|
// The agent:
|
|
// - Receives task requests from the orchestrator
|
|
// - Calls the LLM via the centrally-managed GoClaw Gateway
|
|
// - Reads/writes shared state in the MySQL database
|
|
// - Reports its last-activity time so the SwarmManager can auto-stop it
|
|
// - Gracefully shuts down after IdleTimeout with no requests
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
)
|
|
|
|
// ─── Config ──────────────────────────────────────────────────────────────────
|
|
|
|
type AgentConfig struct {
|
|
AgentID string
|
|
Port string
|
|
GatewayURL string
|
|
LLMURL string
|
|
LLMAPIKey string
|
|
DatabaseURL string
|
|
IdleTimeoutMinutes int
|
|
}
|
|
|
|
func loadConfig() AgentConfig {
|
|
idleMin := 15
|
|
if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
|
|
if n, err := strconv.Atoi(v); err == nil {
|
|
idleMin = n
|
|
}
|
|
}
|
|
port := os.Getenv("AGENT_PORT")
|
|
if port == "" {
|
|
port = "8080"
|
|
}
|
|
return AgentConfig{
|
|
AgentID: getEnv("AGENT_ID", "unnamed-agent"),
|
|
Port: port,
|
|
GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
|
|
LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
|
|
LLMAPIKey: os.Getenv("LLM_API_KEY"),
|
|
DatabaseURL: os.Getenv("DATABASE_URL"),
|
|
IdleTimeoutMinutes: idleMin,
|
|
}
|
|
}
|
|
|
|
func getEnv(key, fallback string) string {
|
|
if v := os.Getenv(key); v != "" {
|
|
return v
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
// ─── State ───────────────────────────────────────────────────────────────────
|
|
|
|
type Agent struct {
|
|
cfg AgentConfig
|
|
lastActivity time.Time
|
|
httpClient *http.Client
|
|
}
|
|
|
|
func NewAgent(cfg AgentConfig) *Agent {
|
|
return &Agent{
|
|
cfg: cfg,
|
|
lastActivity: time.Now(),
|
|
httpClient: &http.Client{Timeout: 120 * time.Second},
|
|
}
|
|
}
|
|
|
|
func (a *Agent) touch() {
|
|
a.lastActivity = time.Now()
|
|
}
|
|
|
|
// ─── HTTP handlers ────────────────────────────────────────────────────────────
|
|
|
|
// GET /health — liveness probe
|
|
func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
|
|
respond(w, 200, map[string]any{
|
|
"ok": true,
|
|
"agentId": a.cfg.AgentID,
|
|
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
|
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
|
})
|
|
}
|
|
|
|
// POST /task — receive a task from the orchestrator
|
|
// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
|
|
func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
|
|
a.touch()
|
|
var body struct {
|
|
SessionID string `json:"sessionId"`
|
|
Messages json.RawMessage `json:"messages"`
|
|
Model string `json:"model"`
|
|
MaxIter int `json:"maxIter"`
|
|
}
|
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
|
respondError(w, 400, "invalid request: "+err.Error())
|
|
return
|
|
}
|
|
// Forward the task to the GoClaw Gateway orchestrator
|
|
gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
|
|
reqBody, _ := json.Marshal(map[string]any{
|
|
"messages": body.Messages,
|
|
"model": body.Model,
|
|
"maxIter": body.MaxIter,
|
|
})
|
|
|
|
req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
|
|
if err != nil {
|
|
respondError(w, 500, "request build error: "+err.Error())
|
|
return
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
resp, err := a.httpClient.Do(req)
|
|
if err != nil {
|
|
respondError(w, 502, "gateway error: "+err.Error())
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
var result map[string]any
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
respondError(w, 502, "gateway response error: "+err.Error())
|
|
return
|
|
}
|
|
|
|
a.touch()
|
|
respond(w, 200, map[string]any{
|
|
"ok": true,
|
|
"agentId": a.cfg.AgentID,
|
|
"sessionId": body.SessionID,
|
|
"result": result,
|
|
})
|
|
}
|
|
|
|
// GET /info — agent metadata
|
|
func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
|
|
hostname, _ := os.Hostname()
|
|
respond(w, 200, map[string]any{
|
|
"agentId": a.cfg.AgentID,
|
|
"hostname": hostname,
|
|
"gatewayUrl": a.cfg.GatewayURL,
|
|
"idleTimeout": a.cfg.IdleTimeoutMinutes,
|
|
"lastActivity": a.lastActivity.Format(time.RFC3339),
|
|
"idleMinutes": time.Since(a.lastActivity).Minutes(),
|
|
})
|
|
}
|
|
|
|
// ─── Idle watchdog ────────────────────────────────────────────────────────────
|
|
|
|
func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
|
|
threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer ticker.Stop()
|
|
for range ticker.C {
|
|
idle := time.Since(a.lastActivity)
|
|
if idle >= threshold {
|
|
log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
|
|
a.cfg.AgentID, idle.Minutes())
|
|
a.selfStop()
|
|
cancel()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// selfStop asks the GoClaw Gateway to scale this service to 0.
|
|
func (a *Agent) selfStop() {
|
|
url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
|
|
req, err := http.NewRequest("POST", url, nil)
|
|
if err != nil {
|
|
log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
|
|
return
|
|
}
|
|
resp, err := a.httpClient.Do(req)
|
|
if err != nil {
|
|
log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
|
|
return
|
|
}
|
|
body, _ := io.ReadAll(resp.Body)
|
|
resp.Body.Close()
|
|
log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
|
|
}
|
|
|
|
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
|
|
func respond(w http.ResponseWriter, status int, data any) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(status)
|
|
json.NewEncoder(w).Encode(data)
|
|
}
|
|
|
|
func respondError(w http.ResponseWriter, status int, msg string) {
|
|
respond(w, status, map[string]any{"error": msg})
|
|
}
|
|
|
|
// ─── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
func main() {
|
|
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
|
|
|
cfg := loadConfig()
|
|
agent := NewAgent(cfg)
|
|
|
|
log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
|
|
cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
|
|
log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
|
|
|
|
// ── HTTP server ──────────────────────────────────────────────────────────
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("GET /health", agent.handleHealth)
|
|
mux.HandleFunc("POST /task", agent.handleTask)
|
|
mux.HandleFunc("GET /info", agent.handleInfo)
|
|
|
|
srv := &http.Server{
|
|
Addr: ":" + cfg.Port,
|
|
Handler: mux,
|
|
ReadTimeout: 30 * time.Second,
|
|
WriteTimeout: 150 * time.Second,
|
|
IdleTimeout: 120 * time.Second,
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
// ── Idle watchdog ────────────────────────────────────────────────────────
|
|
go agent.runIdleWatchdog(cancel)
|
|
|
|
// ── Graceful shutdown ────────────────────────────────────────────────────
|
|
quit := make(chan os.Signal, 1)
|
|
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
|
|
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
|
log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case <-quit:
|
|
log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
|
|
case <-ctx.Done():
|
|
log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
|
|
}
|
|
|
|
shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer shutCancel()
|
|
if err := srv.Shutdown(shutCtx); err != nil {
|
|
log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
|
|
}
|
|
log.Printf("[Agent %s] Stopped.", cfg.AgentID)
|
|
}
|