Files
GoClaw/gateway/cmd/agent/main.go
bboxwtf f8e0ca7d5d feat(gateway): restore Phase C full agent lifecycle API
- Restored Phase C gateway code (handlers, main.go, docker client, db)
- Added routes: GET /api/agents/running, POST /api/agents (CRUD),
  POST /api/agents/{id}/deploy, POST /api/agents/{id}/stop,
  POST /api/agents/{id}/restart, POST /api/agents/{id}/scale
- Fixed StopAgent: always try to stop by canonical name goclaw-agent-{id}
  even when serviceName is empty in DB
- Fixed DeployAgent: handle 409 conflict by removing existing container
  and retrying once (idempotent deploy)
- Added swarm_manager.go: background SwarmManager for dead-letter recovery
- Added AGENT_NETWORK and AGENT_DB_URL config options
- Updated .gitignore to exclude gateway binaries
- All agents use standalone docker run (not Swarm) on bridge network

Verified on prod: deploy/stop/restart cycle works correctly,
/api/agents/running returns live running agents with containerStatus
2026-04-19 11:40:39 +00:00

271 lines
8.8 KiB
Go

// GoClaw Agent Server — autonomous agent microservice
//
// Each agent runs as an independent container in the Docker Swarm overlay
// network. It exposes an HTTP API that the GoClaw Orchestrator can reach
// via the Swarm DNS name (e.g. http://goclaw-agent-researcher:8080).
//
// The agent:
// - Receives task requests from the orchestrator
// - Calls the LLM via the centrally-managed GoClaw Gateway
// - Reads/writes shared state in the MySQL database
// - Reports its last-activity time so the SwarmManager can auto-stop it
// - Gracefully shuts down after IdleTimeout with no requests
package main
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
)
// ─── Config ──────────────────────────────────────────────────────────────────
type AgentConfig struct {
AgentID string
Port string
GatewayURL string
LLMURL string
LLMAPIKey string
DatabaseURL string
IdleTimeoutMinutes int
}
func loadConfig() AgentConfig {
idleMin := 15
if v := os.Getenv("IDLE_TIMEOUT_MINUTES"); v != "" {
if n, err := strconv.Atoi(v); err == nil {
idleMin = n
}
}
port := os.Getenv("AGENT_PORT")
if port == "" {
port = "8080"
}
return AgentConfig{
AgentID: getEnv("AGENT_ID", "unnamed-agent"),
Port: port,
GatewayURL: getEnv("GATEWAY_URL", "http://goclaw-gateway:18789"),
LLMURL: getEnv("LLM_BASE_URL", "https://ollama.com/v1"),
LLMAPIKey: os.Getenv("LLM_API_KEY"),
DatabaseURL: os.Getenv("DATABASE_URL"),
IdleTimeoutMinutes: idleMin,
}
}
func getEnv(key, fallback string) string {
if v := os.Getenv(key); v != "" {
return v
}
return fallback
}
// ─── State ───────────────────────────────────────────────────────────────────
type Agent struct {
cfg AgentConfig
lastActivity time.Time
httpClient *http.Client
}
func NewAgent(cfg AgentConfig) *Agent {
return &Agent{
cfg: cfg,
lastActivity: time.Now(),
httpClient: &http.Client{Timeout: 120 * time.Second},
}
}
func (a *Agent) touch() {
a.lastActivity = time.Now()
}
// ─── HTTP handlers ────────────────────────────────────────────────────────────
// GET /health — liveness probe
func (a *Agent) handleHealth(w http.ResponseWriter, r *http.Request) {
respond(w, 200, map[string]any{
"ok": true,
"agentId": a.cfg.AgentID,
"lastActivity": a.lastActivity.Format(time.RFC3339),
"idleMinutes": time.Since(a.lastActivity).Minutes(),
})
}
// POST /task — receive a task from the orchestrator
// Body: { "sessionId": "abc", "messages": [...], "model": "qwen2.5:7b", "maxIter": 5 }
func (a *Agent) handleTask(w http.ResponseWriter, r *http.Request) {
a.touch()
var body struct {
SessionID string `json:"sessionId"`
Messages json.RawMessage `json:"messages"`
Model string `json:"model"`
MaxIter int `json:"maxIter"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
respondError(w, 400, "invalid request: "+err.Error())
return
}
// Forward the task to the GoClaw Gateway orchestrator
gatewayURL := a.cfg.GatewayURL + "/api/orchestrator/chat"
reqBody, _ := json.Marshal(map[string]any{
"messages": body.Messages,
"model": body.Model,
"maxIter": body.MaxIter,
})
req, err := http.NewRequestWithContext(r.Context(), "POST", gatewayURL, strings.NewReader(string(reqBody)))
if err != nil {
respondError(w, 500, "request build error: "+err.Error())
return
}
req.Header.Set("Content-Type", "application/json")
resp, err := a.httpClient.Do(req)
if err != nil {
respondError(w, 502, "gateway error: "+err.Error())
return
}
defer resp.Body.Close()
var result map[string]any
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
respondError(w, 502, "gateway response error: "+err.Error())
return
}
a.touch()
respond(w, 200, map[string]any{
"ok": true,
"agentId": a.cfg.AgentID,
"sessionId": body.SessionID,
"result": result,
})
}
// GET /info — agent metadata
func (a *Agent) handleInfo(w http.ResponseWriter, r *http.Request) {
hostname, _ := os.Hostname()
respond(w, 200, map[string]any{
"agentId": a.cfg.AgentID,
"hostname": hostname,
"gatewayUrl": a.cfg.GatewayURL,
"idleTimeout": a.cfg.IdleTimeoutMinutes,
"lastActivity": a.lastActivity.Format(time.RFC3339),
"idleMinutes": time.Since(a.lastActivity).Minutes(),
})
}
// ─── Idle watchdog ────────────────────────────────────────────────────────────
func (a *Agent) runIdleWatchdog(cancel context.CancelFunc) {
threshold := time.Duration(a.cfg.IdleTimeoutMinutes) * time.Minute
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for range ticker.C {
idle := time.Since(a.lastActivity)
if idle >= threshold {
log.Printf("[Agent %s] Idle for %.1f min — requesting self-stop via gateway",
a.cfg.AgentID, idle.Minutes())
a.selfStop()
cancel()
return
}
}
}
// selfStop asks the GoClaw Gateway to scale this service to 0.
func (a *Agent) selfStop() {
url := fmt.Sprintf("%s/api/swarm/agents/%s/stop", a.cfg.GatewayURL, a.cfg.AgentID)
req, err := http.NewRequest("POST", url, nil)
if err != nil {
log.Printf("[Agent %s] selfStop error building request: %v", a.cfg.AgentID, err)
return
}
resp, err := a.httpClient.Do(req)
if err != nil {
log.Printf("[Agent %s] selfStop error: %v", a.cfg.AgentID, err)
return
}
body, _ := io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("[Agent %s] selfStop response %d: %s", a.cfg.AgentID, resp.StatusCode, string(body))
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
func respond(w http.ResponseWriter, status int, data any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(data)
}
func respondError(w http.ResponseWriter, status int, msg string) {
respond(w, status, map[string]any{"error": msg})
}
// ─── Main ─────────────────────────────────────────────────────────────────────
func main() {
log.SetFlags(log.LstdFlags | log.Lshortfile)
cfg := loadConfig()
agent := NewAgent(cfg)
log.Printf("[Agent] %s starting on port %s (idle timeout: %d min)",
cfg.AgentID, cfg.Port, cfg.IdleTimeoutMinutes)
log.Printf("[Agent] Gateway: %s", cfg.GatewayURL)
// ── HTTP server ──────────────────────────────────────────────────────────
mux := http.NewServeMux()
mux.HandleFunc("GET /health", agent.handleHealth)
mux.HandleFunc("POST /task", agent.handleTask)
mux.HandleFunc("GET /info", agent.handleInfo)
srv := &http.Server{
Addr: ":" + cfg.Port,
Handler: mux,
ReadTimeout: 30 * time.Second,
WriteTimeout: 150 * time.Second,
IdleTimeout: 120 * time.Second,
}
ctx, cancel := context.WithCancel(context.Background())
// ── Idle watchdog ────────────────────────────────────────────────────────
go agent.runIdleWatchdog(cancel)
// ── Graceful shutdown ────────────────────────────────────────────────────
quit := make(chan os.Signal, 1)
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
go func() {
log.Printf("[Agent %s] Listening on :%s", cfg.AgentID, cfg.Port)
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
log.Fatalf("[Agent %s] Server error: %v", cfg.AgentID, err)
}
}()
select {
case <-quit:
log.Printf("[Agent %s] Signal received — shutting down", cfg.AgentID)
case <-ctx.Done():
log.Printf("[Agent %s] Context cancelled — shutting down", cfg.AgentID)
}
shutCtx, shutCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer shutCancel()
if err := srv.Shutdown(shutCtx); err != nil {
log.Printf("[Agent %s] Shutdown error: %v", cfg.AgentID, err)
}
log.Printf("[Agent %s] Stopped.", cfg.AgentID)
}