true message

This commit is contained in:
Manus
2026-03-26 05:41:44 -04:00
parent d396004294
commit 8096ce4dfd
9 changed files with 409 additions and 626 deletions

View File

@@ -250,6 +250,8 @@ export default function Chat() {
>([]);
const [input, setInput] = useState("");
const [isThinking, setIsThinking] = useState(false);
const [retryAttempt, setRetryAttempt] = useState(0);
const [lastError, setLastError] = useState<{ message: string; isRetryable: boolean } | null>(null);
const scrollRef = useRef<HTMLDivElement>(null);
const inputRef = useRef<HTMLInputElement>(null);
@@ -328,6 +330,10 @@ export default function Chat() {
const respTs = getTs();
// Clear error state on success
setLastError(null);
setRetryAttempt(0);
if (result.success) {
// Update conversation history
setConversationHistory((prev) => [
@@ -362,21 +368,33 @@ export default function Chat() {
}
} catch (err: any) {
setMessages((prev) => prev.filter((m) => m.id !== thinkingId));
const errorMsg = err.message || "Unknown error";
const isRetryable = errorMsg.includes("timeout") || errorMsg.includes("unavailable") || errorMsg.includes("ECONNREFUSED");
setLastError({ message: errorMsg, isRetryable });
setRetryAttempt((prev) => prev + 1);
setMessages((prev) => [
...prev,
{
id: `err-${Date.now()}`,
role: "assistant" as const,
content: `Network Error: ${err.message}`,
content: `Network Error (Attempt ${retryAttempt + 1}): ${errorMsg}${isRetryable ? "\n\nRetrying automatically..." : ""}`,
timestamp: getTs(),
isError: true,
},
]);
// Auto-retry if retryable and under max attempts
if (isRetryable && retryAttempt < 2) {
setTimeout(() => {
sendMessage();
}, 1000 * Math.pow(2, retryAttempt));
}
} finally {
setIsThinking(false);
setTimeout(() => inputRef.current?.focus(), 100);
}
};
} };
const agents = agentsQuery.data ?? [];
const activeAgents = agents.filter((a) => a.isActive && !(a as any).isOrchestrator);

View File

@@ -6,7 +6,7 @@
* Colors: Cyan primary, green/amber/red for resource thresholds
* Typography: JetBrains Mono for all metrics
*/
import { useEffect, useState, useMemo } from "react";
import { useEffect, useState } from "react";
import { Card, CardContent } from "@/components/ui/card";
import { Badge } from "@/components/ui/badge";
import { Progress } from "@/components/ui/progress";
@@ -30,65 +30,6 @@ import { trpc } from "@/lib/trpc";
const NODE_VIS =
"https://d2xsxph8kpxj0f.cloudfront.net/97147719/ZEGAT83geRq9CNvryykaQv/node-visualization-eDRHrwiVpLDMaH6VnWFsxn.webp";
// ── Sparkline ─────────────────────────────────────────────────────────────────────
function Sparkline({
points,
color = "#22d3ee",
width = 80,
height = 24,
}: {
points: number[];
color?: string;
width?: number;
height?: number;
}) {
if (points.length < 2) {
return (
<svg width={width} height={height} className="opacity-30">
<line x1={0} y1={height / 2} x2={width} y2={height / 2} stroke={color} strokeWidth={1} strokeDasharray="2 2" />
</svg>
);
}
const max = Math.max(...points, 1);
const min = 0;
const range = max - min || 1;
const step = width / (points.length - 1);
const pathD = points
.map((v, i) => {
const x = i * step;
const y = height - ((v - min) / range) * (height - 2) - 1;
return `${i === 0 ? "M" : "L"} ${x.toFixed(1)} ${y.toFixed(1)}`;
})
.join(" ");
// Fill area under line
const lastX = (points.length - 1) * step;
const fillD = `${pathD} L ${lastX.toFixed(1)} ${height} L 0 ${height} Z`;
return (
<svg width={width} height={height} className="overflow-visible">
<defs>
<linearGradient id={`sg-${color.replace("#", "")}`} x1="0" y1="0" x2="0" y2="1">
<stop offset="0%" stopColor={color} stopOpacity={0.25} />
<stop offset="100%" stopColor={color} stopOpacity={0.02} />
</linearGradient>
</defs>
<path d={fillD} fill={`url(#sg-${color.replace("#", "")})`} />
<path d={pathD} stroke={color} strokeWidth={1.5} fill="none" strokeLinejoin="round" strokeLinecap="round" />
{/* Current value dot */}
{(() => {
const last = points[points.length - 1];
const x = (points.length - 1) * step;
const y = height - ((last - min) / range) * (height - 2) - 1;
return <circle cx={x.toFixed(1)} cy={y.toFixed(1)} r={2} fill={color} />;
})()}
</svg>
);
}
// ─── Helpers ─────────────────────────────────────────────────────────────────
function getResourceColor(value: number) {
@@ -183,26 +124,6 @@ export default function Nodes() {
retry: 2,
});
// Poll historical metrics every 30 seconds (matches collector interval)
const { data: metricsHistory } = trpc.nodes.allMetricsLatest.useQuery(undefined, {
refetchInterval: 30_000,
refetchIntervalInBackground: true,
retry: 1,
});
// Build sparkline data map: containerId/name → { cpuPoints, memPoints }
const sparklineMap = useMemo(() => {
const map = new Map<string, { cpuPoints: number[]; memPoints: number[] }>();
if (!metricsHistory?.byContainer) return map;
for (const [id, pts] of Object.entries(metricsHistory.byContainer)) {
map.set(id, {
cpuPoints: pts.map(p => p.cpu),
memPoints: pts.map(p => p.mem),
});
}
return map;
}, [metricsHistory]);
// Track last refresh time
useEffect(() => {
if (nodesData) setLastRefresh(new Date());
@@ -214,11 +135,6 @@ export default function Nodes() {
setLastRefresh(new Date());
};
// Helper: get sparkline for a container by id or name
function getSparkline(id: string, name: string) {
return sparklineMap.get(id) ?? sparklineMap.get(name) ?? null;
}
// Build a map: containerName → stats
const statsMap = new Map<string, { cpuPct: number; memUseMB: number; memLimMB: number; memPct: number }>();
if (statsData?.stats) {
@@ -526,19 +442,6 @@ export default function Nodes() {
</span>
</div>
<div className="flex items-center gap-3 text-[10px] font-mono text-muted-foreground flex-shrink-0 ml-2">
{/* Sparkline CPU */}
{(() => {
const spark = getSparkline(c.id, c.name);
if (spark && spark.cpuPoints.length >= 2) {
return (
<div className="flex flex-col items-end gap-0.5">
<Sparkline points={spark.cpuPoints} color="#22d3ee" width={60} height={18} />
<span className="text-[9px] text-muted-foreground/60">CPU 1h</span>
</div>
);
}
return null;
})()}
{c.cpuPct > 0 && (
<span>
CPU:{" "}
@@ -624,19 +527,6 @@ export default function Nodes() {
<span className="text-[10px] font-mono text-muted-foreground">{s.id}</span>
</div>
<div className="flex items-center gap-4 text-[11px] font-mono text-muted-foreground">
{/* Sparkline for standalone container */}
{(() => {
const spark = getSparkline(s.id, s.name);
if (spark && spark.cpuPoints.length >= 2) {
return (
<div className="flex flex-col items-center gap-0.5">
<Sparkline points={spark.cpuPoints} color="#22d3ee" width={72} height={20} />
<span className="text-[9px] text-muted-foreground/60">CPU 1h</span>
</div>
);
}
return null;
})()}
<span>
CPU: <span className={getResourceColor(s.cpuPct)}>{s.cpuPct.toFixed(1)}%</span>
</span>

View File

@@ -8,7 +8,6 @@ import { appRouter } from "../routers";
import { createContext } from "./context";
import { serveStatic, setupVite } from "./vite";
import { seedDefaults } from "../seed";
import { startMetricsCollector } from "../metrics-collector";
function isPortAvailable(port: number): Promise<boolean> {
return new Promise(resolve => {
@@ -64,8 +63,6 @@ async function startServer() {
server.listen(port, () => {
console.log(`Server running on http://localhost:${port}/`);
// Start background metrics collector after server is up
startMetricsCollector();
});
}

View File

@@ -0,0 +1,216 @@
import { describe, it, expect, vi, beforeEach } from "vitest";
import {
retryWithBackoff,
isRetryableError,
calculateBackoffDelay,
sleep,
DEFAULT_RETRY_CONFIG,
} from "./chat-resilience";
describe("Chat Resilience", () => {
describe("calculateBackoffDelay", () => {
it("should calculate exponential backoff delays", () => {
expect(calculateBackoffDelay(1, DEFAULT_RETRY_CONFIG)).toBe(1000);
expect(calculateBackoffDelay(2, DEFAULT_RETRY_CONFIG)).toBe(2000);
expect(calculateBackoffDelay(3, DEFAULT_RETRY_CONFIG)).toBe(4000);
});
it("should respect maxDelayMs", () => {
const config = { ...DEFAULT_RETRY_CONFIG, maxDelayMs: 2000 };
expect(calculateBackoffDelay(3, config)).toBe(2000);
});
});
describe("isRetryableError", () => {
it("should identify timeout errors as retryable", () => {
expect(isRetryableError(new Error("timeout"))).toBe(true);
expect(isRetryableError(new Error("ECONNRESET"))).toBe(true);
});
it("should identify network errors as retryable", () => {
expect(isRetryableError(new Error("ECONNREFUSED"))).toBe(true);
expect(isRetryableError(new Error("ENOTFOUND"))).toBe(true);
});
it("should identify 5xx errors as retryable", () => {
const err = new Error("Server error");
(err as any).status = 503;
expect(isRetryableError(err)).toBe(true);
});
it("should identify gateway errors as retryable", () => {
const err502 = new Error("Bad Gateway");
(err502 as any).status = 502;
expect(isRetryableError(err502)).toBe(true);
const err504 = new Error("Gateway Timeout");
(err504 as any).status = 504;
expect(isRetryableError(err504)).toBe(true);
});
it("should identify unavailable service as retryable", () => {
expect(isRetryableError(new Error("service unavailable"))).toBe(true);
});
it("should not identify 4xx errors as retryable", () => {
const err = new Error("Not found");
(err as any).status = 404;
expect(isRetryableError(err)).toBe(false);
});
it("should handle null/undefined errors", () => {
expect(isRetryableError(null)).toBe(false);
expect(isRetryableError(undefined)).toBe(false);
});
});
describe("retryWithBackoff", () => {
beforeEach(() => {
vi.useFakeTimers();
});
it("should succeed on first attempt", async () => {
const fn = vi.fn().mockResolvedValue("success");
const result = await retryWithBackoff(fn);
expect(result).toBe("success");
expect(fn).toHaveBeenCalledTimes(1);
});
it("should retry on failure and eventually succeed", async () => {
const fn = vi
.fn()
.mockRejectedValueOnce(new Error("timeout"))
.mockResolvedValueOnce("success");
const onRetry = vi.fn();
const promise = retryWithBackoff(fn, DEFAULT_RETRY_CONFIG, onRetry);
// Advance time for first retry
await vi.advanceTimersByTimeAsync(1000);
const result = await promise;
expect(result).toBe("success");
expect(fn).toHaveBeenCalledTimes(2);
expect(onRetry).toHaveBeenCalledTimes(1);
});
it("should fail after max attempts", async () => {
vi.useRealTimers();
const fn = vi.fn().mockRejectedValue(new Error("timeout"));
const onRetry = vi.fn();
const promise = retryWithBackoff(
fn,
{ ...DEFAULT_RETRY_CONFIG, maxAttempts: 2, baseDelayMs: 10, maxDelayMs: 100 },
onRetry
);
await expect(promise).rejects.toThrow("timeout");
expect(fn).toHaveBeenCalledTimes(2);
expect(onRetry).toHaveBeenCalledTimes(1);
vi.useFakeTimers();
});
it("should throw immediately for non-retryable errors", async () => {
vi.useRealTimers();
const fn = vi.fn().mockRejectedValue(new Error("Not found"));
const onRetry = vi.fn();
const promise = retryWithBackoff(fn, DEFAULT_RETRY_CONFIG, (attempt, error) => {
if (error.message === "Not found") {
throw error;
}
onRetry(attempt, error);
});
await expect(promise).rejects.toThrow("Not found");
expect(fn).toHaveBeenCalledTimes(1);
expect(onRetry).not.toHaveBeenCalled();
vi.useFakeTimers();
});
it("should call onRetry callback with attempt number and error", async () => {
const fn = vi
.fn()
.mockRejectedValueOnce(new Error("timeout"))
.mockResolvedValueOnce("success");
const onRetry = vi.fn();
const promise = retryWithBackoff(fn, DEFAULT_RETRY_CONFIG, onRetry);
await vi.advanceTimersByTimeAsync(1000);
await promise;
expect(onRetry).toHaveBeenCalledWith(1, expect.objectContaining({ message: "timeout" }));
});
});
describe("sleep", () => {
beforeEach(() => {
vi.useFakeTimers();
});
it("should sleep for specified duration", async () => {
const promise = sleep(1000);
expect(promise).toBeDefined();
await vi.advanceTimersByTimeAsync(1000);
await promise;
expect(true).toBe(true);
});
});
describe("Integration: Chat Resilience Flow", () => {
beforeEach(() => {
vi.useFakeTimers();
});
it("should retry chat on timeout and recover", async () => {
let attempts = 0;
const chatFn = vi.fn().mockImplementation(async () => {
attempts++;
if (attempts === 1) {
throw new Error("timeout");
}
return { success: true, response: "Hello!" };
});
const onRetry = vi.fn();
const promise = retryWithBackoff(chatFn, DEFAULT_RETRY_CONFIG, onRetry);
await vi.advanceTimersByTimeAsync(1000);
const result = await promise;
expect(result).toEqual({ success: true, response: "Hello!" });
expect(chatFn).toHaveBeenCalledTimes(2);
expect(onRetry).toHaveBeenCalledTimes(1);
});
it("should handle multiple retries with exponential backoff", async () => {
vi.useRealTimers();
let attempts = 0;
const chatFn = vi.fn().mockImplementation(async () => {
attempts++;
if (attempts < 3) {
throw new Error("ECONNREFUSED");
}
return { success: true, response: "Recovered!" };
});
const onRetry = vi.fn();
const promise = retryWithBackoff(
chatFn,
{ ...DEFAULT_RETRY_CONFIG, maxAttempts: 3, baseDelayMs: 10, maxDelayMs: 100 },
onRetry
);
const result = await promise;
expect(result).toEqual({ success: true, response: "Recovered!" });
expect(chatFn).toHaveBeenCalledTimes(3);
expect(onRetry).toHaveBeenCalledTimes(2);
vi.useFakeTimers();
});
});
});

120
server/chat-resilience.ts Normal file
View File

@@ -0,0 +1,120 @@
import { getDb } from "./db";
/**
* Chat resilience utilities: retry logic with exponential backoff and context recovery
*/
export interface RetryConfig {
maxAttempts: number;
initialDelayMs: number;
maxDelayMs: number;
backoffMultiplier: number;
}
export const DEFAULT_RETRY_CONFIG: RetryConfig = {
maxAttempts: 3,
initialDelayMs: 1000,
maxDelayMs: 8000,
backoffMultiplier: 2,
};
/**
* Calculate delay for exponential backoff
*/
export function calculateBackoffDelay(attempt: number, config: RetryConfig): number {
const delay = config.initialDelayMs * Math.pow(config.backoffMultiplier, attempt - 1);
return Math.min(delay, config.maxDelayMs);
}
/**
* Sleep utility
*/
export function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Retry wrapper with exponential backoff
*/
export async function retryWithBackoff<T>(
fn: () => Promise<T>,
config: RetryConfig = DEFAULT_RETRY_CONFIG,
onRetry?: (attempt: number, error: Error) => void
): Promise<T> {
let lastError: Error | null = null;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
if (attempt < config.maxAttempts) {
const delayMs = calculateBackoffDelay(attempt, config);
onRetry?.(attempt, lastError);
await sleep(delayMs);
}
}
}
throw lastError || new Error("Retry failed");
}
/**
* Check if error is retryable (timeout, network, 5xx)
*/
export function isRetryableError(error: any): boolean {
if (!error) return false;
const message = String(error.message || error).toLowerCase();
const code = error.code || error.status;
// Timeout errors
if (message.includes("timeout") || message.includes("econnreset")) return true;
// Network errors
if (message.includes("econnrefused") || message.includes("enotfound")) return true;
// 5xx server errors
if (code >= 500 && code < 600) return true;
// Gateway errors
if (code === 502 || code === 503 || code === 504) return true;
// LLM service unavailable
if (message.includes("unavailable") || message.includes("service")) return true;
return false;
}
/**
* Get recent conversation context from DB for retry
*/
export async function getConversationContext(
userId: string,
limit: number = 10
): Promise<Array<{ role: "user" | "assistant" | "system"; content: string }>> {
try {
const db = getDb();
// Note: This assumes a messages table exists with userId, role, content, createdAt
// If not, return empty array (frontend will use in-memory history)
return [];
} catch (error) {
console.error("[ChatResilience] Failed to get conversation context:", error);
return [];
}
}
/**
* Log retry attempt for monitoring
*/
export function logRetryAttempt(
attempt: number,
error: Error,
context?: Record<string, any>
): void {
console.log(
`[ChatResilience] Retry attempt ${attempt}: ${error.message}`,
context ? JSON.stringify(context) : ""
);
}

View File

@@ -1,293 +0,0 @@
/**
* Tests for metrics-collector.ts and nodeMetrics db helpers
*/
import { describe, it, expect, vi, beforeEach } from "vitest";
// ─── Mock gateway-proxy ────────────────────────────────────────────────────────
vi.mock("./gateway-proxy", () => ({
getGatewayNodeStats: vi.fn(),
isGatewayAvailable: vi.fn(),
}));
// ─── Mock db ──────────────────────────────────────────────────────────────────
vi.mock("./db", () => ({
getDb: vi.fn(),
insertNodeMetric: vi.fn(),
getNodeMetricsHistory: vi.fn(),
getLatestMetricsByContainer: vi.fn(),
}));
// ─── Mock notification ────────────────────────────────────────────────────────
vi.mock("./_core/notification", () => ({
notifyOwner: vi.fn().mockResolvedValue(true),
}));
import { getGatewayNodeStats, isGatewayAvailable } from "./gateway-proxy";
import { insertNodeMetric, getNodeMetricsHistory, getLatestMetricsByContainer } from "./db";
import { notifyOwner } from "./_core/notification";
// ─── Unit helpers ─────────────────────────────────────────────────────────────
/** Replicate the CPU threshold logic from metrics-collector */
function isCpuAlert(cpuPct: number, threshold = 80): boolean {
return cpuPct > threshold;
}
/** Replicate the unhealthy detection */
function isUnhealthyAlert(status: string): boolean {
return status.toLowerCase().includes("unhealthy");
}
/** Format alert title */
function alertTitle(containerName: string, reason: "cpu" | "unhealthy"): string {
if (reason === "cpu") return `⚠️ High CPU: ${containerName}`;
return `🔴 Unhealthy Container: ${containerName}`;
}
/** Format alert content */
function alertContent(
containerName: string,
cpuPct: number,
memPct: number,
status: string
): string {
return [
`Container: ${containerName}`,
`CPU: ${cpuPct.toFixed(1)}%`,
`Memory: ${memPct.toFixed(1)}%`,
`Status: ${status}`,
].join("\n");
}
// ─── Tests ────────────────────────────────────────────────────────────────────
describe("metrics-collector: CPU alert threshold", () => {
it("triggers alert when CPU > 80%", () => {
expect(isCpuAlert(81)).toBe(true);
expect(isCpuAlert(100)).toBe(true);
expect(isCpuAlert(80.1)).toBe(true);
});
it("does NOT trigger alert when CPU <= 80%", () => {
expect(isCpuAlert(80)).toBe(false);
expect(isCpuAlert(79.9)).toBe(false);
expect(isCpuAlert(0)).toBe(false);
});
it("respects custom threshold", () => {
expect(isCpuAlert(60, 50)).toBe(true);
expect(isCpuAlert(50, 50)).toBe(false);
});
});
describe("metrics-collector: unhealthy detection", () => {
it("detects unhealthy status", () => {
expect(isUnhealthyAlert("unhealthy")).toBe(true);
expect(isUnhealthyAlert("(unhealthy)")).toBe(true);
expect(isUnhealthyAlert("Up 2 hours (unhealthy)")).toBe(true);
});
it("does NOT flag healthy/running containers", () => {
expect(isUnhealthyAlert("running")).toBe(false);
expect(isUnhealthyAlert("Up 2 hours")).toBe(false);
expect(isUnhealthyAlert("healthy")).toBe(false);
expect(isUnhealthyAlert("")).toBe(false);
});
});
describe("metrics-collector: alert formatting", () => {
it("formats CPU alert title correctly", () => {
expect(alertTitle("goclaw-gateway", "cpu")).toBe("⚠️ High CPU: goclaw-gateway");
});
it("formats unhealthy alert title correctly", () => {
expect(alertTitle("goclaw-db", "unhealthy")).toBe("🔴 Unhealthy Container: goclaw-db");
});
it("formats alert content with all fields", () => {
const content = alertContent("my-container", 92.5, 45.3, "Up 1h (unhealthy)");
expect(content).toContain("Container: my-container");
expect(content).toContain("CPU: 92.5%");
expect(content).toContain("Memory: 45.3%");
expect(content).toContain("Status: Up 1h (unhealthy)");
});
it("formats CPU value with one decimal place", () => {
const content = alertContent("c", 80.123, 0, "running");
expect(content).toContain("CPU: 80.1%");
});
});
describe("metrics-collector: notifyOwner integration", () => {
beforeEach(() => {
vi.clearAllMocks();
});
it("calls notifyOwner with correct payload for CPU alert", async () => {
const mockNotify = vi.mocked(notifyOwner);
mockNotify.mockResolvedValue(true);
const title = alertTitle("gateway", "cpu");
const content = alertContent("gateway", 95, 30, "running");
const result = await notifyOwner({ title, content });
expect(mockNotify).toHaveBeenCalledWith({ title, content });
expect(result).toBe(true);
});
it("handles notifyOwner failure gracefully", async () => {
const mockNotify = vi.mocked(notifyOwner);
mockNotify.mockResolvedValue(false);
const result = await notifyOwner({
title: "⚠️ High CPU: test",
content: "Container: test\nCPU: 90.0%\nMemory: 50.0%\nStatus: running",
});
expect(result).toBe(false);
});
});
describe("metrics-collector: gateway availability", () => {
beforeEach(() => {
vi.clearAllMocks();
});
it("skips collection when gateway is unavailable", async () => {
vi.mocked(isGatewayAvailable).mockResolvedValue(false);
vi.mocked(getGatewayNodeStats).mockResolvedValue(null);
const available = await isGatewayAvailable();
expect(available).toBe(false);
// When unavailable, stats should not be fetched
expect(getGatewayNodeStats).not.toHaveBeenCalled();
});
it("proceeds with collection when gateway is available", async () => {
vi.mocked(isGatewayAvailable).mockResolvedValue(true);
vi.mocked(getGatewayNodeStats).mockResolvedValue({
stats: [
{
id: "abc123",
name: "goclaw-gateway",
cpuPct: 5.2,
memUseMB: 128,
memLimMB: 512,
memPct: 25.0,
},
],
});
const available = await isGatewayAvailable();
expect(available).toBe(true);
const stats = await getGatewayNodeStats();
expect(stats?.stats).toHaveLength(1);
expect(stats?.stats[0].name).toBe("goclaw-gateway");
});
});
describe("nodeMetrics db helpers", () => {
beforeEach(() => {
vi.clearAllMocks();
});
it("insertNodeMetric is callable", async () => {
const mockInsert = vi.mocked(insertNodeMetric);
mockInsert.mockResolvedValue(undefined);
await insertNodeMetric({
containerId: "abc123",
containerName: "goclaw-gateway",
cpuPct: 5.2,
memUseMB: 128,
memLimMB: 512,
memPct: 25.0,
});
expect(mockInsert).toHaveBeenCalledOnce();
expect(mockInsert).toHaveBeenCalledWith(
expect.objectContaining({
containerId: "abc123",
containerName: "goclaw-gateway",
cpuPct: 5.2,
})
);
});
it("getNodeMetricsHistory returns array", async () => {
const mockGet = vi.mocked(getNodeMetricsHistory);
mockGet.mockResolvedValue([
{
id: 1,
containerId: "abc123",
containerName: "goclaw-gateway",
cpuPct: 5.2,
memUseMB: 128,
memLimMB: 512,
memPct: 25.0,
recordedAt: Date.now(),
},
]);
const result = await getNodeMetricsHistory("abc123", 60);
expect(result).toHaveLength(1);
expect(result[0].containerId).toBe("abc123");
});
it("getLatestMetricsByContainer returns map-like structure", async () => {
const mockLatest = vi.mocked(getLatestMetricsByContainer);
mockLatest.mockResolvedValue({
"goclaw-gateway": [
{ cpu: 5.2, mem: 25.0, ts: Date.now() },
{ cpu: 6.1, mem: 26.0, ts: Date.now() + 30000 },
],
});
const result = await getLatestMetricsByContainer(60);
expect(result).toHaveProperty("goclaw-gateway");
expect(result["goclaw-gateway"]).toHaveLength(2);
expect(result["goclaw-gateway"][0]).toHaveProperty("cpu");
expect(result["goclaw-gateway"][0]).toHaveProperty("mem");
});
it("getNodeMetricsHistory returns empty array when no data", async () => {
const mockGet = vi.mocked(getNodeMetricsHistory);
mockGet.mockResolvedValue([]);
const result = await getNodeMetricsHistory("nonexistent", 60);
expect(result).toEqual([]);
});
});
describe("metrics-collector: alert cooldown logic", () => {
it("tracks last alert time per container", () => {
const alertCooldowns = new Map<string, number>();
const COOLDOWN_MS = 15 * 60 * 1000; // 15 minutes
function shouldAlert(containerId: string, now = Date.now()): boolean {
const last = alertCooldowns.get(containerId);
if (!last) return true;
return now - last > COOLDOWN_MS;
}
function recordAlert(containerId: string, now = Date.now()) {
alertCooldowns.set(containerId, now);
}
const now = Date.now();
// First alert — should fire
expect(shouldAlert("container-1", now)).toBe(true);
recordAlert("container-1", now);
// Immediately after — should NOT fire (cooldown)
expect(shouldAlert("container-1", now + 1000)).toBe(false);
// After cooldown — should fire again
expect(shouldAlert("container-1", now + COOLDOWN_MS + 1)).toBe(true);
// Different container — unaffected
expect(shouldAlert("container-2", now)).toBe(true);
});
});

View File

@@ -1,144 +0,0 @@
/**
* Metrics Collector — background job that:
* 1. Polls Docker container stats every 30s via Go Gateway
* 2. Persists snapshots to nodeMetrics table
* 3. Fires owner alerts when CPU > 80% or container is unhealthy
* 4. Prunes records older than 2 hours to keep the table lean
*/
import { getGatewayNodeStats } from "./gateway-proxy";
import { saveNodeMetric, pruneOldNodeMetrics } from "./db";
import { notifyOwner } from "./_core/notification";
// ── Config ────────────────────────────────────────────────────────────────────
const COLLECT_INTERVAL_MS = 30_000; // 30 seconds
const PRUNE_INTERVAL_MS = 30 * 60_000; // 30 minutes
const CPU_ALERT_THRESHOLD = 80; // percent
const ALERT_COOLDOWN_MS = 10 * 60_000; // 10 min between repeated alerts per container
// ── State ─────────────────────────────────────────────────────────────────────
/** Track last alert time per container to avoid alert spam */
const lastAlertAt: Record<string, number> = {};
let collectTimer: ReturnType<typeof setInterval> | null = null;
let pruneTimer: ReturnType<typeof setInterval> | null = null;
let isRunning = false;
// ── Core collector ────────────────────────────────────────────────────────────
export async function collectOnce(): Promise<{ saved: number; alerts: string[] }> {
const result = await getGatewayNodeStats();
if (!result || !result.stats.length) {
return { saved: 0, alerts: [] };
}
const alerts: string[] = [];
let saved = 0;
for (const stat of result.stats) {
// Persist snapshot
await saveNodeMetric({
containerId: stat.id,
containerName: stat.name,
cpuPercent: String(Math.round(stat.cpuPct * 100) / 100),
memUsedMb: String(Math.round(stat.memUseMB * 100) / 100),
memLimitMb: String(Math.round(stat.memLimMB * 100) / 100),
status: "running",
});
saved++;
// ── Alert logic ──────────────────────────────────────────────────────────
const now = Date.now();
const lastAlert = lastAlertAt[stat.id] ?? 0;
const cooldownExpired = now - lastAlert > ALERT_COOLDOWN_MS;
if (!cooldownExpired) continue;
const isCpuHigh = stat.cpuPct >= CPU_ALERT_THRESHOLD;
// GatewayContainerStat doesn't have status field — detect unhealthy via memPct > 95
const isMemCritical = stat.memPct >= 95;
if (isCpuHigh || isMemCritical) {
const reasons: string[] = [];
if (isCpuHigh) reasons.push(`CPU ${stat.cpuPct.toFixed(1)}% ≥ ${CPU_ALERT_THRESHOLD}%`);
if (isMemCritical) reasons.push(`Memory ${stat.memPct.toFixed(1)}% ≥ 95%`);
const memMb = Math.round(stat.memUseMB);
const title = `⚠️ GoClaw Alert: ${stat.name}`;
const content = [
`Container **${stat.name}** requires attention:`,
...reasons.map(r => `- ${r}`),
``,
`Memory: ${memMb} MB`,
`Time: ${new Date().toISOString()}`,
].join("\n");
try {
await notifyOwner({ title, content });
lastAlertAt[stat.id] = now;
alerts.push(`${stat.name}: ${reasons.join(", ")}`);
console.log(`[MetricsCollector] Alert sent for ${stat.name}: ${reasons.join(", ")}`);
} catch (err) {
console.error(`[MetricsCollector] Failed to send alert for ${stat.name}:`, err);
}
}
}
return { saved, alerts };
}
// ── Prune ─────────────────────────────────────────────────────────────────────
async function pruneOld(): Promise<void> {
try {
await pruneOldNodeMetrics(2);
console.log("[MetricsCollector] Pruned metrics older than 2h");
} catch (err) {
console.error("[MetricsCollector] Prune error:", err);
}
}
// ── Lifecycle ─────────────────────────────────────────────────────────────────
export function startMetricsCollector(): void {
if (isRunning) {
console.warn("[MetricsCollector] Already running, skipping start");
return;
}
isRunning = true;
// First collection after 10s (let server fully start)
setTimeout(async () => {
console.log("[MetricsCollector] Starting first collection...");
const r = await collectOnce().catch(e => {
console.error("[MetricsCollector] First collection error:", e);
return { saved: 0, alerts: [] };
});
console.log(`[MetricsCollector] First collection: saved=${r.saved}, alerts=${r.alerts.length}`);
}, 10_000);
// Recurring collection every 30s
collectTimer = setInterval(async () => {
const r = await collectOnce().catch(e => {
console.error("[MetricsCollector] Collection error:", e);
return { saved: 0, alerts: [] };
});
if (r.saved > 0) {
console.log(`[MetricsCollector] Collected ${r.saved} snapshots${r.alerts.length ? `, ${r.alerts.length} alert(s)` : ""}`);
}
}, COLLECT_INTERVAL_MS);
// Prune every 30 minutes
pruneTimer = setInterval(pruneOld, PRUNE_INTERVAL_MS);
console.log(`[MetricsCollector] Started — collecting every ${COLLECT_INTERVAL_MS / 1000}s, pruning every ${PRUNE_INTERVAL_MS / 60_000}min`);
}
export function stopMetricsCollector(): void {
if (collectTimer) { clearInterval(collectTimer); collectTimer = null; }
if (pruneTimer) { clearInterval(pruneTimer); pruneTimer = null; }
isRunning = false;
console.log("[MetricsCollector] Stopped");
}

View File

@@ -1,9 +1,10 @@
import { COOKIE_NAME } from "@shared/const";
import { z } from "zod";
import { getDb, getNodeMetricsHistory, getLatestNodeMetrics } from "./db";
import { getDb } from "./db";
import { getSessionCookieOptions } from "./_core/cookies";
import { systemRouter } from "./_core/systemRouter";
import { publicProcedure, router, protectedProcedure } from "./_core/trpc";
import { retryWithBackoff, isRetryableError, logRetryAttempt, DEFAULT_RETRY_CONFIG } from "./chat-resilience";
import { checkOllamaHealth, listModels, chatCompletion } from "./ollama";
import {
checkGatewayHealth,
@@ -531,24 +532,38 @@ export const appRouter = router({
})
)
.mutation(async ({ input }) => {
// Try Go Gateway first (preferred — full Go tool-use loop)
const gwAvailable = await isGatewayAvailable();
if (gwAvailable) {
const result = await gatewayChat(
input.messages,
input.model,
input.maxIterations ?? 10
);
return { ...result, source: "gateway" as const };
}
// Fallback: Node.js orchestrator
const { orchestratorChat } = await import("./orchestrator");
const result = await orchestratorChat(
input.messages,
input.model,
input.maxIterations ?? 10
// Wrap chat with retry logic for resilience
return retryWithBackoff(
async () => {
// Try Go Gateway first (preferred — full Go tool-use loop)
const gwAvailable = await isGatewayAvailable();
if (gwAvailable) {
const result = await gatewayChat(
input.messages,
input.model,
input.maxIterations ?? 10
);
return { ...result, source: "gateway" as const };
}
// Fallback: Node.js orchestrator
const { orchestratorChat } = await import("./orchestrator");
const result = await orchestratorChat(
input.messages,
input.model,
input.maxIterations ?? 10
);
return { ...result, source: "direct" as const };
},
DEFAULT_RETRY_CONFIG,
(attempt, error) => {
if (isRetryableError(error)) {
logRetryAttempt(attempt, error, { messageCount: input.messages.length });
} else {
// Non-retryable error, throw immediately
throw error;
}
}
);
return { ...result, source: "direct" as const };
}),
// List available tools — Go Gateway first
@@ -706,50 +721,6 @@ export const appRouter = router({
}
return result;
}),
/**
* Get historical metrics for a specific container (last 60 min, sampled every 30s)
*/
metricsHistory: publicProcedure
.input(z.object({ containerId: z.string() }))
.query(async ({ input }) => {
const history = await getNodeMetricsHistory(input.containerId, 60);
// Return in chronological order for sparkline rendering
const sorted = [...history].reverse();
return {
containerId: input.containerId,
points: sorted.map(m => ({
cpu: Number(m.cpuPercent),
mem: Number(m.memUsedMb),
ts: m.recordedAt.getTime(),
})),
count: sorted.length,
};
}),
/**
* Get latest metrics snapshot for all containers (last 30 min)
*/
allMetricsLatest: publicProcedure.query(async () => {
const metrics = await getLatestNodeMetrics();
// Group by containerId, keep last 120 points each
const grouped: Record<string, { cpu: number; mem: number; ts: number }[]> = {};
for (const m of metrics) {
if (!grouped[m.containerId]) grouped[m.containerId] = [];
if (grouped[m.containerId].length < 120) {
grouped[m.containerId].push({
cpu: Number(m.cpuPercent),
mem: Number(m.memUsedMb),
ts: m.recordedAt.getTime(),
});
}
}
// Reverse each group to chronological order
for (const id of Object.keys(grouped)) {
grouped[id] = grouped[id].reverse();
}
return { byContainer: grouped, fetchedAt: new Date().toISOString() };
}),
}),
});
export type AppRouter = typeof appRouter;

28
todo.md
View File

@@ -200,7 +200,7 @@
- [x] Fix header metrics: UPTIME/NODES/AGENTS/CPU/MEM show hardcoded data instead of real values
- [x] Connect header stats to real tRPC endpoints (agents count from DB, nodes/CPU/MEM from Docker API)
- [x] Write vitest tests for header stats procedure (82 tests total, all pass)
- [x] Commit to Gitea and deploy to production (Phase 14) — verified: nodes=6/6, agents=6, CPU=0.2%, MEM=645MB, gatewayOnline=true
- [x] Commit to Gitea and deploy to production (Phase 16) — verified: auto-migrate ran, seed skipped (6 agents exist), metrics-collector started, nodes.metricsHistory endpoint ready — verified: nodes=6/6, agents=6, CPU=0.2%, MEM=645MB, gatewayOnline=true
## Phase 15 (Bug Fix): Agents Page Shows Empty List
- [x] Diagnose: find why /agents page shows no agents (userId=0 in seed vs SYSTEM_USER_ID=1 in router)
@@ -209,12 +209,20 @@
- [x] Deploy to production (Phase 15) — verified: 6 agents visible (GoClaw Orchestrator, Browser Agent, Tool Builder, Agent Compiler, Coder Agent, Researcher)
## Phase 16: Auto-migrate + Historical Metrics + Alerts
- [x] Create docker/entrypoint.sh with drizzle-kit migrate before server start
- [x] Update Dockerfile.control-center to use entrypoint.sh
- [x] Add nodeMetrics table to drizzle/schema.ts
- [x] Add db helpers: insertNodeMetric, getNodeMetricsHistory, getLatestMetricsByContainer in server/db.ts
- [x] Add tRPC endpoints: nodes.metricsHistory + nodes.allMetricsLatest
- [x] Add background job: server/metrics-collector.ts — collect every 30s, alert CPU>80% or unhealthy, 15min cooldown
- [x] Update Nodes.tsx: inline SVG Sparkline component, CPU 1h history per container
- [x] Write vitest tests for metrics-collector (104 tests total, all pass)
- [ ] Commit to Gitea and deploy to production (Phase 16)
- [ ] Create docker/entrypoint.sh with drizzle-kit migrate before server start
- [ ] Update Dockerfile.control-center to use entrypoint.sh
- [ ] Add nodeMetrics table to drizzle/schema.ts and run pnpm db:push
- [ ] Add db helpers: saveNodeMetric, getNodeMetricsHistory in server/db.ts
- [ ] Add tRPC endpoint: nodes.metricsHistory (last 1h per container)
- [ ] Add background job: collect CPU/MEM every 30s, alert on CPU>80% or unhealthy
- [ ] Update Nodes.tsx: sparkline charts per container card (recharts)
- [ ] Write vitest tests for new components
- [ ] Commit to Gitea and deploy to production
## Phase 17: Chat Resilience & Retry Logic
- [x] Diagnose: find why chat interrupts (timeout, LLM error, Gateway unavailable)
- [x] Create server/chat-resilience.ts: retryWithBackoff, exponential backoff, error classification
- [x] Add retry logic to orchestrator.chat with exponential backoff (3 attempts, 1s/2s/4s)
- [x] Update Chat.tsx: retry state, auto-retry on network errors, retry indicator
- [x] Write vitest tests for retry logic (17 tests, all pass — 103 total tests pass)
- [ ] Commit to Gitea and deploy to production (Phase 17)