diff --git a/.kilo/agents/pipeline-judge.md b/.kilo/agents/pipeline-judge.md index d28a332..b1f6577 100644 --- a/.kilo/agents/pipeline-judge.md +++ b/.kilo/agents/pipeline-judge.md @@ -49,26 +49,55 @@ where: ## Execution Protocol -### Step 1: Collect Metrics +### Step 1: Collect Metrics (Docker-first for precision) ```bash -# Run test suite -bun test --reporter=json > /tmp/test-results.json 2>&1 -bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1 +# Prefer Docker for consistent measurements with millisecond precision +if command -v docker &> /dev/null && docker info &> /dev/null; then + echo "Using Docker container for precise measurements..." + + # Run tests in container with millisecond timing + START_MS=$(date +%s%3N) + docker-compose -f docker/evolution-test/docker-compose.yml run --rm evolution-test \ + bun test --reporter=json --coverage 2>&1 | tee /tmp/test-results.json + END_MS=$(date +%s%3N) + + TIME_MS=$((END_MS - START_MS)) + echo "Execution time: ${TIME_MS}ms" +else + echo "Running locally (Docker not available, less precise)..." + + START_MS=$(date +%s%3N) + bun test --reporter=json --coverage > /tmp/test-results.json 2>&1 + END_MS=$(date +%s%3N) + + TIME_MS=$((END_MS - START_MS)) +fi -# Count results -TOTAL=$(jq '.numTotalTests' /tmp/test-results.json) -PASSED=$(jq '.numPassedTests' /tmp/test-results.json) -FAILED=$(jq '.numFailedTests' /tmp/test-results.json) +# Run additional test suites +bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1 || true -# Check build +# Parse test results with 2 decimal precision +TOTAL=$(jq '.numTotalTests // 0' /tmp/test-results.json) +PASSED=$(jq '.numPassedTests // 0' /tmp/test-results.json) +FAILED=$(jq '.numFailedTests // 0' /tmp/test-results.json) +SKIPPED=$(jq '.numSkippedTests // 0' /tmp/test-results.json) + +# Calculate pass rate with 2 decimals +if [ "$TOTAL" -gt 0 ]; then + PASS_RATE=$(awk "BEGIN {printf \"%.2f\", $PASSED / $TOTAL * 100}") +else + PASS_RATE="0.00" +fi + +# Check quality gates bun run build 2>&1 && BUILD_OK=true || BUILD_OK=false - -# Check lint -bun run lint 2>&1 && LINT_OK=true || LINT_OK=false - -# Check types +bun run lint 2>&1 && LINT_OK=true || LINT_OK=false bun run typecheck 2>&1 && TYPES_OK=true || TYPES_OK=false + +# Get coverage with 2 decimal precision +COVERAGE=$(bun test --coverage 2>&1 | grep 'All files' | awk '{printf "%.2f", $4}' || echo "0.00") +COVERAGE_OK=$(awk "BEGIN {print ($COVERAGE >= 80) ? 1 : 0}") ``` ### Step 2: Read Pipeline Log diff --git a/.kilo/logs/fitness-history.jsonl b/.kilo/logs/fitness-history.jsonl index cb4bff8..1cba1c2 100644 --- a/.kilo/logs/fitness-history.jsonl +++ b/.kilo/logs/fitness-history.jsonl @@ -1 +1 @@ -{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"} \ No newline at end of file +{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}{"ts":"2026-04-06T00:32:00Z","issue":31,"workflow":"feature","fitness":0.52,"breakdown":{"test_pass_rate":0.45,"quality_gates_rate":0.80,"efficiency_score":0.44},"tokens":35000,"time_ms":170000,"tests_passed":0,"tests_total":5,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer","code-skeptic","performance-engineer","security-auditor","release-manager","evaluator","pipeline-judge"],"verdict":"MARGINAL","improvement_trigger":true} diff --git a/docker/evolution-test/Dockerfile b/docker/evolution-test/Dockerfile new file mode 100644 index 0000000..999d13a --- /dev/null +++ b/docker/evolution-test/Dockerfile @@ -0,0 +1,25 @@ +# Evolution Test Container +# Used for testing pipeline-judge fitness scoring with precise measurements + +FROM oven/bun:1 AS base + +WORKDIR /app + +# Install TypeScript and testing tools +RUN bun add -g typescript @types/node + +# Copy project files +COPY . /app/ + +# Install dependencies +RUN bun install + +# Create logs directory +RUN mkdir -p .kilo/logs + +# Health check +HEALTHCHECK --interval=30s --timeout=10s \ + CMD bun test --reporter=json || exit 1 + +# Default command - run tests with precise timing +CMD ["bun", "test", "--reporter=json"] \ No newline at end of file diff --git a/docker/evolution-test/docker-compose.yml b/docker/evolution-test/docker-compose.yml new file mode 100644 index 0000000..3cec235 --- /dev/null +++ b/docker/evolution-test/docker-compose.yml @@ -0,0 +1,88 @@ +# Evolution Test Containers +# Run multiple workflow tests in parallel + +version: '3.8' + +services: + # Evolution test runner for feature workflow + evolution-feature: + build: + context: ../.. + dockerfile: docker/evolution-test/Dockerfile + container_name: evolution-feature + environment: + - WORKFLOW_TYPE=feature + - TOKEN_BUDGET=50000 + - TIME_BUDGET=300 + - MIN_COVERAGE=80 + volumes: + - ../../.kilo/logs:/app/.kilo/logs + - ../../src:/app/src + command: bun test --reporter=json --coverage + + # Evolution test runner for bugfix workflow + evolution-bugfix: + build: + context: ../.. + dockerfile: docker/evolution-test/Dockerfile + container_name: evolution-bugfix + environment: + - WORKFLOW_TYPE=bugfix + - TOKEN_BUDGET=20000 + - TIME_BUDGET=120 + - MIN_COVERAGE=90 + volumes: + - ../../.kilo/logs:/app/.kilo/logs + - ../../src:/app/src + command: bun test --reporter=json --coverage + + # Evolution test runner for refactor workflow + evolution-refactor: + build: + context: ../.. + dockerfile: docker/evolution-test/Dockerfile + container_name: evolution-refactor + environment: + - WORKFLOW_TYPE=refactor + - TOKEN_BUDGET=40000 + - TIME_BUDGET=240 + - MIN_COVERAGE=95 + volumes: + - ../../.kilo/logs:/app/.kilo/logs + - ../../src:/app/src + command: bun test --reporter=json --coverage + + # Evolution test runner for security workflow + evolution-security: + build: + context: ../.. + dockerfile: docker/evolution-test/Dockerfile + container_name: evolution-security + environment: + - WORKFLOW_TYPE=security + - TOKEN_BUDGET=30000 + - TIME_BUDGET=180 + - MIN_COVERAGE=80 + volumes: + - ../../.kilo/logs:/app/.kilo/logs + - ../../src:/app/src + command: bun test --reporter=json --coverage + + # Fitness aggregator - collects results from all containers + fitness-aggregator: + image: oven/bun:1 + container_name: fitness-aggregator + depends_on: + - evolution-feature + - evolution-bugfix + - evolution-refactor + - evolution-security + volumes: + - ../../.kilo/logs:/app/.kilo/logs + working_dir: /app + command: | + sh -c " + echo 'Aggregating fitness scores...' + cat .kilo/logs/fitness-history.jsonl | tail -4 > .kilo/logs/fitness-latest.jsonl + echo 'Fitness aggregation complete.' + " \ No newline at end of file diff --git a/docker/evolution-test/run-evolution-test.bat b/docker/evolution-test/run-evolution-test.bat new file mode 100644 index 0000000..1c44e77 --- /dev/null +++ b/docker/evolution-test/run-evolution-test.bat @@ -0,0 +1,65 @@ +@echo off +REM Evolution Test Runner for Windows +REM Runs pipeline-judge tests with precise measurements + +setlocal enabledelayedexpansion + +echo === Evolution Test Runner === +echo. + +REM Check Docker +where docker >nul 2>&1 +if %errorlevel% neq 0 ( + echo Error: Docker not found + echo Please install Docker Desktop first: + echo winget install Docker.DockerDesktop + echo. + echo Or run tests locally ^(less precise^): + echo bun test --reporter=json --coverage + exit /b 1 +) + +REM Check Docker daemon +docker info >nul 2>&1 +if %errorlevel% neq 0 ( + echo Warning: Docker daemon not running + echo Please start Docker Desktop and try again + exit /b 1 +) + +REM Get workflow type +set WORKFLOW=%1 +if "%WORKFLOW%"=="" set WORKFLOW=feature + +echo Running evolution test for: %WORKFLOW% +echo. + +REM Build container +echo Building evolution test container... +docker-compose -f docker/evolution-test/docker-compose.yml build + +REM Run test +if "%WORKFLOW%"=="all" ( + echo Running ALL workflow tests in parallel... + docker-compose -f docker/evolution-test/docker-compose.yml up + docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator +) else ( + docker-compose -f docker/evolution-test/docker-compose.yml up evolution-%WORKFLOW% +) + +REM Show results +echo. +echo === Test Results === +if exist .kilo\logs\fitness-history.jsonl ( + echo Latest fitness scores: + powershell -Command "Get-Content .kilo\logs\fitness-history.jsonl -Tail 4 | ForEach-Object { $j = $_ | ConvertFrom-Json; Write-Host (' ' + $j.workflow + ': fitness=' + $j.fitness + ', time=' + $j.time_ms + 'ms, tokens=' + $j.tokens) }" +) else ( + echo No fitness history found +) + +REM Cleanup +echo. +echo Cleaning up... +docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>nul + +echo Done! \ No newline at end of file diff --git a/docker/evolution-test/run-evolution-test.sh b/docker/evolution-test/run-evolution-test.sh new file mode 100644 index 0000000..c222e20 --- /dev/null +++ b/docker/evolution-test/run-evolution-test.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Evolution Test Runner +# Runs pipeline-judge tests with precise measurements + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}=== Evolution Test Runner ===${NC}" +echo "" + +# Check Docker +if ! command -v docker &> /dev/null; then + echo -e "${RED}Error: Docker not found${NC}" + echo "Please install Docker Desktop first:" + echo " winget install Docker.DockerDesktop" + echo "" + echo "Or use alternatives:" + echo " 1. Use WSL2 with Docker" + echo " 2. Run tests locally (less precise):" + echo " bun test --reporter=json --coverage" + exit 1 +fi + +# Docker daemon check +if ! docker info &> /dev/null; then + echo -e "${YELLOW}Warning: Docker daemon not running${NC}" + echo "Starting Docker Desktop..." + open -a "Docker" 2>/dev/null || start "Docker Desktop" 2>/dev/null || true + sleep 30 +fi + +# Build evolution test container +echo -e "${BLUE}Building evolution test container...${NC}" +docker-compose -f docker/evolution-test/docker-compose.yml build + +# Run specific workflow test +WORKFLOW=${1:-feature} +echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}" + +case $WORKFLOW in + feature) + docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature + ;; + bugfix) + docker-compose -f docker/evolution-test/docker-compose.yml up evolution-bugfix + ;; + refactor) + docker-compose -f docker/evolution-test/docker-compose.yml up evolution-refactor + ;; + security) + docker-compose -f docker/evolution-test/docker-compose.yml up evolution-security + ;; + all) + echo -e "${BLUE}Running ALL workflow tests in parallel...${NC}" + docker-compose -f docker/evolution-test/docker-compose.yml up + docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator + ;; + *) + echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}" + echo "Usage: $0 [feature|bugfix|refactor|security|all]" + exit 1 + ;; +esac + +# Parse results +echo "" +echo -e "${BLUE}=== Test Results ===${NC}" +if [ -f ".kilo/logs/fitness-history.jsonl" ]; then + echo -e "${GREEN}Latest fitness scores:${NC}" + tail -4 .kilo/logs/fitness-history.jsonl | while read -r line; do + FITNESS=$(echo "$line" | jq -r '.fitness // empty') + WORKFLOW=$(echo "$line" | jq -r '.workflow // empty') + TIME_MS=$(echo "$line" | jq -r '.time_ms // empty') + TOKENS=$(echo "$line" | jq -r '.tokens // empty') + echo " ${WORKFLOW}: fitness=${FITNESS}, time=${TIME_MS}ms, tokens=${TOKENS}" + done +else + echo -e "${YELLOW}No fitness history found${NC}" +fi + +# Cleanup +echo "" +echo -e "${BLUE}Cleaning up...${NC}" +docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>/dev/null || true + +echo -e "${GREEN}Done!${NC}" \ No newline at end of file