feat: add Docker-based evolution testing with precise measurements

- Add docker/evolution-test/Dockerfile with bun, TypeScript - Add docker/evolution-test/docker-compose.yml for parallel workflow testing - Add run-evolution-test.sh and .bat scripts for cross-platform - Update pipeline-judge.md with Docker-first approach: - Millisecond precision timing (date +%s%3N) - 2 decimal places for test pass rate and coverage - Docker container for consistent test environment - Multiple workflow types (feature/bugfix/refactor/security) Enables: - Parallel testing with docker-compose - Consistent environment across machines - Precise fitness measurements (ms, 2 decimals) - Multi-workflow testing in containers
2026-04-06 00:48:21 +01:00
parent fa68141d47
commit 1703247651
6 changed files with 314 additions and 15 deletions
--- a/.kilo/agents/pipeline-judge.md
+++ b/.kilo/agents/pipeline-judge.md
@@ -49,26 +49,55 @@ where:
 ## Execution Protocol
-### Step 1: Collect Metrics
+### Step 1: Collect Metrics (Docker-first for precision)
 ```bash
-# Run test suite
+# Prefer Docker for consistent measurements with millisecond precision
-bun test --reporter=json > /tmp/test-results.json 2>&1
+if command -v docker &> /dev/null && docker info &> /dev/null; then
-bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1
+  echo "Using Docker container for precise measurements..."
  # Run tests in container with millisecond timing
  START_MS=$(date +%s%3N)
  docker-compose -f docker/evolution-test/docker-compose.yml run --rm evolution-test \
    bun test --reporter=json --coverage 2>&1 | tee /tmp/test-results.json
  END_MS=$(date +%s%3N)
  TIME_MS=$((END_MS - START_MS))
  echo "Execution time: ${TIME_MS}ms"
 else
  echo "Running locally (Docker not available, less precise)..."
  START_MS=$(date +%s%3N)
  bun test --reporter=json --coverage > /tmp/test-results.json 2>&1
  END_MS=$(date +%s%3N)
  TIME_MS=$((END_MS - START_MS))
 fi
-# Count results
+# Run additional test suites
-TOTAL=$(jq '.numTotalTests' /tmp/test-results.json)
+bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1 || true
 PASSED=$(jq '.numPassedTests' /tmp/test-results.json)
 FAILED=$(jq '.numFailedTests' /tmp/test-results.json)
-# Check build
+# Parse test results with 2 decimal precision
 TOTAL=$(jq '.numTotalTests // 0' /tmp/test-results.json)
 PASSED=$(jq '.numPassedTests // 0' /tmp/test-results.json)
 FAILED=$(jq '.numFailedTests // 0' /tmp/test-results.json)
 SKIPPED=$(jq '.numSkippedTests // 0' /tmp/test-results.json)
 # Calculate pass rate with 2 decimals
 if [ "$TOTAL" -gt 0 ]; then
  PASS_RATE=$(awk "BEGIN {printf \"%.2f\", $PASSED / $TOTAL * 100}")
 else
  PASS_RATE="0.00"
 fi
 # Check quality gates
 bun run build 2>&1 && BUILD_OK=true || BUILD_OK=false
-
+bun run lint 2>&1 && LINT_OK=true || LINT_OK=false  
 # Check lint
 bun run lint 2>&1 && LINT_OK=true || LINT_OK=false
 # Check types
 bun run typecheck 2>&1 && TYPES_OK=true || TYPES_OK=false
 # Get coverage with 2 decimal precision
 COVERAGE=$(bun test --coverage 2>&1 | grep 'All files' | awk '{printf "%.2f", $4}' || echo "0.00")
 COVERAGE_OK=$(awk "BEGIN {print ($COVERAGE >= 80) ? 1 : 0}")
 ```
 ### Step 2: Read Pipeline Log
--- a/.kilo/logs/fitness-history.jsonl
+++ b/.kilo/logs/fitness-history.jsonl
@@ -1 +1 @@
-{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}
+{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}{"ts":"2026-04-06T00:32:00Z","issue":31,"workflow":"feature","fitness":0.52,"breakdown":{"test_pass_rate":0.45,"quality_gates_rate":0.80,"efficiency_score":0.44},"tokens":35000,"time_ms":170000,"tests_passed":0,"tests_total":5,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer","code-skeptic","performance-engineer","security-auditor","release-manager","evaluator","pipeline-judge"],"verdict":"MARGINAL","improvement_trigger":true}
--- a/docker/evolution-test/Dockerfile
+++ b/docker/evolution-test/Dockerfile
@@ -0,0 +1,25 @@
 # Evolution Test Container
 # Used for testing pipeline-judge fitness scoring with precise measurements
 FROM oven/bun:1 AS base
 WORKDIR /app
 # Install TypeScript and testing tools
 RUN bun add -g typescript @types/node
 # Copy project files
 COPY . /app/
 # Install dependencies
 RUN bun install
 # Create logs directory
 RUN mkdir -p .kilo/logs
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s \
  CMD bun test --reporter=json || exit 1
 # Default command - run tests with precise timing
 CMD ["bun", "test", "--reporter=json"]
--- a/docker/evolution-test/docker-compose.yml
+++ b/docker/evolution-test/docker-compose.yml
@@ -0,0 +1,88 @@
 # Evolution Test Containers
 # Run multiple workflow tests in parallel
 version: '3.8'
 services:
  # Evolution test runner for feature workflow
  evolution-feature:
    build:
      context: ../..
      dockerfile: docker/evolution-test/Dockerfile
    container_name: evolution-feature
    environment:
      - WORKFLOW_TYPE=feature
      - TOKEN_BUDGET=50000
      - TIME_BUDGET=300
      - MIN_COVERAGE=80
    volumes:
      - ../../.kilo/logs:/app/.kilo/logs
      - ../../src:/app/src
    command: bun test --reporter=json --coverage
  # Evolution test runner for bugfix workflow
  evolution-bugfix:
    build:
      context: ../..
      dockerfile: docker/evolution-test/Dockerfile
    container_name: evolution-bugfix
    environment:
      - WORKFLOW_TYPE=bugfix
      - TOKEN_BUDGET=20000
      - TIME_BUDGET=120
      - MIN_COVERAGE=90
    volumes:
      - ../../.kilo/logs:/app/.kilo/logs
      - ../../src:/app/src
    command: bun test --reporter=json --coverage
  # Evolution test runner for refactor workflow
  evolution-refactor:
    build:
      context: ../..
      dockerfile: docker/evolution-test/Dockerfile
    container_name: evolution-refactor
    environment:
      - WORKFLOW_TYPE=refactor
      - TOKEN_BUDGET=40000
      - TIME_BUDGET=240
      - MIN_COVERAGE=95
    volumes:
      - ../../.kilo/logs:/app/.kilo/logs
      - ../../src:/app/src
    command: bun test --reporter=json --coverage
  # Evolution test runner for security workflow
  evolution-security:
    build:
      context: ../..
      dockerfile: docker/evolution-test/Dockerfile
    container_name: evolution-security
    environment:
      - WORKFLOW_TYPE=security
      - TOKEN_BUDGET=30000
      - TIME_BUDGET=180
      - MIN_COVERAGE=80
    volumes:
      - ../../.kilo/logs:/app/.kilo/logs
      - ../../src:/app/src
    command: bun test --reporter=json --coverage
  # Fitness aggregator - collects results from all containers
  fitness-aggregator:
    image: oven/bun:1
    container_name: fitness-aggregator
    depends_on:
      - evolution-feature
      - evolution-bugfix
      - evolution-refactor
      - evolution-security
    volumes:
      - ../../.kilo/logs:/app/.kilo/logs
    working_dir: /app
    command: |
      sh -c "
        echo 'Aggregating fitness scores...'
        cat .kilo/logs/fitness-history.jsonl | tail -4 > .kilo/logs/fitness-latest.jsonl
        echo 'Fitness aggregation complete.'
      "
--- a/docker/evolution-test/run-evolution-test.bat
+++ b/docker/evolution-test/run-evolution-test.bat
@@ -0,0 +1,65 @@
@echo off
 REM Evolution Test Runner for Windows
 REM Runs pipeline-judge tests with precise measurements
 setlocal enabledelayedexpansion
 echo === Evolution Test Runner ===
 echo.
 REM Check Docker
 where docker >nul 2>&1
 if %errorlevel% neq 0 (
    echo Error: Docker not found
    echo Please install Docker Desktop first:
    echo   winget install Docker.DockerDesktop
    echo.
    echo Or run tests locally ^(less precise^):
    echo   bun test --reporter=json --coverage
    exit /b 1
 )
 REM Check Docker daemon
 docker info >nul 2>&1
 if %errorlevel% neq 0 (
    echo Warning: Docker daemon not running
    echo Please start Docker Desktop and try again
    exit /b 1
 )
 REM Get workflow type
 set WORKFLOW=%1
 if "%WORKFLOW%"=="" set WORKFLOW=feature
 echo Running evolution test for: %WORKFLOW%
 echo.
 REM Build container
 echo Building evolution test container...
 docker-compose -f docker/evolution-test/docker-compose.yml build
 REM Run test
 if "%WORKFLOW%"=="all" (
    echo Running ALL workflow tests in parallel...
    docker-compose -f docker/evolution-test/docker-compose.yml up
    docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
 ) else (
    docker-compose -f docker/evolution-test/docker-compose.yml up evolution-%WORKFLOW%
 )
 REM Show results
 echo.
 echo === Test Results ===
 if exist .kilo\logs\fitness-history.jsonl (
    echo Latest fitness scores:
    powershell -Command "Get-Content .kilo\logs\fitness-history.jsonl -Tail 4 | ForEach-Object { $j = $_ | ConvertFrom-Json; Write-Host ('  ' + $j.workflow + ': fitness=' + $j.fitness + ', time=' + $j.time_ms + 'ms, tokens=' + $j.tokens) }"
 ) else (
    echo No fitness history found
 )
 REM Cleanup
 echo.
 echo Cleaning up...
 docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>nul
 echo Done!
--- a/docker/evolution-test/run-evolution-test.sh
+++ b/docker/evolution-test/run-evolution-test.sh
@@ -0,0 +1,92 @@
 #!/bin/bash
 # Evolution Test Runner
 # Runs pipeline-judge tests with precise measurements
 set -e
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 echo -e "${BLUE}=== Evolution Test Runner ===${NC}"
 echo ""
 # Check Docker
 if ! command -v docker &> /dev/null; then
    echo -e "${RED}Error: Docker not found${NC}"
    echo "Please install Docker Desktop first:"
    echo "  winget install Docker.DockerDesktop"
    echo ""
    echo "Or use alternatives:"
    echo "  1. Use WSL2 with Docker"
    echo "  2. Run tests locally (less precise):"
    echo "     bun test --reporter=json --coverage"
    exit 1
 fi
 # Docker daemon check
 if ! docker info &> /dev/null; then
    echo -e "${YELLOW}Warning: Docker daemon not running${NC}"
    echo "Starting Docker Desktop..."
    open -a "Docker" 2>/dev/null || start "Docker Desktop" 2>/dev/null || true
    sleep 30
 fi
 # Build evolution test container
 echo -e "${BLUE}Building evolution test container...${NC}"
 docker-compose -f docker/evolution-test/docker-compose.yml build
 # Run specific workflow test
 WORKFLOW=${1:-feature}
 echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}"
 case $WORKFLOW in
    feature)
        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature
        ;;
    bugfix)
        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-bugfix
        ;;
    refactor)
        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-refactor
        ;;
    security)
        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-security
        ;;
    all)
        echo -e "${BLUE}Running ALL workflow tests in parallel...${NC}"
        docker-compose -f docker/evolution-test/docker-compose.yml up
        docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
        ;;
    *)
        echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}"
        echo "Usage: $0 [feature|bugfix|refactor|security|all]"
        exit 1
        ;;
 esac
 # Parse results
 echo ""
 echo -e "${BLUE}=== Test Results ===${NC}"
 if [ -f ".kilo/logs/fitness-history.jsonl" ]; then
    echo -e "${GREEN}Latest fitness scores:${NC}"
    tail -4 .kilo/logs/fitness-history.jsonl | while read -r line; do
        FITNESS=$(echo "$line" | jq -r '.fitness // empty')
        WORKFLOW=$(echo "$line" | jq -r '.workflow // empty')
        TIME_MS=$(echo "$line" | jq -r '.time_ms // empty')
        TOKENS=$(echo "$line" | jq -r '.tokens // empty')
        echo "  ${WORKFLOW}: fitness=${FITNESS}, time=${TIME_MS}ms, tokens=${TOKENS}"
    done
 else
    echo -e "${YELLOW}No fitness history found${NC}"
 fi
 # Cleanup
 echo ""
 echo -e "${BLUE}Cleaning up...${NC}"
 docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>/dev/null || true
 echo -e "${GREEN}Done!${NC}"
`@@ -1 +1 @@`
	`{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}`	{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}{"ts":"2026-04-06T00:32:00Z","issue":31,"workflow":"feature","fitness":0.52,"breakdown":{"test_pass_rate":0.45,"quality_gates_rate":0.80,"efficiency_score":0.44},"tokens":35000,"time_ms":170000,"tests_passed":0,"tests_total":5,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer","code-skeptic","performance-engineer","security-auditor","release-manager","evaluator","pipeline-judge"],"verdict":"MARGINAL","improvement_trigger":true}