feat: add Docker-based evolution testing with precise measurements

- Add docker/evolution-test/Dockerfile with bun, TypeScript - Add docker/evolution-test/docker-compose.yml for parallel workflow testing - Add run-evolution-test.sh and .bat scripts for cross-platform - Update pipeline-judge.md with Docker-first approach: - Millisecond precision timing (date +%s%3N) - 2 decimal places for test pass rate and coverage - Docker container for consistent test environment - Multiple workflow types (feature/bugfix/refactor/security) Enables: - Parallel testing with docker-compose - Consistent environment across machines - Precise fitness measurements (ms, 2 decimals) - Multi-workflow testing in containers
2026-04-06 00:48:21 +01:00
parent fa68141d47
commit 1703247651
6 changed files with 314 additions and 15 deletions
--- a/.kilo/agents/pipeline-judge.md
+++ b/.kilo/agents/pipeline-judge.md
@@ -49,26 +49,55 @@ where:

 ## Execution Protocol

-### Step 1: Collect Metrics
+### Step 1: Collect Metrics (Docker-first for precision)

 ```bash
-# Run test suite
-bun test --reporter=json > /tmp/test-results.json 2>&1
-bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1
+# Prefer Docker for consistent measurements with millisecond precision
+if command -v docker &> /dev/null && docker info &> /dev/null; then
+  echo "Using Docker container for precise measurements..."
+  
+  # Run tests in container with millisecond timing
+  START_MS=$(date +%s%3N)
+  docker-compose -f docker/evolution-test/docker-compose.yml run --rm evolution-test \
+    bun test --reporter=json --coverage 2>&1 | tee /tmp/test-results.json
+  END_MS=$(date +%s%3N)
+  
+  TIME_MS=$((END_MS - START_MS))
+  echo "Execution time: ${TIME_MS}ms"
+else
+  echo "Running locally (Docker not available, less precise)..."
+  
+  START_MS=$(date +%s%3N)
+  bun test --reporter=json --coverage > /tmp/test-results.json 2>&1
+  END_MS=$(date +%s%3N)
+  
+  TIME_MS=$((END_MS - START_MS))
+fi

-# Count results
-TOTAL=$(jq '.numTotalTests' /tmp/test-results.json)
-PASSED=$(jq '.numPassedTests' /tmp/test-results.json)
-FAILED=$(jq '.numFailedTests' /tmp/test-results.json)
+# Run additional test suites
+bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1 || true

-# Check build
+# Parse test results with 2 decimal precision
+TOTAL=$(jq '.numTotalTests // 0' /tmp/test-results.json)
+PASSED=$(jq '.numPassedTests // 0' /tmp/test-results.json)
+FAILED=$(jq '.numFailedTests // 0' /tmp/test-results.json)
+SKIPPED=$(jq '.numSkippedTests // 0' /tmp/test-results.json)
+
+# Calculate pass rate with 2 decimals
+if [ "$TOTAL" -gt 0 ]; then
+  PASS_RATE=$(awk "BEGIN {printf \"%.2f\", $PASSED / $TOTAL * 100}")
+else
+  PASS_RATE="0.00"
+fi
+
+# Check quality gates
 bun run build 2>&1 && BUILD_OK=true || BUILD_OK=false
-
-# Check lint
-bun run lint 2>&1 && LINT_OK=true || LINT_OK=false
-
-# Check types
+bun run lint 2>&1 && LINT_OK=true || LINT_OK=false  
 bun run typecheck 2>&1 && TYPES_OK=true || TYPES_OK=false
+
+# Get coverage with 2 decimal precision
+COVERAGE=$(bun test --coverage 2>&1 | grep 'All files' | awk '{printf "%.2f", $4}' || echo "0.00")
+COVERAGE_OK=$(awk "BEGIN {print ($COVERAGE >= 80) ? 1 : 0}")
 ```

 ### Step 2: Read Pipeline Log
--- a/.kilo/logs/fitness-history.jsonl
+++ b/.kilo/logs/fitness-history.jsonl
@@ -1 +1 @@
-{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}
+{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}{"ts":"2026-04-06T00:32:00Z","issue":31,"workflow":"feature","fitness":0.52,"breakdown":{"test_pass_rate":0.45,"quality_gates_rate":0.80,"efficiency_score":0.44},"tokens":35000,"time_ms":170000,"tests_passed":0,"tests_total":5,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer","code-skeptic","performance-engineer","security-auditor","release-manager","evaluator","pipeline-judge"],"verdict":"MARGINAL","improvement_trigger":true}
--- a/docker/evolution-test/Dockerfile
+++ b/docker/evolution-test/Dockerfile
@@ -0,0 +1,25 @@
+# Evolution Test Container
+# Used for testing pipeline-judge fitness scoring with precise measurements
+
+FROM oven/bun:1 AS base
+
+WORKDIR /app
+
+# Install TypeScript and testing tools
+RUN bun add -g typescript @types/node
+
+# Copy project files
+COPY . /app/
+
+# Install dependencies
+RUN bun install
+
+# Create logs directory
+RUN mkdir -p .kilo/logs
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s \
+  CMD bun test --reporter=json || exit 1
+
+# Default command - run tests with precise timing
+CMD ["bun", "test", "--reporter=json"]
--- a/docker/evolution-test/docker-compose.yml
+++ b/docker/evolution-test/docker-compose.yml
@@ -0,0 +1,88 @@
+# Evolution Test Containers
+# Run multiple workflow tests in parallel
+
+version: '3.8'
+
+services:
+  # Evolution test runner for feature workflow
+  evolution-feature:
+    build:
+      context: ../..
+      dockerfile: docker/evolution-test/Dockerfile
+    container_name: evolution-feature
+    environment:
+      - WORKFLOW_TYPE=feature
+      - TOKEN_BUDGET=50000
+      - TIME_BUDGET=300
+      - MIN_COVERAGE=80
+    volumes:
+      - ../../.kilo/logs:/app/.kilo/logs
+      - ../../src:/app/src
+    command: bun test --reporter=json --coverage
+
+  # Evolution test runner for bugfix workflow
+  evolution-bugfix:
+    build:
+      context: ../..
+      dockerfile: docker/evolution-test/Dockerfile
+    container_name: evolution-bugfix
+    environment:
+      - WORKFLOW_TYPE=bugfix
+      - TOKEN_BUDGET=20000
+      - TIME_BUDGET=120
+      - MIN_COVERAGE=90
+    volumes:
+      - ../../.kilo/logs:/app/.kilo/logs
+      - ../../src:/app/src
+    command: bun test --reporter=json --coverage
+
+  # Evolution test runner for refactor workflow
+  evolution-refactor:
+    build:
+      context: ../..
+      dockerfile: docker/evolution-test/Dockerfile
+    container_name: evolution-refactor
+    environment:
+      - WORKFLOW_TYPE=refactor
+      - TOKEN_BUDGET=40000
+      - TIME_BUDGET=240
+      - MIN_COVERAGE=95
+    volumes:
+      - ../../.kilo/logs:/app/.kilo/logs
+      - ../../src:/app/src
+    command: bun test --reporter=json --coverage
+
+  # Evolution test runner for security workflow
+  evolution-security:
+    build:
+      context: ../..
+      dockerfile: docker/evolution-test/Dockerfile
+    container_name: evolution-security
+    environment:
+      - WORKFLOW_TYPE=security
+      - TOKEN_BUDGET=30000
+      - TIME_BUDGET=180
+      - MIN_COVERAGE=80
+    volumes:
+      - ../../.kilo/logs:/app/.kilo/logs
+      - ../../src:/app/src
+    command: bun test --reporter=json --coverage
+
+  # Fitness aggregator - collects results from all containers
+  fitness-aggregator:
+    image: oven/bun:1
+    container_name: fitness-aggregator
+    depends_on:
+      - evolution-feature
+      - evolution-bugfix
+      - evolution-refactor
+      - evolution-security
+    volumes:
+      - ../../.kilo/logs:/app/.kilo/logs
+    working_dir: /app
+    command: |
+      sh -c "
+        echo 'Aggregating fitness scores...'
+        cat .kilo/logs/fitness-history.jsonl | tail -4 > .kilo/logs/fitness-latest.jsonl
+        echo 'Fitness aggregation complete.'
+      "
--- a/docker/evolution-test/run-evolution-test.bat
+++ b/docker/evolution-test/run-evolution-test.bat
@@ -0,0 +1,65 @@
+@echo off
+REM Evolution Test Runner for Windows
+REM Runs pipeline-judge tests with precise measurements
+
+setlocal enabledelayedexpansion
+
+echo === Evolution Test Runner ===
+echo.
+
+REM Check Docker
+where docker >nul 2>&1
+if %errorlevel% neq 0 (
+    echo Error: Docker not found
+    echo Please install Docker Desktop first:
+    echo   winget install Docker.DockerDesktop
+    echo.
+    echo Or run tests locally ^(less precise^):
+    echo   bun test --reporter=json --coverage
+    exit /b 1
+)
+
+REM Check Docker daemon
+docker info >nul 2>&1
+if %errorlevel% neq 0 (
+    echo Warning: Docker daemon not running
+    echo Please start Docker Desktop and try again
+    exit /b 1
+)
+
+REM Get workflow type
+set WORKFLOW=%1
+if "%WORKFLOW%"=="" set WORKFLOW=feature
+
+echo Running evolution test for: %WORKFLOW%
+echo.
+
+REM Build container
+echo Building evolution test container...
+docker-compose -f docker/evolution-test/docker-compose.yml build
+
+REM Run test
+if "%WORKFLOW%"=="all" (
+    echo Running ALL workflow tests in parallel...
+    docker-compose -f docker/evolution-test/docker-compose.yml up
+    docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
+) else (
+    docker-compose -f docker/evolution-test/docker-compose.yml up evolution-%WORKFLOW%
+)
+
+REM Show results
+echo.
+echo === Test Results ===
+if exist .kilo\logs\fitness-history.jsonl (
+    echo Latest fitness scores:
+    powershell -Command "Get-Content .kilo\logs\fitness-history.jsonl -Tail 4 | ForEach-Object { $j = $_ | ConvertFrom-Json; Write-Host ('  ' + $j.workflow + ': fitness=' + $j.fitness + ', time=' + $j.time_ms + 'ms, tokens=' + $j.tokens) }"
+) else (
+    echo No fitness history found
+)
+
+REM Cleanup
+echo.
+echo Cleaning up...
+docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>nul
+
+echo Done!
--- a/docker/evolution-test/run-evolution-test.sh
+++ b/docker/evolution-test/run-evolution-test.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# Evolution Test Runner
+# Runs pipeline-judge tests with precise measurements
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}=== Evolution Test Runner ===${NC}"
+echo ""
+
+# Check Docker
+if ! command -v docker &> /dev/null; then
+    echo -e "${RED}Error: Docker not found${NC}"
+    echo "Please install Docker Desktop first:"
+    echo "  winget install Docker.DockerDesktop"
+    echo ""
+    echo "Or use alternatives:"
+    echo "  1. Use WSL2 with Docker"
+    echo "  2. Run tests locally (less precise):"
+    echo "     bun test --reporter=json --coverage"
+    exit 1
+fi
+
+# Docker daemon check
+if ! docker info &> /dev/null; then
+    echo -e "${YELLOW}Warning: Docker daemon not running${NC}"
+    echo "Starting Docker Desktop..."
+    open -a "Docker" 2>/dev/null || start "Docker Desktop" 2>/dev/null || true
+    sleep 30
+fi
+
+# Build evolution test container
+echo -e "${BLUE}Building evolution test container...${NC}"
+docker-compose -f docker/evolution-test/docker-compose.yml build
+
+# Run specific workflow test
+WORKFLOW=${1:-feature}
+echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}"
+
+case $WORKFLOW in
+    feature)
+        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature
+        ;;
+    bugfix)
+        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-bugfix
+        ;;
+    refactor)
+        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-refactor
+        ;;
+    security)
+        docker-compose -f docker/evolution-test/docker-compose.yml up evolution-security
+        ;;
+    all)
+        echo -e "${BLUE}Running ALL workflow tests in parallel...${NC}"
+        docker-compose -f docker/evolution-test/docker-compose.yml up
+        docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
+        ;;
+    *)
+        echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}"
+        echo "Usage: $0 [feature|bugfix|refactor|security|all]"
+        exit 1
+        ;;
+esac
+
+# Parse results
+echo ""
+echo -e "${BLUE}=== Test Results ===${NC}"
+if [ -f ".kilo/logs/fitness-history.jsonl" ]; then
+    echo -e "${GREEN}Latest fitness scores:${NC}"
+    tail -4 .kilo/logs/fitness-history.jsonl | while read -r line; do
+        FITNESS=$(echo "$line" | jq -r '.fitness // empty')
+        WORKFLOW=$(echo "$line" | jq -r '.workflow // empty')
+        TIME_MS=$(echo "$line" | jq -r '.time_ms // empty')
+        TOKENS=$(echo "$line" | jq -r '.tokens // empty')
+        echo "  ${WORKFLOW}: fitness=${FITNESS}, time=${TIME_MS}ms, tokens=${TOKENS}"
+    done
+else
+    echo -e "${YELLOW}No fitness history found${NC}"
+fi
+
+# Cleanup
+echo ""
+echo -e "${BLUE}Cleaning up...${NC}"
+docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>/dev/null || true
+
+echo -e "${GREEN}Done!${NC}"