feat: add Docker-based evolution testing with precise measurements

- Add docker/evolution-test/Dockerfile with bun, TypeScript
- Add docker/evolution-test/docker-compose.yml for parallel workflow testing
- Add run-evolution-test.sh and .bat scripts for cross-platform
- Update pipeline-judge.md with Docker-first approach:
  - Millisecond precision timing (date +%s%3N)
  - 2 decimal places for test pass rate and coverage
  - Docker container for consistent test environment
  - Multiple workflow types (feature/bugfix/refactor/security)

Enables:
- Parallel testing with docker-compose
- Consistent environment across machines
- Precise fitness measurements (ms, 2 decimals)
- Multi-workflow testing in containers
This commit is contained in:
¨NW¨
2026-04-06 00:48:21 +01:00
parent fa68141d47
commit 1703247651
6 changed files with 314 additions and 15 deletions

View File

@@ -0,0 +1,25 @@
# Evolution Test Container
# Used for testing pipeline-judge fitness scoring with precise measurements
FROM oven/bun:1 AS base
WORKDIR /app
# Install TypeScript and testing tools
RUN bun add -g typescript @types/node
# Copy project files
COPY . /app/
# Install dependencies
RUN bun install
# Create logs directory
RUN mkdir -p .kilo/logs
# Health check
HEALTHCHECK --interval=30s --timeout=10s \
CMD bun test --reporter=json || exit 1
# Default command - run tests with precise timing
CMD ["bun", "test", "--reporter=json"]

View File

@@ -0,0 +1,88 @@
# Evolution Test Containers
# Run multiple workflow tests in parallel
version: '3.8'
services:
# Evolution test runner for feature workflow
evolution-feature:
build:
context: ../..
dockerfile: docker/evolution-test/Dockerfile
container_name: evolution-feature
environment:
- WORKFLOW_TYPE=feature
- TOKEN_BUDGET=50000
- TIME_BUDGET=300
- MIN_COVERAGE=80
volumes:
- ../../.kilo/logs:/app/.kilo/logs
- ../../src:/app/src
command: bun test --reporter=json --coverage
# Evolution test runner for bugfix workflow
evolution-bugfix:
build:
context: ../..
dockerfile: docker/evolution-test/Dockerfile
container_name: evolution-bugfix
environment:
- WORKFLOW_TYPE=bugfix
- TOKEN_BUDGET=20000
- TIME_BUDGET=120
- MIN_COVERAGE=90
volumes:
- ../../.kilo/logs:/app/.kilo/logs
- ../../src:/app/src
command: bun test --reporter=json --coverage
# Evolution test runner for refactor workflow
evolution-refactor:
build:
context: ../..
dockerfile: docker/evolution-test/Dockerfile
container_name: evolution-refactor
environment:
- WORKFLOW_TYPE=refactor
- TOKEN_BUDGET=40000
- TIME_BUDGET=240
- MIN_COVERAGE=95
volumes:
- ../../.kilo/logs:/app/.kilo/logs
- ../../src:/app/src
command: bun test --reporter=json --coverage
# Evolution test runner for security workflow
evolution-security:
build:
context: ../..
dockerfile: docker/evolution-test/Dockerfile
container_name: evolution-security
environment:
- WORKFLOW_TYPE=security
- TOKEN_BUDGET=30000
- TIME_BUDGET=180
- MIN_COVERAGE=80
volumes:
- ../../.kilo/logs:/app/.kilo/logs
- ../../src:/app/src
command: bun test --reporter=json --coverage
# Fitness aggregator - collects results from all containers
fitness-aggregator:
image: oven/bun:1
container_name: fitness-aggregator
depends_on:
- evolution-feature
- evolution-bugfix
- evolution-refactor
- evolution-security
volumes:
- ../../.kilo/logs:/app/.kilo/logs
working_dir: /app
command: |
sh -c "
echo 'Aggregating fitness scores...'
cat .kilo/logs/fitness-history.jsonl | tail -4 > .kilo/logs/fitness-latest.jsonl
echo 'Fitness aggregation complete.'
"

View File

@@ -0,0 +1,65 @@
@echo off
REM Evolution Test Runner for Windows
REM Runs pipeline-judge tests with precise measurements
setlocal enabledelayedexpansion
echo === Evolution Test Runner ===
echo.
REM Check Docker
where docker >nul 2>&1
if %errorlevel% neq 0 (
echo Error: Docker not found
echo Please install Docker Desktop first:
echo winget install Docker.DockerDesktop
echo.
echo Or run tests locally ^(less precise^):
echo bun test --reporter=json --coverage
exit /b 1
)
REM Check Docker daemon
docker info >nul 2>&1
if %errorlevel% neq 0 (
echo Warning: Docker daemon not running
echo Please start Docker Desktop and try again
exit /b 1
)
REM Get workflow type
set WORKFLOW=%1
if "%WORKFLOW%"=="" set WORKFLOW=feature
echo Running evolution test for: %WORKFLOW%
echo.
REM Build container
echo Building evolution test container...
docker-compose -f docker/evolution-test/docker-compose.yml build
REM Run test
if "%WORKFLOW%"=="all" (
echo Running ALL workflow tests in parallel...
docker-compose -f docker/evolution-test/docker-compose.yml up
docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
) else (
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-%WORKFLOW%
)
REM Show results
echo.
echo === Test Results ===
if exist .kilo\logs\fitness-history.jsonl (
echo Latest fitness scores:
powershell -Command "Get-Content .kilo\logs\fitness-history.jsonl -Tail 4 | ForEach-Object { $j = $_ | ConvertFrom-Json; Write-Host (' ' + $j.workflow + ': fitness=' + $j.fitness + ', time=' + $j.time_ms + 'ms, tokens=' + $j.tokens) }"
) else (
echo No fitness history found
)
REM Cleanup
echo.
echo Cleaning up...
docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>nul
echo Done!

View File

@@ -0,0 +1,92 @@
#!/bin/bash
# Evolution Test Runner
# Runs pipeline-judge tests with precise measurements
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}=== Evolution Test Runner ===${NC}"
echo ""
# Check Docker
if ! command -v docker &> /dev/null; then
echo -e "${RED}Error: Docker not found${NC}"
echo "Please install Docker Desktop first:"
echo " winget install Docker.DockerDesktop"
echo ""
echo "Or use alternatives:"
echo " 1. Use WSL2 with Docker"
echo " 2. Run tests locally (less precise):"
echo " bun test --reporter=json --coverage"
exit 1
fi
# Docker daemon check
if ! docker info &> /dev/null; then
echo -e "${YELLOW}Warning: Docker daemon not running${NC}"
echo "Starting Docker Desktop..."
open -a "Docker" 2>/dev/null || start "Docker Desktop" 2>/dev/null || true
sleep 30
fi
# Build evolution test container
echo -e "${BLUE}Building evolution test container...${NC}"
docker-compose -f docker/evolution-test/docker-compose.yml build
# Run specific workflow test
WORKFLOW=${1:-feature}
echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}"
case $WORKFLOW in
feature)
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature
;;
bugfix)
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-bugfix
;;
refactor)
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-refactor
;;
security)
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-security
;;
all)
echo -e "${BLUE}Running ALL workflow tests in parallel...${NC}"
docker-compose -f docker/evolution-test/docker-compose.yml up
docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
;;
*)
echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}"
echo "Usage: $0 [feature|bugfix|refactor|security|all]"
exit 1
;;
esac
# Parse results
echo ""
echo -e "${BLUE}=== Test Results ===${NC}"
if [ -f ".kilo/logs/fitness-history.jsonl" ]; then
echo -e "${GREEN}Latest fitness scores:${NC}"
tail -4 .kilo/logs/fitness-history.jsonl | while read -r line; do
FITNESS=$(echo "$line" | jq -r '.fitness // empty')
WORKFLOW=$(echo "$line" | jq -r '.workflow // empty')
TIME_MS=$(echo "$line" | jq -r '.time_ms // empty')
TOKENS=$(echo "$line" | jq -r '.tokens // empty')
echo " ${WORKFLOW}: fitness=${FITNESS}, time=${TIME_MS}ms, tokens=${TOKENS}"
done
else
echo -e "${YELLOW}No fitness history found${NC}"
fi
# Cleanup
echo ""
echo -e "${BLUE}Cleaning up...${NC}"
docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>/dev/null || true
echo -e "${GREEN}Done!${NC}"