feat: add Docker-based evolution testing with precise measurements
- Add docker/evolution-test/Dockerfile with bun, TypeScript - Add docker/evolution-test/docker-compose.yml for parallel workflow testing - Add run-evolution-test.sh and .bat scripts for cross-platform - Update pipeline-judge.md with Docker-first approach: - Millisecond precision timing (date +%s%3N) - 2 decimal places for test pass rate and coverage - Docker container for consistent test environment - Multiple workflow types (feature/bugfix/refactor/security) Enables: - Parallel testing with docker-compose - Consistent environment across machines - Precise fitness measurements (ms, 2 decimals) - Multi-workflow testing in containers
This commit is contained in:
@@ -49,26 +49,55 @@ where:
|
|||||||
|
|
||||||
## Execution Protocol
|
## Execution Protocol
|
||||||
|
|
||||||
### Step 1: Collect Metrics
|
### Step 1: Collect Metrics (Docker-first for precision)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Run test suite
|
# Prefer Docker for consistent measurements with millisecond precision
|
||||||
bun test --reporter=json > /tmp/test-results.json 2>&1
|
if command -v docker &> /dev/null && docker info &> /dev/null; then
|
||||||
bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1
|
echo "Using Docker container for precise measurements..."
|
||||||
|
|
||||||
|
# Run tests in container with millisecond timing
|
||||||
|
START_MS=$(date +%s%3N)
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml run --rm evolution-test \
|
||||||
|
bun test --reporter=json --coverage 2>&1 | tee /tmp/test-results.json
|
||||||
|
END_MS=$(date +%s%3N)
|
||||||
|
|
||||||
|
TIME_MS=$((END_MS - START_MS))
|
||||||
|
echo "Execution time: ${TIME_MS}ms"
|
||||||
|
else
|
||||||
|
echo "Running locally (Docker not available, less precise)..."
|
||||||
|
|
||||||
|
START_MS=$(date +%s%3N)
|
||||||
|
bun test --reporter=json --coverage > /tmp/test-results.json 2>&1
|
||||||
|
END_MS=$(date +%s%3N)
|
||||||
|
|
||||||
|
TIME_MS=$((END_MS - START_MS))
|
||||||
|
fi
|
||||||
|
|
||||||
# Count results
|
# Run additional test suites
|
||||||
TOTAL=$(jq '.numTotalTests' /tmp/test-results.json)
|
bun test:e2e --reporter=json >> /tmp/test-results.json 2>&1 || true
|
||||||
PASSED=$(jq '.numPassedTests' /tmp/test-results.json)
|
|
||||||
FAILED=$(jq '.numFailedTests' /tmp/test-results.json)
|
|
||||||
|
|
||||||
# Check build
|
# Parse test results with 2 decimal precision
|
||||||
|
TOTAL=$(jq '.numTotalTests // 0' /tmp/test-results.json)
|
||||||
|
PASSED=$(jq '.numPassedTests // 0' /tmp/test-results.json)
|
||||||
|
FAILED=$(jq '.numFailedTests // 0' /tmp/test-results.json)
|
||||||
|
SKIPPED=$(jq '.numSkippedTests // 0' /tmp/test-results.json)
|
||||||
|
|
||||||
|
# Calculate pass rate with 2 decimals
|
||||||
|
if [ "$TOTAL" -gt 0 ]; then
|
||||||
|
PASS_RATE=$(awk "BEGIN {printf \"%.2f\", $PASSED / $TOTAL * 100}")
|
||||||
|
else
|
||||||
|
PASS_RATE="0.00"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check quality gates
|
||||||
bun run build 2>&1 && BUILD_OK=true || BUILD_OK=false
|
bun run build 2>&1 && BUILD_OK=true || BUILD_OK=false
|
||||||
|
bun run lint 2>&1 && LINT_OK=true || LINT_OK=false
|
||||||
# Check lint
|
|
||||||
bun run lint 2>&1 && LINT_OK=true || LINT_OK=false
|
|
||||||
|
|
||||||
# Check types
|
|
||||||
bun run typecheck 2>&1 && TYPES_OK=true || TYPES_OK=false
|
bun run typecheck 2>&1 && TYPES_OK=true || TYPES_OK=false
|
||||||
|
|
||||||
|
# Get coverage with 2 decimal precision
|
||||||
|
COVERAGE=$(bun test --coverage 2>&1 | grep 'All files' | awk '{printf "%.2f", $4}' || echo "0.00")
|
||||||
|
COVERAGE_OK=$(awk "BEGIN {print ($COVERAGE >= 80) ? 1 : 0}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### Step 2: Read Pipeline Log
|
### Step 2: Read Pipeline Log
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}
|
{"ts":"2026-04-04T02:30:00Z","issue":5,"workflow":"feature","fitness":0.85,"breakdown":{"test_pass_rate":0.95,"quality_gates_rate":0.80,"efficiency_score":0.78},"tokens":38400,"time_ms":245000,"tests_passed":9,"tests_total":10,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer"],"verdict":"PASS"}{"ts":"2026-04-06T00:32:00Z","issue":31,"workflow":"feature","fitness":0.52,"breakdown":{"test_pass_rate":0.45,"quality_gates_rate":0.80,"efficiency_score":0.44},"tokens":35000,"time_ms":170000,"tests_passed":0,"tests_total":5,"agents":["requirement-refiner","history-miner","system-analyst","sdet-engineer","lead-developer","code-skeptic","performance-engineer","security-auditor","release-manager","evaluator","pipeline-judge"],"verdict":"MARGINAL","improvement_trigger":true}
|
||||||
|
|||||||
25
docker/evolution-test/Dockerfile
Normal file
25
docker/evolution-test/Dockerfile
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Evolution Test Container
|
||||||
|
# Used for testing pipeline-judge fitness scoring with precise measurements
|
||||||
|
|
||||||
|
FROM oven/bun:1 AS base
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install TypeScript and testing tools
|
||||||
|
RUN bun add -g typescript @types/node
|
||||||
|
|
||||||
|
# Copy project files
|
||||||
|
COPY . /app/
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN bun install
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
RUN mkdir -p .kilo/logs
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s \
|
||||||
|
CMD bun test --reporter=json || exit 1
|
||||||
|
|
||||||
|
# Default command - run tests with precise timing
|
||||||
|
CMD ["bun", "test", "--reporter=json"]
|
||||||
88
docker/evolution-test/docker-compose.yml
Normal file
88
docker/evolution-test/docker-compose.yml
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
# Evolution Test Containers
|
||||||
|
# Run multiple workflow tests in parallel
|
||||||
|
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
# Evolution test runner for feature workflow
|
||||||
|
evolution-feature:
|
||||||
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: docker/evolution-test/Dockerfile
|
||||||
|
container_name: evolution-feature
|
||||||
|
environment:
|
||||||
|
- WORKFLOW_TYPE=feature
|
||||||
|
- TOKEN_BUDGET=50000
|
||||||
|
- TIME_BUDGET=300
|
||||||
|
- MIN_COVERAGE=80
|
||||||
|
volumes:
|
||||||
|
- ../../.kilo/logs:/app/.kilo/logs
|
||||||
|
- ../../src:/app/src
|
||||||
|
command: bun test --reporter=json --coverage
|
||||||
|
|
||||||
|
# Evolution test runner for bugfix workflow
|
||||||
|
evolution-bugfix:
|
||||||
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: docker/evolution-test/Dockerfile
|
||||||
|
container_name: evolution-bugfix
|
||||||
|
environment:
|
||||||
|
- WORKFLOW_TYPE=bugfix
|
||||||
|
- TOKEN_BUDGET=20000
|
||||||
|
- TIME_BUDGET=120
|
||||||
|
- MIN_COVERAGE=90
|
||||||
|
volumes:
|
||||||
|
- ../../.kilo/logs:/app/.kilo/logs
|
||||||
|
- ../../src:/app/src
|
||||||
|
command: bun test --reporter=json --coverage
|
||||||
|
|
||||||
|
# Evolution test runner for refactor workflow
|
||||||
|
evolution-refactor:
|
||||||
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: docker/evolution-test/Dockerfile
|
||||||
|
container_name: evolution-refactor
|
||||||
|
environment:
|
||||||
|
- WORKFLOW_TYPE=refactor
|
||||||
|
- TOKEN_BUDGET=40000
|
||||||
|
- TIME_BUDGET=240
|
||||||
|
- MIN_COVERAGE=95
|
||||||
|
volumes:
|
||||||
|
- ../../.kilo/logs:/app/.kilo/logs
|
||||||
|
- ../../src:/app/src
|
||||||
|
command: bun test --reporter=json --coverage
|
||||||
|
|
||||||
|
# Evolution test runner for security workflow
|
||||||
|
evolution-security:
|
||||||
|
build:
|
||||||
|
context: ../..
|
||||||
|
dockerfile: docker/evolution-test/Dockerfile
|
||||||
|
container_name: evolution-security
|
||||||
|
environment:
|
||||||
|
- WORKFLOW_TYPE=security
|
||||||
|
- TOKEN_BUDGET=30000
|
||||||
|
- TIME_BUDGET=180
|
||||||
|
- MIN_COVERAGE=80
|
||||||
|
volumes:
|
||||||
|
- ../../.kilo/logs:/app/.kilo/logs
|
||||||
|
- ../../src:/app/src
|
||||||
|
command: bun test --reporter=json --coverage
|
||||||
|
|
||||||
|
# Fitness aggregator - collects results from all containers
|
||||||
|
fitness-aggregator:
|
||||||
|
image: oven/bun:1
|
||||||
|
container_name: fitness-aggregator
|
||||||
|
depends_on:
|
||||||
|
- evolution-feature
|
||||||
|
- evolution-bugfix
|
||||||
|
- evolution-refactor
|
||||||
|
- evolution-security
|
||||||
|
volumes:
|
||||||
|
- ../../.kilo/logs:/app/.kilo/logs
|
||||||
|
working_dir: /app
|
||||||
|
command: |
|
||||||
|
sh -c "
|
||||||
|
echo 'Aggregating fitness scores...'
|
||||||
|
cat .kilo/logs/fitness-history.jsonl | tail -4 > .kilo/logs/fitness-latest.jsonl
|
||||||
|
echo 'Fitness aggregation complete.'
|
||||||
|
"
|
||||||
65
docker/evolution-test/run-evolution-test.bat
Normal file
65
docker/evolution-test/run-evolution-test.bat
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
@echo off
|
||||||
|
REM Evolution Test Runner for Windows
|
||||||
|
REM Runs pipeline-judge tests with precise measurements
|
||||||
|
|
||||||
|
setlocal enabledelayedexpansion
|
||||||
|
|
||||||
|
echo === Evolution Test Runner ===
|
||||||
|
echo.
|
||||||
|
|
||||||
|
REM Check Docker
|
||||||
|
where docker >nul 2>&1
|
||||||
|
if %errorlevel% neq 0 (
|
||||||
|
echo Error: Docker not found
|
||||||
|
echo Please install Docker Desktop first:
|
||||||
|
echo winget install Docker.DockerDesktop
|
||||||
|
echo.
|
||||||
|
echo Or run tests locally ^(less precise^):
|
||||||
|
echo bun test --reporter=json --coverage
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
REM Check Docker daemon
|
||||||
|
docker info >nul 2>&1
|
||||||
|
if %errorlevel% neq 0 (
|
||||||
|
echo Warning: Docker daemon not running
|
||||||
|
echo Please start Docker Desktop and try again
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
REM Get workflow type
|
||||||
|
set WORKFLOW=%1
|
||||||
|
if "%WORKFLOW%"=="" set WORKFLOW=feature
|
||||||
|
|
||||||
|
echo Running evolution test for: %WORKFLOW%
|
||||||
|
echo.
|
||||||
|
|
||||||
|
REM Build container
|
||||||
|
echo Building evolution test container...
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml build
|
||||||
|
|
||||||
|
REM Run test
|
||||||
|
if "%WORKFLOW%"=="all" (
|
||||||
|
echo Running ALL workflow tests in parallel...
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
|
||||||
|
) else (
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-%WORKFLOW%
|
||||||
|
)
|
||||||
|
|
||||||
|
REM Show results
|
||||||
|
echo.
|
||||||
|
echo === Test Results ===
|
||||||
|
if exist .kilo\logs\fitness-history.jsonl (
|
||||||
|
echo Latest fitness scores:
|
||||||
|
powershell -Command "Get-Content .kilo\logs\fitness-history.jsonl -Tail 4 | ForEach-Object { $j = $_ | ConvertFrom-Json; Write-Host (' ' + $j.workflow + ': fitness=' + $j.fitness + ', time=' + $j.time_ms + 'ms, tokens=' + $j.tokens) }"
|
||||||
|
) else (
|
||||||
|
echo No fitness history found
|
||||||
|
)
|
||||||
|
|
||||||
|
REM Cleanup
|
||||||
|
echo.
|
||||||
|
echo Cleaning up...
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>nul
|
||||||
|
|
||||||
|
echo Done!
|
||||||
92
docker/evolution-test/run-evolution-test.sh
Normal file
92
docker/evolution-test/run-evolution-test.sh
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Evolution Test Runner
|
||||||
|
# Runs pipeline-judge tests with precise measurements
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
echo -e "${BLUE}=== Evolution Test Runner ===${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check Docker
|
||||||
|
if ! command -v docker &> /dev/null; then
|
||||||
|
echo -e "${RED}Error: Docker not found${NC}"
|
||||||
|
echo "Please install Docker Desktop first:"
|
||||||
|
echo " winget install Docker.DockerDesktop"
|
||||||
|
echo ""
|
||||||
|
echo "Or use alternatives:"
|
||||||
|
echo " 1. Use WSL2 with Docker"
|
||||||
|
echo " 2. Run tests locally (less precise):"
|
||||||
|
echo " bun test --reporter=json --coverage"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Docker daemon check
|
||||||
|
if ! docker info &> /dev/null; then
|
||||||
|
echo -e "${YELLOW}Warning: Docker daemon not running${NC}"
|
||||||
|
echo "Starting Docker Desktop..."
|
||||||
|
open -a "Docker" 2>/dev/null || start "Docker Desktop" 2>/dev/null || true
|
||||||
|
sleep 30
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build evolution test container
|
||||||
|
echo -e "${BLUE}Building evolution test container...${NC}"
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml build
|
||||||
|
|
||||||
|
# Run specific workflow test
|
||||||
|
WORKFLOW=${1:-feature}
|
||||||
|
echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}"
|
||||||
|
|
||||||
|
case $WORKFLOW in
|
||||||
|
feature)
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature
|
||||||
|
;;
|
||||||
|
bugfix)
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-bugfix
|
||||||
|
;;
|
||||||
|
refactor)
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-refactor
|
||||||
|
;;
|
||||||
|
security)
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up evolution-security
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
echo -e "${BLUE}Running ALL workflow tests in parallel...${NC}"
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml up fitness-aggregator
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}"
|
||||||
|
echo "Usage: $0 [feature|bugfix|refactor|security|all]"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Parse results
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}=== Test Results ===${NC}"
|
||||||
|
if [ -f ".kilo/logs/fitness-history.jsonl" ]; then
|
||||||
|
echo -e "${GREEN}Latest fitness scores:${NC}"
|
||||||
|
tail -4 .kilo/logs/fitness-history.jsonl | while read -r line; do
|
||||||
|
FITNESS=$(echo "$line" | jq -r '.fitness // empty')
|
||||||
|
WORKFLOW=$(echo "$line" | jq -r '.workflow // empty')
|
||||||
|
TIME_MS=$(echo "$line" | jq -r '.time_ms // empty')
|
||||||
|
TOKENS=$(echo "$line" | jq -r '.tokens // empty')
|
||||||
|
echo " ${WORKFLOW}: fitness=${FITNESS}, time=${TIME_MS}ms, tokens=${TOKENS}"
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}No fitness history found${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}Cleaning up...${NC}"
|
||||||
|
docker-compose -f docker/evolution-test/docker-compose.yml down -v 2>/dev/null || true
|
||||||
|
|
||||||
|
echo -e "${GREEN}Done!${NC}"
|
||||||
Reference in New Issue
Block a user