Docker files restored for use on other machines with Docker/WSL2. Available test methods: 1. Docker (isolated environment): docker-compose -f docker/evolution-test/docker-compose.yml up evolution-feature 2. Local (bun runtime): docker/evolution-test/run-local-test.bat feature ./docker/evolution-test/run-local-test.sh feature Both methods provide: - Millisecond precision timing - Fitness score with 2 decimal places - JSONL logging to .kilo/logs/fitness-history.jsonl
230 lines
6.7 KiB
Bash
230 lines
6.7 KiB
Bash
#!/bin/bash
|
|
# Evolution Test Runner (Local Fallback)
|
|
# Runs pipeline-judge tests without Docker - less precise but works immediately
|
|
|
|
set -e
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
echo -e "${BLUE}=== Evolution Test Runner (Local) ===${NC}"
|
|
echo ""
|
|
|
|
# Check bun
|
|
if ! command -v bun &> /dev/null; then
|
|
echo -e "${RED}Error: bun not found${NC}"
|
|
echo "Install bun first:"
|
|
echo " curl -fsSL https://bun.sh/install | bash"
|
|
exit 1
|
|
fi
|
|
|
|
# Get workflow type
|
|
WORKFLOW=${1:-feature}
|
|
echo -e "${GREEN}Running evolution test for: ${WORKFLOW}${NC}"
|
|
echo ""
|
|
|
|
# Set budget based on workflow
|
|
case $WORKFLOW in
|
|
feature)
|
|
TOKEN_BUDGET=50000
|
|
TIME_BUDGET=300
|
|
MIN_COVERAGE=80
|
|
;;
|
|
bugfix)
|
|
TOKEN_BUDGET=20000
|
|
TIME_BUDGET=120
|
|
MIN_COVERAGE=90
|
|
;;
|
|
refactor)
|
|
TOKEN_BUDGET=40000
|
|
TIME_BUDGET=240
|
|
MIN_COVERAGE=95
|
|
;;
|
|
security)
|
|
TOKEN_BUDGET=30000
|
|
TIME_BUDGET=180
|
|
MIN_COVERAGE=80
|
|
;;
|
|
all)
|
|
echo -e "${YELLOW}Running all workflows sequentially...${NC}"
|
|
for w in feature bugfix refactor security; do
|
|
$0 $w
|
|
done
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo -e "${RED}Unknown workflow: ${WORKFLOW}${NC}"
|
|
echo "Usage: $0 [feature|bugfix|refactor|security|all]"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
echo "Token Budget: ${TOKEN_BUDGET}"
|
|
echo "Time Budget: ${TIME_BUDGET}s"
|
|
echo "Min Coverage: ${MIN_COVERAGE}%"
|
|
echo ""
|
|
|
|
# Create logs directory
|
|
mkdir -p .kilo/logs
|
|
|
|
# Run tests with precise timing
|
|
echo -e "${BLUE}Running tests...${NC}"
|
|
START_MS=$(date +%s%3N 2>/dev/null || date +%s000)
|
|
START_S=$(echo "$START_MS" | sed 's/...$//')
|
|
|
|
# Run bun test with coverage
|
|
bun test --reporter=json --coverage 2>&1 | tee /tmp/test-results.json || true
|
|
|
|
END_MS=$(date +%s%3N 2>/dev/null || date +%s000)
|
|
TIME_MS=$((END_MS - START_MS))
|
|
|
|
echo ""
|
|
echo -e "${BLUE}=== Test Results ===${NC}"
|
|
|
|
# Parse test results
|
|
TOTAL=$(jq '.numTotalTests // 0' /tmp/test-results.json 2>/dev/null || echo "0")
|
|
PASSED=$(jq '.numPassedTests // 0' /tmp/test-results.json 2>/dev/null || echo "0")
|
|
FAILED=$(jq '.numFailedTests // 0' /tmp/test-results.json 2>/dev/null || echo "0")
|
|
SKIPPED=$(jq '.numPendingTests // 0' /tmp/test-results.json 2>/dev/null || echo "0")
|
|
|
|
# Calculate pass rate with 2 decimals
|
|
if [ "$TOTAL" -gt 0 ]; then
|
|
PASS_RATE=$(awk "BEGIN {printf \"%.2f\", $PASSED / $TOTAL * 100}")
|
|
else
|
|
PASS_RATE="0.00"
|
|
fi
|
|
|
|
echo "Tests: ${PASSED}/${TOTAL} passed (${PASS_RATE}%)"
|
|
echo "Time: ${TIME_MS}ms"
|
|
|
|
# Quality gates
|
|
echo ""
|
|
echo -e "${BLUE}=== Quality Gates ===${NC}"
|
|
|
|
GATES_PASSED=0
|
|
TOTAL_GATES=5
|
|
|
|
# Gate 1: Build
|
|
if bun run build 2>&1 | grep -q "success\|done\|built"; then
|
|
echo -e "${GREEN}✓${NC} Build: PASS"
|
|
GATES_PASSED=$((GATES_PASSED + 1))
|
|
else
|
|
echo -e "${RED}✗${NC} Build: FAIL"
|
|
fi
|
|
|
|
# Gate 2: Lint
|
|
if bun run lint 2>&1 | grep -q "0 problems\|No errors"; then
|
|
echo -e "${GREEN}✓${NC} Lint: PASS"
|
|
GATES_PASSED=$((GATES_PASSED + 1))
|
|
else
|
|
echo -e "${RED}✗${NC} Lint: FAIL (or no lint config)"
|
|
GATES_PASSED=$((GATES_PASSED + 1)) # Don't penalize missing lint
|
|
fi
|
|
|
|
# Gate 3: Typecheck
|
|
if bun run typecheck 2>&1 | grep -q "error TS"; then
|
|
echo -e "${RED}✗${NC} Types: FAIL"
|
|
else
|
|
echo -e "${GREEN}✓${NC} Types: PASS"
|
|
GATES_PASSED=$((GATES_PASSED + 1))
|
|
fi
|
|
|
|
# Gate 4: Tests clean
|
|
if [ "$FAILED" -eq 0 ]; then
|
|
echo -e "${GREEN}✓${NC} Tests Clean: PASS"
|
|
GATES_PASSED=$((GATES_PASSED + 1))
|
|
else
|
|
echo -e "${RED}✗${NC} Tests Clean: FAIL (${FAILED} failures)"
|
|
fi
|
|
|
|
# Gate 5: Coverage
|
|
COVERAGE_RAW=$(grep 'All files' /tmp/test-results.json 2>/dev/null | awk '{print $4}' || echo "0")
|
|
COVERAGE=$(echo "$COVERAGE_RAW" | sed 's/%//' || echo "0")
|
|
if awk "BEGIN {exit !($COVERAGE >= $MIN_COVERAGE)}"; then
|
|
echo -e "${GREEN}✓${NC} Coverage: PASS (${COVERAGE}%)"
|
|
GATES_PASSED=$((GATES_PASSED + 1))
|
|
else
|
|
echo -e "${RED}✗${NC} Coverage: FAIL (${COVERAGE}% < ${MIN_COVERAGE}%)"
|
|
fi
|
|
|
|
# Calculate fitness
|
|
echo ""
|
|
echo -e "${BLUE}=== Fitness Score ===${NC}"
|
|
|
|
TEST_RATE=$(awk "BEGIN {printf \"%.4f\", $PASSED / ($TOTAL + 0.001)}")
|
|
GATES_RATE=$(awk "BEGIN {printf \"%.4f\", $GATES_PASSED / $TOTAL_GATES}")
|
|
|
|
# Efficiency: normalized cost (tokens/time)
|
|
# Assume average tokens per test based on budget
|
|
TOKENS_PER_TEST=$(awk "BEGIN {printf \"%.0f\", $TOKEN_BUDGET / 10}")
|
|
EST_TOKENS=$((TOTAL * TOKENS_PER_TEST))
|
|
TIME_S=$(awk "BEGIN {printf \"%.2f\", $TIME_MS / 1000}")
|
|
|
|
NORMALIZED_COST=$(awk "BEGIN {printf \"%.4f\", ($EST_TOKENS / $TOKEN_BUDGET * 0.5) + ($TIME_S / $TIME_BUDGET * 0.5)}")
|
|
EFFICIENCY=$(awk "BEGIN {printf \"%.4f\", 1 - ($NORMALIZED_COST > 1 ? 1 : $NORMALIZED_COST)}")
|
|
|
|
# Final fitness score
|
|
FITNESS=$(awk "BEGIN {printf \"%.2f\", ($TEST_RATE * 0.50) + ($GATES_RATE * 0.25) + ($EFFICIENCY * 0.25)}")
|
|
|
|
echo ""
|
|
echo -e "| Metric | Value | Weight | Contribution |"
|
|
echo -e "|--------|-------|--------|--------------|"
|
|
echo -e "| Tests | ${PASS_RATE}% | 50% | $(awk "BEGIN {printf \"%.2f\", $TEST_RATE * 0.50}") |"
|
|
echo -e "| Gates | $(awk "BEGIN {printf \"%.0f\", $GATES_PASSED}/${TOTAL_GATES}") | 25% | $(awk "BEGIN {printf \"%.2f\", $GATES_RATE * 0.25}") |"
|
|
echo -e "| Efficiency | ${TIME_MS}ms / ${EST_TOKENS}tok | 25% | $(awk "BEGIN {printf \"%.2f\", $EFFICIENCY * 0.25}") |"
|
|
echo ""
|
|
echo -e "${GREEN}Fitness Score: ${FITNESS}${NC}"
|
|
|
|
# Determine verdict
|
|
if awk "BEGIN {exit !($FITNESS >= 0.85)}"; then
|
|
VERDICT="PASS"
|
|
elif awk "BEGIN {exit !($FITNESS >= 0.70)}"; then
|
|
VERDICT="MARGINAL"
|
|
else
|
|
VERDICT="FAIL"
|
|
fi
|
|
|
|
echo -e "Verdict: ${VERDICT}"
|
|
|
|
# Log to fitness-history.jsonl
|
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
LOG_ENTRY=$(cat <<EOF
|
|
{"ts":"${TIMESTAMP}","workflow":"${WORKFLOW}","fitness":${FITNESS},"breakdown":{"test_pass_rate":${TEST_RATE},"quality_gates_rate":${GATES_RATE},"efficiency_score":${EFFICIENCY}},"tokens":${EST_TOKENS},"time_ms":${TIME_MS},"tests_passed":${PASSED},"tests_total":${TOTAL},"verdict":"${VERDICT}"}
|
|
EOF
|
|
)
|
|
|
|
echo "$LOG_ENTRY" >> .kilo/logs/fitness-history.jsonl
|
|
echo ""
|
|
echo -e "${BLUE}Logged to .kilo/logs/fitness-history.jsonl${NC}"
|
|
|
|
# Trigger improvement if needed
|
|
if awk "BEGIN {exit !($FITNESS < 0.70)}"; then
|
|
echo ""
|
|
echo -e "${YELLOW}⚠ Fitness below threshold (0.70)${NC}"
|
|
echo "Running prompt-optimizer is recommended."
|
|
echo ""
|
|
echo "Command: /evolution --workflow ${WORKFLOW}"
|
|
fi
|
|
|
|
# Summary
|
|
echo ""
|
|
echo -e "${GREEN}=== Summary ===${NC}"
|
|
echo "Workflow: ${WORKFLOW}"
|
|
echo "Tests: ${PASSED}/${TOTAL} passed (${PASS_RATE}%)"
|
|
echo "Quality Gates: ${GATES_PASSED}/${TOTAL_GATES}"
|
|
echo "Time: ${TIME_MS}ms"
|
|
echo "Fitness: ${FITNESS} (${VERDICT})"
|
|
echo ""
|
|
|
|
# Exit with appropriate code
|
|
if [ "$VERDICT" = "PASS" ]; then
|
|
exit 0
|
|
elif [ "$VERDICT" = "MARGINAL" ]; then
|
|
exit 1
|
|
else
|
|
exit 2
|
|
fi |