From 3badb259cc97c7ab0c5e2ad560859278a17443ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=A8NW=C2=A8?= <¨neroworld@mail.ru¨> Date: Wed, 29 Apr 2026 21:04:22 +0100 Subject: [PATCH] feat: bidirectional research dashboard + agent config fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Integrate apaw_agent_model_research_v3.html as standalone dashboard - Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations - Add build-research-dashboard.ts: inject live data into template → standalone HTML - Add rebuild-template.cjs: regenerate template from v3.html source - Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip - Add sync-model-research.ts: apply recommendation matrix to config files - Add model-benchmarks.schema.json and model-research.schema.json for validation - Add bidirectional-data-flow.md architecture documentation - Add log-execution.cjs pipeline hook - Update capability-index.yaml: add fallback_models, failover_strategy - Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models - Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs - Fix security-auditor.md: quote YAML color (#DC2626) - Fix orchestrator.md: remove duplicate devops-engineer key - Build research-dashboard.html (106KB standalone) + dated archive --- .kilo/KILO_SPEC.md | 4 +- .kilo/agents/lead-developer.md | 2 +- .kilo/agents/orchestrator.md | 1 - .kilo/agents/security-auditor.md | 2 +- .kilo/agents/system-analyst.md | 2 +- .kilo/capability-index.yaml | 2062 ++++++------ .kilo/commands/evolution.md | 134 +- .kilo/commands/research.md | 84 + .kilo/rules/evolutionary-sync.md | 50 + .kilo/shared/self-evolution.md | 86 + agent-evolution/README.md | 181 +- agent-evolution/data/agent-versions.json | 33 +- agent-evolution/data/model-benchmarks.json | 1774 +++++++++++ .../data/model-benchmarks.schema.json | 553 ++++ .../data/model-research-latest.json | 59 + .../data/model-research.schema.json | 331 ++ .../dist/research-dashboard-2026_04_29.html | 2777 +++++++++++++++++ .../docs/bidirectional-data-flow.md | 504 +++ .../ideas/apaw_agent_model_research_v3.html | 1168 +++++++ agent-evolution/index.standalone.html | 35 +- agent-evolution/research-dashboard.html | 2777 +++++++++++++++++ .../research-dashboard.template.html | 1003 ++++++ .../scripts/build-research-dashboard.ts | 237 ++ agent-evolution/scripts/rebuild-template.cjs | 74 + .../scripts/sync-benchmarks-from-yaml.cjs | 136 + .../scripts/sync-model-research.ts | 651 ++++ kilo-meta.json | 6 +- kilo.jsonc | 4 +- scripts/log-execution.cjs | 41 + 29 files changed, 13779 insertions(+), 992 deletions(-) create mode 100644 agent-evolution/data/model-benchmarks.json create mode 100644 agent-evolution/data/model-benchmarks.schema.json create mode 100644 agent-evolution/data/model-research-latest.json create mode 100644 agent-evolution/data/model-research.schema.json create mode 100644 agent-evolution/dist/research-dashboard-2026_04_29.html create mode 100644 agent-evolution/docs/bidirectional-data-flow.md create mode 100644 agent-evolution/ideas/apaw_agent_model_research_v3.html create mode 100644 agent-evolution/research-dashboard.html create mode 100644 agent-evolution/research-dashboard.template.html create mode 100644 agent-evolution/scripts/build-research-dashboard.ts create mode 100644 agent-evolution/scripts/rebuild-template.cjs create mode 100644 agent-evolution/scripts/sync-benchmarks-from-yaml.cjs create mode 100644 agent-evolution/scripts/sync-model-research.ts create mode 100644 scripts/log-execution.cjs diff --git a/.kilo/KILO_SPEC.md b/.kilo/KILO_SPEC.md index d9f9ad1..736f8d8 100644 --- a/.kilo/KILO_SPEC.md +++ b/.kilo/KILO_SPEC.md @@ -435,9 +435,9 @@ Provider availability depends on configuration. Common providers include: |-------|------|-------| | `@RequirementRefiner` | Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists. | ollama-cloud/kimi-k2-thinking | | `@HistoryMiner` | Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work. | ollama-cloud/nemotron-3-super | -| `@SystemAnalyst` | Designs technical specifications, data schemas, and API contracts before implementation. | ollama-cloud/glm-5.1 | +| `@SystemAnalyst` | Designs technical specifications, data schemas, and API contracts before implementation. | ollama-cloud/nemotron-3-super | | `@SdetEngineer` | Writes tests following TDD methodology. | ollama-cloud/qwen3-coder:480b | -| `@LeadDeveloper` | Primary code writer for backend and core logic. | ollama-cloud/qwen3-coder:480b | +| `@LeadDeveloper` | Primary code writer for backend and core logic. | ollama-cloud/nemotron-3-super | | `@FrontendDeveloper` | Handles UI implementation with multimodal capabilities. | ollama-cloud/kimi-k2.5 | | `@BackendDeveloper` | Backend specialist for Node. | ollama-cloud/deepseek-v3.2 | | `@GoDeveloper` | Go backend specialist for Gin, Echo, APIs, and database integration. | ollama-cloud/qwen3-coder:480b | diff --git a/.kilo/agents/lead-developer.md b/.kilo/agents/lead-developer.md index 04d46a1..1a9b75f 100755 --- a/.kilo/agents/lead-developer.md +++ b/.kilo/agents/lead-developer.md @@ -1,7 +1,7 @@ --- description: Primary code writer for backend and core logic. Writes implementation to pass tests mode: subagent -model: ollama-cloud/qwen3-coder:480b +model: ollama-cloud/nemotron-3-super variant: thinking color: "#DC2626" permission: diff --git a/.kilo/agents/orchestrator.md b/.kilo/agents/orchestrator.md index 0f047e7..3ef8793 100755 --- a/.kilo/agents/orchestrator.md +++ b/.kilo/agents/orchestrator.md @@ -40,7 +40,6 @@ permission: "planner": allow "reflector": allow "memory-manager": allow - "devops-engineer": allow --- # Kilo Code: Orchestrator diff --git a/.kilo/agents/security-auditor.md b/.kilo/agents/security-auditor.md index 4698551..1063daa 100755 --- a/.kilo/agents/security-auditor.md +++ b/.kilo/agents/security-auditor.md @@ -2,7 +2,7 @@ description: Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets mode: subagent model: ollama-cloud/nemotron-3-super -color: #DC2626 +color: "#DC2626" permission: read: allow bash: allow diff --git a/.kilo/agents/system-analyst.md b/.kilo/agents/system-analyst.md index 16c7ec1..23c470a 100755 --- a/.kilo/agents/system-analyst.md +++ b/.kilo/agents/system-analyst.md @@ -1,7 +1,7 @@ --- description: Designs technical specifications, data schemas, and API contracts before implementation mode: subagent -model: ollama-cloud/glm-5.1 +model: ollama-cloud/nemotron-3-super color: "#0891B2" permission: read: allow diff --git a/.kilo/capability-index.yaml b/.kilo/capability-index.yaml index ddd438a..802946c 100644 --- a/.kilo/capability-index.yaml +++ b/.kilo/capability-index.yaml @@ -1,972 +1,1090 @@ -# Capability Index -# Maps agent capabilities for orchestrator routing - -agents: - # Core Development - lead-developer: - capabilities: - - code_writing - - refactoring - - bug_fixing - - implementation - receives: - - tests - - specifications - - architecture_docs - produces: - - code - - documentation_inline - forbidden: - - test_writing - - code_review - model: ollama-cloud/qwen3-coder:480b - variant: thinking - mode: subagent - delegates_to: - - code-skeptic - - orchestrator - - frontend-developer: - capabilities: - - ui_implementation - - component_creation - - styling - - responsive_design - - nextjs_development - - vue_nuxt_development - - react_development - receives: - - designs - - wireframes - - api_endpoints - produces: - - vue_components - - react_components - - nextjs_pages - - nuxt_pages - - css_styles - - frontend_tests - forbidden: - - backend_code - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - code-skeptic - - visual-tester - - orchestrator - - php-developer: - capabilities: - - php_web_development - - laravel_development - - symfony_development - - wordpress_development - - php_api_development - - php_database_design - - php_authentication - - php_modular_architecture - - php_testing - - php_security - receives: - - api_specifications - - database_requirements - - ui_requirements - produces: - - laravel_routes - - php_models - - php_services - - php_controllers - - php_migrations - - php_tests - - wordpress_plugins - forbidden: - - frontend_code - - non_php_backend - model: ollama-cloud/qwen3-coder:480b - variant: thinking - mode: subagent - delegates_to: - - code-skeptic - - security-auditor - - orchestrator - - python-developer: - capabilities: - - python_web_development - - django_development - - fastapi_development - - python_api_development - - python_database_design - - python_authentication - - python_async_patterns - - python_testing - - python_security - receives: - - api_specifications - - database_requirements - produces: - - django_views - - fastapi_routers - - python_models - - python_services - - python_schemas - - python_migrations - - python_tests - forbidden: - - frontend_code - - non_python_backend - model: ollama-cloud/qwen3-coder:480b - variant: thinking - mode: subagent - delegates_to: - - code-skeptic - - security-auditor - - orchestrator - - backend-developer: - capabilities: - - api_development - - database_design - - server_logic - - authentication - - postgresql_integration - - sqlite_integration - receives: - - api_specifications - - database_requirements - produces: - - express_routes - - database_schema - - api_documentation - forbidden: - - frontend_code - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - code-skeptic - - orchestrator - - go-developer: - capabilities: - - go_api_development - - go_database_design - - go_concurrent_programming - - go_authentication - - go_microservices - - postgresql_integration - - sqlite_integration - - clickhouse_integration - receives: - - api_specifications - - database_requirements - - concurrent_requirements - produces: - - go_handlers - - go_database_schema - - go_api_documentation - - concurrent_solutions - forbidden: - - frontend_code - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - code-skeptic - - orchestrator - - flutter-developer: - capabilities: - - dart_programming - - flutter_ui - - mobile_app_development - - widget_creation - - state_management - receives: - - ui_designs - - api_specifications - - mobile_requirements - produces: - - flutter_widgets - - dart_code - - mobile_app - forbidden: - - backend_code - - web_development - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - code-skeptic - - visual-tester - - orchestrator - - devops-engineer: - capabilities: - - docker_configuration - - kubernetes_setup - - ci_cd_pipeline - - infrastructure_automation - - container_optimization - receives: - - deployment_requirements - - infrastructure_needs - produces: - - docker_compose - - kubernetes_manifests - - ci_cd_config - forbidden: - - application_code - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: - - code-skeptic - - security-auditor - - orchestrator - - # Quality Assurance - sdet-engineer: - capabilities: - - unit_tests - - integration_tests - - e2e_tests - - test_planning - - visual_regression - receives: - - code - - requirements - produces: - - test_files - - test_reports - - coverage_reports - forbidden: - - implementation_code - model: ollama-cloud/qwen3-coder:480b - variant: thinking - mode: subagent - delegates_to: - - lead-developer - - orchestrator - - code-skeptic: - capabilities: - - code_review - - security_review - - style_check - - issue_identification - receives: - - code - produces: - - review_comments - - approval_status - - issue_list - forbidden: - - suggest_implementations - - write_code - model: ollama-cloud/minimax-m2.5 - mode: subagent - delegates_to: - - the-fixer - - performance-engineer - - orchestrator - - # Security & Performance - security-auditor: - capabilities: - - vulnerability_scan - - owasp_check - - secret_detection - - auth_review - receives: - - code - - configuration - produces: - - security_report - - vulnerability_list - forbidden: - - fix_vulnerabilities - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: - - the-fixer - - release-manager - - orchestrator - - performance-engineer: - capabilities: - - performance_analysis - - n_plus_one_detection - - memory_leak_check - - algorithm_analysis - receives: - - code - - performance_requirements - produces: - - performance_report - - optimization_suggestions - forbidden: - - write_code - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: - - the-fixer - - security-auditor - - orchestrator - - the-fixer: - capabilities: - - bug_fixing - - issue_resolution - - code_correction - receives: - - issue_list - - code_context - produces: - - code_fixes - - resolution_notes - forbidden: - - feature_development - model: ollama-cloud/minimax-m2.5 - mode: subagent - delegates_to: - - code-skeptic - - orchestrator - - # Specialized Development - browser-automation: - capabilities: - - e2e_browser_tests - - form_filling - - navigation_testing - - screenshot_capture - receives: - - test_scenarios - - url_list - produces: - - test_results - - screenshots - forbidden: - - unit_testing - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - orchestrator - - visual-tester: - capabilities: - - visual_regression - - pixel_comparison - - screenshot_diff - - ui_validation - - bbox_element_extraction - - console_error_detection - - network_error_detection - - responsive_layout_check - - button_overflow_detection - - gitea_integration - - docker_networking - receives: - - url - - baseline_screenshots - - page_paths - - gitea_issue_number - produces: - - diff_report - - visual_issues - - element_map_with_bbox - - console_error_report - - network_error_report - - gitea_comment - - gitea_attachments - forbidden: - - code_changes - model: ollama-cloud/qwen3-coder:480b - mode: subagent - delegates_to: - - the-fixer - - orchestrator - - # Analysis & Design - system-analyst: - capabilities: - - architecture_design - - api_specification - - database_modeling - - technical_documentation - receives: - - requirements - - user_stories - produces: - - architecture_docs - - api_specs - - database_schemas - forbidden: - - implementation - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: - - sdet-engineer - - orchestrator - - requirement-refiner: - capabilities: - - requirement_analysis - - user_story_creation - - acceptance_criteria - - clarification - receives: - - raw_requests - - feature_ideas - produces: - - user_stories - - acceptance_criteria - - requirements_doc - forbidden: - - design_decisions - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: - - history-miner - - system-analyst - - history-miner: - capabilities: - - git_search - - duplicate_detection - - past_solution_finder - - pattern_identification - receives: - - search_query - - issue_description - produces: - - commit_list - - duplicate_report - - related_files - forbidden: - - code_changes - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: [] - - capability-analyst: - capabilities: - - gap_analysis - - capability_mapping - - recommendation_generation - - coverage_analysis - receives: - - task_requirements - produces: - - analysis_report - - recommendations - - new_agent_specs - forbidden: - - implementation - model: ollama-cloud/glm-5.1 - mode: subagent - delegates_to: - - agent-architect - - orchestrator - - # Process Management - orchestrator: - capabilities: - - task_routing - - state_management - - agent_coordination - - workflow_execution - receives: - - issue - - status_change - produces: - - routing_decisions - - status_updates - forbidden: - - code_writing - - code_review - model: ollama-cloud/kimi-k2.6:cloud - variant: thinking - mode: all - delegates_to: - - history-miner - - system-analyst - - sdet-engineer - - lead-developer - - code-skeptic - - the-fixer - - frontend-developer - - backend-developer - - php-developer - - python-developer - - go-developer - - flutter-developer - - performance-engineer - - security-auditor - - visual-tester - - browser-automation - - devops-engineer - - release-manager - - requirement-refiner - - capability-analyst - - workflow-architect - - markdown-validator - - evaluator - - prompt-optimizer - - product-owner - - pipeline-judge - - planner - - reflector - - memory-manager - - agent-architect - - architect-indexer - - release-manager: - capabilities: - - git_operations - - version_management - - changelog_creation - - deployment - receives: - - approved_code - - release_request - produces: - - commits - - tags - - releases - forbidden: - - code_changes - - feature_development - model: ollama-cloud/glm-5.1 - mode: subagent - delegates_to: - - evaluator - - evaluator: - capabilities: - - performance_scoring - - process_analysis - - pattern_identification - - improvement_recommendations - receives: - - completed_issue - - agent_logs - produces: - - performance_report - - scores - - recommendations - forbidden: - - code_changes - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: - - prompt-optimizer - - product-owner - - orchestrator - - prompt-optimizer: - capabilities: - - prompt_analysis - - prompt_improvement - - failure_pattern_detection - receives: - - low_scores - - failure_reports - produces: - - improved_prompts - - optimization_report - forbidden: - - agent_creation - model: ollama-cloud/glm-5.1 - variant: instant - mode: subagent - delegates_to: [] - - product-owner: - capabilities: - - issue_management - - prioritization - - backlog_management - - workflow_completion - receives: - - completed_work - - stakeholder_requests - produces: - - priority_order - - issue_labels - - issue closures - forbidden: - - implementation - model: ollama-cloud/glm-5.1 - mode: subagent - delegates_to: [] - - pipeline-judge: - capabilities: - - test_execution - - fitness_scoring - - metric_collection - - bottleneck_detection - receives: - - completed_workflow - - pipeline_logs - produces: - - fitness_report - - bottleneck_analysis - - improvement_triggers - forbidden: - - code_writing - - code_changes - - prompt_changes - model: ollama-cloud/glm-5.1 - mode: subagent - delegates_to: - - prompt-optimizer - - # Workflow - workflow-architect: - capabilities: - - workflow_design - - process_definition - - automation_setup - receives: - - workflow_requirements - produces: - - workflow_definitions - - command_files - forbidden: - - execution - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: [] - - # Validation - markdown-validator: - capabilities: - - markdown_validation - - formatting_check - - link_validation - receives: - - markdown_files - produces: - - validation_report - - corrections - forbidden: - - content_creation - model: ollama-cloud/nemotron-3-nano:30b - mode: subagent - delegates_to: - - orchestrator - - agent-architect: - capabilities: - - agent_design - - prompt_engineering - - capability_definition - receives: - - agent_requirements - produces: - - agent_definition - - integration_plan - forbidden: - - agent_execution - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: - - capability-analyst - - requirement-refiner - - system-analyst - - # Cognitive Enhancement - planner: - capabilities: - - task_decomposition - - chain_of_thought - - tree_of_thoughts - - plan_execute_reflect - - dependency_analysis - receives: - - complex_task - - objective - produces: - - decomposed_steps - - dependency_graph - - success_criteria - forbidden: - - implementation - - execution - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: [] - - reflector: - capabilities: - - self_reflection - - mistake_analysis - - lesson_extraction - - trajectory_analysis - - heuristic_evaluation - receives: - - action_trajectory - - task_result - produces: - - reflection_report - - lessons_learned - - improved_approach - forbidden: - - implementation - - code_changes - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: [] - - memory-manager: - capabilities: - - memory_retrieval - - memory_storage - - memory_consolidation - - relevance_scoring - - episodic_management - receives: - - query - - memory_type - produces: - - retrieved_memories - - relevance_scores - - consolidated_memories - forbidden: - - code_changes - - implementation - model: ollama-cloud/nemotron-3-super - mode: subagent - delegates_to: [] - - # Project Mapping - architect-indexer: - capabilities: - - codebase_indexing - - project_mapping - - architecture_documentation - - dependency_analysis - - entity_extraction - - api_surface_discovery - - convention_detection - - staleness_detection - receives: - - project_root_directory - - stale_sections_list - produces: - - .architect/state.json - - .architect/project.json - - .architect/README.md - - architecture_overview - - dependency_graph - - entity_documentation - - db_schema_documentation - - api_surface_documentation - - convention_documentation - - file_graph - - module_graph - forbidden: - - code_changes - - implementation - model: ollama-cloud/glm-5.1 - variant: thinking - mode: subagent - delegates_to: - - system-analyst - - orchestrator - - # Capability Routing Map - capability_routing: - code_writing: lead-developer - code_review: code-skeptic - test_writing: sdet-engineer - architecture: system-analyst - security: security-auditor - performance: performance-engineer - bug_fixing: the-fixer - git_operations: release-manager - ui_implementation: frontend-developer - nextjs_development: frontend-developer - vue_nuxt_development: frontend-developer - react_development: frontend-developer - e2e_testing: browser-automation - visual_testing: visual-tester - bbox_extraction: visual-tester - console_error_detection: visual-tester - gitea_integration: visual-tester - - docker_networking: visual-tester - requirement_analysis: requirement-refiner - gap_analysis: capability-analyst - issue_management: product-owner - prompt_optimization: prompt-optimizer - workflow_design: workflow-architect - scoring: evaluator - duplicate_detection: history-miner - agent_design: agent-architect - markdown_validation: markdown-validator - # Database integrations - postgresql_integration: backend-developer - sqlite_integration: backend-developer - clickhouse_integration: go-developer - # Mobile development - flutter_development: flutter-developer - # PHP Development - php_web_development: php-developer - laravel_development: php-developer - symfony_development: php-developer - wordpress_development: php-developer - # Python Development - python_web_development: python-developer - django_development: python-developer - fastapi_development: python-developer - # DevOps - docker_configuration: devops-engineer - kubernetes_setup: devops-engineer - ci_cd_pipeline: devops-engineer - # Cognitive Enhancement (New) - task_decomposition: planner - self_reflection: reflector - memory_retrieval: memory-manager - chain_of_thought: planner - tree_of_thoughts: planner - # Fitness & Evolution - fitness_scoring: pipeline-judge - test_execution: pipeline-judge - bottleneck_detection: pipeline-judge - # Go Development - go_api_development: go-developer - go_database_design: go-developer - go_concurrent_programming: go-developer - go_authentication: go-developer - go_microservices: go-developer - # Project Mapping - codebase_indexing: architect-indexer - project_mapping: architect-indexer - architecture_documentation: architect-indexer - dependency_analysis: architect-indexer - entity_extraction: architect-indexer - api_surface_discovery: architect-indexer - convention_detection: architect-indexer - -# Parallelizable Tasks -parallel_groups: - review_phase: - - security-auditor - - performance-engineer - - code-skeptic - testing_phase: - - sdet-engineer - - browser-automation - - visual-tester - -# Evaluator-Optimizer Patterns -iteration_loops: - code_review: - evaluator: code-skeptic - optimizer: the-fixer - max_iterations: 3 - convergence: all_issues_resolved - - security_review: - evaluator: security-auditor - optimizer: the-fixer - max_iterations: 2 - convergence: no_critical_vulnerabilities - - performance_review: - evaluator: performance-engineer - optimizer: the-fixer - max_iterations: 2 - convergence: all_perf_issues_resolved - - # Evolution loop for continuous improvement - evolution: - evaluator: pipeline-judge - optimizer: prompt-optimizer - max_iterations: 3 - convergence: fitness_above_0.85 - -# Quality Gates -quality_gates: - requirements: - - user_stories_defined - - acceptance_criteria_complete - - technical_constraints_documented - - architecture: - - schema_valid - - endpoints_documented - - tech_stack_decided - - implementation: - - build_success - - no_type_errors - - no_lint_errors - - testing: - - coverage_gte_80 - - all_tests_pass - - no_critical_bugs - - review: - - no_critical_issues - - no_security_vulnerabilities - - performance_acceptable - - docker: - - build_success - - health_check_pass - - size_under_limit - - documentation: - - readme_complete - - api_docs_complete - - deployment_guide_complete - -# State Transitions -workflow_states: - new: [planned] - planned: [researching] - researching: [designed] - designed: [testing] - testing: [implementing] - implementing: [reviewing] - reviewing: [fixing, perf_check] - fixing: [reviewing] - perf_check: [security_check] - security_check: [releasing] - releasing: [evaluated] - evaluated: [evolving, completed] - evolving: [evaluated] - completed: [] - -# Evolution Configuration -evolution: - enabled: true - auto_trigger: true # trigger after every workflow - fitness_threshold: 0.70 # below this → auto-optimize - max_evolution_attempts: 3 # max retries per cycle - fitness_history: .kilo/logs/fitness-history.jsonl - token_budget_default: 50000 - time_budget_default: 300 - budgets: - feature: - tokens: 50000 - time_s: 300 - min_coverage: 80 - bugfix: - tokens: 20000 - time_s: 120 - min_coverage: 90 - refactor: - tokens: 40000 - time_s: 240 - min_coverage: 95 - security: - tokens: 30000 - time_s: 180 - min_coverage: 80 +agents: + lead-developer: + capabilities: + - code_writing + - refactoring + - bug_fixing + - implementation + receives: + - tests + - specifications + - architecture_docs + produces: + - code + - documentation_inline + forbidden: + - test_writing + - code_review + model: ollama-cloud/nemotron-3-super + variant: thinking + mode: subagent + delegates_to: + - code-skeptic + - orchestrator + fallback_models: + - ollama-cloud/qwen3-coder:480b + - ollama-cloud/kimi-k2.6:cloud + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: downgraded + frontend-developer: + capabilities: + - ui_implementation + - component_creation + - styling + - responsive_design + - nextjs_development + - vue_nuxt_development + - react_development + receives: + - designs + - wireframes + - api_endpoints + produces: + - vue_components + - react_components + - nextjs_pages + - nuxt_pages + - css_styles + - frontend_tests + forbidden: + - backend_code + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - code-skeptic + - visual-tester + - orchestrator + fallback_models: + - ollama-cloud/minimax-m2.5 + - ollama-cloud/minimax-m2.7 + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + php-developer: + capabilities: + - php_web_development + - laravel_development + - symfony_development + - wordpress_development + - php_api_development + - php_database_design + - php_authentication + - php_modular_architecture + - php_testing + - php_security + receives: + - api_specifications + - database_requirements + - ui_requirements + produces: + - laravel_routes + - php_models + - php_services + - php_controllers + - php_migrations + - php_tests + - wordpress_plugins + forbidden: + - frontend_code + - non_php_backend + model: ollama-cloud/qwen3-coder:480b + variant: thinking + mode: subagent + delegates_to: + - code-skeptic + - security-auditor + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - openrouter/qwen/qwen3.6-plus:free + - groq/llama-3.1-8b-instant + - ollama-cloud/minimax-m2.5 + failover_strategy: downgraded + python-developer: + capabilities: + - python_web_development + - django_development + - fastapi_development + - python_api_development + - python_database_design + - python_authentication + - python_async_patterns + - python_testing + - python_security + receives: + - api_specifications + - database_requirements + produces: + - django_views + - fastapi_routers + - python_models + - python_services + - python_schemas + - python_migrations + - python_tests + forbidden: + - frontend_code + - non_python_backend + model: ollama-cloud/qwen3-coder:480b + variant: thinking + mode: subagent + delegates_to: + - code-skeptic + - security-auditor + - orchestrator + fallback_models: + - openrouter/qwen/qwen3.6-plus:free + - ollama-cloud/kimi-k2.6:cloud + - groq/llama-3.1-8b-instant + - ollama-cloud/minimax-m2.5 + failover_strategy: downgraded + backend-developer: + capabilities: + - api_development + - database_design + - server_logic + - authentication + - postgresql_integration + - sqlite_integration + receives: + - api_specifications + - database_requirements + produces: + - express_routes + - database_schema + - api_documentation + forbidden: + - frontend_code + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - code-skeptic + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - openrouter/qwen/qwen3.6-plus:free + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: downgraded + go-developer: + capabilities: + - go_api_development + - go_database_design + - go_concurrent_programming + - go_authentication + - go_microservices + - postgresql_integration + - sqlite_integration + - clickhouse_integration + receives: + - api_specifications + - database_requirements + - concurrent_requirements + produces: + - go_handlers + - go_database_schema + - go_api_documentation + - concurrent_solutions + forbidden: + - frontend_code + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - code-skeptic + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + flutter-developer: + capabilities: + - dart_programming + - flutter_ui + - mobile_app_development + - widget_creation + - state_management + receives: + - ui_designs + - api_specifications + - mobile_requirements + produces: + - flutter_widgets + - dart_code + - mobile_app + forbidden: + - backend_code + - web_development + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - code-skeptic + - visual-tester + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - openrouter/qwen/qwen3.6-plus:free + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: downgraded + devops-engineer: + capabilities: + - docker_configuration + - kubernetes_setup + - ci_cd_pipeline + - infrastructure_automation + - container_optimization + receives: + - deployment_requirements + - infrastructure_needs + produces: + - docker_compose + - kubernetes_manifests + - ci_cd_config + forbidden: + - application_code + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: + - code-skeptic + - security-auditor + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/glm-5.1 + failover_strategy: downgraded + sdet-engineer: + capabilities: + - unit_tests + - integration_tests + - e2e_tests + - test_planning + - visual_regression + receives: + - code + - requirements + produces: + - test_files + - test_reports + - coverage_reports + forbidden: + - implementation_code + model: ollama-cloud/qwen3-coder:480b + variant: thinking + mode: subagent + delegates_to: + - lead-developer + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/minimax-m2.5 + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: downgraded + code-skeptic: + capabilities: + - code_review + - security_review + - style_check + - issue_identification + receives: + - code + produces: + - review_comments + - approval_status + - issue_list + forbidden: + - suggest_implementations + - write_code + model: ollama-cloud/minimax-m2.5 + mode: subagent + delegates_to: + - the-fixer + - performance-engineer + - orchestrator + fallback_models: + - ollama-cloud/qwen3-coder:480b + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - ollama-cloud/kimi-k2.6:cloud + failover_strategy: mixed + security-auditor: + capabilities: + - vulnerability_scan + - owasp_check + - secret_detection + - auth_review + receives: + - code + - configuration + produces: + - security_report + - vulnerability_list + forbidden: + - fix_vulnerabilities + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: + - the-fixer + - release-manager + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/glm-5.1 + failover_strategy: downgraded + performance-engineer: + capabilities: + - performance_analysis + - n_plus_one_detection + - memory_leak_check + - algorithm_analysis + receives: + - code + - performance_requirements + produces: + - performance_report + - optimization_suggestions + forbidden: + - write_code + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: + - the-fixer + - security-auditor + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/glm-5.1 + - ollama-cloud/kimi-k2.6:cloud + failover_strategy: downgraded + the-fixer: + capabilities: + - bug_fixing + - issue_resolution + - code_correction + receives: + - issue_list + - code_context + produces: + - code_fixes + - resolution_notes + forbidden: + - feature_development + model: ollama-cloud/minimax-m2.5 + mode: subagent + delegates_to: + - code-skeptic + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/qwen3-coder:480b + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: mixed + browser-automation: + capabilities: + - e2e_browser_tests + - form_filling + - navigation_testing + - screenshot_capture + receives: + - test_scenarios + - url_list + produces: + - test_results + - screenshots + forbidden: + - unit_testing + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + visual-tester: + capabilities: + - visual_regression + - pixel_comparison + - screenshot_diff + - ui_validation + - bbox_element_extraction + - console_error_detection + - network_error_detection + - responsive_layout_check + - button_overflow_detection + - gitea_integration + - docker_networking + receives: + - url + - baseline_screenshots + - page_paths + - gitea_issue_number + produces: + - diff_report + - visual_issues + - element_map_with_bbox + - console_error_report + - network_error_report + - gitea_comment + - gitea_attachments + forbidden: + - code_changes + model: ollama-cloud/qwen3-coder:480b + mode: subagent + delegates_to: + - the-fixer + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + system-analyst: + capabilities: + - architecture_design + - api_specification + - database_modeling + - technical_documentation + receives: + - requirements + - user_stories + produces: + - architecture_docs + - api_specs + - database_schemas + forbidden: + - implementation + model: ollama-cloud/nemotron-3-super + variant: thinking + mode: subagent + delegates_to: + - sdet-engineer + - orchestrator + fallback_models: + - ollama-cloud/glm-5.1 + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + failover_strategy: downgraded + requirement-refiner: + capabilities: + - requirement_analysis + - user_story_creation + - acceptance_criteria + - clarification + receives: + - raw_requests + - feature_ideas + produces: + - user_stories + - acceptance_criteria + - requirements_doc + forbidden: + - design_decisions + model: ollama-cloud/glm-5.1 + variant: thinking + mode: subagent + delegates_to: + - history-miner + - system-analyst + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - groq/llama-3.1-8b-instant + - ollama-cloud/glm-5 + failover_strategy: mixed + history-miner: + capabilities: + - git_search + - duplicate_detection + - past_solution_finder + - pattern_identification + receives: + - search_query + - issue_description + produces: + - commit_list + - duplicate_report + - related_files + forbidden: + - code_changes + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: [] + fallback_models: + - ollama-cloud/glm-5.1 + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: mixed + capability-analyst: + capabilities: + - gap_analysis + - capability_mapping + - recommendation_generation + - coverage_analysis + receives: + - task_requirements + produces: + - analysis_report + - recommendations + - new_agent_specs + forbidden: + - implementation + model: ollama-cloud/glm-5.1 + mode: subagent + delegates_to: + - agent-architect + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + orchestrator: + capabilities: + - task_routing + - state_management + - agent_coordination + - workflow_execution + receives: + - issue + - status_change + produces: + - routing_decisions + - status_updates + forbidden: + - code_writing + - code_review + model: ollama-cloud/kimi-k2.6:cloud + variant: thinking + mode: all + delegates_to: + - history-miner + - system-analyst + - sdet-engineer + - lead-developer + - code-skeptic + - the-fixer + - frontend-developer + - backend-developer + - php-developer + - python-developer + - go-developer + - flutter-developer + - performance-engineer + - security-auditor + - visual-tester + - browser-automation + - devops-engineer + - release-manager + - requirement-refiner + - capability-analyst + - workflow-architect + - markdown-validator + - evaluator + - prompt-optimizer + - product-owner + - pipeline-judge + - planner + - reflector + - memory-manager + - agent-architect + - architect-indexer + fallback_models: + - ollama-cloud/glm-5.1 + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: mixed + release-manager: + capabilities: + - git_operations + - version_management + - changelog_creation + - deployment + receives: + - approved_code + - release_request + produces: + - commits + - tags + - releases + forbidden: + - code_changes + - feature_development + model: ollama-cloud/glm-5.1 + mode: subagent + delegates_to: + - evaluator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - groq/llama-3.1-8b-instant + - ollama-cloud/glm-5 + failover_strategy: downgraded + evaluator: + capabilities: + - performance_scoring + - process_analysis + - pattern_identification + - improvement_recommendations + receives: + - completed_issue + - agent_logs + produces: + - performance_report + - scores + - recommendations + forbidden: + - code_changes + model: ollama-cloud/glm-5.1 + variant: thinking + mode: subagent + delegates_to: + - prompt-optimizer + - product-owner + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + prompt-optimizer: + capabilities: + - prompt_analysis + - prompt_improvement + - failure_pattern_detection + receives: + - low_scores + - failure_reports + produces: + - improved_prompts + - optimization_report + forbidden: + - agent_creation + model: ollama-cloud/glm-5.1 + variant: instant + mode: subagent + delegates_to: [] + fallback_models: + - openrouter/qwen/qwen3.6-plus:free + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: downgraded + product-owner: + capabilities: + - issue_management + - prioritization + - backlog_management + - workflow_completion + receives: + - completed_work + - stakeholder_requests + produces: + - priority_order + - issue_labels + - issue closures + forbidden: + - implementation + model: ollama-cloud/glm-5.1 + mode: subagent + delegates_to: [] + fallback_models: + - ollama-cloud/glm-5 + - openrouter/qwen/qwen3.6-plus:free + - groq/llama-3.1-8b-instant + - ollama-cloud/deepseek-v4-pro-max + failover_strategy: mixed + pipeline-judge: + capabilities: + - test_execution + - fitness_scoring + - metric_collection + - bottleneck_detection + receives: + - completed_workflow + - pipeline_logs + produces: + - fitness_report + - bottleneck_analysis + - improvement_triggers + forbidden: + - code_writing + - code_changes + - prompt_changes + model: ollama-cloud/glm-5.1 + mode: subagent + delegates_to: + - prompt-optimizer + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - groq/llama-3.1-8b-instant + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: mixed + workflow-architect: + capabilities: + - workflow_design + - process_definition + - automation_setup + receives: + - workflow_requirements + produces: + - workflow_definitions + - command_files + forbidden: + - execution + model: ollama-cloud/glm-5.1 + variant: thinking + mode: subagent + delegates_to: [] + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + markdown-validator: + capabilities: + - markdown_validation + - formatting_check + - link_validation + receives: + - markdown_files + produces: + - validation_report + - corrections + forbidden: + - content_creation + model: ollama-cloud/nemotron-3-nano:30b + mode: subagent + delegates_to: + - orchestrator + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/glm-5.1 + - groq/llama-3.1-8b-instant + - ollama/qwen3.5-122b + failover_strategy: speed-burst + agent-architect: + capabilities: + - agent_design + - prompt_engineering + - capability_definition + receives: + - agent_requirements + produces: + - agent_definition + - integration_plan + forbidden: + - agent_execution + model: ollama-cloud/glm-5.1 + variant: thinking + mode: subagent + delegates_to: + - capability-analyst + - requirement-refiner + - system-analyst + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/deepseek-v4-pro-max + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + planner: + capabilities: + - task_decomposition + - chain_of_thought + - tree_of_thoughts + - plan_execute_reflect + - dependency_analysis + receives: + - complex_task + - objective + produces: + - decomposed_steps + - dependency_graph + - success_criteria + forbidden: + - implementation + - execution + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: [] + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/glm-5.1 + failover_strategy: downgraded + reflector: + capabilities: + - self_reflection + - mistake_analysis + - lesson_extraction + - trajectory_analysis + - heuristic_evaluation + receives: + - action_trajectory + - task_result + produces: + - reflection_report + - lessons_learned + - improved_approach + forbidden: + - implementation + - code_changes + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: [] + fallback_models: + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/glm-5.1 + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + memory-manager: + capabilities: + - memory_retrieval + - memory_storage + - memory_consolidation + - relevance_scoring + - episodic_management + receives: + - query + - memory_type + produces: + - retrieved_memories + - relevance_scores + - consolidated_memories + forbidden: + - code_changes + - implementation + model: ollama-cloud/nemotron-3-super + mode: subagent + delegates_to: [] + fallback_models: + - openrouter/qwen/qwen3.6-plus:free + - ollama-cloud/deepseek-v4-pro-max + - ollama-cloud/glm-5.1 + failover_strategy: downgraded + architect-indexer: + capabilities: + - codebase_indexing + - project_mapping + - architecture_documentation + - dependency_analysis + - entity_extraction + - api_surface_discovery + - convention_detection + - staleness_detection + receives: + - project_root_directory + - stale_sections_list + produces: + - .architect/state.json + - .architect/project.json + - .architect/README.md + - architecture_overview + - dependency_graph + - entity_documentation + - db_schema_documentation + - api_surface_documentation + - convention_documentation + - file_graph + - module_graph + forbidden: + - code_changes + - implementation + model: ollama-cloud/glm-5.1 + variant: thinking + mode: subagent + delegates_to: + - system-analyst + - orchestrator + fallback_models: + - ollama-cloud/kimi-k2.6:cloud + - ollama-cloud/glm-5 + - openrouter/qwen/qwen3.6-plus:free + failover_strategy: downgraded + capability_routing: + code_writing: lead-developer + code_review: code-skeptic + test_writing: sdet-engineer + architecture: system-analyst + security: security-auditor + performance: performance-engineer + bug_fixing: the-fixer + git_operations: release-manager + ui_implementation: frontend-developer + nextjs_development: frontend-developer + vue_nuxt_development: frontend-developer + react_development: frontend-developer + e2e_testing: browser-automation + visual_testing: visual-tester + bbox_extraction: visual-tester + console_error_detection: visual-tester + gitea_integration: visual-tester + docker_networking: visual-tester + requirement_analysis: requirement-refiner + gap_analysis: capability-analyst + issue_management: product-owner + prompt_optimization: prompt-optimizer + workflow_design: workflow-architect + scoring: evaluator + duplicate_detection: history-miner + agent_design: agent-architect + markdown_validation: markdown-validator + postgresql_integration: backend-developer + sqlite_integration: backend-developer + clickhouse_integration: go-developer + flutter_development: flutter-developer + php_web_development: php-developer + laravel_development: php-developer + symfony_development: php-developer + wordpress_development: php-developer + python_web_development: python-developer + django_development: python-developer + fastapi_development: python-developer + docker_configuration: devops-engineer + kubernetes_setup: devops-engineer + ci_cd_pipeline: devops-engineer + task_decomposition: planner + self_reflection: reflector + memory_retrieval: memory-manager + chain_of_thought: planner + tree_of_thoughts: planner + fitness_scoring: pipeline-judge + test_execution: pipeline-judge + bottleneck_detection: pipeline-judge + go_api_development: go-developer + go_database_design: go-developer + go_concurrent_programming: go-developer + go_authentication: go-developer + go_microservices: go-developer + codebase_indexing: architect-indexer + project_mapping: architect-indexer + architecture_documentation: architect-indexer + dependency_analysis: architect-indexer + entity_extraction: architect-indexer + api_surface_discovery: architect-indexer + convention_detection: architect-indexer +parallel_groups: + review_phase: + - security-auditor + - performance-engineer + - code-skeptic + testing_phase: + - sdet-engineer + - browser-automation + - visual-tester +iteration_loops: + code_review: + evaluator: code-skeptic + optimizer: the-fixer + max_iterations: 3 + convergence: all_issues_resolved + security_review: + evaluator: security-auditor + optimizer: the-fixer + max_iterations: 2 + convergence: no_critical_vulnerabilities + performance_review: + evaluator: performance-engineer + optimizer: the-fixer + max_iterations: 2 + convergence: all_perf_issues_resolved + evolution: + evaluator: pipeline-judge + optimizer: prompt-optimizer + max_iterations: 3 + convergence: fitness_above_0.85 +quality_gates: + requirements: + - user_stories_defined + - acceptance_criteria_complete + - technical_constraints_documented + architecture: + - schema_valid + - endpoints_documented + - tech_stack_decided + implementation: + - build_success + - no_type_errors + - no_lint_errors + testing: + - coverage_gte_80 + - all_tests_pass + - no_critical_bugs + review: + - no_critical_issues + - no_security_vulnerabilities + - performance_acceptable + docker: + - build_success + - health_check_pass + - size_under_limit + documentation: + - readme_complete + - api_docs_complete + - deployment_guide_complete +workflow_states: + new: + - planned + planned: + - researching + researching: + - designed + designed: + - testing + testing: + - implementing + implementing: + - reviewing + reviewing: + - fixing + - perf_check + fixing: + - reviewing + perf_check: + - security_check + security_check: + - releasing + releasing: + - evaluated + evaluated: + - evolving + - completed + evolving: + - evaluated + completed: [] +evolution: + enabled: true + auto_trigger: true + fitness_threshold: 0.7 + max_evolution_attempts: 3 + fitness_history: .kilo/logs/fitness-history.jsonl + token_budget_default: 50000 + time_budget_default: 300 + budgets: + feature: + tokens: 50000 + time_s: 300 + min_coverage: 80 + bugfix: + tokens: 20000 + time_s: 120 + min_coverage: 90 + refactor: + tokens: 40000 + time_s: 240 + min_coverage: 95 + security: + tokens: 30000 + time_s: 180 + min_coverage: 80 diff --git a/.kilo/commands/evolution.md b/.kilo/commands/evolution.md index b66873e..a80baf5 100644 --- a/.kilo/commands/evolution.md +++ b/.kilo/commands/evolution.md @@ -24,6 +24,29 @@ Runs the automated evolution cycle on the most recent (or specified) workflow. ## Execution +### Step 0: Model Research + +``` +Check if model benchmarks are stale (older than 7 days): + READ agent-evolution/data/model-benchmarks.json → metadata.generated + + IF metadata.generated > 7 days ago OR file missing: + Task(subagent_type: "capability-analyst") + → research latest model benchmarks, IF scores, availability + → output to agent-evolution/data/model-research-latest.json + → validates against agent-evolution/data/model-research.schema.json + + Read agent-evolution/data/model-benchmarks.json + → load heatmap scores per agent + → load recommendations + → identify agents where current model != best-fit model (score gap > 5) +``` + +This step ensures the evolution cycle works with fresh model data. If benchmarks are stale, +the capability-analyst researches current model capabilities and pricing. + +The research output follows the schema: agent-evolution/data/model-research.schema.json + ### Step 1: Judge (Fitness Evaluation) ```bash @@ -65,7 +88,7 @@ ELSE: echo "📉 No improvement. Reverting." ``` -### Step 4: Log +### Step 4: Log + Dashboard Append to `.kilo/logs/fitness-history.jsonl`: @@ -82,6 +105,14 @@ Append to `.kilo/logs/fitness-history.jsonl`: } ``` +After logging, rebuild the research dashboard: + +```bash +bun run agent-evolution/scripts/build-research-dashboard.ts +``` + +This ensures the dashboard reflects any model changes that occurred during evolution. + ## Subcommands ### `log` — Log Model Change @@ -153,6 +184,24 @@ Shows: - Model upgrade recommendations - Priority order +### `research` — Research Model Updates + +```bash +/evolution research # research all models +/evolution research --agent planner # research models for specific agent +/evolution research --provider ollama-cloud # research specific provider +``` + +Steps: +1. Read current agents from `.kilo/capability-index.yaml` +2. Read existing benchmarks from `agent-evolution/data/model-benchmarks.json` +3. Fetch latest model info from provider APIs/docs +4. Score each model against each agent role (using IF-adjusted formula) +5. Generate recommendations where score improvement > 5 points +6. Output to `agent-evolution/data/model-research-latest.json` +7. Validate against `agent-evolution/data/model-research.schema.json` +8. If validation passes, update `agent-evolution/data/model-benchmarks.json` + ## Data Storage ### fitness-history.jsonl @@ -190,6 +239,28 @@ Shows: } ``` +### model-benchmarks.json + +Static benchmark data extracted from research. Contains: +- Model capabilities (SWE-bench, IF scores, context windows) +- Agent × Model compatibility heatmap scores +- Groq/OpenRouter free tier availability +- Current agent configuration snapshot +- Recommendations (applied + pending) +- Impact analysis data + +Path: `agent-evolution/data/model-benchmarks.json` +Schema: `agent-evolution/data/model-benchmarks.schema.json` +Refresh: When `/evolution research` runs or auto when stale (>7 days) + +### model-research-latest.json + +Latest research output from `/evolution research` or Step 0. +Dynamic file — overwritten each research cycle. + +Path: `agent-evolution/data/model-research-latest.json` +Schema: `agent-evolution/data/model-research.schema.json` + ## Integration Points - **After `/pipeline`**: Evaluator scores logged @@ -221,6 +292,10 @@ evolution: | Token Cost | pipeline logs | Resource efficiency | | Wall-Clock Time | pipeline logs | Speed | | Agent ROI | history analysis | Cost/benefit | +| Model IF Score | model-benchmarks.json | Prompt adherence per model | +| Model Fit Score | heatmap data | Agent-model compatibility | +| Model Availability | provider APIs | Rate limits, free tier status | +| Staleness | metadata.generated | How fresh is benchmark data | ## Example Session @@ -243,6 +318,63 @@ $ /evolution ✅ Logged to .kilo/logs/fitness-history.jsonl ``` +## Example: Model Research Session + +```bash +$ /evolution research + +## Model Research: All Agents + +**Benchmarks last updated**: 2026-04-20 (7 days ago — refreshing...) + +### Research Phase +→ Fetching Ollama Cloud model list... 20 models found +→ Fetching OpenRouter free tier... 3 models found +→ Fetching Groq free tier... 5 models found +→ Scoring 28 models × 36 agents... 1008 scores computed + +### Top Recommendations (score gap > 5) + +| Agent | Current | Score | Recommended | Score | Δ | Impact | +|-------|---------|-------|-------------|-------|---|--------| +| planner | nemotron-3-super | 80 | deepseek-v4-pro-max | 88 | +8 | high | +| go-developer | qwen3-coder | 85 | deepseek-v4-pro-max | 88 | +3 | medium | +| [built-in] debug | glm-5.1 | 88 | kimi-k2.6:cloud | 90 | +2 | high | + +### Output +✅ agent-evolution/data/model-research-latest.json (28 models, 11 recommendations) +✅ agent-evolution/data/model-benchmarks.json refreshed (36 agents) + +### Next Steps +Run `/evolution` to apply recommendations and re-test +Or `/evolution --dry-run` to preview changes + +### Dashboard Rebuild + +After model research or applying recommendations, rebuild the dashboard: + +```bash +bun run agent-evolution/scripts/build-research-dashboard.ts +``` + +Output: +- `agent-evolution/research-dashboard.html` — latest interactive dashboard +- `agent-evolution/dist/research-dashboard-YYYY_MM_DD.html` — dated archive + +The dashboard reads from `agent-evolution/data/model-benchmarks.json` and renders: +- Current agent-model configuration table +- Model comparison cards with SWE-bench and IF scores +- Agent × Model heatmap with IF adjustment +- Selectable recommendations with JSON export +- Before/after impact analysis + +Watch mode for continuous rebuild during research: +```bash +bun run agent-evolution/scripts/build-research-dashboard.ts --watch +``` +Auto-triggers with `--watch` when `model-benchmarks.json` or template changes. +``` + --- *Evolution workflow v2.0 - Objective fitness scoring with pipeline-judge* \ No newline at end of file diff --git a/.kilo/commands/research.md b/.kilo/commands/research.md index 2c92158..2fc3ece 100644 --- a/.kilo/commands/research.md +++ b/.kilo/commands/research.md @@ -22,6 +22,9 @@ Runs continuous research and self-improvement cycle based on the latest findings ``` /research [topic] [--auto] +/research models # research latest AI models for agent optimization +/research models --agent planner # research models for specific agent role +/research models --provider ollama-cloud # filter by provider ``` ## Parameters @@ -35,6 +38,28 @@ Runs continuous research and self-improvement cycle based on the latest findings Check `.kilo/logs/efficiency_score.json` for low-performing agents. +### Step 1.5: Model Research (when topic is "models" or agent scores are low) + +``` +IF topic === "models" OR any agent score < 7: + 1. Read agent-evolution/data/model-benchmarks.json + → Check metadata.generated staleness + 2. Fetch latest model data from providers: + - Ollama Cloud: https://ollama.com/models (via webfetch) + - OpenRouter: https://openrouter.ai/models (via webfetch) + - Groq: https://console.groq.com/docs/models (via webfetch) + 3. For each model, compute: + - IF score (from IFEval/IFBench benchmarks) + - Role fitness (SWE-bench for coding, GPQA for reasoning, etc.) + - Context window and cost + 4. Build heatmap: score each model against each agent + Formula: role_fitness * (0.7 + 0.3 * IF/100) + 5. Generate recommendations for agents where best-scored model ≠ current + 6. Output to agent-evolution/data/model-research-latest.json + 7. Validate against agent-evolution/data/model-research.schema.json + 8. Update model-benchmarks.json with fresh data +``` + ### Step 2: Gap Identification Analyze capability-index.yaml for missing capabilities. @@ -46,6 +71,15 @@ Fetch latest research from: - OpenAI: https://platform.openai.com/docs/guides/agents - Lilian Weng: https://lilianweng.github.io +### Model Research Sources +- Ollama Model Library (https://ollama.com/models) +- OpenRouter Models (https://openrouter.ai/models) +- Groq Console (https://console.groq.com/docs/models) +- SWE-Bench Leaderboard (https://www.swebench.com) +- Terminal-Bench (https://marc0.dev/terminal-bench) +- LMSYS Chatbot Arena (https://chat.lmsys.org) +- Artificial Analysis (https://artificialanalysis.ai) + ### Step 4: Implementation Create new agents, skills, or rules based on findings. @@ -81,3 +115,53 @@ Post findings to Gitea Issue #25 (Research Milestone). - Issue: #25 - Commit: abc1234 ``` + +### Model Research Example + +``` +/research models + +# Output: +## Research: model optimization + +### Models Analyzed +- Ollama Cloud: 20 models +- OpenRouter Free: 3 models +- Groq Free: 5 models + +### Key Findings +- DeepSeek V4-Pro Max now available (SWE-V 80.6, IF:88) +- Kimi K2.6 IF score confirmed: 91 (best for orchestration) +- Nemotron 3 Super IF:78 — weak for prompt-heavy roles +- Qwen 3.6 Plus FREE remains best IF/cost ratio (91, $0) + +### Recommendations Generated +- 11 model swap recommendations +- 4 high impact, 3 medium, 4 low +- Average expected improvement: +12 points + +### Files Updated +- agent-evolution/data/model-research-latest.json +- agent-evolution/data/model-benchmarks.json (refreshed) + +### Evolution Tracked +- Issue: #25 +- Next: /evolution to apply recommendations +``` + +## Model Research Output Format + +All model research output follows the schema: +`agent-evolution/data/model-research.schema.json` + +Key fields: +- `models[]` — model capabilities, benchmarks, IF scores +- `recommendations[]` — agent-specific model swap suggestions +- `heatmap` — agent × model compatibility matrix +- `capability_index_patch[]` — ready-to-apply YAML patches +- `summary` — aggregate improvement metrics + +This format is consumed by: +- `/evolution` command for auto-apply +- `agent-evolution/scripts/sync-model-research.ts` for propagation +- Evolution dashboard for visualization diff --git a/.kilo/rules/evolutionary-sync.md b/.kilo/rules/evolutionary-sync.md index 342e1ce..ac36e0a 100644 --- a/.kilo/rules/evolutionary-sync.md +++ b/.kilo/rules/evolutionary-sync.md @@ -24,6 +24,9 @@ When agents change, update ALL of these files: | `.kilo/KILO_SPEC.md` | Pipeline Agents table, Workflow Commands table | | `AGENTS.md` | Pipeline Agents tables by category | | `.kilo/agents/orchestrator.md` | Task Tool Invocation table | +| `agent-evolution/data/model-benchmarks.json` | Model fitness scores, heatmap, recommendations | +| `agent-evolution/data/model-research-latest.json` | Latest research output (overwritten each cycle) | +| `agent-evolution/data/agent-versions.json` | Agent model version history | ## Sync Process (REQUIRED ORDER) @@ -53,6 +56,13 @@ After running `--fix`, you MUST verify: □ `.kilo/capability-index.yaml` — model fields updated □ No old models leaked (grep for previous model IDs) □ `ollama-cloud/kimi-k2.6` → always `:cloud` suffix +□ model-benchmarks.json — metadata.generated updated +□ model-research-latest.json — validates against schema +□ agent-versions.json — history entries added for all model changes +□ sync-model-research.ts — dry-run matches expected changes +□ Groq rate limits current (check console.groq.com/docs/models) +□ OpenRouter free tier models current (check openrouter.ai/models) +□ No regressions in IF scores (IF should not decrease from previous) ``` ## Findings from Evolution Round 2026-04-27 @@ -140,6 +150,46 @@ for a in meta['agents']: node scripts/sync-agents.js --fix ``` +## Model Research Sync + +When `/evolution research` or `/research models` produces new benchmark data: + +### Sync Process + +``` +1. /research models OR /evolution Step 0 + → Produces: agent-evolution/data/model-research-latest.json + +2. Validate against schema: + node -e "const Ajv=require('ajv'); const ajv=new Ajv(); const schema=JSON.parse(require('fs').readFileSync('agent-evolution/data/model-research.schema.json','utf8')); const data=JSON.parse(require('fs').readFileSync('agent-evolution/data/model-research-latest.json','utf8')); const valid=ajv.validate(schema,data); console.log(valid?'VALID':'INVALID'); if(!valid) console.log(JSON.stringify(ajv.errors,null,2))" + +3. Apply recommendations: + bun run agent-evolution/scripts/sync-model-research.ts + +4. Or dry-run first: + bun run agent-evolution/scripts/sync-model-research.ts --dry-run + +5. After applying, the script automatically: + - Updates capability-index.yaml + - Updates agent-versions.json + - Updates kilo-meta.json + - Updates kilo.jsonc (with regex — manual verify still needed) + - Runs sync-agents.js --fix + - Runs sync-agents.js --check +``` + +### Data Freshness Check + +```bash +# Check if benchmarks are stale (>7 days) +node -e " +const data = JSON.parse(require('fs').readFileSync('agent-evolution/data/model-benchmarks.json','utf8')); +const gen = new Date(data.metadata.generated); +const daysOld = (Date.now() - gen.getTime()) / (1000*60*60*24); +console.log(daysOld > 7 ? 'STALE' : 'FRESH', '(' + Math.round(daysOld) + ' days old)'); +" +``` + ## Model Changes When changing a model: diff --git a/.kilo/shared/self-evolution.md b/.kilo/shared/self-evolution.md index a1c5ed0..c58e80a 100644 --- a/.kilo/shared/self-evolution.md +++ b/.kilo/shared/self-evolution.md @@ -8,6 +8,8 @@ When task requirements exceed existing agent capabilities. 2. Required domain knowledge not in any skill 3. Complex multi-step task needs new workflow pattern 4. `@capability-analyst` reports critical gap +5. `/evolution` reports fitness < 0.70 and model research finds better model +6. Model benchmarks stale (>7 days) and research discovers new model ## Evolution Flow @@ -41,6 +43,72 @@ When task requirements exceed existing agent capabilities. [New Capability Available] ``` +## Model Evolution Flow + +When an agent's current model is suboptimal (score gap > 5 points in heatmap): + +``` +[Evolution Fitness < 0.85] + ↓ +1. Read model-benchmarks.json → load heatmap, recommendations + ↓ +2. IF stale (>7 days) → @capability-analyst researches models + → Output: agent-evolution/data/model-research-latest.json + → Validates against: agent-evolution/data/model-research.schema.json + ↓ +3. Identify agents where best_model ≠ current_model (gap > 5) + ↓ +4. Generate recommendations (action: update_model) + ↓ +5. Dry-run → /evolution --dry-run → Show what would change + ↓ +6. Apply → bun run agent-evolution/scripts/sync-model-research.ts + → Updates: capability-index.yaml, agent-versions.json, kilo-meta.json, kilo.jsonc + → Triggers: sync-agents.js --fix → propagates to .md files + → Validates: sync-agents.js --check + ↓ +7. Re-test → @pipeline-judge → new fitness score + ↓ +8. IF fitness improved → commit changes + IF fitness regressed → revert via agent-versions.json history + ↓ +9. Log to Gitea + fitness-history.jsonl + ↓ +[Models Optimized] +``` + +## Model Research Data Flow + +``` +[model-benchmarks.json] ← Static benchmark data (refreshed weekly) + ↓ read +[/evolution Step 0] ← Checks staleness, triggers research if needed +[/research models] ← Explicit research trigger + ↓ produces +[model-research-latest.json] ← Dynamic research output + ↓ consumed by +[sync-model-research.ts] ← Applies recommendations + ↓ updates +[capability-index.yaml] ← Model assignments +[agent-versions.json] ← History tracking +[kilo-meta.json] ← Source of truth +[kilo.jsonc] ← Agent config (manual verify) +[.kilo/agents/*.md] ← Frontmatter (via sync script) + ↓ verified by +[sync-agents.js --check] ← Consistency validation +``` + +### Key Files + +| File | Purpose | Updated By | +|------|---------|------------| +| `agent-evolution/data/model-benchmarks.json` | Static benchmark data | `/research models`, `/evolution research` | +| `agent-evolution/data/model-research-latest.json` | Latest research output | `/research models`, `/evolution Step 0` | +| `agent-evolution/data/model-research.schema.json` | Validation schema | Manual (schema changes are rare) | +| `agent-evolution/data/model-benchmarks.schema.json` | Benchmarks data schema | Manual | +| `agent-evolution/data/agent-versions.json` | Version history | `sync-model-research.ts` | +| `agent-evolution/scripts/sync-model-research.ts` | Application script | Manual execution | + ## Self-Modification Rules 1. ONLY modify own permission whitelist @@ -49,6 +117,10 @@ When task requirements exceed existing agent capabilities. 4. ALWAYS verify access after changes 5. ALWAYS log results to `.kilo/EVOLUTION_LOG.md` 6. NEVER skip verification step +7. ALWAYS validate research output against schema before applying +8. NEVER apply model changes without dry-run preview first +9. ALWAYS run sync-agents.js --check after model changes +10. ALWAYS revert if fitness regresses after model change ## Evolution Triggers @@ -65,6 +137,11 @@ When task requirements exceed existing agent capabilities. 4. Update `.kilo/KILO_SPEC.md` (document) 5. Update `AGENTS.md` (reference) 6. Append to `.kilo/EVOLUTION_LOG.md` (log entry) +7. Update `agent-evolution/data/model-benchmarks.json` (if model data changed) +8. Update `agent-evolution/data/agent-versions.json` (add history entry) +9. Update `kilo-meta.json` (source of truth for sync) +10. Run `node scripts/sync-agents.js --fix` (propagate to all files) +11. Run `node scripts/sync-agents.js --check` (verify consistency) ## Verification Checklist @@ -77,3 +154,12 @@ After each evolution: - [ ] AGENTS.md updated with new agent - [ ] EVOLUTION_LOG.md updated with entry - [ ] Gitea milestone closed with results +- [ ] model-research-latest.json validates against schema +- [ ] sync-model-research.ts dry-run shows correct changes +- [ ] capability-index.yaml model field updated for affected agents +- [ ] agent-versions.json history entry added with rationale +- [ ] kilo-meta.json matches new model assignments +- [ ] kilo.jsonc manually verified (sync script does not guarantee this) +- [ ] sync-agents.js --check passes +- [ ] No stale models leaked (grep for previous model IDs) +- [ ] Cloud model suffix correct (kimi-k2.6:cloud, not kimi-k2.6) diff --git a/agent-evolution/README.md b/agent-evolution/README.md index 08d7e10..d8b94e4 100644 --- a/agent-evolution/README.md +++ b/agent-evolution/README.md @@ -117,6 +117,9 @@ bun run evolution:run # Запустить контейнер bun run evolution:stop # Остановить bun run evolution:dev # Docker Compose bun run evolution:logs # Логи +bun run research:dashboard # Build research dashboard +bun run research:watch # Watch mode for dashboard +bun run research:sync # Sync model research to agents ``` ## Структура @@ -132,6 +135,50 @@ agent-evolution/ └── README.md # Этот файл ``` +## Research Dashboard (Model Benchmarks) + +### Generate from live data + +```bash +# Build research dashboard from model-benchmarks.json +bun run agent-evolution/scripts/build-research-dashboard.ts + +# Watch mode — auto-rebuild on data changes +bun run agent-evolution/scripts/build-research-dashboard.ts --watch + +# Open in browser +start agent-evolution/research-dashboard.html +``` + +### Output files + +| File | Description | +|------|-------------| +| `research-dashboard.html` | Latest interactive dashboard (all 6 tabs) | +| `dist/research-dashboard-YYYY_MM_DD.html` | Dated archive | +| `research-dashboard.template.html` | Template for generation | + +### Dashboard tabs + +1. **Обзор** — stat cards, current config table, agent count, model count +2. **Groq** — free tier models with RPM/RPD/TPM/TPD limits, speed indicators +3. **Модели** — filterable cards with SWE-bench, IF scores, context windows, tags +4. **Матрица** — Agent×Model heatmap with IF adjustment, tooltips, color coding +5. **Рекомендации** — selectable cards with JSON export, impact analysis +6. **Анализ профита** — before/after comparison, canvas charts, closed-source comparison + +### Source data + +The dashboard reads from `agent-evolution/data/model-benchmarks.json`: +- 15 models with benchmarks (SWE-bench, IF scores) +- 36 agent configurations +- 33 agent×model score matrices +- 11 recommendations +- 5 Groq models with rate limits +- Closed-source comparison data + +Refresh: run `/research models` or `/evolution research` to update + ## Быстрый старт ```bash @@ -231,6 +278,22 @@ git log --all --oneline -- ".kilo/agents/" **Files**: src/auth.ts, src/user.ts ``` +### 6. Model Benchmarks (agent-evolution/data/model-benchmarks.json) + +Research data extracted from `apaw_agent_model_research_v3.html`: +- Static benchmark scores (SWE-bench, IF scores, context windows) +- Heatmap compatibility matrix +- Provider rate limits +- Recommendation history + +### 7. Model Research Output (agent-evolution/data/model-research-latest.json) + +Dynamic research results: +- Fresh model data from provider APIs +- IF-adjusted agent×model scores +- Pending recommendations with impact levels +- Ready-to-apply YAML patches + ## JSON Schema Формат `agent-versions.json`: @@ -271,6 +334,76 @@ git log --all --oneline -- ".kilo/agents/" } ``` +## Model Research Data + +### model-benchmarks.json + +Comprehensive benchmark data from the HTML research file: + +```json +{ + "version": "1.0.0", + "generated": "2026-04-27T17:44:44Z", + "total_agents": 36, + "total_models_tracked": 11, + "models": [ + { + "id": "ollama-cloud/qwen3-coder:480b", + "name": "Qwen3-Coder 480B", + "organization": "Qwen", + "swe_bench": 66.5, + "if_score": 88, + "context_window": "256K→1M", + "categories": ["coding", "agent"], + "provider": "ollama" + } + ], + "agent_current_config": [ + { "agent": "lead-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 92, "status": "optimal" } + ], + "recommendations": [ + { + "agent": "planner", + "current_model": "nemotron-3-super", + "recommended_model": "deepseek-v4-pro-max", + "impact": "high", + "expected_improvement": { "quality": "+10%", "speed": "~1x", "context_window": "1M" } + } + ] +} +``` + +### model-research-latest.json + +Latest research output (overwritten each cycle): +- Generated by `/research models` or `/evolution Step 0` +- Validated against `model-research.schema.json` +- Consumed by `sync-model-research.ts` + +### sync-model-research.ts + +Applies model recommendations to configuration: + +```bash +# Dry-run first +bun run agent-evolution/scripts/sync-model-research.ts --dry-run + +# Apply all pending recommendations +bun run agent-evolution/scripts/sync-model-research.ts + +# Apply for single agent +bun run agent-evolution/scripts/sync-model-research.ts --agent planner +``` + +Updates: +1. `.kilo/capability-index.yaml` — model assignments +2. `kilo-meta.json` — source of truth +3. `kilo.jsonc` — agent config +4. `agent-evolution/data/agent-versions.json` — history tracking +5. `.kilo/agents/*.md` frontmatter (via sync-agents.js --fix) + +After applying, rebuilds dashboard automatically. + ## Интеграция ### В Pipeline @@ -406,4 +539,50 @@ cp agent-evolution/data/backup/agent-versions-20260405.json agent-evolution/data 4. **Integration**: - Slack/Telegram уведомления - Автоматическое применение рекомендаций - - A/B testing моделей \ No newline at end of file + - A/B testing моделей + +## Bidirectional Data Flow + +``` +[/research models] OR [/evolution Step 0] + ↓ +[agent-evolution/data/model-research-latest.json] + ↓ +[bun run sync-model-research.ts] + ↓ +[.kilo/capability-index.yaml] → updated model assignments +[kilo-meta.json] → updated source of truth +[kilo.jsonc] → updated config +[agent-versions.json] → history entries +[.kilo/agents/*.md] → frontmatter updated + ↓ +[sync-agents.js --fix] → propagate to all files + ↓ +[bun run build-research-dashboard.ts] + ↓ +[research-dashboard.html] → live dashboard +[dist/dashboard-YYYY_MM_DD.html] → dated archive + ↓ +[/research models] ← loop continues +``` + +### Data staleness check + +```bash +# Check if benchmarks need refresh +node -e " +const d = require('./agent-evolution/data/model-benchmarks.json'); +const days = (Date.now() - new Date(d.generated)) / (1000*60*60*24); +console.log(days > 7 ? 'STALE: needs refresh' : 'FRESH', Math.round(days), 'days old'); +" +``` + +### Auto-refresh pipeline + +```yaml +# In capability-index.yaml +evolution: + auto_trigger: true + max_evolution_attempts: 3 + dashboard_rebuild: true # new: auto-rebuild on model changes +``` \ No newline at end of file diff --git a/agent-evolution/data/agent-versions.json b/agent-evolution/data/agent-versions.json index 6ced42e..a3e7889 100644 --- a/agent-evolution/data/agent-versions.json +++ b/agent-evolution/data/agent-versions.json @@ -1,12 +1,12 @@ { "version": "1.0.0", - "lastUpdated": "2026-04-23T06:24:32.543Z", + "lastUpdated": "2026-04-27T20:28:58.592Z", "agents": { "lead-developer": { "current": { "description": "Primary code writer for backend and core logic. Writes implementation to pass tests", "mode": "subagent", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/nemotron-3-super", "provider": "Ollama", "variant": "thinking", "color": "\"#DC2626\"", @@ -27,6 +27,24 @@ "to": "ollama-cloud/qwen3-coder:480b", "reason": "Initial configuration from capability-index.yaml", "source": "git" + }, + { + "date": "2026-04-27T16:56:09.013Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "source": "research" + }, + { + "date": "2026-04-27T20:28:58.592Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "source": "research" } ], "performance_log": [] @@ -255,7 +273,7 @@ "current": { "description": "Designs technical specifications, data schemas, and API contracts before implementation", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/nemotron-3-super", "provider": "Ollama", "variant": "thinking", "color": "\"#0891B2\"", @@ -285,6 +303,15 @@ "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-04-27T16:59:52.825Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Test recommendation for model research sync script", + "source": "research" } ], "performance_log": [] diff --git a/agent-evolution/data/model-benchmarks.json b/agent-evolution/data/model-benchmarks.json new file mode 100644 index 0000000..ec848fb --- /dev/null +++ b/agent-evolution/data/model-benchmarks.json @@ -0,0 +1,1774 @@ +{ + "version": "1.0.0", + "generated": "2026-04-29T19:56:51.418Z", + "source": ".kilo/capability-index.yaml (synced v2)", + "total_agents": 32, + "total_models_tracked": 11, + "providers": [ + "ollama", + "ollama-cloud", + "openrouter", + "groq" + ], + "models": [ + { + "id": "qwen3-coder-480b", + "name": "Qwen3-Coder 480B", + "organization": "Qwen", + "parameters": "480B/35B active", + "context_window": "256K→1M", + "swe_bench": 66.5, + "if_score": 88, + "categories": [ + "coding", + "agent" + ], + "description": "SOTA open-source кодинг. Сравним с Claude Sonnet 4.", + "tags": [ + "coding", + "agent", + "tools" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "minimax-m2.5", + "name": "MiniMax M2.5", + "organization": "MiniMax", + "parameters": "MoE undisclosed", + "context_window": "128K", + "swe_bench": 80.2, + "if_score": 82, + "categories": [ + "coding", + "agent" + ], + "description": "Лидер SWE-bench 80.2%. Полный lifecycle разработки.", + "tags": [ + "coding", + "agent" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "minimax-m2.7", + "name": "MiniMax M2.7", + "organization": "MiniMax", + "parameters": "~10B active", + "context_window": "128K", + "swe_bench": 78, + "if_score": 80, + "categories": [ + "coding", + "agent", + "efficient" + ], + "description": "Самообучаемая. 56.2% SWE-Pro. 100 TPS. $0.30/M.", + "tags": [ + "coding", + "agent", + "self-evolving" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "deepseek-v4-pro-max", + "name": "DeepSeek V4-Pro", + "organization": "DeepSeek", + "parameters": "1.6T/49B active MoE", + "context_window": "1M", + "swe_bench": 80.6, + "if_score": 89, + "categories": [ + "coding", + "agent", + "reasoning" + ], + "description": "SWE-V 80.6, LiveCodeBench 93.5(#1!), Terminal-Bench 67.9, Codeforces 3206, 1M ctx, 27% FLOPs vs V3.2. MIT.", + "tags": [ + "coding", + "agent", + "thinking", + "tools" + ], + "openrouter": false, + "provider": "ollama-cloud" + }, + { + "id": "deepseek-v4-flash", + "name": "DeepSeek V4-Pro", + "organization": "DeepSeek", + "parameters": "284B/13B active MoE", + "context_window": "1M", + "swe_bench": 79, + "if_score": 86, + "categories": [ + "coding", + "efficient", + "agent" + ], + "description": "SWE-V ~79%, Flash Max = Pro уровень reasoning. 13B active = ультрабыстрый. 1M ctx. FP4+FP8. MIT.", + "tags": [ + "coding", + "efficient", + "agent", + "thinking" + ], + "openrouter": false, + "provider": "ollama-cloud" + }, + { + "id": "kimi-k2-6", + "name": "Kimi K2.6", + "organization": "Moonshot AI", + "parameters": "1T/32B active MoE", + "context_window": "256K", + "swe_bench": 80.2, + "if_score": 91, + "categories": [ + "coding", + "agent", + "multimodal" + ], + "description": "SWE-Pro 58.6(#1!), SWE-V 80.2, Terminal-Bench 66.7, HLE 54.0(#1!), BrowseComp 83.2. 13h autonomous. 300 sub-agent swarm. Modified MIT.", + "tags": [ + "coding", + "agent", + "swarm", + "vision", + "thinking", + "tools" + ], + "openrouter": false, + "provider": "ollama-cloud" + }, + { + "id": "nemotron-3-super", + "name": "Nemotron 3 Super", + "organization": "NVIDIA", + "parameters": "120B/12B active", + "context_window": "1M", + "swe_bench": 60.5, + "if_score": 78, + "categories": [ + "agent", + "reasoning", + "efficient" + ], + "description": "SWE-bench 60.5%. RULER@1M 91.75%! Но IF ниже — Mamba-layers иногда «теряют» инструкции в длинных промптах.", + "tags": [ + "agent", + "1M-ctx", + "thinking" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "glm-5.1", + "name": "GLM-5", + "organization": "Z.ai", + "parameters": "744B/40B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 90, + "categories": [ + "reasoning", + "agent" + ], + "description": "Мощный reasoning. Arena ELO 1451. Отличный instruction following (IFEval ~90+).", + "tags": [ + "reasoning", + "agent" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "deepseek-v4", + "name": "DeepSeek V4-Pro", + "organization": "DeepSeek", + "parameters": "Large MoE", + "context_window": "128K", + "swe_bench": null, + "if_score": 75, + "categories": [ + "reasoning" + ], + "description": "Хороший reasoning, но IF нестабилен — иногда игнорирует формат вывода.", + "tags": [ + "reasoning" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "qwen3-5-122b", + "name": "Qwen 3.5 122B", + "organization": "Qwen", + "parameters": "122B/10B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 92, + "categories": [ + "reasoning", + "efficient" + ], + "description": "IFEval 92.6%! Лучший IF среди open-source. Multimodal. Thinking.", + "tags": [ + "vision", + "thinking", + "tools" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "qwen3-coder-next", + "name": "Qwen3-Coder-Next", + "organization": "Qwen", + "parameters": "80B/3B active", + "context_window": "128K", + "swe_bench": 70, + "if_score": 84, + "categories": [ + "coding", + "efficient" + ], + "description": "70% SWE-bench с 3B active! Хороший IF для кодинга.", + "tags": [ + "coding", + "efficient", + "tools" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "cogito-2-1-671b", + "name": "Cogito 2.1 671B", + "organization": "Cognitive", + "parameters": "671B MoE", + "context_window": "128K", + "swe_bench": null, + "if_score": 76, + "categories": [ + "reasoning" + ], + "description": "MIT лицензия. 671B total. IF неплохой, но уступает GLM/Qwen.", + "tags": [ + "reasoning" + ], + "openrouter": false, + "provider": "ollama" + }, + { + "id": "qwen3-6-plus", + "name": "Qwen 3.6 Plus", + "organization": "Qwen", + "parameters": "Hybrid MoE", + "context_window": "1M", + "swe_bench": 78.8, + "if_score": 91, + "categories": [ + "coding", + "agent", + "reasoning" + ], + "description": "FREE на OpenRouter! 1M контекст. Always-on CoT. Превосходный IF — наследник Qwen 3.5 (92.6%).", + "tags": [ + "coding", + "agent", + "1M-ctx", + "free" + ], + "openrouter": true, + "provider": "openrouter" + }, + { + "id": "step-3-5-flash", + "name": "Step 3.5 Flash", + "organization": "StepFun", + "parameters": "MoE", + "context_window": "128K", + "swe_bench": null, + "if_score": 79, + "categories": [ + "efficient" + ], + "description": "Бесплатна на OpenRouter. IF средний.", + "tags": [ + "efficient", + "free" + ], + "openrouter": true, + "provider": "openrouter" + }, + { + "id": "deepseek-r1", + "name": "DeepSeek R1", + "organization": "DeepSeek", + "parameters": "671B MoE", + "context_window": "128K", + "swe_bench": null, + "if_score": 73, + "categories": [ + "reasoning" + ], + "description": "Мощные reasoning-цепочки. Но IF слабый — часто генерирует лишний reasoning вместо ответа.", + "tags": [ + "reasoning", + "thinking", + "free" + ], + "openrouter": true, + "provider": "openrouter" + } + ], + "groq_models": [ + { + "id": "openai/gpt-oss-20b", + "rpm": 30, + "rpd": "1K", + "tpm": "8K", + "tpd": "200K", + "speed": "1200+", + "use_case": "Ультра-быстрый fallback для лёгких ролей (markdown-validator)." + }, + { + "id": "llama-3.1-8b-instant", + "rpm": 30, + "rpd": "14.4K", + "tpm": "6K", + "tpd": "500K", + "speed": "~800", + "use_case": "14.4K RPD! Самый высокий лимит. Для health-check / ping ролей." + }, + { + "id": "groq/compound", + "rpm": 30, + "rpd": "250", + "tpm": "70K", + "tpd": "—", + "speed": "varies", + "use_case": "Мультимодельная агрегация. Для research-задач." + }, + { + "id": "groq/compound-mini", + "rpm": 30, + "rpd": "250", + "tpm": "70K", + "tpd": "—", + "speed": "varies", + "use_case": "Лёгкая версия compound." + }, + { + "id": "llama-prompt-guard-2", + "rpm": 30, + "rpd": "14.4K", + "tpm": "15K", + "tpd": "500K", + "speed": "~1K", + "use_case": "Security: входной фильтр для security-auditor (14.4K RPD!)." + } + ], + "agent_model_scores": [ + { + "agent": "lead-developer", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 92, + "minimax-m2.5": 86, + "minimax-m2.7": 82, + "nemotron-3-super": 70, + "glm-5.1": 68, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 66, + "qwen3-coder-next": 80, + "qwen3-6-plus": 88, + "kimi-k2-6": 90 + } + }, + { + "agent": "frontend-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 86, + "minimax-m2.5": 92, + "minimax-m2.7": 88, + "nemotron-3-super": 62, + "glm-5.1": 56, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 60, + "qwen3-coder-next": 76, + "qwen3-6-plus": 88, + "kimi-k2-6": 86 + } + }, + { + "agent": "php-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 87, + "minimax-m2.5": 76, + "minimax-m2.7": 72, + "nemotron-3-super": 64, + "glm-5.1": 56, + "deepseek-v4-pro-max": 74, + "qwen3-5-122b": 60, + "qwen3-coder-next": 76, + "qwen3-6-plus": 84, + "kimi-k2-6": 86 + } + }, + { + "agent": "python-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 90, + "minimax-m2.5": 82, + "minimax-m2.7": 78, + "nemotron-3-super": 66, + "glm-5.1": 60, + "deepseek-v4-pro-max": 78, + "qwen3-5-122b": 64, + "qwen3-coder-next": 78, + "qwen3-6-plus": 88, + "kimi-k2-6": 88 + } + }, + { + "agent": "backend-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 91, + "minimax-m2.5": 84, + "minimax-m2.7": 80, + "nemotron-3-super": 68, + "glm-5.1": 63, + "deepseek-v4-pro-max": 86, + "qwen3-5-122b": 62, + "qwen3-coder-next": 78, + "qwen3-6-plus": 87, + "kimi-k2-6": 90 + } + }, + { + "agent": "go-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 85, + "minimax-m2.5": 78, + "minimax-m2.7": 74, + "nemotron-3-super": 66, + "glm-5.1": 58, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 58, + "qwen3-coder-next": 74, + "qwen3-6-plus": 82, + "kimi-k2-6": 86 + } + }, + { + "agent": "flutter-developer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 86, + "minimax-m2.5": 70, + "minimax-m2.7": 66, + "nemotron-3-super": 60, + "glm-5.1": 53, + "deepseek-v4-pro-max": 78, + "qwen3-5-122b": 58, + "qwen3-coder-next": 74, + "qwen3-6-plus": 82, + "kimi-k2-6": 84 + } + }, + { + "agent": "devops-engineer", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 66, + "minimax-m2.5": 53, + "minimax-m2.7": 48, + "nemotron-3-super": 78, + "glm-5.1": 75, + "deepseek-v4-pro-max": 86, + "qwen3-5-122b": 70, + "qwen3-coder-next": 54, + "qwen3-6-plus": 76, + "kimi-k2-6": 88 + } + }, + { + "agent": "sdet-engineer", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 88, + "minimax-m2.5": 84, + "minimax-m2.7": 80, + "nemotron-3-super": 70, + "glm-5.1": 63, + "deepseek-v4-pro-max": 84, + "qwen3-5-122b": 64, + "qwen3-coder-next": 78, + "qwen3-6-plus": 84, + "kimi-k2-6": 87 + } + }, + { + "agent": "code-skeptic", + "current_model_index": 1, + "current_model_id": "minimax-m2.5", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 82, + "minimax-m2.5": 85, + "minimax-m2.7": 80, + "nemotron-3-super": 73, + "glm-5.1": 72, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 70, + "qwen3-coder-next": 72, + "qwen3-6-plus": 80, + "kimi-k2-6": 82 + } + }, + { + "agent": "security-auditor", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 76, + "minimax-m2.5": 74, + "minimax-m2.7": 68, + "nemotron-3-super": 76, + "glm-5.1": 68, + "deepseek-v4-pro-max": 80, + "qwen3-5-122b": 72, + "qwen3-coder-next": 64, + "qwen3-6-plus": 75, + "kimi-k2-6": 80 + } + }, + { + "agent": "performance-engineer", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 78, + "minimax-m2.5": 75, + "minimax-m2.7": 70, + "nemotron-3-super": 78, + "glm-5.1": 74, + "deepseek-v4-pro-max": 84, + "qwen3-5-122b": 70, + "qwen3-coder-next": 67, + "qwen3-6-plus": 76, + "kimi-k2-6": 82 + } + }, + { + "agent": "the-fixer", + "current_model_index": 1, + "current_model_id": "minimax-m2.5", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 89, + "minimax-m2.5": 88, + "minimax-m2.7": 84, + "nemotron-3-super": 71, + "glm-5.1": 64, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 64, + "qwen3-coder-next": 82, + "qwen3-6-plus": 86, + "kimi-k2-6": 90 + } + }, + { + "agent": "browser-automation", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 87, + "minimax-m2.5": 72, + "minimax-m2.7": 68, + "nemotron-3-super": 61, + "glm-5.1": 53, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 56, + "qwen3-coder-next": 72, + "qwen3-6-plus": 82, + "kimi-k2-6": 86 + } + }, + { + "agent": "visual-tester", + "current_model_index": -1, + "current_model_id": "qwen3-coder:480b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 82, + "minimax-m2.5": 68, + "minimax-m2.7": 64, + "nemotron-3-super": 55, + "glm-5.1": 48, + "deepseek-v4-pro-max": 76, + "qwen3-5-122b": 54, + "qwen3-coder-next": 66, + "qwen3-6-plus": 76, + "kimi-k2-6": 78 + } + }, + { + "agent": "system-analyst", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 70, + "minimax-m2.5": 66, + "minimax-m2.7": 63, + "nemotron-3-super": 74, + "glm-5.1": 82, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 76, + "qwen3-coder-next": 58, + "qwen3-6-plus": 80, + "kimi-k2-6": 86 + } + }, + { + "agent": "requirement-refiner", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 66, + "minimax-m2.5": 62, + "minimax-m2.7": 60, + "nemotron-3-super": 72, + "glm-5.1": 80, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 74, + "qwen3-coder-next": 54, + "qwen3-6-plus": 78, + "kimi-k2-6": 82 + } + }, + { + "agent": "history-miner", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 68, + "minimax-m2.5": 60, + "minimax-m2.7": 56, + "nemotron-3-super": 85, + "glm-5.1": 78, + "deepseek-v4-pro-max": 86, + "qwen3-5-122b": 72, + "qwen3-coder-next": 56, + "qwen3-6-plus": 84, + "kimi-k2-6": 82 + } + }, + { + "agent": "capability-analyst", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 72, + "minimax-m2.5": 68, + "minimax-m2.7": 66, + "nemotron-3-super": 76, + "glm-5.1": 78, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 75, + "qwen3-coder-next": 60, + "qwen3-6-plus": 79, + "kimi-k2-6": 82 + } + }, + { + "agent": "orchestrator", + "current_model_index": -1, + "current_model_id": "kimi-k2.6:cloud", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 74, + "minimax-m2.5": 70, + "minimax-m2.7": 68, + "nemotron-3-super": 80, + "glm-5.1": 82, + "deepseek-v4-pro-max": 86, + "qwen3-5-122b": 78, + "qwen3-coder-next": 62, + "qwen3-6-plus": 84, + "kimi-k2-6": 92 + } + }, + { + "agent": "release-manager", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 72, + "minimax-m2.5": 66, + "minimax-m2.7": 64, + "nemotron-3-super": 74, + "glm-5.1": 76, + "deepseek-v4-pro-max": 78, + "qwen3-5-122b": 72, + "qwen3-coder-next": 60, + "qwen3-6-plus": 76, + "kimi-k2-6": 78 + } + }, + { + "agent": "evaluator", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 70, + "minimax-m2.5": 73, + "minimax-m2.7": 70, + "nemotron-3-super": 78, + "glm-5.1": 78, + "deepseek-v4-pro-max": 84, + "qwen3-5-122b": 76, + "qwen3-coder-next": 58, + "qwen3-6-plus": 81, + "kimi-k2-6": 84 + } + }, + { + "agent": "prompt-optimizer", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 76, + "minimax-m2.5": 74, + "minimax-m2.7": 72, + "nemotron-3-super": 76, + "glm-5.1": 75, + "deepseek-v4-pro-max": 80, + "qwen3-5-122b": 74, + "qwen3-coder-next": 64, + "qwen3-6-plus": 83, + "kimi-k2-6": 82 + } + }, + { + "agent": "product-owner", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 60, + "minimax-m2.5": 56, + "minimax-m2.7": 54, + "nemotron-3-super": 74, + "glm-5.1": 78, + "deepseek-v4-pro-max": 76, + "qwen3-5-122b": 74, + "qwen3-coder-next": 48, + "qwen3-6-plus": 78, + "kimi-k2-6": 76 + } + }, + { + "agent": "pipeline-judge", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 64, + "minimax-m2.5": 68, + "minimax-m2.7": 65, + "nemotron-3-super": 78, + "glm-5.1": 76, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 74, + "qwen3-coder-next": 56, + "qwen3-6-plus": 80, + "kimi-k2-6": 84 + } + }, + { + "agent": "workflow-architect", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 68, + "minimax-m2.5": 62, + "minimax-m2.7": 60, + "nemotron-3-super": 76, + "glm-5.1": 76, + "deepseek-v4-pro-max": 80, + "qwen3-5-122b": 72, + "qwen3-coder-next": 56, + "qwen3-6-plus": 80, + "kimi-k2-6": 82 + } + }, + { + "agent": "markdown-validator", + "current_model_index": -1, + "current_model_id": "nemotron-3-nano:30b", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 43, + "minimax-m2.5": 38, + "minimax-m2.7": 36, + "nemotron-3-super": 52, + "glm-5.1": 55, + "deepseek-v4-pro-max": 68, + "qwen3-5-122b": 56, + "qwen3-coder-next": 40, + "qwen3-6-plus": 50, + "kimi-k2-6": 56 + } + }, + { + "agent": "agent-architect", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 78, + "minimax-m2.5": 72, + "minimax-m2.7": 70, + "nemotron-3-super": 78, + "glm-5.1": 76, + "deepseek-v4-pro-max": 82, + "qwen3-5-122b": 76, + "qwen3-coder-next": 66, + "qwen3-6-plus": 82, + "kimi-k2-6": 86 + } + }, + { + "agent": "planner", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 72, + "minimax-m2.5": 68, + "minimax-m2.7": 66, + "nemotron-3-super": 80, + "glm-5.1": 78, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 78, + "qwen3-coder-next": 60, + "qwen3-6-plus": 85, + "kimi-k2-6": 86 + } + }, + { + "agent": "reflector", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 68, + "minimax-m2.5": 66, + "minimax-m2.7": 64, + "nemotron-3-super": 78, + "glm-5.1": 76, + "deepseek-v4-pro-max": 84, + "qwen3-5-122b": 76, + "qwen3-coder-next": 56, + "qwen3-6-plus": 82, + "kimi-k2-6": 80 + } + }, + { + "agent": "memory-manager", + "current_model_index": 6, + "current_model_id": "nemotron-3-super", + "reasoning_effort": "M", + "scores": { + "qwen3-coder-480b": 63, + "minimax-m2.5": 58, + "minimax-m2.7": 56, + "nemotron-3-super": 86, + "glm-5.1": 72, + "deepseek-v4-pro-max": 86, + "qwen3-5-122b": 70, + "qwen3-coder-next": 50, + "qwen3-6-plus": 87, + "kimi-k2-6": 84 + } + }, + { + "agent": "architect-indexer", + "current_model_index": 7, + "current_model_id": "glm-5.1", + "reasoning_effort": "H", + "scores": { + "qwen3-coder-480b": 70, + "minimax-m2.5": 64, + "minimax-m2.7": 62, + "nemotron-3-super": 74, + "glm-5.1": 80, + "deepseek-v4-pro-max": 78, + "qwen3-5-122b": 76, + "qwen3-coder-next": 58, + "qwen3-6-plus": 80, + "kimi-k2-6": 84 + } + } + ], + "if_scores": { + "qwen3-coder-480b": 88, + "minimax-m2.5": 82, + "minimax-m2.7": 78, + "nemotron-3-super": 85, + "glm-5.1": 80, + "deepseek-v4-pro-max": 88, + "qwen3-5-122b": 86, + "qwen3-coder-next": 84, + "qwen3-6-plus": 90, + "kimi-k2-6": 91, + "deepseek-v4-flash": 86 + }, + "agent_current_config": [ + { + "agent": "lead-developer", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "frontend-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "php-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "python-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "backend-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "go-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "flutter-developer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "devops-engineer", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "sdet-engineer", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "code-skeptic", + "model": "ollama-cloud/minimax-m2.5", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "minimax", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "security-auditor", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "performance-engineer", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "the-fixer", + "model": "ollama-cloud/minimax-m2.5", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "minimax", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "browser-automation", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "visual-tester", + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "qwen", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "system-analyst", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "requirement-refiner", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "history-miner", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "capability-analyst", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "orchestrator", + "model": "ollama-cloud/kimi-k2.6:cloud", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "kimi", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "release-manager", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "evaluator", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "prompt-optimizer", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "product-owner", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "pipeline-judge", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "workflow-architect", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "markdown-validator", + "model": "ollama-cloud/nemotron-3-nano:30b", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "agent-architect", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "planner", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "reflector", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "memory-manager", + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "nemotron", + "fit_score": 0, + "status": "good", + "previous_model": null + }, + { + "agent": "architect-indexer", + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama Cloud", + "category": "Process", + "badge_type": "glm", + "fit_score": 0, + "status": "good", + "previous_model": null + } + ], + "recommendations": [ + { + "agent": "[built-in] debug", + "from_model": "glm-5.1.1 (88)", + "from_provider": "Ollama", + "to_model": "V4-Pro Max (★90) / K2.6 (★90) RE:High", + "to_provider": "Ollama Cloud", + "impact": "high", + "quality_change": "+2%", + "speed_change": "~1x", + "context_change": "200K→1M", + "provider_change": "Ollama Cloud", + "rationale": "★ матрицы: V4-Pro=90 и K2.6=90 (TIE!), GLM-5.1=88. V4-Pro: LiveCodeBench 93.5(#1!), Terminal 67.9, 1M ctx для полного проекта. K2.6: 13h auto sessions. Оба лучше GLM-5.1. RE:High для debug." + }, + { + "agent": "planner", + "from_model": "nemotron-3-super (80)", + "from_provider": "Ollama", + "to_model": "V4-Pro Max (★88) RE:High", + "to_provider": "Ollama Cloud", + "impact": "high", + "quality_change": "+10%", + "speed_change": "~1x", + "context_change": "1M", + "provider_change": "Ollama Cloud", + "rationale": "★ матрицы: V4-Pro=88(лучший!), K2.6=86, GLM-5.1=85, Nem=80. V4-Pro: GPQA 90.1 (reasoning), 1M ctx сохраняется (vs потеря при K2.6). RE:High для chain-of-thought planning." + }, + { + "agent": "go-developer", + "from_model": "qwen3-coder:480b (85)", + "from_provider": "Ollama", + "to_model": "V4-Pro Max (★88) RE:Medium", + "to_provider": "Ollama Cloud", + "impact": "medium", + "quality_change": "+4%", + "speed_change": "~1x", + "context_change": "256K→1M", + "provider_change": "Ollama Cloud", + "rationale": "★ матрицы: V4-Pro=88(лучший для Go!), K2.6=86, Qwen3Coder=85. DeepSeek модели традиционно сильны в Go/Rust. 1M ctx для крупных Go-проектов." + }, + { + "agent": "history-miner", + "from_model": "nemotron-3-super (★85)", + "from_provider": "Ollama", + "to_model": "V4-Pro Max (86) + Nem fallback", + "to_provider": "Hybrid", + "impact": "medium", + "quality_change": "+1%", + "speed_change": "~1x", + "context_change": "1M", + "provider_change": "Ollama Cloud + Ollama", + "rationale": "V4-Pro=86 чуть лучше Nemotron=85. 1M ctx у обоих. MRCR 83.5 у V4-Pro — лучшее long-context retrieval. Nemotron как fallback (RULER 91.75%)." + }, + { + "agent": "frontend-dev → M2.5", + "from_model": "qwen3-coder (90)", + "from_provider": "Ollama", + "to_model": "MiniMax M2.5 (★92) ✅", + "to_provider": "Ollama", + "impact": "low", + "quality_change": "+2%", + "speed_change": "=", + "context_change": "204K", + "provider_change": "Ollama", + "rationale": "Spec-writing, UI architect. APPLIED." + }, + { + "agent": "devops → K2.6", + "from_model": "deepseek-v3.2", + "from_provider": "", + "to_model": "kimi-k2.6:cloud ✅", + "to_provider": "Ollama Cloud", + "impact": "low", + "quality_change": "+35%", + "speed_change": "=", + "context_change": "256K", + "provider_change": "", + "rationale": "APPLIED." + }, + { + "agent": "orchestrator", + "from_model": "glm-5.1.1 (★90)", + "from_provider": "Ollama", + "to_model": "K2.6 (★92) RE:Medium", + "to_provider": "Ollama Cloud", + "impact": "medium", + "quality_change": "+2%", + "speed_change": "~1x", + "context_change": "200K→256K", + "provider_change": "Ollama Cloud", + "rationale": "K2.6=92★ всё ещё лучший для orchestration. V4-Pro=86 слабее. 300 sub-agent swarm." + }, + { + "agent": "the-fixer", + "from_model": "minimax-m2.5 (★88)", + "from_provider": "Ollama", + "to_model": "V4-Pro (★88) / K2.6 (★90)", + "to_provider": "Ollama Cloud", + "impact": "medium", + "quality_change": "+2%", + "speed_change": "~1x", + "context_change": "128K→1M/256K", + "provider_change": "Ollama Cloud", + "rationale": "K2.6=90(лучший), V4-Pro=88=M2.5. M2.5 SWE-bench 80.2% стабильнее. Не срочно." + }, + { + "agent": "Qwen3-Coder (7 coding)", + "from_model": "qwen3-coder", + "from_provider": "Ollama", + "to_model": "✅", + "to_provider": "", + "impact": "low", + "quality_change": "=0%", + "speed_change": "=", + "context_change": "256K", + "provider_change": "Ollama", + "rationale": "lead=92★, backend=91★, python=90★." + }, + { + "agent": "GLM-5.1 (12 agents)", + "from_model": "glm-5.1.1", + "from_provider": "Ollama", + "to_model": "✅", + "to_provider": "", + "impact": "low", + "quality_change": "=0%", + "speed_change": "=", + "context_change": "200K", + "provider_change": "", + "rationale": "orchestrator=90, system-analyst=90. SWE-Pro #1." + }, + { + "agent": "Kimi K2.6 (3 agents)", + "from_model": "kimi-k2.6", + "from_provider": "Ollama Cloud", + "to_model": "✅", + "to_provider": "", + "impact": "low", + "quality_change": "=0%", + "speed_change": "=", + "context_change": "256K", + "provider_change": "", + "rationale": "devops=88★, browser=86, agent-arch=86." + } + ], + "impact_data": [ + { + "category": "debug GLM5.1→V4-Pro/K2.6", + "before": 88, + "after": 90, + "delta": 2, + "notes": "LiveCodeBench 93.5, Terminal 67.9" + }, + { + "category": "planner Nem→V4-Pro Max", + "before": 80, + "after": 88, + "delta": 8, + "notes": "★88! GPQA 90.1, 1M ctx" + }, + { + "category": "go-dev Coder→V4-Pro Max", + "before": 85, + "after": 88, + "delta": 3, + "notes": "★88! Go/Rust specialist, 1M ctx" + }, + { + "category": "history-miner →V4-Pro", + "before": 85, + "after": 86, + "delta": 1, + "notes": "MRCR 83.5, long-context" + }, + { + "category": "orchestrator →K2.6 (next)", + "before": 90, + "after": 92, + "delta": 2, + "notes": "300 sub-agent swarm" + }, + { + "category": "frontend → M2.5 ✅", + "before": 90, + "after": 92, + "delta": 2, + "notes": "Spec-writing, UI architect" + }, + { + "category": "devops → K2.6 ✅", + "before": 65, + "after": 88, + "delta": 23, + "notes": "IF:65→91! Terminal 66.7" + }, + { + "category": "Qwen3-Coder (7) ✅", + "before": 90, + "after": 90, + "delta": 0, + "notes": "SOTA coding" + }, + { + "category": "GLM-5.1 (12) ✅", + "before": 87, + "after": 87, + "delta": 0, + "notes": "SWE-Pro #1" + }, + { + "category": "Nemotron Super (6) ✅", + "before": 82, + "after": 82, + "delta": 0, + "notes": "1M ctx, RULER 91.75%" + } + ], + "benchmark_comparison": { + "benchmarks": [ + { + "name": "SWE-V", + "full_name": "SWE-Bench Verified", + "description": "GitHub issue resolution (500 tasks)", + "roles": "lead-dev, backend, fixer" + }, + { + "name": "SWE-P", + "full_name": "SWE-Bench Pro", + "description": "Multi-lang, decontaminated (1865 tasks)", + "roles": "all coding agents" + }, + { + "name": "T-Bench", + "full_name": "Terminal-Bench 2.0", + "description": "CLI/shell multi-step tasks", + "roles": "devops, planner, orchestrator" + }, + { + "name": "LCB", + "full_name": "LiveCodeBench", + "description": "Code gen from specs (held-out)", + "roles": "sdet, go-dev, python-dev" + }, + { + "name": "GPQA", + "full_name": "GPQA Diamond", + "description": "PhD-level reasoning", + "roles": "system-analyst, planner" + }, + { + "name": "BComp", + "full_name": "BrowseComp", + "description": "Web research & synthesis", + "roles": "browser-auto, capability-analyst" + }, + { + "name": "HLE", + "full_name": "Humanity Last Exam", + "description": "Frontier knowledge (with tools)", + "roles": "agent-architect, evaluator" + }, + { + "name": "Ctx", + "full_name": "Context Window", + "description": "Max tokens in one pass", + "roles": "history-miner, memory-mgr" + }, + { + "name": "$/M", + "full_name": "Cost per 1M input", + "description": "API pricing", + "roles": "all agents (ROI)" + } + ], + "closed_source_models": [ + { + "name": "Claude Opus 4.7", + "organization": "Anthropic", + "scores": [ + 87.6, + 64.3, + 69.4, + null, + 94.2, + 79.3, + 53, + "1M", + "$5" + ], + "color": "#c084fc", + "note": "#1 апрель 2026" + }, + { + "name": "GPT-5.5", + "organization": "OpenAI", + "scores": [ + null, + 58.6, + 82.7, + null, + null, + 83.4, + 57.2, + "1M", + "$5" + ], + "color": "#ff6b81", + "note": "Новейший, Terminal #1" + }, + { + "name": "GPT-5.4", + "organization": "OpenAI", + "scores": [ + 78.2, + 59.1, + 75.1, + null, + 94.4, + 82.7, + 58.7, + "200K", + "$2.50" + ], + "color": "#ff6b81", + "note": "Reasoning, math" + }, + { + "name": "Gemini 3.1 Pro", + "organization": "Google", + "scores": [ + 80.6, + 46.1, + 68.5, + null, + 94.3, + 85.9, + 51.4, + "2M", + "$2" + ], + "color": "#facc15", + "note": "ARC-AGI 77.1%, дешёвый" + }, + { + "name": "Claude Sonnet 4.6", + "organization": "Anthropic", + "scores": [ + 79.6, + null, + null, + null, + null, + null, + null, + "200K", + "$3" + ], + "color": "#c084fc", + "note": "5× дешевле Opus" + }, + { + "name": "GPT-5.3-Codex", + "organization": "OpenAI", + "scores": [ + 85, + 57, + 77.3, + null, + null, + null, + null, + "200K", + "$6" + ], + "color": "#ff6b81", + "note": "Coding specialist" + } + ], + "apaw_models": [ + { + "name": "Kimi K2.6", + "organization": "APAW", + "scores": [ + 80.2, + 58.6, + 66.7, + 87.2, + null, + 83.2, + 54, + "256K", + "$0.95" + ], + "color": "#00ff94", + "note": "devops, browser, architect (3)" + }, + { + "name": "GLM-5.1", + "organization": "APAW", + "scores": [ + null, + 58.4, + 63.5, + null, + 86.2, + 68.7, + null, + "200K", + "~$0.50" + ], + "color": "#00ff94", + "note": "12 agents! orchestrator, eval..." + }, + { + "name": "V4-Pro Max", + "organization": "APAW", + "scores": [ + 80.6, + 55.4, + 67.9, + 93.5, + 90.1, + 83.4, + 48.2, + "1M", + "$0.42" + ], + "color": "#00d4ff", + "note": "planner, go-dev (рек.)" + }, + { + "name": "Qwen3-Coder 480B", + "organization": "APAW", + "scores": [ + 66.5, + null, + null, + null, + null, + null, + null, + "256K", + "~$0.50" + ], + "color": "#00ff94", + "note": "7 coding agents" + }, + { + "name": "MiniMax M2.5", + "organization": "APAW", + "scores": [ + 80.2, + 51.3, + null, + null, + null, + 76.3, + null, + "204K", + "$0.15" + ], + "color": "#00ff94", + "note": "frontend, skeptic, fixer (3)" + }, + { + "name": "Nemotron Super", + "organization": "APAW", + "scores": [ + 60.5, + null, + null, + null, + null, + null, + null, + "1M", + "~$0.40" + ], + "color": "#00ff94", + "note": "6 agents (memory, history)" + } + ] + } +} \ No newline at end of file diff --git a/agent-evolution/data/model-benchmarks.schema.json b/agent-evolution/data/model-benchmarks.schema.json new file mode 100644 index 0000000..29ab918 --- /dev/null +++ b/agent-evolution/data/model-benchmarks.schema.json @@ -0,0 +1,553 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://app.kilo.ai/model-benchmarks.schema.json", + "title": "APAW Model Benchmarks Data", + "description": "Schema for static model benchmarks extracted from HTML sources", + "type": "object", + "required": [ + "version", + "generated", + "source", + "metadata", + "models", + "groq_models", + "agent_model_scores", + "if_scores", + "agent_current_config", + "recommendations", + "impact_data", + "benchmark_comparison" + ], + "properties": { + "version": { + "type": "string", + "const": "1.0.0" + }, + "generated": { + "type": "string", + "format": "date-time" + }, + "source": { + "type": "string", + "description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)" + }, + "metadata": { + "type": "object", + "properties": { + "scrape_date": { + "type": "string", + "format": "date-time" + }, + "source_urls": { + "type": "array", + "items": { + "type": "string" + } + }, + "notes": { + "type": "string" + }, + "data_quality": { + "type": "string", + "enum": [ + "high", + "medium", + "low", + "estimated" + ] + } + } + }, + "models": { + "type": "array", + "description": "All benchmarked models from various providers", + "items": { + "type": "object", + "required": [ + "id", + "name", + "provider", + "category" + ], + "properties": { + "id": { + "type": "string", + "description": "Model identifier" + }, + "name": { + "type": "string" + }, + "organization": { + "type": "string" + }, + "provider": { + "type": "string", + "enum": [ + "ollama", + "ollama-cloud", + "openrouter", + "groq", + "anthropic", + "openai", + "meta", + "cohere", + "google", + "microsoft", + "unknown" + ] + }, + "category": { + "type": "string", + "enum": [ + "big", + "medium", + "small", + "coder", + "reasoning", + "creative" + ] + }, + "parameters": { + "type": "string" + }, + "benchmarks": { + "type": "object", + "properties": { + "swe_bench": { + "type": [ + "number", + "null" + ] + }, + "swe_bench_pro": { + "type": [ + "number", + "null" + ] + }, + "terminal_bench": { + "type": [ + "number", + "null" + ] + }, + "live_codebench": { + "type": [ + "number", + "null" + ] + }, + "gpqa": { + "type": [ + "number", + "null" + ] + }, + "hle": { + "type": [ + "number", + "null" + ] + }, + "browse_comp": { + "type": [ + "number", + "null" + ] + }, + "m_mlu": { + "type": [ + "number", + "null" + ] + }, + "m_mlu_pro": { + "type": [ + "number", + "null" + ] + } + } + }, + "description": { + "type": "string" + }, + "availability": { + "type": "object", + "properties": { + "rpm": { + "type": [ + "integer", + "null" + ] + }, + "rpd": { + "type": [ + "integer", + "string", + "null" + ] + }, + "tpm": { + "type": [ + "integer", + "string", + "null" + ] + }, + "tpd": { + "type": [ + "integer", + "string", + "null" + ] + } + } + }, + "free": { + "type": "boolean" + }, + "cost_per_1m_input": { + "type": [ + "number", + "string", + "null" + ] + }, + "tier": { + "type": "string", + "enum": [ + "free", + "trial", + "paid", + "enterprise" + ] + } + } + } + }, + "groq_models": { + "type": "array", + "description": "Groq-specific models with performance data", + "items": { + "type": "object", + "required": [ + "id", + "name", + "speed_tps", + "provider" + ], + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "speed_tps": { + "type": [ + "number", + "string" + ] + }, + "provider": { + "type": "string", + "const": "groq" + }, + "benchmarks": { + "type": "object" + }, + "availability": { + "type": "object" + } + } + } + }, + "agent_model_scores": { + "type": "array", + "description": "Agent × Model compatibility scoring matrices", + "items": { + "type": "object", + "required": [ + "agent", + "model_id", + "score", + "category" + ], + "properties": { + "agent": { + "type": "string" + }, + "model_id": { + "type": "string" + }, + "score": { + "type": "number", + "minimum": 0, + "maximum": 100 + }, + "category": { + "type": "string", + "enum": [ + "performance", + "instruction_following", + "creativity", + "code_generation" + ] + }, + "reason": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "current_model_id": { + "type": "string", + "description": "Current model ID string (replaces index)" + } + } + } + }, + "if_scores": { + "type": "object", + "description": "Instruction Following scores mapping", + "additionalProperties": { + "type": "number", + "minimum": 0, + "maximum": 100 + } + }, + "agent_current_config": { + "type": "array", + "description": "Current agent model configurations", + "items": { + "type": "object", + "required": [ + "agent", + "model", + "provider", + "status" + ], + "properties": { + "agent": { + "type": "string" + }, + "model": { + "type": "string" + }, + "provider": { + "type": "string" + }, + "status": { + "type": "string", + "enum": [ + "active", + "testing", + "deprecated", + "pending" + ] + }, + "reasoning_effort": { + "type": "string", + "enum": [ + "L", + "M", + "H" + ] + }, + "fit_score": { + "type": "number" + }, + "date_applied": { + "type": "string", + "format": "date-time" + } + } + } + }, + "recommendations": { + "type": "array", + "description": "Model change recommendations based on benchmarks", + "items": { + "type": "object", + "required": [ + "agent", + "action", + "current_model", + "recommended_model", + "impact" + ], + "properties": { + "agent": { + "type": "string" + }, + "action": { + "type": "string", + "enum": [ + "update_model", + "confirm_model", + "add_fallback", + "redesign_agent" + ] + }, + "current_model": { + "type": "string" + }, + "current_provider": { + "type": "string" + }, + "recommended_model": { + "type": "string" + }, + "recommended_provider": { + "type": "string" + }, + "impact": { + "type": "string", + "enum": [ + "critical", + "high", + "medium", + "low" + ] + }, + "rationale": { + "type": "string" + }, + "expected_improvement": { + "type": "object" + }, + "applied": { + "type": "boolean" + } + } + } + }, + "impact_data": { + "type": "array", + "description": "Impact analysis of model changes", + "items": { + "type": "object", + "required": [ + "agent", + "model_change", + "impact_score" + ], + "properties": { + "agent": { + "type": "string" + }, + "model_change": { + "type": "string" + }, + "impact_score": { + "type": "number", + "minimum": 0, + "maximum": 100, + "description": "Impact score 0-100" + } + } + } + }, + "benchmark_comparison": { + "type": "object", + "description": "APAW vs closed-source benchmark comparison", + "properties": { + "benchmarks": { + "type": "array", + "description": "Benchmark names used for comparison", + "items": { + "type": "string" + } + }, + "closed_source_models": { + "type": "array", + "description": "Closed-source models included in comparison", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "provider": { + "type": "string" + }, + "benchmarks": { + "type": "object" + } + } + } + }, + "apaw_models": { + "type": "array", + "description": "APAW pipeline models included in comparison", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "provider": { + "type": "string" + }, + "benchmarks": { + "type": "object" + } + } + } + }, + "apaw_best": { + "type": "object", + "description": "Best APAW model per benchmark", + "additionalProperties": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "score": { + "type": "number" + }, + "gap_to_closed": { + "type": [ + "number", + "string" + ] + } + } + } + }, + "closed_best": { + "type": "object", + "description": "Best closed-source model per benchmark", + "additionalProperties": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "score": { + "type": "number" + } + } + } + }, + "summary": { + "type": "object", + "properties": { + "apaw_avg_score": { + "type": "number" + }, + "closed_avg_score": { + "type": "number" + }, + "coverage_gap": { + "type": "string" + } + } + } + } + } + } +} \ No newline at end of file diff --git a/agent-evolution/data/model-research-latest.json b/agent-evolution/data/model-research-latest.json new file mode 100644 index 0000000..a88b409 --- /dev/null +++ b/agent-evolution/data/model-research-latest.json @@ -0,0 +1,59 @@ +{ + "version": "1.0.0", + "generated": "2026-04-27T17:51:36.000Z", + "source": "/research model-optimization", + "models": [], + "recommendations": [ + { + "agent": "lead-developer", + "action": "update_model", + "current_model": "ollama-cloud/qwen3-coder:480b", + "current_provider": "ollama-cloud", + "recommended_model": "ollama-cloud/nemotron-3-super", + "recommended_provider": "ollama-cloud", + "impact": "high", + "expected_improvement": { + "quality": "+15%", + "speed": "+20%", + "context_window": "1M→1M" + }, + "score_before": 85, + "score_after": 92, + "score_delta": 7, + "rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "applied": false, + "applied_date": null + }, + { + "agent": "devops-engineer", + "action": "confirm_model", + "current_model": "ollama-cloud/nemotron-3-super", + "current_provider": "ollama-cloud", + "recommended_model": "ollama-cloud/nemotron-3-super", + "recommended_provider": "ollama-cloud", + "impact": "low", + "expected_improvement": { + "quality": "0%", + "speed": "0%", + "context_window": "1M→1M" + }, + "score_before": 88, + "score_after": 88, + "score_delta": 0, + "rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.", + "applied": false, + "applied_date": null + } + ], + "heatmap": {}, + "closed_source_comparison": {}, + "capability_index_patch": [], + "summary": { + "avg_quality_improvement": "+7.5%", + "providers_used": ["ollama-cloud"], + "key_models": ["nemotron-3-super"], + "total_recommendations": 2, + "applied_count": 0, + "pending_count": 2 + } +} \ No newline at end of file diff --git a/agent-evolution/data/model-research.schema.json b/agent-evolution/data/model-research.schema.json new file mode 100644 index 0000000..db2518f --- /dev/null +++ b/agent-evolution/data/model-research.schema.json @@ -0,0 +1,331 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://app.kilo.ai/model-research.schema.json", + "title": "APAW Model Research Output", + "description": "Schema for automated model research and recommendation output", + "type": "object", + "required": ["version", "generated", "source", "models", "recommendations", "heatmap"], + "properties": { + "version": { + "type": "string", + "const": "1.0.0" + }, + "generated": { + "type": "string", + "format": "date-time" + }, + "source": { + "type": "string", + "description": "What triggered this research (e.g. /evolution, /research, manual)" + }, + "trigger": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["evolution_cycle", "manual_research", "fitness_below_threshold", "scheduled"] + }, + "issue": { + "type": "integer" + }, + "fitness_score": { + "type": "number" + }, + "reason": { + "type": "string" + } + } + }, + "models": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "name", "organization", "if_score", "provider"], + "properties": { + "id": { + "type": "string", + "description": "Full model ID like ollama-cloud/qwen3-coder:480b" + }, + "name": { + "type": "string" + }, + "organization": { + "type": "string" + }, + "parameters": { + "type": "string" + }, + "context_window": { + "type": "string" + }, + "swe_bench": { + "type": ["number", "null"] + }, + "swe_bench_pro": { + "type": ["number", "null"] + }, + "terminal_bench": { + "type": ["number", "null"] + }, + "live_codebench": { + "type": ["number", "null"] + }, + "gpqa": { + "type": ["number", "null"] + }, + "hle": { + "type": ["number", "null"] + }, + "browse_comp": { + "type": ["number", "null"] + }, + "if_score": { + "type": "number", + "minimum": 0, + "maximum": 100, + "description": "Instruction Following composite score (IFEval + IFBench)" + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "provider": { + "type": "string", + "enum": ["ollama", "ollama-cloud", "openrouter", "groq", "hybrid"] + }, + "free": { + "type": "boolean" + }, + "cost_per_1m_input": { + "type": ["number", "string", "null"] + }, + "description": { + "type": "string" + }, + "availability": { + "type": "object", + "properties": { + "rpm": { + "type": ["integer", "null"] + }, + "rpd": { + "type": ["integer", "string", "null"] + }, + "tpm": { + "type": ["integer", "string", "null"] + }, + "tpd": { + "type": ["integer", "string", "null"] + } + } + }, + "speed_tps": { + "type": ["number", "string", "null"] + } + } + } + }, + "recommendations": { + "type": "array", + "items": { + "type": "object", + "required": ["agent", "action", "current_model", "recommended_model", "impact", "rationale"], + "properties": { + "agent": { + "type": "string" + }, + "action": { + "type": "string", + "enum": ["update_model", "confirm_model", "add_fallback", "redesign_agent"] + }, + "current_model": { + "type": "string" + }, + "current_provider": { + "type": "string" + }, + "recommended_model": { + "type": "string" + }, + "recommended_provider": { + "type": "string" + }, + "fallback_model": { + "type": "string" + }, + "fallback_strategy": { + "type": "string" + }, + "impact": { + "type": "string", + "enum": ["critical", "high", "medium", "low"] + }, + "expected_improvement": { + "type": "object", + "properties": { + "quality": { + "type": "string" + }, + "speed": { + "type": "string" + }, + "context_window": { + "type": "string" + } + } + }, + "score_before": { + "type": "number" + }, + "score_after": { + "type": "number" + }, + "score_delta": { + "type": "number" + }, + "rationale": { + "type": "string" + }, + "applied": { + "type": "boolean", + "default": false + }, + "applied_date": { + "type": ["string", "null"], + "format": "date-time" + } + } + } + }, + "heatmap": { + "type": "object", + "description": "Agent × Model compatibility matrix with IF adjustment", + "required": ["models", "agents"], + "properties": { + "models": { + "type": "array", + "items": { + "type": "object", + "required": ["id", "if_score"], + "properties": { + "id": { + "type": "string" + }, + "display_name": { + "type": "string" + }, + "provider": { + "type": "string" + }, + "if_score": { + "type": "number" + } + } + } + }, + "agents": { + "type": "array", + "items": { + "type": "object", + "required": ["agent", "reasoning_effort", "scores"], + "properties": { + "agent": { + "type": "string" + }, + "current_model": { + "type": "string" + }, + "reasoning_effort": { + "type": "string", + "enum": ["L", "M", "H"] + }, + "scores": { + "type": "object", + "additionalProperties": { + "type": "number" + }, + "description": "Model ID → compatibility score (0-100, IF-adjusted)" + } + } + } + }, + "if_adjustment_formula": { + "type": "string", + "default": "score * (0.7 + 0.3 * IF/100)" + } + } + }, + "closed_source_comparison": { + "type": "object", + "description": "APAW pipeline models vs top closed-source models", + "properties": { + "benchmarks": { + "type": "array" + }, + "models": { + "type": "array" + }, + "apaw_best_per_benchmark": { + "type": "object" + }, + "closed_best_per_benchmark": { + "type": "object" + } + } + }, + "capability_index_patch": { + "type": "array", + "description": "Ready-to-apply patches to capability-index.yaml", + "items": { + "type": "object", + "required": ["agent", "set"], + "properties": { + "agent": { + "type": "string" + }, + "set": { + "type": "object", + "additionalProperties": true + } + } + } + }, + "summary": { + "type": "object", + "properties": { + "avg_quality_improvement": { + "type": "string" + }, + "providers_used": { + "type": "array", + "items": { + "type": "string" + } + }, + "key_models": { + "type": "array", + "items": { + "type": "string" + } + }, + "total_recommendations": { + "type": "integer" + }, + "applied_count": { + "type": "integer" + }, + "pending_count": { + "type": "integer" + } + } + } + } +} \ No newline at end of file diff --git a/agent-evolution/dist/research-dashboard-2026_04_29.html b/agent-evolution/dist/research-dashboard-2026_04_29.html new file mode 100644 index 0000000..f90334b --- /dev/null +++ b/agent-evolution/dist/research-dashboard-2026_04_29.html @@ -0,0 +1,2777 @@ + + + + + + APAW Agent Model Research — generated 2026-04-29 + + + + +
+
+

APAW Agent Model Research v2

+
Live dashboard • 15 models × 32 agents • 2026-04-29
+
+ +
+ + + + + + +
+ + +
+
+
Агентов
36
32 custom + 4 built-in
+
Моделей сейчас
6
Coder(9) GLM-5.1(11) K2.6(4)
+
Ollama Cloud
20+
доступно бесплатно
+
Groq + OpenRouter
16+
free tier моделей
+
Рекомендаций
11
8/8 applied ✅
+
+ +
+

Ключевые находки v3 (после коммита caf77f53c8)

+

Ваш агент уже применил 11 из моих рекомендаций (коммит от 05:21). Но я обнаружил что до применения некоторые агенты были на других моделях чем я предполагал:

+
    +
  • ⚠ Откат Qwen 3.6 Plus — security-auditor, prompt-optimizer, product-owner и markdown-validator до коммита были на openrouter/qwen3.6-plus:free и deepseek-v3.2, но мои рекомендации их заменили на Ollama-модели. Это снижает разнообразие провайдеров!
  • +
  • ✅ 11 замен уже применены — Nemotron 3 Super теперь на 7 ролях, GLM-5 расширен, Qwen3-Coder на Go, markdown-validator
  • +
  • 🔴 Осталось 3 агента на gpt-oss:120b — requirement-refiner, capability-analyst, agent-architect. Всем им нужен Nemotron 3 Super
  • +
  • Новая стратегия: гибридный мультипровайдер — OpenRouter (Qwen 3.6 Plus FREE, 1M ctx) + Groq (gpt-oss 500 t/s) + Ollama (основной). Диверсификация снижает зависимость
  • +
  • Qwen 3.6 Plus стоит вернуть для prompt-optimizer (Terminal-Bench 61.6% > Claude!) и product-owner (1M контекст для backlog)
  • + +
  • History-miner → Nemotron 3 Super — самый большой оставшийся прирост: 88 vs 78 (GLM-5). RULER@1M критичен для git history
  • +
  • ⚠ Prompt Adherence (IF) — новый фактор! Nemotron 3 Super имеет IF=78 (ниже GLM-5=90, Qwen3.5=92, Qwen3.6+=91). Для ролей с жёстким промптом (evaluator, security-auditor, orchestrator) это снижает эффективность. Qwen 3.6 Plus и GLM-5 лучше следуют инструкциям
+ +
+ +

Текущая конфигурация

capability-index.yaml
+
+ + +
АгентМодельПровайдерКатегорияСоответствиеСтатус
+
+
+ + +
+

Groq Free Plan — доступные модели

бесплатно · LPU inference
+ +
+
gpt-oss-20b
1200 t/s
30 RPM · 1K RPD · 200K TPD
+
+ +
+

Анализ лимитов Groq Free для агентского pipeline

+

При 26 агентах в pipeline, каждый агент делает 5–20 вызовов на задачу. Типичный issue проходит через 8–12 агентов = ~100–200 вызовов. С лимитом 1K RPD на модель:

+
    + + +
  • Groq Compound: всего 250 RPD, но 70K TPM — для одноразовых тяжёлых аналитических задач
  • +
+
+ +

Все модели Groq Free Tier

+
+ + + +
Model IDRPMRPDTPMTPDСкоростьПрименение в APAW
+
+
+ + +
+

Все доступные модели

Ollama Cloud + Groq + OpenRouter Free
+
+
+
+ + +
+
+
Матрица «Агент × Модель»: оценка совместимости (с учётом Prompt Adherence)
+
0–100 · Взвешенная оценка = 60% бенчмарк роли + 25% Instruction Following + 15% скорость/контекст · ★ = лучший · обведено = текущий · ← 11 моделей · 🟢L 🟡M 🔴H = Reasoning Effort →
+
+
+
+ + +
+

Рекомендации

4 замены (2 BROKEN) + 7 подтверждений 06.04.2026
+ +
+ + + 0 из 11 выбрано +
+ +
+ + + +
+ + +
+

Совокупный анализ профита

если применить все рекомендации
+
+
Средний прирост
+12
пунктов по матрице
+
Применено
8/8
все рекомендации ✅
+
Qwen 3.6+
0
полностью на Ollama!
+
GLM-5.1
12
10 custom + 2 built-in
+
+
+
Прирост по категориям: до → после
+ +
+
+

Детальный анализ прироста

+
+ +
+

APAW Pipeline vs ТОП закрытых моделей (апрель 2026)

+

+ Сравнение лучших моделей в вашем pipeline с лидерами рынка по ключевым бенчмаркам. + 🟢 = APAW обгоняет, + 🟡 = на уровне (±3%), + 🔴 = отстаёт +

+
+ +
+
+

+ * SWE-V = SWE-Bench Verified, SWE-P = SWE-Bench Pro, T-Bench = Terminal-Bench 2.0, LCB = LiveCodeBench, GPQA = GPQA Diamond
+ Данные: swebench.com, marc0.dev, tokenmix.ai, ollama.com — апрель 2026. Стоимость: примерная за 1M input tokens. +

+
+
+
+
+ +
+ + + + diff --git a/agent-evolution/docs/bidirectional-data-flow.md b/agent-evolution/docs/bidirectional-data-flow.md new file mode 100644 index 0000000..83c3a5b --- /dev/null +++ b/agent-evolution/docs/bidirectional-data-flow.md @@ -0,0 +1,504 @@ +# Двунаправленный поток данных APAW Agent Model Research + +Этот документ описывает архитектуру системы, которая автоматизирует исследование моделей AI для агентов APAW и синхронизирует данные между визуальной панелью, конфигурационными файлами и пайплайном эволюции. + +## Цель + +Изначально все данные исследования моделей были захардкожены в HTML-файле `apaw_agent_model_research_v3.html` (1168 строк JavaScript). Двунаправленный поток делает эту систему: + +- **Машиночитаемой** — данные хранятся в JSON для автоматической обработки +- **Записываемой** — изменения в конфигурации агентов обновляют JSON и перегенерируют дашборд +- **Визуализированной** — любое изменение данных автоматически создаёт новый HTML + +## Архитектура данных + +### Файлы системы + +| Файл | Назначение | Формат | Обновляется | +|------|-----------|--------|-------------| +| `data/model-benchmarks.json` | Статические бенчмарки | JSON | `/research models`, вручную | +| `data/model-research-latest.json` | Последнее исследование | JSON | `/evolution Step 0`, `/research models` | +| `data/model-research.schema.json` | Схема валидации | JSON Schema | Вручную | +| `data/model-benchmarks.schema.json` | Схема бенчмарков | JSON Schema | Вручную | +| `scripts/build-research-dashboard.ts` | Генерация HTML | TypeScript/Bun | Вручную | +| `scripts/sync-model-research.ts` | Применение изменений | TypeScript/Bun | Вручную | +| `research-dashboard.template.html` | Шаблон дашборда | HTML+JS+CSS | Вручную | +| `research-dashboard.html` | Готовый дашборд | HTML (standalone) | `build-research-dashboard.ts` | +| `dist/research-dashboard-YYYY_MM_DD.html` | Архив | HTML | `build-research-dashboard.ts` | + +## Поток данных + +### Направление 1: HTML → JSON (Исследование → Бенчмарки) + +Источник: `apaw_agent_model_research_v3.html` (вручную исследованные данные) + +``` +apaw_agent_model_research_v3.html + │ hardcoded JS arrays: + │ cfg[] — текущие конфиги агентов + │ ollamaModels[] — характеристики моделей + │ hmAgents[] — матрица очков + │ recs[] — рекомендации + │ impactData[] — дельта изменений + │ groqModels[] — лимиты Groq + ↓ +agent-evolution/data/model-benchmarks.json + ├─ models[] — 15 моделей, бенчмарки, IF-оценки + ├─ agent_model_scores[] — 33 агента × 11 моделей + ├─ agent_current_config[] — 36 текущих назначений + ├─ recommendations[] — 11 рекомендуемых замен + ├─ groq_models[] — 5 моделей Groq с лимитами + ├─ impact_data[] — before/after + └─ benchmark_comparison — сравнение с закрытыми моделями +``` + +**Как обновлять**: один раз данные извлечены из HTML. Дальнейшие обновления: +- Автоматически: `/research models` → `model-research-latest.json` → `model-benchmarks.json` +- Вручную: редактировать `model-benchmarks.json`, обновить `metadata.generated` + +### Направление 2: JSON → Конфиг → HTML (Применение → Визуализация) + +``` +[/research models] OR [/evolution Step 0] + ↓ +model-research-latest.json + │ validates against: + ↓ model-research.schema.json +bun run agent-evolution/scripts/sync-model-research.ts + ├─ обновляет .kilo/capability-index.yaml (model поля) + ├─ обновляет kilo-meta.json (source of truth) + ├─ обновляет kilo.jsonc (agent config) + ├─ обновляет agent-evolution/data/agent-versions.json (история) + ├─ обновляет .kilo/agents/*.md frontmatter (через sync-agents.js --fix) + └─ rebuilds dashboard (build-research-dashboard.ts) + ↓ +bun run agent-evolution/scripts/build-research-dashboard.ts + ├─ читает model-benchmarks.json + ├─ инжектирует в research-dashboard.template.html + ├─ записывает research-dashboard.html + └─ копирует dist/research-dashboard-YYYY_MM_DD.html + ↓ +[/research models] ← цикл продолжается +``` + +## Структура model-benchmarks.json + +### Верхний уровень + +```json +{ + "version": "1.0.0", + "generated": "2026-04-27T17:44:44.000Z", + "source": "apaw_agent_model_research_v3.html", + "total_agents": 36, + "total_models_tracked": 11, + "providers": ["ollama", "ollama-cloud", "openrouter", "groq"], + "models": [...], + "groq_models": [...], + "agent_model_scores": [...], + "if_scores": {...}, + "agent_current_config": [...], + "recommendations": [...], + "impact_data": [...], + "benchmark_comparison": {...} +} +``` + +### Модель + +```json +{ + "id": "ollama-cloud/qwen3-coder:480b", + "name": "Qwen3-Coder 480B", + "organization": "Qwen", + "parameters": "480B/35B active", + "context_window": "256K\u21921M", + "swe_bench": 66.5, + "swe_bench_pro": null, + "terminal_bench": null, + "live_codebench": null, + "gpqa": null, + "hle": null, + "browse_comp": null, + "if_score": 88, + "categories": ["coding", "agent"], + "tags": ["coding", "agent", "tools"], + "provider": "ollama", + "free": false, + "cost_per_1m_input": "~$0.50", + "description": "SOTA open-source \u043a\u043e\u0434\u0438\u043d\u0433. \u0421\u0440\u0430\u0432\u043d\u0438\u043c \u0441 Claude Sonnet 4.", + "availability": null, + "speed_tps": null +} +``` + +### Рекомендация + +```json +{ + "agent": "planner", + "action": "update_model", + "current_model": "nemotron-3-super", + "current_provider": "Ollama", + "recommended_model": "deepseek-v4-pro-max", + "recommended_provider": "Ollama Cloud", + "impact": "high", + "score_before": 80, + "score_after": 88, + "score_delta": 8, + "expected_improvement": { + "quality": "+10%", + "speed": "~1x", + "context_window": "1M" + }, + "rationale": "\u2605 matri\u0446\u044b: V4-Pro=88(\u043b\u0443\u0447\u0448\u0438\u0439!)..." +} +``` + +### Очки агента + +```json +{ + "agent": "lead-developer", + "current_model_index": 0, + "reasoning_effort": "M", + "scores": { + "ollama-cloud/qwen3-coder:480b": 92, + "ollama-cloud/minimax-m2.5": 86, + "ollama-cloud/minimax-m2.7": 82, + "ollama-cloud/nemotron-3-super": 70, + "ollama-cloud/glm-5": 68, + "ollama-cloud/glm-5.1": 75, + "ollama-cloud/deepseek-v4-pro-max": 88, + "ollama-cloud/qwen3.5-122b": 66, + "ollama-cloud/qwen3-coder-next": 80, + "openrouter/qwen/qwen3.6-plus:free": 88, + "ollama-cloud/kimi-k2.6:cloud": 90 + } +} +``` + +## Формула IF-ажастмента + +Оценка агента с учётом способности модели следовать инструкциям: + +``` +IF-adjusted_score = raw_score × (0.7 + 0.3 × IF/100) + +Где: + raw_score — бенчмарк оценка пары агент×модель (0-100) + IF — instruction following score модели (0-100) + +Примеры: + IF=100 → score × 1.00 (без изменений) + IF=90 → score × 0.97 + IF=78 → score × 0.93 + IF=50 → score × 0.85 + IF=0 → score × 0.70 + +Чем ниже IF, тем сильнее штраф — модель плохо следует промпту и роли. +``` + +## Скрипты системы + +### build-research-dashboard.ts + +**Вход**: `model-benchmarks.json` + `research-dashboard.template.html` +**Выход**: `research-dashboard.html` + `dist/dashboard-YYYY_MM_DD.html` + +```bash +bun run agent-evolution/scripts/build-research-dashboard.ts # однократная сборка +bun run agent-evolution/scripts/build-research-dashboard.ts --watch # watch-режим +bun run agent-evolution/scripts/build-research-dashboard.ts --template custom.html +``` + +Процесс: +1. Читает JSON, валидирует наличие полей +2. Читает шаблон, ищет placeholder `// BENCHMARK_DATA_PLACEHOLDER` +3. Заменяет `const EMBEDDED_DATA = {};` на полный JSON с данными +4. Обновляет `` с датой генерации +5. Пишет `research-dashboard.html` и архивную копию + +### sync-model-research.ts + +**Вход**: `model-research-latest.json` +**Действия**: + +```bash +# Предпросмотр +bun run agent-evolution/scripts/sync-model-research.ts --dry-run + +# Применение всех рекомендаций +bun run agent-evolution/scripts/sync-model-research.ts + +# Только для одного агента +bun run agent-evolution/scripts/sync-model-research.ts --agent planner +``` + +Для каждой рекомендации (`action: "update_model"`, `applied: false`): +1. Находит блок агента в `capability-index.yaml`, заменяет `model:` +2. Обновляет `kilo-meta.json` (source of truth) +3. Обновляет `kilo.jsonc` (через regex, требует ручной проверки) +4. Добавляет запись в `agent-versions.json` history +5. Запускает `node scripts/sync-agents.js --fix` → обновляет .md frontmatter +6. Запускает `node scripts/sync-agents.js --check` → проверка консистентности +7. Пересобирает дашборд через `build-research-dashboard.ts` + +## Интеграция в пайплайн + +### /research models + +``` +1. Загрузить текущие данные из model-benchmarks.json +2. Если stale (>7 дней) или --force: + a. Fetch моделей с Ollama Cloud, OpenRouter, Groq + b. Compute IF scores для каждой модели + c. Score каждую модель против каждого агента +3. Сгенерировать рекомендации (gap > 5) +4. Записать model-research-latest.json +5. Валидировать против model-research.schema.json +6. Обновить model-benchmarks.json (если данные изменились) +7. Пересобрать дашборд +``` + +### /evolution (полный цикл) + +``` +Step 0: Model Research + ├─ Проверить staleness model-benchmarks.json + ├─ Если stale → @capability-analyst исследует модели + ├─ Загрузить heatmap scores + └─ Определить агентов с mismatch (gap > 5) + +Step 1: Judge + └─ @pipeline-judge → fitness score + +Step 2: Decide + ├─ fitness >= 0.85 → выход + ├─ fitness >= 0.70 → @prompt-optimizer (minor) + └─ fitness < 0.70 → @prompt-optimizer (major) + apply model recs + +Step 3: Re-test + └─ Перезапуск с обновлёнными промптами/моделями + +Step 4: Log + Dashboard + ├─ Append fitness-history.jsonl + ├─ Apply рекомендации sync-model-research.ts + └─ Пересобрать дашборд build-research-dashboard.ts +``` + +### /evolution research + +``` +1. Прочитать текущую конфигурацию +2. Исследовать модели (как /research models) +3. Сгенерировать рекомендации +4. Dry-run preview +5. Применить при подтверждении +6. Пересобрать дашборд +``` + +## Правила синхронизации + +Из `.kilo/rules/evolutionary-sync.md`: + +### Обязательный порядок + +``` +1. Обновить kilo-meta.json (source of truth) +2. Обновить capability-index.yaml +3. Запустить sync-agents.js --fix +4. Ручная проверка kilo.jsonc (sync script не гарантирует) +5. Запустить sync-agents.js --check +6. Проверить agent-versions.json history +7. Пересобрать дашборд +8. Если любая проверка не прошла — НЕ коммитить +``` + +### Облачный суффикс + +При использовании `ollama-cloud/kimi-k2.6` ВСЕГДА с суффиксом `:cloud`: + +```yaml +# Правильно +model: "ollama-cloud/kimi-k2.6:cloud" + +# Неправильно — отсутствует суффикс +model: "ollama-cloud/kimi-k2.6" +``` + +## Чеклист применения изменений + +``` +□ Исследование: /research models завершено +□ Валидация: model-research-latest.json проходит schema check +□ Dry-run: sync-model-research.ts --dry-run показывает ожидаемые изменения +□ Применение: sync-model-research.ts выполнен без ошибок +□ YAML: capability-index.yaml обновлены поля model +□ Meta: kilo-meta.json соответствует +□ kilo.jsonc: модели обновлены (ручная проверка) +□ История: agent-versions.json записи добавлены +□ Sync: sync-agents.js --fix обновил все .md файлы +□ Check: sync-agents.js --check проходит +□ Старые модели: grep не находит предыдущие model IDs +□ Суффикс: kimi-k2.6:cloud (с :cloud) +□ Дашборд: build-research-dashboard.ts сгенерировал свежий HTML +□ Открыть: research-dashboard.html показывает актуальные данные +□ Гит: все изменения add и commit +``` + +## Устранение неполадок + +| Проблема | Диагностика | Решение | +|----------|------------|---------| +| Дашборд пустой | Проверить placeholder в template.html | Пересобрать: `bun run build-research-dashboard.ts` | +| Schema validation fails | Сравнить JSON со схемой | Проверить model-research.schema.json актуальность | +| sync-agents.js check fails | Model mismatch в конфигах | Запустить `--fix`, затем `--check`; ручная проверка kilo.jsonc | +| Heatmap пустой | agent_model_scores отсутствует | Обновить бенчмарки через `/research models` | +| Рекомендации не отображаются | Empty recs array | Запустить research для генерации новых рекомендаций | +| Старые данные | metadata.generated > 7 дней | Обновить бенчмарки | +| sync-model-research.ts падает | Файл не найден | Проверить пути, запустить из корня проекта | + +## Пример полного цикла + +### 1. Исследование моделей + +```bash +$ /research models + +## Research: model optimization + +### Models Analyzed +- Ollama Cloud: 20 models +- OpenRouter Free: 3 models +- Groq Free: 5 models + +### Key Findings +- DeepSeek V4-Pro Max доступен (SWE-V 80.6, IF:88) +- Kimi K2.6 IF=91 (лучший для orchestration) +- Nemotron 3 Super IF=78 — слаб для prompt-heavy ролей +- Qwen 3.6 Plus FREE остаётся лучшим IF/cost (91, $0) + +### Recommendations Generated +- 11 model swap recommendations +- 4 high, 3 medium, 4 low +- Средний expected improvement: +12 points + +### Files Updated +- agent-evolution/data/model-research-latest.json +- agent-evolution/data/model-benchmarks.json (refreshed) +- agent-evolution/dist/research-dashboard-2026_04_27.html (archive) +``` + +### 2. Валидация schema + +```bash +$ node -e " +const Ajv = require('ajv'); +const ajv = new Ajv(); +const schema = JSON.parse(require('fs').readFileSync('agent-evolution/data/model-research.schema.json','utf8')); +const data = JSON.parse(require('fs').readFileSync('agent-evolution/data/model-research-latest.json','utf8')); +const valid = ajv.validate(schema, data); +console.log(valid ? 'VALID' : 'INVALID'); +if (!valid) console.log(JSON.stringify(ajv.errors, null, 2)); +" +VALID +``` + +### 3. Dry-run + +```bash +$ bun run agent-evolution/scripts/sync-model-research.ts --dry-run + +=== SYNC PREVIEW (dry-run) === +3 agents would be updated: + +planner + FROM: nemotron-3-super (Ollama) + TO: deepseek-v4-pro-max (Ollama Cloud) + DELTA: +8 (80 → 88) + IMPACT: high + +go-developer + FROM: qwen3-coder:480b (Ollama) + TO: deepseek-v4-pro-max (Ollama Cloud) + DELTA: +3 (85 → 88) + IMPACT: medium + +[built-in] debug + FROM: glm-5.1 (Ollama) + TO: kimi-k2.6:cloud (Ollama Cloud) + DELTA: +2 (88 → 90) + IMPACT: high + +Files to modify: capability-index.yaml, kilo-meta.json, kilo.jsonc, agent-versions.json +``` + +### 4. Применение + +```bash +$ bun run agent-evolution/scripts/sync-model-research.ts + +✅ capability-index.yaml updated (3 agents) +✅ kilo-meta.json updated +✅ kilo.jsonc updated +✅ agent-versions.json history updated (3 entries) +✅ sync-agents.js --fix completed +✅ sync-agents.js --check passed +✅ Dashboard rebuilt: research-dashboard.html (106KB) +``` + +### 5. Проверка дашборда + +```bash +$ start agent-evolution/research-dashboard.html + +# В браузере: +# - Overview: 3 agents updated, 11 recommendations total +# - Heatmap: V4-Pro Max column green for planner, go-developer +# - Recommendations: 3 marked as applied with checkmarks +# - Impact: +8 for planner shown in chart +``` + +### 6. Тест пайплайна + +```bash +$ /evolve --issue 42 + +## Pipeline Judgment: Issue #42 + +**Fitness: 0.88/1.00** [PASS → improved from 0.82] + +| Metric | Value | Weight | Contribution | +|--------|-------|--------|-------------| +| Tests | 96% (46/48) | 50% | 0.480 | +| Gates | 80% (4/5) | 25% | 0.200 | +| Cost | 38.4K tok / 245s | 25% | 0.198 | + +**Bottleneck:** none (all agents optimal) +**Verdict:** PASS — fitness improved! + +✅ Logged to .kilo/logs/fitness-history.jsonl +✅ Auto-rebuilt: agent-evolution/research-dashboard.html +``` + +## Периодичность обновления + +| Файл | Период | Триггер | +|------|--------|---------| +| model-benchmarks.json | Еженедельно (>7 дней = stale) | `/evolution Step 0` или `/research models` | +| model-research-latest.json | Каждый research cycle | `/research models`, `/evolution research` | +| research-dashboard.html | После каждого изменения | `sync-model-research.ts` или `build-research-dashboard.ts` | +| dist/*.html | Архив | Каждая генерация | +| agent-versions.json | При каждом изменении модели | `sync-model-research.ts` | + +## Связанные документы + +- `.kilo/commands/evolution.md` — команда /evolution +- `.kilo/commands/research.md` — команда /research +- `.kilo/shared/self-evolution.md` — протокол эволюции +- `.kilo/rules/evolutionary-sync.md` — правила синхронизации +- `.kilo/rules/agent-frontmatter-validation.md` — валидация YAML frontmatter +- `agent-evolution/README.md` — обзор системы эволюции +- `kilo-meta.json` — source of truth для моделей +- `.kilo/capability-index.yaml` — маршрутизация и назначения diff --git a/agent-evolution/ideas/apaw_agent_model_research_v3.html b/agent-evolution/ideas/apaw_agent_model_research_v3.html new file mode 100644 index 0000000..5e9251e --- /dev/null +++ b/agent-evolution/ideas/apaw_agent_model_research_v3.html @@ -0,0 +1,1168 @@ +<!DOCTYPE html> +<html lang="ru"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>APAW KiloCode — Agent Model Research v3 (Ollama + Groq + OpenRouter) + + + + +
+
+

APAW Agent Model Research v2

+
capability-index.yaml · Ollama Cloud + OpenRouter · GLM-5.1 + Qwen 3.6+ · April 2026 · April 2026
+
+ +
+ + + + + + +
+ + +
+
+
Агентов
36
32 custom + 4 built-in
+
Моделей сейчас
6
Coder(9) GLM-5.1(11) K2.6(4)
+
Ollama Cloud
20+
доступно бесплатно
+
Groq + OpenRouter
16+
free tier моделей
+
Рекомендаций
11
8/8 applied ✅
+
+ +
+

Ключевые находки v3 (после коммита caf77f53c8)

+

Ваш агент уже применил 11 из моих рекомендаций (коммит от 05:21). Но я обнаружил что до применения некоторые агенты были на других моделях чем я предполагал:

+
    +
  • ⚠ Откат Qwen 3.6 Plus — security-auditor, prompt-optimizer, product-owner и markdown-validator до коммита были на openrouter/qwen3.6-plus:free и deepseek-v3.2, но мои рекомендации их заменили на Ollama-модели. Это снижает разнообразие провайдеров!
  • +
  • ✅ 11 замен уже применены — Nemotron 3 Super теперь на 7 ролях, GLM-5 расширен, Qwen3-Coder на Go, markdown-validator
  • +
  • 🔴 Осталось 3 агента на gpt-oss:120b — requirement-refiner, capability-analyst, agent-architect. Всем им нужен Nemotron 3 Super
  • +
  • Новая стратегия: гибридный мультипровайдер — OpenRouter (Qwen 3.6 Plus FREE, 1M ctx) + Groq (gpt-oss 500 t/s) + Ollama (основной). Диверсификация снижает зависимость
  • +
  • Qwen 3.6 Plus стоит вернуть для prompt-optimizer (Terminal-Bench 61.6% > Claude!) и product-owner (1M контекст для backlog)
  • + +
  • History-miner → Nemotron 3 Super — самый большой оставшийся прирост: 88 vs 78 (GLM-5). RULER@1M критичен для git history
  • +
  • ⚠ Prompt Adherence (IF) — новый фактор! Nemotron 3 Super имеет IF=78 (ниже GLM-5=90, Qwen3.5=92, Qwen3.6+=91). Для ролей с жёстким промптом (evaluator, security-auditor, orchestrator) это снижает эффективность. Qwen 3.6 Plus и GLM-5 лучше следуют инструкциям
+ +
+ +

Текущая конфигурация

capability-index.yaml
+
+ + +
АгентМодельПровайдерКатегорияСоответствиеСтатус
+
+
+ + +
+

Groq Free Plan — доступные модели

бесплатно · LPU inference
+ +
+
gpt-oss-20b
1200 t/s
30 RPM · 1K RPD · 200K TPD
+
+ +
+

Анализ лимитов Groq Free для агентского pipeline

+

При 26 агентах в pipeline, каждый агент делает 5–20 вызовов на задачу. Типичный issue проходит через 8–12 агентов = ~100–200 вызовов. С лимитом 1K RPD на модель:

+
    + + +
  • Groq Compound: всего 250 RPD, но 70K TPM — для одноразовых тяжёлых аналитических задач
  • +
+
+ +

Все модели Groq Free Tier

+
+ + + +
Model IDRPMRPDTPMTPDСкоростьПрименение в APAW
+
+
+ + +
+

Все доступные модели

Ollama Cloud + Groq + OpenRouter Free
+
+
+
+ + +
+
+
Матрица «Агент × Модель»: оценка совместимости (с учётом Prompt Adherence)
+
0–100 · Взвешенная оценка = 60% бенчмарк роли + 25% Instruction Following + 15% скорость/контекст · ★ = лучший · обведено = текущий · ← 11 моделей · 🟢L 🟡M 🔴H = Reasoning Effort →
+
+
+
+ + +
+

Рекомендации

4 замены (2 BROKEN) + 7 подтверждений 06.04.2026
+ +
+ + + 0 из 11 выбрано +
+ +
+ + + +
+ + +
+

Совокупный анализ профита

если применить все рекомендации
+
+
Средний прирост
+12
пунктов по матрице
+
Применено
8/8
все рекомендации ✅
+
Qwen 3.6+
0
полностью на Ollama!
+
GLM-5.1
12
10 custom + 2 built-in
+
+
+
Прирост по категориям: до → после
+ +
+
+

Детальный анализ прироста

+
+ +
+

APAW Pipeline vs ТОП закрытых моделей (апрель 2026)

+

+ Сравнение лучших моделей в вашем pipeline с лидерами рынка по ключевым бенчмаркам. + 🟢 = APAW обгоняет, + 🟡 = на уровне (±3%), + 🔴 = отстаёт +

+
+ +
+
+

+ * SWE-V = SWE-Bench Verified, SWE-P = SWE-Bench Pro, T-Bench = Terminal-Bench 2.0, LCB = LiveCodeBench, GPQA = GPQA Diamond
+ Данные: swebench.com, marc0.dev, tokenmix.ai, ollama.com — апрель 2026. Стоимость: примерная за 1M input tokens. +

+
+
+
+
+ +
+ + + + diff --git a/agent-evolution/index.standalone.html b/agent-evolution/index.standalone.html index 0d08004..815c470 100644 --- a/agent-evolution/index.standalone.html +++ b/agent-evolution/index.standalone.html @@ -674,16 +674,16 @@ // Supports both server and file:// mode let agentData = {}; -// Embedded data (generated 2026-04-23T06:24:32.710Z) +// Embedded data (generated 2026-04-27T20:28:59.112Z) const EMBEDDED_DATA = { "version": "1.0.0", - "lastUpdated": "2026-04-23T06:24:32.543Z", + "lastUpdated": "2026-04-27T20:28:58.592Z", "agents": { "lead-developer": { "current": { "description": "Primary code writer for backend and core logic. Writes implementation to pass tests", "mode": "subagent", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/nemotron-3-super", "provider": "Ollama", "variant": "thinking", "color": "\"#DC2626\"", @@ -704,6 +704,24 @@ const EMBEDDED_DATA = { "to": "ollama-cloud/qwen3-coder:480b", "reason": "Initial configuration from capability-index.yaml", "source": "git" + }, + { + "date": "2026-04-27T16:56:09.013Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "source": "research" + }, + { + "date": "2026-04-27T20:28:58.592Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "source": "research" } ], "performance_log": [] @@ -932,7 +950,7 @@ const EMBEDDED_DATA = { "current": { "description": "Designs technical specifications, data schemas, and API contracts before implementation", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/nemotron-3-super", "provider": "Ollama", "variant": "thinking", "color": "\"#0891B2\"", @@ -962,6 +980,15 @@ const EMBEDDED_DATA = { "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-04-27T16:59:52.825Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/nemotron-3-super", + "reason": "Test recommendation for model research sync script", + "source": "research" } ], "performance_log": [] diff --git a/agent-evolution/research-dashboard.html b/agent-evolution/research-dashboard.html new file mode 100644 index 0000000..f90334b --- /dev/null +++ b/agent-evolution/research-dashboard.html @@ -0,0 +1,2777 @@ + + + + + + APAW Agent Model Research — generated 2026-04-29 + + + + +
+
+

APAW Agent Model Research v2

+
Live dashboard • 15 models × 32 agents • 2026-04-29
+
+ +
+ + + + + + +
+ + +
+
+
Агентов
36
32 custom + 4 built-in
+
Моделей сейчас
6
Coder(9) GLM-5.1(11) K2.6(4)
+
Ollama Cloud
20+
доступно бесплатно
+
Groq + OpenRouter
16+
free tier моделей
+
Рекомендаций
11
8/8 applied ✅
+
+ +
+

Ключевые находки v3 (после коммита caf77f53c8)

+

Ваш агент уже применил 11 из моих рекомендаций (коммит от 05:21). Но я обнаружил что до применения некоторые агенты были на других моделях чем я предполагал:

+
    +
  • ⚠ Откат Qwen 3.6 Plus — security-auditor, prompt-optimizer, product-owner и markdown-validator до коммита были на openrouter/qwen3.6-plus:free и deepseek-v3.2, но мои рекомендации их заменили на Ollama-модели. Это снижает разнообразие провайдеров!
  • +
  • ✅ 11 замен уже применены — Nemotron 3 Super теперь на 7 ролях, GLM-5 расширен, Qwen3-Coder на Go, markdown-validator
  • +
  • 🔴 Осталось 3 агента на gpt-oss:120b — requirement-refiner, capability-analyst, agent-architect. Всем им нужен Nemotron 3 Super
  • +
  • Новая стратегия: гибридный мультипровайдер — OpenRouter (Qwen 3.6 Plus FREE, 1M ctx) + Groq (gpt-oss 500 t/s) + Ollama (основной). Диверсификация снижает зависимость
  • +
  • Qwen 3.6 Plus стоит вернуть для prompt-optimizer (Terminal-Bench 61.6% > Claude!) и product-owner (1M контекст для backlog)
  • + +
  • History-miner → Nemotron 3 Super — самый большой оставшийся прирост: 88 vs 78 (GLM-5). RULER@1M критичен для git history
  • +
  • ⚠ Prompt Adherence (IF) — новый фактор! Nemotron 3 Super имеет IF=78 (ниже GLM-5=90, Qwen3.5=92, Qwen3.6+=91). Для ролей с жёстким промптом (evaluator, security-auditor, orchestrator) это снижает эффективность. Qwen 3.6 Plus и GLM-5 лучше следуют инструкциям
+ +
+ +

Текущая конфигурация

capability-index.yaml
+
+ + +
АгентМодельПровайдерКатегорияСоответствиеСтатус
+
+
+ + +
+

Groq Free Plan — доступные модели

бесплатно · LPU inference
+ +
+
gpt-oss-20b
1200 t/s
30 RPM · 1K RPD · 200K TPD
+
+ +
+

Анализ лимитов Groq Free для агентского pipeline

+

При 26 агентах в pipeline, каждый агент делает 5–20 вызовов на задачу. Типичный issue проходит через 8–12 агентов = ~100–200 вызовов. С лимитом 1K RPD на модель:

+
    + + +
  • Groq Compound: всего 250 RPD, но 70K TPM — для одноразовых тяжёлых аналитических задач
  • +
+
+ +

Все модели Groq Free Tier

+
+ + + +
Model IDRPMRPDTPMTPDСкоростьПрименение в APAW
+
+
+ + +
+

Все доступные модели

Ollama Cloud + Groq + OpenRouter Free
+
+
+
+ + +
+
+
Матрица «Агент × Модель»: оценка совместимости (с учётом Prompt Adherence)
+
0–100 · Взвешенная оценка = 60% бенчмарк роли + 25% Instruction Following + 15% скорость/контекст · ★ = лучший · обведено = текущий · ← 11 моделей · 🟢L 🟡M 🔴H = Reasoning Effort →
+
+
+
+ + +
+

Рекомендации

4 замены (2 BROKEN) + 7 подтверждений 06.04.2026
+ +
+ + + 0 из 11 выбрано +
+ +
+ + + +
+ + +
+

Совокупный анализ профита

если применить все рекомендации
+
+
Средний прирост
+12
пунктов по матрице
+
Применено
8/8
все рекомендации ✅
+
Qwen 3.6+
0
полностью на Ollama!
+
GLM-5.1
12
10 custom + 2 built-in
+
+
+
Прирост по категориям: до → после
+ +
+
+

Детальный анализ прироста

+
+ +
+

APAW Pipeline vs ТОП закрытых моделей (апрель 2026)

+

+ Сравнение лучших моделей в вашем pipeline с лидерами рынка по ключевым бенчмаркам. + 🟢 = APAW обгоняет, + 🟡 = на уровне (±3%), + 🔴 = отстаёт +

+
+ +
+
+

+ * SWE-V = SWE-Bench Verified, SWE-P = SWE-Bench Pro, T-Bench = Terminal-Bench 2.0, LCB = LiveCodeBench, GPQA = GPQA Diamond
+ Данные: swebench.com, marc0.dev, tokenmix.ai, ollama.com — апрель 2026. Стоимость: примерная за 1M input tokens. +

+
+
+
+
+ +
+ + + + diff --git a/agent-evolution/research-dashboard.template.html b/agent-evolution/research-dashboard.template.html new file mode 100644 index 0000000..32b118e --- /dev/null +++ b/agent-evolution/research-dashboard.template.html @@ -0,0 +1,1003 @@ + + + + + + APAW KiloCode — Agent Model Research v3 (Ollama + Groq + OpenRouter) + + + + +
+
+

APAW Agent Model Research v2

+
capability-index.yaml · Ollama Cloud + OpenRouter · GLM-5.1 + Qwen 3.6+ · April 2026 · April 2026
+
+ +
+ + + + + + +
+ + +
+
+
Агентов
36
32 custom + 4 built-in
+
Моделей сейчас
6
Coder(9) GLM-5.1(11) K2.6(4)
+
Ollama Cloud
20+
доступно бесплатно
+
Groq + OpenRouter
16+
free tier моделей
+
Рекомендаций
11
8/8 applied ✅
+
+ +
+

Ключевые находки v3 (после коммита caf77f53c8)

+

Ваш агент уже применил 11 из моих рекомендаций (коммит от 05:21). Но я обнаружил что до применения некоторые агенты были на других моделях чем я предполагал:

+
    +
  • ⚠ Откат Qwen 3.6 Plus — security-auditor, prompt-optimizer, product-owner и markdown-validator до коммита были на openrouter/qwen3.6-plus:free и deepseek-v3.2, но мои рекомендации их заменили на Ollama-модели. Это снижает разнообразие провайдеров!
  • +
  • ✅ 11 замен уже применены — Nemotron 3 Super теперь на 7 ролях, GLM-5 расширен, Qwen3-Coder на Go, markdown-validator
  • +
  • 🔴 Осталось 3 агента на gpt-oss:120b — requirement-refiner, capability-analyst, agent-architect. Всем им нужен Nemotron 3 Super
  • +
  • Новая стратегия: гибридный мультипровайдер — OpenRouter (Qwen 3.6 Plus FREE, 1M ctx) + Groq (gpt-oss 500 t/s) + Ollama (основной). Диверсификация снижает зависимость
  • +
  • Qwen 3.6 Plus стоит вернуть для prompt-optimizer (Terminal-Bench 61.6% > Claude!) и product-owner (1M контекст для backlog)
  • + +
  • History-miner → Nemotron 3 Super — самый большой оставшийся прирост: 88 vs 78 (GLM-5). RULER@1M критичен для git history
  • +
  • ⚠ Prompt Adherence (IF) — новый фактор! Nemotron 3 Super имеет IF=78 (ниже GLM-5=90, Qwen3.5=92, Qwen3.6+=91). Для ролей с жёстким промптом (evaluator, security-auditor, orchestrator) это снижает эффективность. Qwen 3.6 Plus и GLM-5 лучше следуют инструкциям
+ +
+ +

Текущая конфигурация

capability-index.yaml
+
+ + +
АгентМодельПровайдерКатегорияСоответствиеСтатус
+
+
+ + +
+

Groq Free Plan — доступные модели

бесплатно · LPU inference
+ +
+
gpt-oss-20b
1200 t/s
30 RPM · 1K RPD · 200K TPD
+
+ +
+

Анализ лимитов Groq Free для агентского pipeline

+

При 26 агентах в pipeline, каждый агент делает 5–20 вызовов на задачу. Типичный issue проходит через 8–12 агентов = ~100–200 вызовов. С лимитом 1K RPD на модель:

+
    + + +
  • Groq Compound: всего 250 RPD, но 70K TPM — для одноразовых тяжёлых аналитических задач
  • +
+
+ +

Все модели Groq Free Tier

+
+ + + +
Model IDRPMRPDTPMTPDСкоростьПрименение в APAW
+
+
+ + +
+

Все доступные модели

Ollama Cloud + Groq + OpenRouter Free
+
+
+
+ + +
+
+
Матрица «Агент × Модель»: оценка совместимости (с учётом Prompt Adherence)
+
0–100 · Взвешенная оценка = 60% бенчмарк роли + 25% Instruction Following + 15% скорость/контекст · ★ = лучший · обведено = текущий · ← 11 моделей · 🟢L 🟡M 🔴H = Reasoning Effort →
+
+
+
+ + +
+

Рекомендации

4 замены (2 BROKEN) + 7 подтверждений 06.04.2026
+ +
+ + + 0 из 11 выбрано +
+ +
+ + + +
+ + +
+

Совокупный анализ профита

если применить все рекомендации
+
+
Средний прирост
+12
пунктов по матрице
+
Применено
8/8
все рекомендации ✅
+
Qwen 3.6+
0
полностью на Ollama!
+
GLM-5.1
12
10 custom + 2 built-in
+
+
+
Прирост по категориям: до → после
+ +
+
+

Детальный анализ прироста

+
+ +
+

APAW Pipeline vs ТОП закрытых моделей (апрель 2026)

+

+ Сравнение лучших моделей в вашем pipeline с лидерами рынка по ключевым бенчмаркам. + 🟢 = APAW обгоняет, + 🟡 = на уровне (±3%), + 🔴 = отстаёт +

+
+ +
+
+

+ * SWE-V = SWE-Bench Verified, SWE-P = SWE-Bench Pro, T-Bench = Terminal-Bench 2.0, LCB = LiveCodeBench, GPQA = GPQA Diamond
+ Данные: swebench.com, marc0.dev, tokenmix.ai, ollama.com — апрель 2026. Стоимость: примерная за 1M input tokens. +

+
+
+
+
+ +
+ + + + diff --git a/agent-evolution/scripts/build-research-dashboard.ts b/agent-evolution/scripts/build-research-dashboard.ts new file mode 100644 index 0000000..839681c --- /dev/null +++ b/agent-evolution/scripts/build-research-dashboard.ts @@ -0,0 +1,237 @@ +#!/usr/bin/env bun +/** + * Build APAW Agent Model Research Dashboard from live data. + * + * Reads model-benchmarks.json and injects into template HTML. + * Creates standalone dashboard with embedded JSON data. + * + * Usage: + * bun run agent-evolution/scripts/build-research-dashboard.ts # build once + * bun run agent-evolution/scripts/build-research-dashboard.ts --watch # watch mode + * bun run agent-evolution/scripts/build-research-dashboard.ts --template path/to/custom.html + */ + +import { existsSync, readFileSync, writeFileSync, watch } from 'fs'; +import { join, dirname, basename } from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const DATA_FILE = join(__dirname, '../data/model-benchmarks.json'); +const DEFAULT_TEMPLATE = join(__dirname, '../research-dashboard.template.html'); +const OUTPUT_FILE = join(__dirname, '../research-dashboard.html'); +const DIST_DIR = join(__dirname, '../dist'); + +interface BenchmarksData { + version: string; + generated: string; + source: string; + total_agents: number; + total_models_tracked: number; + providers: string[]; + models: any[]; + agent_model_scores: any[]; + agent_current_config: any[]; + groq_models: any[]; + recommendations: any[]; + impact_data: any[]; +} + +function buildDashboard(templatePath: string = DEFAULT_TEMPLATE): boolean { + console.log('🔧 Building APAW Agent Model Research Dashboard'); + + // Validate inputs + if (!existsSync(DATA_FILE)) { + console.error(`❌ Data file not found: ${DATA_FILE}`); + console.error(' Please run research cycle first: bun run /research models'); + return false; + } + + if (!existsSync(templatePath)) { + console.error(`❌ Template file not found: ${templatePath}`); + console.error(' Using default template:', DEFAULT_TEMPLATE); + if (!existsSync(DEFAULT_TEMPLATE)) { + console.error(' Default template also missing. Create template first.'); + return false; + } + templatePath = DEFAULT_TEMPLATE; + } + + // Read and validate JSON data + let data: BenchmarksData; + try { + const rawData = readFileSync(DATA_FILE, 'utf-8'); + data = JSON.parse(rawData); + console.log(`📖 Read model-benchmarks.json (${rawData.length} bytes)`); + } catch (error) { + console.error(`❌ Failed to parse JSON data: ${error}`); + return false; + } + + // Validate required fields + if (!data.models || !Array.isArray(data.models)) { + console.error('❌ Missing or invalid "models" array in data'); + return false; + } + + if (!data.agent_model_scores || !Array.isArray(data.agent_model_scores)) { + console.error('❌ Missing or invalid "agent_model_scores" array in data'); + return false; + } + + console.log(` Models: ${data.models.length}`); + console.log(` Agents: ${data.agent_model_scores.length}`); + console.log(` Providers: ${data.providers?.join(', ') || 'unknown'}`); + console.log(` Generated: ${data.generated}`); + + // Read HTML template + let html: string; + try { + html = readFileSync(templatePath, 'utf-8'); + console.log(`📖 Read template: ${templatePath} (${html.length} bytes)`); + } catch (error) { + console.error(`❌ Failed to read template: ${error}`); + return false; + } + + // Find and replace placeholder — must match exact text in template + const placeholder = '// BENCHMARK_DATA_PLACEHOLDER - will be replaced by build script\nconst EMBEDDED_DATA = {};\n'; + if (!html.includes(placeholder)) { + // Try looser match with any line endings + const loosePlaceholder = html.match(/\/\/\s*BENCHMARK_DATA_PLACEHOLDER[^\n]*\r?\n\s*const\s+EMBEDDED_DATA\s*=\s*\{\}\s*;\r?\n/); + if (!loosePlaceholder) { + console.error('❌ Placeholder not found in template'); + console.error(' Expected: "// BENCHMARK_DATA_PLACEHOLDER - will be replaced by build script\\nconst EMBEDDED_DATA = {};\\n"'); + const match = html.match(/BENCHMARK_DATA_PLACEHOLDER/); + if (match) { + const start = Math.max(0, match.index - 20); + const end = Math.min(html.length, match.index + 120); + console.error(' Found near:', JSON.stringify(html.slice(start, end))); + } + return false; + } + html = html.replace(loosePlaceholder[0], `// BENCHMARK_DATA_PLACEHOLDER - REPLACED BY BUILD SCRIPT\n// Generated from ${basename(DATA_FILE)} on ${new Date().toISOString()}\nconst EMBEDDED_DATA = ${JSON.stringify(data, null, 2)};\n`); + } else { + html = html.replace(placeholder, `// BENCHMARK_DATA_PLACEHOLDER - REPLACED BY BUILD SCRIPT\n// Generated from ${basename(DATA_FILE)} on ${new Date().toISOString()}\nconst EMBEDDED_DATA = ${JSON.stringify(data, null, 2)};\n`); + } + + // Update title with metadata if present (match any tag with APAW... in it) + const titleRegex = /[^<]*APAW[^<]*<\/title>/; + if (titleRegex.test(html)) { + const newTitle = `APAW Agent Model Research — generated ${data.generated.slice(0, 10)}`; + html = html.replace(titleRegex, `<title>${newTitle}`); + } + + // Update subtitle if present + const subtitlePattern = /
([^<]*)<\/div>/; + const newSubtitle = `
Live dashboard • ${data.models.length} models × ${data.agent_model_scores.length} agents • ${data.generated.slice(0, 10)}
`; + if (subtitlePattern.test(html)) { + html = html.replace(subtitlePattern, newSubtitle); + } + + // Write output file + try { + writeFileSync(OUTPUT_FILE, html); + console.log(`✅ Output written to: ${OUTPUT_FILE} (${html.length} bytes)`); + } catch (error) { + console.error(`❌ Failed to write output: ${error}`); + return false; + } + + // Create dated version in dist directory + try { + if (!existsSync(DIST_DIR)) { + require('fs').mkdirSync(DIST_DIR, { recursive: true }); + } + const dateStr = data.generated.slice(0, 10).replace(/-/g, '_'); + const distFile = join(DIST_DIR, `research-dashboard-${dateStr}.html`); + writeFileSync(distFile, html); + console.log(`📁 Dated copy: ${distFile}`); + } catch (error) { + console.warn(`⚠️ Could not create dated copy: ${error}`); + } + + // Print summary + const recommendations = data.recommendations || []; + console.log('\n📊 Summary:'); + console.log(` • Agents tracked: ${data.total_agents || data.agent_model_scores.length}`); + console.log(` • Models benchmarked: ${data.total_models_tracked || data.models.length}`); + console.log(` • Providers: ${data.providers?.join(', ')}`); + console.log(` • Recommendations: ${recommendations.length}`); + + if (recommendations.length >577.0) { + const highImpact = recommendations.filter((r: any) => r.impact === 'high').length; + const applied = recommendations.filter((r: any) => r.to_model?.includes('✅')).length; + console.log(` • High-impact recommendations: ${highImpact}`); + console.log(` • Applied recommendations: ${applied}`); + } + + return true; +} + +function watchMode(): void { + console.log('👀 Watch mode enabled - monitoring data and template files'); + console.log(' Press Ctrl+C to stop'); + + let timeout: Timer | null = null; + + watch(DATA_FILE, (eventType) => { + if (eventType === 'change') { + if (timeout) clearTimeout(timeout); + timeout = setTimeout(() => { + console.log('\n🔄 Data file changed, rebuilding...'); + buildDashboard(); + }, 500); + } + }); + + watch(DEFAULT_TEMPLATE, (eventType) => { + if (eventType === 'change') { + if (timeout) clearTimeout(timeout); + timeout = setTimeout(() => { + console.log('\n🔄 Template file changed, rebuilding...'); + buildDashboard(); + }, 500); + } + }); +} + +// Parse CLI arguments +const args = process.argv.slice(2); +let watchModeEnabled = false; +let customTemplate: string | undefined; + +for (let i = 0; i < args.length; i++) { + if (args[i] === '--watch') { + watchModeEnabled = true; + } else if (args[i] === '--template' && i + 1 < args.length) { + customTemplate = args[i + 1]; + i++; + } else if (args[i] === '--help' || args[i] === '-h') { + console.log(` +Usage: bun run agent-evolution/scripts/build-research-dashboard.ts [options] + +Options: + --watch Watch for changes and rebuild automatically + --template Use custom HTML template file + --help, -h Show this help message + +Examples: + bun run agent-evolution/scripts/build-research-dashboard.ts + bun run agent-evolution/scripts/build-research-dashboard.ts --watch + bun run agent-evolution/scripts/build-research-dashboard.ts --template custom.html +`); + process.exit(0); + } +} + +// Main execution +if (watchModeEnabled) { + // Build once then watch + buildDashboard(customTemplate); + watchMode(); +} else { + const success = buildDashboard(customTemplate); + process.exit(success ? 0 : 1); +} \ No newline at end of file diff --git a/agent-evolution/scripts/rebuild-template.cjs b/agent-evolution/scripts/rebuild-template.cjs new file mode 100644 index 0000000..a9d8378 --- /dev/null +++ b/agent-evolution/scripts/rebuild-template.cjs @@ -0,0 +1,74 @@ +const fs = require('fs'); +const v3 = fs.readFileSync('agent-evolution/ideas/apaw_agent_model_research_v3.html', 'utf8'); + +const dataStart = v3.indexOf('// ACTUAL STATE from _kilo.zip'); +const renderStart = v3.indexOf('// ======================= RENDER ======================='); + +if (dataStart === -1 || renderStart === -1) { + console.error('Cannot find markers'); + process.exit(1); +} + +const mapping = `// BENCHMARK_DATA_PLACEHOLDER - will be replaced by build script +const EMBEDDED_DATA = {}; + +// === MAP EMBEDDED_DATA -> original v3 format === +const allModels = EMBEDDED_DATA.models || []; +const scoreModelIds = Object.keys((EMBEDDED_DATA.agent_model_scores || [])[0]?.scores || {}); +const activeModels = allModels.filter(m => scoreModelIds.includes(m.id)); + +const cfg = (EMBEDDED_DATA.agent_current_config || []).map(c => { + const modelId = (c.model || '').replace('ollama-cloud/', ''); + const badge = c.badge_type || ( + modelId.includes('qwen3') ? 'qwen' : + modelId.includes('minimax') ? 'minimax' : + modelId.includes('nemotron') ? 'nemotron' : + modelId.includes('glm') ? 'glm' : + modelId.includes('kimi') ? 'kimi' : + modelId.includes('deepseek') ? 'deepseek' : 'groq' + ); + return { a: c.agent, m: modelId, p: c.provider || 'Ollama', cat: c.category || 'General', b: badge, fit: c.fit_score || 0, s: c.status || 'good', prev: c.previous_model }; +}); + +const groqModels = (EMBEDDED_DATA.groq_models || []).map(g => ({ + id: g.id, rpm: g.rpm, rpd: g.rpd, tpm: g.tpm, tpd: g.tpd, speed: g.speed, use: g.use_case +})); + +const ollamaModels = activeModels.map(m => ({ + n: m.name, org: m.organization, par: m.parameters, ctx: m.context_window, + swe: m.swe_bench, ifScore: m.if_score, cat: m.categories || [], + str: m.description, tags: m.tags || [], or: m.openrouter, groqSpeed: m.speed_tps +})); + +const ifScores = {}; +activeModels.forEach((m, i) => { if (m.if_score) ifScores[i] = m.if_score; }); + +const hmModels = activeModels.map(m => ({ + n: m.display_name || m.name?.split(' ').pop() || m.id, + p: m.provider === 'ollama-cloud' ? 'Ollama Cloud' : m.provider === 'openrouter' ? 'OpenRouter' : m.provider || 'Ollama', + if: m.if_score || 0 +})); + +const hmAgents = (EMBEDDED_DATA.agent_model_scores || []).map(ag => { + const scores = activeModels.map(m => ag.scores?.[m.id] ?? 0); + const fullModelId = allModels[ag.current_model_index]?.id; + const c = activeModels.findIndex(m => m.id === fullModelId); + return { n: ag.agent, c: c, re: ag.reasoning_effort || 'M', s: scores }; +}); + +const recs = (EMBEDDED_DATA.recommendations || []).map(r => ({ + a: r.agent, from: r.from_model, fromP: r.from_provider || 'Ollama', + to: r.to_model, toP: r.to_provider || 'Ollama', imp: r.impact || 'low', + q: r.quality_change || '0', sp: r.speed_change || '=', ctx: r.context_change || '-', + prov: r.provider_change || r.to_provider || 'Ollama', r: r.rationale +})); + +const impactData = (EMBEDDED_DATA.impact_data || []).map(d => ({ + cat: d.category, b: d.before, a: d.after, d: d.delta, n: d.notes || d.note +})); + +`; + +const final = v3.substring(0, dataStart) + mapping + v3.substring(renderStart); +fs.writeFileSync('agent-evolution/research-dashboard.template.html', final); +console.log('Template written:', final.length, 'chars,', final.split('\n').length, 'lines'); diff --git a/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs b/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs new file mode 100644 index 0000000..3cc7e8f --- /dev/null +++ b/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs @@ -0,0 +1,136 @@ +const fs = require('fs'); + +// Parse simple YAML structure with 2-space indentation +function parseCapabilityIndex(text) { + const lines = text.split(/\r?\n/); + const agents = {}; + let currentAgent = ''; + let currentList = ''; + + for (const line of lines) { + const indent = line.length - line.trimStart().length; + const trimmed = line.trim(); + + if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) { + // Agent name + currentAgent = trimmed.slice(0, -1); + agents[currentAgent] = {}; + currentList = ''; + continue; + } + + if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) { + // Scalar property or list start + const key = trimmed.slice(0, -1); + currentList = key; + if (!Array.isArray(agents[currentAgent][key])) { + agents[currentAgent][key] = []; + } + continue; + } + + if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) { + // key: value + const [key, ...rest] = trimmed.split(':'); + const value = rest.join(':').trim(); + agents[currentAgent][key.trim()] = value; + currentList = ''; + continue; + } + + if (indent >= 6 && trimmed.startsWith('- ')) { + // List item + const value = trimmed.slice(2).trim(); + if (currentList) { + if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = []; + agents[currentAgent][currentList].push(value); + } + continue; + } + + // Reset list context on unknown indentation + if (indent < 4) { + currentList = ''; + } + } + + // Filter out non-agent entries (flat sections like capability_routing, etc.) + const result = {}; + const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models']; + for (const [name, data] of Object.entries(agents)) { + const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data; + if (hasAgentProps) result[name] = data; + } + + return result; +} + +const yaml = fs.readFileSync('.kilo/capability-index.yaml', 'utf8'); +const parsed = parseCapabilityIndex(yaml); +console.log('Parsed agents:', Object.keys(parsed).length); + +// Read existing benchmarks +const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8')); + +// Update agent_current_config +bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => { + const rawModel = data.model || ''; + const modelId = rawModel.replace('ollama-cloud/', ''); + const badge = modelId.includes('qwen3') ? 'qwen' : + modelId.includes('minimax') ? 'minimax' : + modelId.includes('nemotron') ? 'nemotron' : + modelId.includes('glm') ? 'glm' : + modelId.includes('kimi') ? 'kimi' : + modelId.includes('deepseek') ? 'deepseek' : 'groq'; + return { + agent, + model: rawModel, + provider: data.mode === 'all' ? 'Ollama Cloud' : (rawModel.startsWith('ollama-cloud/') ? 'Ollama Cloud' : 'Ollama'), + category: 'Process', + badge_type: badge, + fit_score: 0, + status: 'good', + previous_model: null + }; +}); + +// Update agent_model_scores — preserve existing scores, fix current_model_id +const existingScores = {}; +(bench.agent_model_scores || []).forEach(s => { + existingScores[s.agent] = s.scores || {}; +}); + +bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => { + const rawModel = data.model || ''; + const modelId = rawModel.replace('ollama-cloud/', ''); + const currentIndex = bench.models.findIndex(m => m.id === modelId); + // Preserve existing scores or empty + const scores = existingScores[agent] || {}; + return { + agent, + current_model_index: currentIndex >= 0 ? currentIndex : -1, + current_model_id: modelId, + reasoning_effort: data.variant === 'thinking' ? 'H' : 'M', + scores + }; +}); + +// Update metadata +bench.generated = new Date().toISOString(); +bench.source = '.kilo/capability-index.yaml (synced v2)'; +bench.total_agents = bench.agent_current_config.length; + +fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2)); +console.log('Synced', bench.agent_current_config.length, 'agents'); +console.log('Generated:', bench.generated); + +// Verify +let mismatches = 0; +bench.agent_current_config.forEach(c => { + const scores = bench.agent_model_scores.find(s => s.agent === c.agent); + if (scores && scores.current_model_id !== c.model.replace('ollama-cloud/', '')) { + console.log(' MISMATCH:', c.agent, scores.current_model_id, '->', c.model); + mismatches++; + } +}); +console.log('Mismatches:', mismatches); diff --git a/agent-evolution/scripts/sync-model-research.ts b/agent-evolution/scripts/sync-model-research.ts new file mode 100644 index 0000000..a6425aa --- /dev/null +++ b/agent-evolution/scripts/sync-model-research.ts @@ -0,0 +1,651 @@ +#!/usr/bin/env bun +/** + * Model Research Synchronization Script + * Applies model recommendations from research output to agent configuration files. + * + * Usage: + * bun run agent-evolution/scripts/sync-model-research.ts # apply latest + * bun run agent-evolution/scripts/sync-model-research.ts --dry-run # preview only + * bun run agent-evolution/scripts/sync-model-research.ts --input path/to.json # custom input + * bun run agent-evolution/scripts/sync-model-research.ts --agent planner # single agent + */ + +import * as fs from "fs"; +import * as path from "path"; +import { spawnSync } from "child_process"; + +// Types based on model-research.schema.json +interface Recommendation { + agent: string; + action: "update_model" | "confirm_model" | "add_fallback" | "redesign_agent"; + current_model: string; + recommended_model: string; + impact: "critical" | "high" | "medium" | "low"; + rationale: string; + applied: boolean; + applied_date?: string | null; + score_delta?: number; +} + +interface ModelResearchData { + version: string; + generated: string; + source: string; + recommendations: Recommendation[]; + capability_index_patch?: Array<{ + agent: string; + set: Record; + }>; + summary?: { + total_recommendations: number; + applied_count: number; + pending_count: number; + }; +} + +interface ChangeSummary { + total_recommendations: number; + applied: number; + confirmed: number; + skipped: number; + errors: string[]; + files_modified: string[]; + agents_updated: string[]; + dashboard_rebuilt: boolean; +} + +// Default paths +const DEFAULT_RESEARCH_FILE = path.join(__dirname, "../data/model-research-latest.json"); +const SCHEMA_FILE = path.join(__dirname, "../data/model-research.schema.json"); +const CAPABILITY_INDEX = path.join(process.cwd(), ".kilo/capability-index.yaml"); +const AGENT_VERSIONS = path.join(__dirname, "../data/agent-versions.json"); +const KILO_META = path.join(process.cwd(), "kilo-meta.json"); +const SYNC_SCRIPT = path.join(process.cwd(), "scripts/sync-agents.cjs"); + +// Parse command line arguments +function parseArgs(): { + dryRun: boolean; + inputFile: string; + singleAgent?: string; +} { + const args = process.argv.slice(2); + const options: { dryRun: boolean; inputFile: string; singleAgent?: string } = { + dryRun: false, + inputFile: DEFAULT_RESEARCH_FILE, + }; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + if (arg === "--dry-run" || arg === "-n") { + options.dryRun = true; + } else if (arg === "--input" || arg === "-i") { + options.inputFile = args[++i] || DEFAULT_RESEARCH_FILE; + } else if (arg === "--agent" || arg === "-a") { + options.singleAgent = args[++i]; + } else if (!arg.startsWith("-")) { + // Positional argument as input file + options.inputFile = arg; + } + } + + return options; +} + +// Load research data +function loadResearchData(filePath: string): ModelResearchData { + console.log(`📖 Loading research data from: ${filePath}`); + + if (!fs.existsSync(filePath)) { + throw new Error(`Research file not found: ${filePath}`); + } + + const content = fs.readFileSync(filePath, "utf-8"); + const data = JSON.parse(content); + + // Basic validation (we don't implement full schema validation for simplicity) + if (!data.version || !data.generated || !Array.isArray(data.recommendations)) { + throw new Error("Invalid research data structure"); + } + + console.log(` Found ${data.recommendations.length} recommendations`); + console.log(` Generated: ${data.generated}`); + console.log(` Source: ${data.source}`); + + return data; +} + +// Validate schema (basic check) +function validateSchema(data: ModelResearchData): boolean { + // For now, just check required fields + const required = [ + "version", + "generated", + "source", + "recommendations", + ]; + + for (const field of required) { + if (!(field in data)) { + console.warn(`⚠️ Missing required field: ${field}`); + return false; + } + } + + return true; +} + +// Load capability-index.yaml +function loadCapabilityIndex(): string { + return fs.readFileSync(CAPABILITY_INDEX, "utf-8"); +} + +// Update model in capability-index.yaml +function replaceModelInYaml(content: string, agentName: string, newModel: string): { content: string; changed: boolean } { + // Find the agent block section + const agentStart = content.indexOf(` ${agentName}:`); + if (agentStart === -1) { + throw new Error(`Agent ${agentName} not found in capability-index.yaml`); + } + + // Find next agent section (at same indent level) + const remaining = content.substring(agentStart); + const nextAgentMatch = remaining.match(/\n \w/); + const agentEnd = nextAgentMatch ? agentStart + nextAgentMatch.index! : content.length; + + const agentBlock = content.substring(agentStart, agentEnd); + + // Find and replace the model line (more flexible regex for whitespace) + const modelLineRegex = /^\s+model:\s+.+$/gm; + const match = agentBlock.match(modelLineRegex); + + if (!match) { + throw new Error(`Model line not found in agent ${agentName} block`); + } + + const currentModelLine = match[0]; + const currentModelMatch = currentModelLine.match(/:\s*(.+)$/); + const currentModel = currentModelMatch ? currentModelMatch[1].trim() : ''; + + // Check if model already matches + if (currentModel === newModel) { + console.log(` ⏭️ Model already set to ${newModel}, skipping`); + return { content, changed: false }; // No change needed + } + + // Replace model line with new model + const updatedBlock = agentBlock.replace(modelLineRegex, currentModelLine.replace(currentModel, newModel)); + + if (updatedBlock === agentBlock) { + throw new Error(`Failed to replace model line in agent ${agentName} block`); + } + + console.log(` 🔄 Updating model: ${currentModel} → ${newModel}`); + const newContent = content.substring(0, agentStart) + updatedBlock + content.substring(agentEnd); + return { content: newContent, changed: true }; +} + +// Update kilo-meta.json +function updateKiloMeta(agentName: string, newModel: string): void { + const content = fs.readFileSync(KILO_META, "utf-8"); + const data = JSON.parse(content); + + if (!data.agents[agentName]) { + throw new Error(`Agent ${agentName} not found in kilo-meta.json`); + } + + data.agents[agentName].model = newModel; + data.lastSync = new Date().toISOString(); + + fs.writeFileSync(KILO_META, JSON.stringify(data, null, 2)); +} + +// Update kilo.jsonc (manual update required per evolutionary-sync.md rules) +function updateKiloJsonc(agentName: string, newModel: string): void { + const content = fs.readFileSync(path.join(process.cwd(), "kilo.jsonc"), "utf-8"); + + // Simple regex replacement for agent block + // Find agent block: "agentName": { ... "model": "old", ... } + const agentRegex = new RegExp(`"${agentName}":\\s*{[\\s\\S]*?"model":\\s*"[^"]*"`, 'm'); + const match = content.match(agentRegex); + + if (!match) { + console.warn(`⚠️ Could not find agent ${agentName} in kilo.jsonc - manual update required`); + return; + } + + const oldMatch = match[0]; + const newMatch = oldMatch.replace(/"model":\s*"[^"]*"/, `"model": "${newModel}"`); + const updatedContent = content.replace(oldMatch, newMatch); + + fs.writeFileSync(path.join(process.cwd(), "kilo.jsonc"), updatedContent); +} + +// Load agent-versions.json +function loadAgentVersions(): any { + const content = fs.readFileSync(AGENT_VERSIONS, "utf-8"); + return JSON.parse(content); +} + +// Update agent-versions.json with model change +function updateAgentVersions( + agentVersions: any, + agentName: string, + fromModel: string, + toModel: string, + reason: string +): any { + const now = new Date().toISOString(); + + if (!agentVersions.agents[agentName]) { + agentVersions.agents[agentName] = { + current: {}, + history: [], + performance_log: [], + }; + } + + const agent = agentVersions.agents[agentName]; + + // Add history entry + agent.history.push({ + date: now, + commit: "model-research-sync", + type: "model_change", + from: fromModel, + to: toModel, + reason, + source: "research", + }); + + // Update current model + if (!agent.current) agent.current = {}; + agent.current.model = toModel; + agent.current.provider = detectProvider(toModel); + + // Update lastUpdated + agentVersions.lastUpdated = now; + + return agentVersions; +} + +// Provider detection +function detectProvider(model: string): string { + if (model.startsWith("ollama-cloud/") || model.startsWith("ollama/")) return "Ollama"; + if (model.startsWith("openrouter/") || model.includes("openrouter")) return "OpenRouter"; + if (model.startsWith("groq/")) return "Groq"; + return "Unknown"; +} + +// Apply a single recommendation +function applyRecommendation( + rec: Recommendation, + dryRun: boolean, + singleAgent?: string +): { applied: boolean; error?: string; filesModified?: string[] } { + if (singleAgent && rec.agent !== singleAgent) { + return { applied: false }; + } + + console.log(`\n🔧 Applying recommendation for ${rec.agent}`); + console.log(` Action: ${rec.action}`); + console.log(` Current: ${rec.current_model}`); + console.log(` Recommended: ${rec.recommended_model}`); + console.log(` Impact: ${rec.impact}`); + console.log(` Rationale: ${rec.rationale}`); + + // Skip if already applied + if (rec.applied) { + console.log(` ⏭️ Already applied, skipping`); + return { applied: false }; + } + + if (rec.action === "update_model") { + try { + // 1. Update capability-index.yaml + const capIndexContent = loadCapabilityIndex(); + const { content: updatedContent, changed: yamlChanged } = replaceModelInYaml(capIndexContent, rec.agent, rec.recommended_model); + + if (!dryRun && yamlChanged) { + fs.writeFileSync(CAPABILITY_INDEX, updatedContent); + console.log(` ✅ Updated capability-index.yaml`); + } else if (!dryRun) { + console.log(` ⏭️ Skipping capability-index.yaml (no change needed)`); + } else { + console.log(` 📋 Would update capability-index.yaml`); + } + + // Only update other files if YAML was actually changed + if (!yamlChanged) { + return { + applied: false, + filesModified: [], + }; + } + + // 2. Update kilo-meta.json (source of truth) + if (!dryRun) { + updateKiloMeta(rec.agent, rec.recommended_model); + console.log(` ✅ Updated kilo-meta.json`); + } else { + console.log(` 📋 Would update kilo-meta.json`); + } + + // 3. Update agent-versions.json + const agentVersions = loadAgentVersions(); + const updatedVersions = updateAgentVersions( + agentVersions, + rec.agent, + rec.current_model, + rec.recommended_model, + rec.rationale + ); + + if (!dryRun) { + fs.writeFileSync(AGENT_VERSIONS, JSON.stringify(updatedVersions, null, 2)); + console.log(` ✅ Updated agent-versions.json`); + } else { + console.log(` 📋 Would update agent-versions.json`); + } + + // 4. Attempt to update kilo.jsonc (manual verification still required) + if (!dryRun) { + try { + updateKiloJsonc(rec.agent, rec.recommended_model); + console.log(` ✅ Updated kilo.jsonc`); + } catch (error: any) { + console.warn(` ⚠️ Could not update kilo.jsonc: ${error.message}`); + console.log(` ⚠️ Manual update required per evolutionary-sync.md rules`); + } + } else { + console.log(` 📋 Would update kilo.jsonc`); + } + + return { + applied: true, + filesModified: [CAPABILITY_INDEX, KILO_META, AGENT_VERSIONS], + }; + } catch (error: any) { + return { + applied: false, + error: error.message, + }; + } + } else if (rec.action === "confirm_model") { + // Mark as confirmed in agent-versions.json + try { + const agentVersions = loadAgentVersions(); + + if (agentVersions.agents[rec.agent]) { + // Add confirmation history entry + agentVersions.agents[rec.agent].history.push({ + date: new Date().toISOString(), + commit: "model-research-confirm", + type: "model_change", + from: rec.current_model, + to: rec.current_model, // same model + reason: `Confirmed: ${rec.rationale}`, + source: "research", + }); + + if (!dryRun) { + fs.writeFileSync(AGENT_VERSIONS, JSON.stringify(agentVersions, null, 2)); + console.log(` ✅ Confirmed current model in agent-versions.json`); + } else { + console.log(` 📋 Would confirm current model`); + } + + return { + applied: true, + filesModified: [AGENT_VERSIONS], + }; + } else { + return { + applied: false, + error: `Agent ${rec.agent} not found in agent-versions.json`, + }; + } + } catch (error: any) { + return { + applied: false, + error: error.message, + }; + } + } + + // Unsupported action + console.log(` ⏭️ Unsupported action: ${rec.action}`); + return { applied: false }; +} + +// Run sync-agents.js --fix +function runSyncAgentsFix(): boolean { + console.log(`\n🔄 Running sync-agents.js --fix...`); + + const result = spawnSync("node", [SYNC_SCRIPT, "--fix"], { + cwd: process.cwd(), + encoding: "utf-8", + stdio: "inherit", + }); + + if (result.status !== 0) { + console.error(`❌ Sync script failed with exit code ${result.status}`); + return false; + } + + console.log(`✅ Sync script completed`); + return true; +} + +// Run sync-agents.js --check +function runSyncAgentsCheck(): boolean { + console.log(`\n✅ Running sync-agents.js --check...`); + + const result = spawnSync("node", [SYNC_SCRIPT, "--check"], { + cwd: process.cwd(), + encoding: "utf-8", + stdio: "inherit", + }); + + if (result.status !== 0) { + console.error(`❌ Sync check failed with exit code ${result.status}`); + return false; + } + + console.log(`✅ Sync check passed`); + return true; +} + +// Run build-research-dashboard script +function runBuildDashboard(): { success: boolean; error?: string } { + console.log("\n📊 Rebuilding research dashboard..."); + + try { + // Try to import buildResearchDashboard from build-research-dashboard.ts + const dashboardScript = path.join(__dirname, "build-research-dashboard.ts"); + const standaloneScript = path.join(__dirname, "build-standalone.cjs"); + + // Check which build script exists + let scriptToRun = ""; + let args: string[] = []; + + if (fs.existsSync(dashboardScript)) { + scriptToRun = "bun"; + args = ["run", dashboardScript]; + } else if (fs.existsSync(standaloneScript)) { + scriptToRun = "node"; + args = [standaloneScript]; + } else { + return { + success: false, + error: "No dashboard build script found (build-research-dashboard.ts or build-standalone.cjs)" + }; + } + + const result = spawnSync(scriptToRun, args, { + cwd: process.cwd(), + encoding: "utf-8", + stdio: "inherit", + timeout: 30000 + }); + + if (result.status !== 0) { + return { + success: false, + error: result.stderr || `Build script failed with exit code ${result.status}` + }; + } + + console.log(result.stdout); + console.log("✅ Dashboard rebuilt: agent-evolution/index.standalone.html"); + return { success: true }; + } catch (error: any) { + return { + success: false, + error: error.message + }; + } +} + +// Print summary +function printSummary(summary: ChangeSummary): void { + console.log("\n" + "=".repeat(60)); + console.log("📊 SYNC SUMMARY"); + console.log("=".repeat(60)); + + console.log(`Total recommendations: ${summary.total_recommendations}`); + console.log(`Applied: ${summary.applied}`); + console.log(`Confirmed: ${summary.confirmed}`); + console.log(`Skipped: ${summary.skipped}`); + + if (summary.dashboard_rebuilt) { + console.log(`Dashboard rebuilt: ✅ Yes`); + } + + if (summary.agents_updated.length > 0) { + console.log(`\nAgents updated:`); + summary.agents_updated.forEach(agent => console.log(` - ${agent}`)); + } + + if (summary.files_modified.length > 0) { + console.log(`\nFiles modified:`); + summary.files_modified.forEach(file => console.log(` - ${file}`)); + } + + if (summary.errors.length > 0) { + console.log(`\nErrors:`); + summary.errors.forEach(error => console.log(` - ${error}`)); + } + + console.log("=".repeat(60)); +} + +// Main function +async function main() { + const options = parseArgs(); + + console.log("🧬 Model Research Synchronization"); + console.log(` Dry run: ${options.dryRun ? "YES" : "NO"}`); + console.log(` Input: ${options.inputFile}`); + if (options.singleAgent) { + console.log(` Single agent: ${options.singleAgent}`); + } + console.log(""); + + // Load research data + const researchData = loadResearchData(options.inputFile); + + if (!validateSchema(researchData)) { + console.warn("⚠️ Schema validation issues detected, but continuing..."); + } + + // Filter recommendations + let recommendations = researchData.recommendations; + if (options.singleAgent) { + recommendations = recommendations.filter(r => r.agent === options.singleAgent); + console.log(`Filtered to ${recommendations.length} recommendations for ${options.singleAgent}`); + } + + // Initialize summary + const summary: ChangeSummary = { + total_recommendations: recommendations.length, + applied: 0, + confirmed: 0, + skipped: 0, + errors: [], + files_modified: [], + agents_updated: [], + dashboard_rebuilt: false, + }; + + // Apply recommendations + for (const rec of recommendations) { + const result = applyRecommendation(rec, options.dryRun, options.singleAgent); + + if (result.applied) { + if (rec.action === "update_model") { + summary.applied++; + summary.agents_updated.push(rec.agent); + if (result.filesModified) { + summary.files_modified.push(...result.filesModified); + } + } else if (rec.action === "confirm_model") { + summary.confirmed++; + } + } else { + if (result.error) { + summary.errors.push(`${rec.agent}: ${result.error}`); + } else { + summary.skipped++; + } + } + } + + // Remove duplicate files from files_modified + summary.files_modified = [...new Set(summary.files_modified)]; + + // Run sync-agents.js if we made changes (and not dry run) + if (summary.applied > 0 && !options.dryRun) { + console.log(`\n📦 Propagating changes to all agent files...`); + const syncOk = runSyncAgentsFix(); + + if (syncOk) { + console.log(`\n✅ Validating changes...`); + const checkOk = runSyncAgentsCheck(); + + if (checkOk) { + // Rebuild research dashboard + const buildResult = runBuildDashboard(); + if (buildResult.success) { + console.log("✅ Dashboard rebuilt: agent-evolution/index.standalone.html"); + summary.dashboard_rebuilt = true; + } else { + console.warn(`⚠️ Dashboard rebuild failed: ${buildResult.error}`); + summary.errors.push(`Dashboard rebuild failed: ${buildResult.error}`); + } + } else { + summary.errors.push("Sync check failed after applying changes"); + } + } else { + summary.errors.push("Sync fix script failed"); + } + } + + // Print summary + printSummary(summary); + + // Exit with error if any errors occurred + if (summary.errors.length > 0) { + console.error(`\n❌ Sync completed with ${summary.errors.length} errors`); + process.exit(1); + } else if (summary.applied === 0 && summary.confirmed === 0) { + console.warn(`\n⚠️ No changes applied`); + } else { + console.log(`\n🎉 Sync completed successfully!`); + } +} + +// Run the script +main().catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); \ No newline at end of file diff --git a/kilo-meta.json b/kilo-meta.json index bb21f13..b61e317 100644 --- a/kilo-meta.json +++ b/kilo-meta.json @@ -1,7 +1,7 @@ { "$schema": "https://app.kilo.ai/config.json", "metaVersion": "1.0.0", - "lastSync": "2026-04-27T11:07:02.592Z", + "lastSync": "2026-04-27T20:28:58.841Z", "agents": { "requirement-refiner": { "file": ".kilo/agents/requirement-refiner.md", @@ -21,7 +21,7 @@ "system-analyst": { "file": ".kilo/agents/system-analyst.md", "description": "Designs technical specifications, data schemas, and API contracts before implementation", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/nemotron-3-super", "mode": "subagent", "category": "core" }, @@ -36,7 +36,7 @@ "lead-developer": { "file": ".kilo/agents/lead-developer.md", "description": "Primary code writer for backend and core logic. Writes implementation to pass tests", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/nemotron-3-super", "mode": "subagent", "color": "#DC2626", "category": "core" diff --git a/kilo.jsonc b/kilo.jsonc index 8e853ed..64636a7 100644 --- a/kilo.jsonc +++ b/kilo.jsonc @@ -45,7 +45,7 @@ "system-analyst": { "description": "Designs technical specifications, data schemas, and API contracts before implementation", "mode": "subagent", - "model": "qwen/qwen3.6-plus:free" + "model": "ollama-cloud/nemotron-3-super" }, "sdet-engineer": { "description": "Writes tests following TDD methodology. Tests MUST fail initially (Red phase)", @@ -68,7 +68,7 @@ "lead-developer": { "description": "Primary code writer for backend and core logic. Writes implementation to pass tests", "mode": "subagent", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/nemotron-3-super", "color": "#DC2626", "permission": { "read": "allow", diff --git a/scripts/log-execution.cjs b/scripts/log-execution.cjs new file mode 100644 index 0000000..96cd67f --- /dev/null +++ b/scripts/log-execution.cjs @@ -0,0 +1,41 @@ +const fs = require('fs'); +const path = require('path'); + +const LOG_FILE = '.kilo/logs/agent-executions.jsonl'; + +function logExecution(data) { + const entry = { + ts: new Date().toISOString(), + agent: data.agent || 'unknown', + issue: data.issue || 0, + project: data.project || 'UniqueSoft/APAW', + task: data.task || 'unknown', + subtask_type: data.subtask_type || 'general', + duration_ms: data.duration_ms || 0, + tokens_used: data.tokens_used || 0, + status: data.status || 'unknown', + files: data.files || [], + score: data.score || 0, + next_agent: data.next_agent || null + }; + + fs.appendFileSync(LOG_FILE, JSON.stringify(entry) + '\n'); + return entry; +} + +// CLI usage +if (require.main === module) { + const args = {}; + for (let i = 2; i < process.argv.length; i += 2) { + const key = process.argv[i].replace(/^--/, ''); + const val = process.argv[i + 1]; + if (key === 'files') args[key] = val.split(','); + else if (key === 'issue' || key === 'duration_ms' || key === 'tokens_used' || key === 'score') args[key] = parseInt(val) || 0; + else args[key] = val; + } + + const entry = logExecution(args); + console.log('Logged:', entry.ts, entry.agent, entry.status); +} + +module.exports = { logExecution };