chore: add eval scenarios for codeflash-skills tile

5 scenarios testing: sequential debugging, Result type + effort config, test patterns, domain type conventions, and deduplication/repair mechanics. Also adds tessl-labs/tessl-skill-eval-scenarios dev dependency.
2026-02-14 21:24:54 -05:00 · 2026-02-14 21:24:54 -05:00 · ff2abd29f2
commit ff2abd29f2
parent 289b75c555
19 changed files with 408 additions and 0 deletions
--- a/tessl.json
+++ b/tessl.json
@ -72,6 +72,9 @@
    },
    "codeflash/codeflash-skills": {
      "version": "0.2.0"
+    },
+    "tessl-labs/tessl-skill-eval-scenarios": {
+      "version": "0.0.5"
    }
  }
 }
--- a/tiles/codeflash-skills/evals/capabilities.json
+++ b/tiles/codeflash-skills/evals/capabilities.json
@ -0,0 +1,104 @@
+{
+  "package_name": "codeflash-skills",
+  "total_capabilities": 14,
+  "capabilities": [
+    {
+      "id": 0,
+      "name": "sequential-pipeline-debugging",
+      "description": "Debug optimization failures by walking through pipeline stages sequentially and stopping at the first failure found",
+      "complexity": "intermediate",
+      "api_elements": ["discovery", "ranking", "context", "AI service", "verification", "deduplication", "repair"]
+    },
+    {
+      "id": 1,
+      "name": "token-limit-awareness",
+      "description": "Know that OPTIMIZATION_CONTEXT_TOKEN_LIMIT and TESTGEN_CONTEXT_TOKEN_LIMIT are both 16000 tokens and that exceeding them causes function rejection",
+      "complexity": "basic",
+      "api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
+    },
+    {
+      "id": 2,
+      "name": "improvement-threshold",
+      "description": "Know that MIN_IMPROVEMENT_THRESHOLD is 0.05 (5%) and candidates below this speedup are rejected",
+      "complexity": "basic",
+      "api_elements": ["MIN_IMPROVEMENT_THRESHOLD", "STABILITY_WINDOW_SIZE"]
+    },
+    {
+      "id": 3,
+      "name": "ast-deduplication",
+      "description": "Know that candidates are deduplicated via AST normalization using normalize_code() and CandidateEvaluationContext.ast_code_to_id",
+      "complexity": "intermediate",
+      "api_elements": ["normalize_code()", "CandidateEvaluationContext.ast_code_to_id", "code_utils/deduplicate_code.py"]
+    },
+    {
+      "id": 4,
+      "name": "repair-trigger-conditions",
+      "description": "Know that repair only triggers when fewer than MIN_CORRECT_CANDIDATES=2 pass, and is skipped when REPAIR_UNMATCHED_PERCENTAGE_LIMIT is exceeded",
+      "complexity": "advanced",
+      "api_elements": ["MIN_CORRECT_CANDIDATES", "REPAIR_UNMATCHED_PERCENTAGE_LIMIT", "AIServiceCodeRepairRequest"]
+    },
+    {
+      "id": 5,
+      "name": "ai-service-error-patterns",
+      "description": "Know specific log patterns to search for when AI service fails: 'Error generating optimized candidates', 'cli-optimize-error-caught', 'cli-optimize-error-response'",
+      "complexity": "intermediate",
+      "api_elements": ["AiServiceClient", "api/aiservice.py"]
+    },
+    {
+      "id": 6,
+      "name": "behavioral-vs-benchmark-failures",
+      "description": "Distinguish between behavioral test failures (return value/stdout/pass-fail mismatches via TestDiffScope) and benchmark failures (speedup below threshold)",
+      "complexity": "intermediate",
+      "api_elements": ["TestDiffScope", "RETURN_VALUE", "STDOUT", "DID_PASS"]
+    },
+    {
+      "id": 7,
+      "name": "result-type-pattern",
+      "description": "Use Result[L, R] from either.py with Success/Failure constructors and is_successful() checks before unwrap()",
+      "complexity": "basic",
+      "api_elements": ["Result", "Success", "Failure", "is_successful", "unwrap()", "either.py"]
+    },
+    {
+      "id": 8,
+      "name": "effort-config-pattern",
+      "description": "Add effort-dependent config via EffortKeys enum, EFFORT_VALUES dict with LOW/MEDIUM/HIGH levels, and get_effort_value()",
+      "complexity": "intermediate",
+      "api_elements": ["EffortKeys", "EffortLevel", "EFFORT_VALUES", "get_effort_value()", "config_consts.py"]
+    },
+    {
+      "id": 9,
+      "name": "module-to-feature-mapping",
+      "description": "Know which codeflash module to modify for different feature types (optimization/ for strategies, api/ for endpoints, languages/ for language support, etc.)",
+      "complexity": "basic",
+      "api_elements": ["MODULE_REFERENCE.md"]
+    },
+    {
+      "id": 10,
+      "name": "domain-type-conventions",
+      "description": "Use @dataclass(frozen=True) for immutable data, BaseModel for serializable models, and keep function_types.py dependency-free",
+      "complexity": "intermediate",
+      "api_elements": ["@dataclass(frozen=True)", "BaseModel", "models/models.py", "models/function_types.py"]
+    },
+    {
+      "id": 11,
+      "name": "test-patterns",
+      "description": "Use tmp_path fixture, .resolve() on Paths, .as_posix() for string conversion, full string equality assertions, and awareness of deterministic patches",
+      "complexity": "basic",
+      "api_elements": ["tmp_path", ".resolve()", ".as_posix()", "pytest_plugin.py"]
+    },
+    {
+      "id": 12,
+      "name": "quality-check-commands",
+      "description": "Run uv run prek run for formatting/linting, uv run mypy for type checking, and uv run pytest for tests",
+      "complexity": "basic",
+      "api_elements": ["uv run prek run", "uv run mypy", "uv run pytest"]
+    },
+    {
+      "id": 13,
+      "name": "language-support-patterns",
+      "description": "Use @register_language decorator, get_language_support() for lookup, singleton pattern via set_current_language()/current_language(), and is_python()/is_javascript() guards",
+      "complexity": "advanced",
+      "api_elements": ["@register_language", "get_language_support()", "set_current_language()", "is_python()", "is_javascript()"]
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-1/capability.txt
+++ b/tiles/codeflash-skills/evals/scenario-1/capability.txt
@ -0,0 +1 @@
+Sequential pipeline debugging with specific thresholds
--- a/tiles/codeflash-skills/evals/scenario-1/criteria.json
+++ b/tiles/codeflash-skills/evals/scenario-1/criteria.json
@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Sequential stage order",
+      "description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.",
+      "max_score": 25
+    },
+    {
+      "name": "Token limit value",
+      "description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction",
+      "max_score": 25
+    },
+    {
+      "name": "Importance threshold",
+      "description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking",
+      "max_score": 25
+    },
+    {
+      "name": "Stops at failure",
+      "description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages",
+      "max_score": 25
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-1/task.md
+++ b/tiles/codeflash-skills/evals/scenario-1/task.md
@ -0,0 +1,13 @@
+# Diagnose Silent Optimization Skip
+
+## Context
+
+A user reports that when running codeflash on their project, a specific function `calculate_metrics` in `analytics/processor.py` never appears in the optimization results. The function exists in the module root, is not in the exclude list, and has not been previously optimized. Trace data shows the function is called frequently but with very short execution times (averaging 0.0005 seconds total addressable time). The function has moderate dependencies.
+
+## Task
+
+Write a diagnostic report explaining why this function is being skipped and at which stage in the pipeline the function is filtered out. Include the specific threshold or condition that causes the skip.
+
+## Expected Outputs
+
+A markdown file `diagnostic-report.md` explaining the root cause.
--- a/tiles/codeflash-skills/evals/scenario-2/capability.txt
+++ b/tiles/codeflash-skills/evals/scenario-2/capability.txt
@ -0,0 +1 @@
+Result type pattern and effort-dependent configuration
--- a/tiles/codeflash-skills/evals/scenario-2/criteria.json
+++ b/tiles/codeflash-skills/evals/scenario-2/criteria.json
@ -0,0 +1,31 @@
+{
+  "context": "Tests whether the agent uses the codeflash Result type pattern from either.py and the effort-dependent configuration pattern when implementing a new pipeline feature.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Imports from either.py",
+      "description": "Imports Success, Failure, and is_successful from codeflash.either (NOT from a different error handling module)",
+      "max_score": 20
+    },
+    {
+      "name": "Result return type",
+      "description": "Function returns Result type using Success() for success and Failure() for errors, not exceptions or None",
+      "max_score": 20
+    },
+    {
+      "name": "is_successful check",
+      "description": "Calls is_successful() or .is_successful() before calling unwrap() on the result",
+      "max_score": 20
+    },
+    {
+      "name": "EffortKeys enum entry",
+      "description": "Adds a new entry to the EffortKeys enum in config_consts.py",
+      "max_score": 20
+    },
+    {
+      "name": "Three effort levels",
+      "description": "Adds values for all three EffortLevel variants (LOW, MEDIUM, HIGH) in EFFORT_VALUES dict",
+      "max_score": 20
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-2/task.md
+++ b/tiles/codeflash-skills/evals/scenario-2/task.md
@ -0,0 +1,21 @@
+# Add Candidate Timeout Feature
+
+## Context
+
+The codeflash optimization engine currently has no per-candidate timeout. Some candidates take too long during verification, wasting the optimization budget. A new feature is needed to skip candidates that exceed a configurable time limit during behavioral testing.
+
+The timeout should vary based on the optimization effort setting — shorter timeouts for low effort runs (to save time) and longer for high effort runs (to allow more complex optimizations).
+
+## Task
+
+Implement a `check_candidate_timeout` function in `codeflash/optimization/function_optimizer.py` that:
+1. Takes a candidate runtime and returns whether the candidate should be skipped
+2. Uses a configurable timeout threshold that scales with optimization effort
+3. Handles the error case where the runtime measurement is unavailable
+
+Also add the necessary configuration constant to `codeflash/code_utils/config_consts.py`.
+
+## Expected Outputs
+
+- Modified `function_optimizer.py` with the new function
+- Modified `config_consts.py` with the new configuration
--- a/tiles/codeflash-skills/evals/scenario-3/capability.txt
+++ b/tiles/codeflash-skills/evals/scenario-3/capability.txt
@ -0,0 +1 @@
+Test patterns and deterministic patch awareness
--- a/tiles/codeflash-skills/evals/scenario-3/criteria.json
+++ b/tiles/codeflash-skills/evals/scenario-3/criteria.json
@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent follows codeflash test conventions when writing tests, including path handling, temp directory patterns, and awareness of the deterministic patching system.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Uses tmp_path fixture",
+      "description": "Test function uses pytest tmp_path fixture parameter, NOT tempfile.NamedTemporaryFile or tempfile.mkdtemp",
+      "max_score": 25
+    },
+    {
+      "name": "Calls resolve on paths",
+      "description": "Calls .resolve() on Path objects before using them in assertions or function calls",
+      "max_score": 25
+    },
+    {
+      "name": "Full string equality",
+      "description": "Uses exact equality assertions (== or assert_equal) for code string comparisons, NOT substring checks like 'in' or assertIn or contains",
+      "max_score": 25
+    },
+    {
+      "name": "No real time dependency",
+      "description": "Test does NOT depend on real time.time(), datetime.now(), random values, or uuid generation for correctness. Acknowledges or accounts for deterministic patches if time/random values are involved.",
+      "max_score": 25
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-3/task.md
+++ b/tiles/codeflash-skills/evals/scenario-3/task.md
@ -0,0 +1,24 @@
+# Write Tests for Context Hash Comparison
+
+## Context
+
+The codeflash context extraction module has a function `compare_context_hashes(context_a, context_b)` that takes two `CodeOptimizationContext` objects and returns whether their hashing contexts are identical. This is used to detect when the same function has already been optimized.
+
+```python
+# In codeflash/context/code_context_extractor.py
+def compare_context_hashes(context_a: CodeOptimizationContext, context_b: CodeOptimizationContext) -> bool:
+    return context_a.hashing_code_context_hash == context_b.hashing_code_context_hash
+```
+
+## Task
+
+Write a test file `tests/test_context/test_hash_comparison.py` with tests for this function. Include tests for:
+1. Two contexts with identical code producing the same hash
+2. Two contexts with different code producing different hashes
+3. A context compared with itself
+
+The tests should create temporary Python source files to build realistic context objects.
+
+## Expected Outputs
+
+- `tests/test_context/test_hash_comparison.py`
--- a/tiles/codeflash-skills/evals/scenario-4/capability.txt
+++ b/tiles/codeflash-skills/evals/scenario-4/capability.txt
@ -0,0 +1 @@
+Domain type conventions and module identification
--- a/tiles/codeflash-skills/evals/scenario-4/criteria.json
+++ b/tiles/codeflash-skills/evals/scenario-4/criteria.json
@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent follows codeflash domain type conventions and correctly identifies the right module when adding a new data type for the optimization pipeline.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "Placed in models/models.py",
+      "description": "New data type is added to codeflash/models/models.py (NOT models/function_types.py, since it has dependencies on other codeflash modules)",
+      "max_score": 25
+    },
+    {
+      "name": "Uses frozen dataclass",
+      "description": "Immutable data type uses @dataclass(frozen=True) decorator, NOT a regular class or unfrozen dataclass",
+      "max_score": 25
+    },
+    {
+      "name": "BaseModel for serializable",
+      "description": "If a serializable model is needed, uses Pydantic BaseModel (NOT dataclass or dict)",
+      "max_score": 25
+    },
+    {
+      "name": "Correct module for feature",
+      "description": "Places the main logic in the correct module for the feature type (e.g., verification/ for test-related, optimization/ for candidate-related, api/ for service-related)",
+      "max_score": 25
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-4/task.md
+++ b/tiles/codeflash-skills/evals/scenario-4/task.md
@ -0,0 +1,21 @@
+# Add Optimization Confidence Score
+
+## Context
+
+The codeflash team wants to add a confidence score to each optimization result. The score should capture how confident the system is that an optimization is both correct and beneficial. It combines test coverage percentage, number of passing test cases, and speedup stability into a single metric.
+
+The score needs to be:
+- Attached to each candidate during evaluation (immutable once computed)
+- Included in the final PR report (needs JSON serialization)
+- Computed during the candidate evaluation phase
+
+## Task
+
+1. Define the data types needed for the confidence score
+2. Write a `compute_confidence_score` function that takes coverage percentage (float), passing test count (int), and stability ratio (float) and returns the confidence result
+3. Place all code in the appropriate codeflash modules
+
+## Expected Outputs
+
+- New/modified type definitions in the appropriate models file
+- New function in the appropriate module
--- a/tiles/codeflash-skills/evals/scenario-5/capability.txt
+++ b/tiles/codeflash-skills/evals/scenario-5/capability.txt
@ -0,0 +1 @@
+Deduplication mechanics and repair trigger conditions
--- a/tiles/codeflash-skills/evals/scenario-5/criteria.json
+++ b/tiles/codeflash-skills/evals/scenario-5/criteria.json
@ -0,0 +1,26 @@
+{
+  "context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.",
+  "type": "weighted_checklist",
+  "checklist": [
+    {
+      "name": "AST normalization",
+      "description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison",
+      "max_score": 25
+    },
+    {
+      "name": "Duplicate result copying",
+      "description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested",
+      "max_score": 25
+    },
+    {
+      "name": "Repair trigger threshold",
+      "description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails",
+      "max_score": 25
+    },
+    {
+      "name": "Unmatched percentage limit",
+      "description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)",
+      "max_score": 25
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/scenario-5/task.md
+++ b/tiles/codeflash-skills/evals/scenario-5/task.md
@ -0,0 +1,17 @@
+# Investigate Low Candidate Diversity
+
+## Context
+
+A codeflash user is optimizing a data processing function at medium effort level. The AI service returns 5 candidates, but the optimization log shows only 1 candidate was actually benchmarked. Of the 5 candidates, 1 passed behavioral tests but didn't meet the performance threshold. The user wants to understand what happened to the other 4 candidates and why no repair attempts were made.
+
+## Task
+
+Write an analysis document explaining:
+1. Why only 1 out of 5 candidates was benchmarked
+2. How the system determines which candidates to actually test
+3. Under what conditions the system would have attempted to repair the failing candidates
+4. What the user could change to get more diverse results
+
+## Expected Outputs
+
+A markdown file `analysis.md` with the explanation.
--- a/tiles/codeflash-skills/evals/summary.json
+++ b/tiles/codeflash-skills/evals/summary.json
@ -0,0 +1,40 @@
+{
+  "total_scenarios": 5,
+  "capabilities_coverage": {
+    "total_capabilities": 14,
+    "capabilities_tested": 10,
+    "coverage_percentage": 71.4
+  },
+  "complexity_distribution": {
+    "basic": 2,
+    "intermediate": 2,
+    "advanced": 1
+  },
+  "scenarios": [
+    {
+      "index": 1,
+      "capability": "sequential-pipeline-debugging, token-limit-awareness, improvement-threshold",
+      "complexity": "intermediate"
+    },
+    {
+      "index": 2,
+      "capability": "result-type-pattern, effort-config-pattern",
+      "complexity": "intermediate"
+    },
+    {
+      "index": 3,
+      "capability": "test-patterns, quality-check-commands",
+      "complexity": "basic"
+    },
+    {
+      "index": 4,
+      "capability": "domain-type-conventions, module-to-feature-mapping",
+      "complexity": "basic"
+    },
+    {
+      "index": 5,
+      "capability": "ast-deduplication, repair-trigger-conditions",
+      "complexity": "advanced"
+    }
+  ]
+}
--- a/tiles/codeflash-skills/evals/summary_infeasible.json
+++ b/tiles/codeflash-skills/evals/summary_infeasible.json
@ -0,0 +1,25 @@
+{
+  "total_infeasible": 4,
+  "infeasible_capabilities": [
+    {
+      "capability": "ai-service-error-patterns",
+      "complexity": "intermediate",
+      "reasoning": "Requires actual AI service API responses and log output that cannot be meaningfully mocked without bypassing the capability being tested"
+    },
+    {
+      "capability": "behavioral-vs-benchmark-failures",
+      "complexity": "intermediate",
+      "reasoning": "Requires actual test execution results with JUnit XML output and timing data that cannot be generated in a one-shot file-based eval"
+    },
+    {
+      "capability": "language-support-patterns",
+      "complexity": "advanced",
+      "reasoning": "Requires the full language registry system with imports and decorators that would need the codeflash runtime to verify"
+    },
+    {
+      "capability": "quality-check-commands",
+      "complexity": "basic",
+      "reasoning": "Requires running actual uv/prek/mypy commands which need the project environment and dependencies installed"
+    }
+  ]
+}
				`@ -0,0 +1 @@`
				`Sequential pipeline debugging with specific thresholds`
				`@ -0,0 +1 @@`
				`Result type pattern and effort-dependent configuration`
				`@ -0,0 +1 @@`
				`Test patterns and deterministic patch awareness`
				`@ -0,0 +1 @@`
				`Domain type conventions and module identification`
				`@ -0,0 +1 @@`
				`Deduplication mechanics and repair trigger conditions`