refactor: leverage Jinja2 includes, extends, and composition in testgen prompts

Use {% extends %} to deduplicate sync/async system templates via
base_system.md.j2, {% include %} for conditional JIT content, and a
compose_user.md.j2 wrapper to replace Python string assembly in
build_prompt().
This commit is contained in:
Kevin Turcios 2026-03-02 06:47:38 -05:00
parent f191c12438
commit a1c0ac6ae4
9 changed files with 247 additions and 281 deletions

View file

@ -4,8 +4,8 @@ from dataclasses import dataclass
import libcst as cst
from core.languages.python.cst_utils import any_ellipsis_in_cst, ellipsis_in_cst_not_types
from aiservice.common.markdown_utils import split_markdown_code
from core.languages.python.cst_utils import any_ellipsis_in_cst, ellipsis_in_cst_not_types
from core.languages.python.optimizer.context_utils.context_helpers import is_multi_context
from core.languages.python.testgen.preprocessing.preprocess_pipeline import preprocessing_testgen_pipeline

View file

@ -9,21 +9,22 @@ from typing import TYPE_CHECKING
import sentry_sdk
import stamina
from jinja2 import Environment, FileSystemLoader, StrictUndefined
from libcst import parse_module
from ninja.errors import HttpError
from openai import OpenAIError
from aiservice.analytics.posthog import ph
from aiservice.common.markdown_utils import extract_code_block
from aiservice.common_utils import safe_isort, should_hack_for_demo
from aiservice.env_specific import debug_log_sensitive_data
from aiservice.llm import EXECUTE_MODEL, HAIKU_MODEL, OPENAI_MODEL, calculate_llm_cost, call_llm
from aiservice.models.functions_to_optimize import FunctionToOptimize
from jinja2 import Environment, FileSystemLoader, StrictUndefined
from libcst import parse_module
from ninja.errors import HttpError
from openai import OpenAIError
from core.languages.python.cst_utils import parse_module_to_cst
from core.languages.python.testgen.context import BaseTestGenContext
from core.languages.python.testgen.demo_hacks import hack_for_demo, hack_for_demo_gsq
from core.languages.python.testgen.instrumentation.edit_generated_test import replace_definition_with_import
from core.languages.python.testgen.models import CostTracker, LLMOutputParseError
from core.languages.python.testgen.postprocessing.code_validator import (
CodeValidationError,
has_test_functions,
@ -41,10 +42,10 @@ from core.shared.testgen_models import (
)
if TYPE_CHECKING:
from aiservice.llm import LLM
from authapp.auth import AuthenticatedRequest
from openai.types.chat import ChatCompletionMessageParam
from aiservice.llm import LLM
from authapp.auth import AuthenticatedRequest
from core.shared.testgen_models import TestGenSchema
_current_dir = Path(__file__).parent
@ -52,70 +53,40 @@ _prompts_dir = _current_dir / "prompts"
_jinja_env = Environment(loader=FileSystemLoader(_prompts_dir), keep_trailing_newline=True, undefined=StrictUndefined) # noqa: S701 - rendering LLM prompts, not HTML
JIT_INSTRUCTIONS = _jinja_env.get_template("jit_system.md.j2").render()
JIT_USER_REMINDER = _jinja_env.get_template("jit_user.md.j2").render()
def build_prompt(
ctx: BaseTestGenContext,
*,
qualified_name: str,
source_code: str,
notes: str,
function_name: str,
unit_test_package: str,
*,
is_async: bool,
is_numerical_code: bool | None = None,
model_type: str = "openai",
) -> tuple[list[dict[str, str]], str, str]:
if is_async:
system_template_name = "generate_async_system.md.j2"
user_template_name = "generate_async_user.md.j2"
posthog_event_suffix = "async-"
error_context = "async "
else:
system_template_name = "generate_system.md.j2"
user_template_name = "generate_user.md.j2"
posthog_event_suffix = ""
error_context = ""
system_template = "generate_async_system.md.j2" if is_async else "generate_system.md.j2"
system_prompt = _jinja_env.get_template(system_template_name).render(
function_name=ctx.data.qualified_name, model_type=model_type
system_prompt = _jinja_env.get_template(system_template).render(
function_name=qualified_name, model_type=model_type, is_numerical_code=is_numerical_code
)
if is_numerical_code:
system_prompt += f"\n{JIT_INSTRUCTIONS}\n"
execute_system_message = {"role": "system", "content": system_prompt}
# Build a single user message combining notes and the code template
user_parts = []
all_notes = ctx.generate_notes_markdown()
if all_notes:
user_parts.append(all_notes)
if is_numerical_code:
user_parts.append(JIT_USER_REMINDER)
user_parts.append(
_jinja_env.get_template(user_template_name).render(
unit_test_package=unit_test_package,
function_name=function_name,
function_code=ctx.data.source_code_being_tested,
package_comment="",
)
user_prompt = _jinja_env.get_template("compose_user.md.j2").render(
notes=notes,
is_numerical_code=is_numerical_code,
is_async=is_async,
unit_test_package=unit_test_package,
function_name=function_name,
function_code=source_code,
package_comment="",
)
execute_user_message = {"role": "user", "content": "\n\n".join(user_parts)}
execute_messages = [execute_system_message, execute_user_message]
posthog_event_suffix = "async-" if is_async else ""
error_context = "async " if is_async else ""
execute_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
return execute_messages, posthog_event_suffix, error_context
class LLMOutputParseError(Exception):
"""Exception for LLM output parsing failures with raw output context."""
def __init__(self, message: str, raw_llm_output: str, code: str | None = None) -> None:
super().__init__(message)
self.raw_llm_output = raw_llm_output
self.code = code # Code extracted from LLM output, if any
def parse_and_validate_llm_output(
response_content: str,
ctx: BaseTestGenContext,
@ -172,7 +143,7 @@ async def generate_and_validate_test_code(
ctx: BaseTestGenContext,
python_version: tuple[int, int, int],
error_context: str,
cost_tracker: list[float],
cost_tracker: CostTracker,
user_id: str,
posthog_event_suffix: str,
trace_id: str = "",
@ -216,7 +187,7 @@ async def generate_and_validate_test_code(
)
cost = calculate_llm_cost(response.raw_response, model)
cost_tracker.append(cost)
cost_tracker.add(cost)
debug_log_sensitive_data(f"LLM {error_context}execute response:\n{response.raw_response.model_dump_json(indent=2)}")
@ -259,7 +230,9 @@ async def generate_regression_tests_from_function(
model_type: str = "openai",
) -> tuple[str, str | None, str | None, str]:
execute_messages, posthog_event_suffix, error_context = build_prompt(
ctx=ctx,
qualified_name=ctx.data.qualified_name,
source_code=ctx.data.source_code_being_tested,
notes=ctx.generate_notes_markdown(),
function_name=function_name,
unit_test_package=unit_test_package,
is_async=is_async,
@ -267,7 +240,7 @@ async def generate_regression_tests_from_function(
model_type=model_type,
)
cost_tracker = []
cost_tracker = CostTracker()
try:
validated_code, raw_llm_content = await generate_and_validate_test_code(
messages=execute_messages,
@ -287,8 +260,6 @@ async def generate_regression_tests_from_function(
is_async=data.function_to_optimize.is_async or data.is_async or False,
test_index=test_index,
)
total_llm_cost = sum(cost_tracker)
await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
processed_cst = postprocessing_testgen_pipeline(
parse_module_to_cst(validated_code),
@ -339,17 +310,13 @@ async def generate_regression_tests_from_function(
)
return generated_test_source, instrumented_behavior_tests, instrumented_perf_tests, raw_llm_content # noqa: TRY300
except CodeValidationError as e:
total_llm_cost = sum(cost_tracker)
await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
msg = f"Failed to generate valid {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
msg = f"Failed to generate valid {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
logging.exception(msg)
debug_info = e.to_debug_dict()
debug_info["raw_llm_output"] = getattr(e, "raw_llm_output", None)
raise TestGenerationFailedError(msg, debug_info=debug_info) from e
except LLMOutputParseError as e:
total_llm_cost = sum(cost_tracker)
await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
msg = f"Failed to parse LLM output for {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
msg = f"Failed to parse LLM output for {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
logging.exception(msg)
raise TestGenerationFailedError(
msg,
@ -361,11 +328,12 @@ async def generate_regression_tests_from_function(
},
) from e
except (SyntaxError, ValueError) as e:
total_llm_cost = sum(cost_tracker)
await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
msg = f"Failed to generate valid {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
msg = f"Failed to generate valid {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
logging.exception(msg)
raise TestGenerationFailedError(msg, debug_info={"stage": "unknown", "validation_error": str(e)}) from e
finally:
if cost_tracker.total > 0:
await update_optimization_cost(trace_id=trace_id, cost=cost_tracker.total, user_id=user_id)
async def testgen_python(
@ -450,7 +418,7 @@ async def testgen_python(
)
except TestGenerationFailedError as e:
logging.exception(f"Test generation failed. trace_id={data.trace_id}")
logging.exception("Test generation failed. trace_id=%s", data.trace_id)
sentry_sdk.capture_exception(e)
# Return detailed debug info for self-healing
debug_info = None
@ -467,7 +435,7 @@ async def testgen_python(
)
return 500, TestGenErrorResponseSchema(error=str(e), trace_id=data.trace_id, debug_info=debug_info)
except Exception as e:
logging.exception(f"Test generation failed. trace_id={data.trace_id}")
logging.exception("Test generation failed. trace_id=%s", data.trace_id)
sentry_sdk.capture_exception(e)
return 500, TestGenErrorResponseSchema(
error="Error generating tests. Internal server error.", trace_id=data.trace_id

View file

@ -1,6 +1,28 @@
from __future__ import annotations
import enum
from dataclasses import dataclass
class TestingMode(enum.Enum):
BEHAVIOR = "behavior"
PERFORMANCE = "performance"
@dataclass
class CostTracker:
total: float = 0.0
calls: int = 0
def add(self, cost: float) -> None:
self.total += cost
self.calls += 1
class LLMOutputParseError(Exception):
"""Exception for LLM output parsing failures with raw output context."""
def __init__(self, message: str, raw_llm_output: str, code: str | None = None) -> None:
super().__init__(message)
self.raw_llm_output = raw_llm_output
self.code = code

View file

@ -4,7 +4,6 @@ from libcst import Arg, Attribute, Call, CSTTransformer, Module, Name
from core.languages.python.cst_utils import evaluate_expression, make_number_node
# Functions where shape is passed as positional args (not as a tuple/list)
TORCH_SHAPE_AS_ARGS = frozenset(["rand", "randn", "ones", "zeros", "empty"])

View file

@ -0,0 +1,116 @@
{% if model_type == "anthropic" %}
<role>
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the {% block role_target %}`{{ function_name }}`{% endblock %} function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
</role>
<analysis_steps>
Think step by step before writing code. Analyze the {% block analysis_target %}function{% endblock %}:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
{% block analysis_extra %}{% endblock %}
</analysis_steps>
<test_categories>
{% block categories %}
Then write tests organized into three categories:
**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
{% endblock %}
</test_categories>
<quality_criteria>
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, {% block mock_extras %}{% endblock %}`Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
{% block quality_extra %}
- Include large-scale test cases to assess performance with realistic data volumes
{% endblock %}
</quality_criteria>
{% block async_rules_section %}{% endblock %}
<rules>
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance{% block method_call_detail %}{% endblock %}. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
</rules>
<output_format>
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `{% block test_signature %}def test_...{% endblock %}` function.
- Follow the exact template structure provided in the user message.
</output_format>
{% else %}
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the {% block md_role_target %}`{{ function_name }}`{% endblock %} function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
## Analysis Steps
Think step by step before writing code. Analyze the {% block md_analysis_target %}function{% endblock %}:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
{% block md_analysis_extra %}{% endblock %}
## Test Categories
{% block md_categories %}
Then write tests organized into three categories:
**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
{% endblock %}
## Test Quality Criteria
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, {% block md_mock_extras %}{% endblock %}patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
{% block md_quality_extra %}
- Include large-scale test cases to assess performance with realistic data volumes
{% endblock %}
{% block md_async_rules_section %}{% endblock %}
## Rules
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance{% block md_method_call_detail %}{% endblock %}. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
## Output Format
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `{% block md_test_signature %}def test_...{% endblock %}` function.
- Follow the exact template structure provided in the user message.
## CRITICAL REMINDER
You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, {% block md_critical_mock_extras %}{% endblock %}patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
{% endif %}
{% if is_numerical_code %}
{% include "jit_system.md.j2" %}
{% endif %}

View file

@ -0,0 +1,13 @@
{% if notes %}
{{ notes }}
{% endif %}
{% if is_numerical_code %}
{% include "jit_user.md.j2" %}
{% endif %}
{% if is_async %}
{% include "generate_async_user.md.j2" %}
{% else %}
{% include "generate_user.md.j2" %}
{% endif %}

View file

@ -1,18 +1,13 @@
{% if model_type == "anthropic" %}
<role>
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the **async** `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
</role>
{% extends "base_system.md.j2" %}
<analysis_steps>
Think step by step before writing code. Analyze the async function:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
{% block role_target %}**async** `{{ function_name }}`{% endblock %}
{% block analysis_target %}async function{% endblock %}
{% block analysis_extra %}
4. What async-specific edge cases exist (concurrent execution, coroutine handling)?
5. What large-scale or throughput scenarios should be covered?
</analysis_steps>
{% endblock %}
<test_categories>
{% block categories %}
Then write tests organized into four categories:
**Basic tests** — Verify fundamental functionality under normal conditions. Test that the function returns expected values when awaited, and test basic async/await behavior.
@ -22,17 +17,15 @@ Then write tests organized into four categories:
**Large-scale tests** — Assess performance and scalability with concurrent execution. Test multiple concurrent calls using `asyncio.gather()`. Use data structures up to 1000 elements and loops up to 1000 iterations.
**Throughput tests** — Measure performance under load and high-volume scenarios. Name these functions with `_throughput_` in the name (e.g., `test_function_throughput_high_load`). Test with varying loads (small, medium, large) and sustained execution patterns.
</test_categories>
{% endblock %}
<quality_criteria>
- Tests should be diverse — cover a wide range of inputs and async-specific scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, `AsyncMock`, `Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
{% block mock_extras %}`AsyncMock`, {% endblock %}
{% block quality_extra %}
- Include concurrent execution tests using `asyncio.gather()` to assess async performance
- Test proper async/await patterns and coroutine handling
</quality_criteria>
{% endblock %}
{% block async_rules_section %}
<async_rules>
- All test functions must use `async def` and be marked with `@pytest.mark.asyncio`
- Use `await` when calling the async function under test
@ -41,39 +34,21 @@ Then write tests organized into four categories:
- Never create tests that intentionally timeout or hang. No `asyncio.wait_for()` with short timeouts expecting `TimeoutError`. No tests that rely on timing or delays.
- All throughput test functions must include `_throughput_` in their name
</async_rules>
{% endblock %}
<rules>
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance with `await instance.method(...)`. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
</rules>
{% block method_call_detail %} with `await instance.method(...)`{% endblock %}
{% block test_signature %}async def test_...{% endblock %}
<output_format>
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `async def test_...` function.
- Follow the exact template structure provided in the user message.
</output_format>
{% else %}
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the **async** `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
{# ── Markdown branch overrides ── #}
## Analysis Steps
Think step by step before writing code. Analyze the async function:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
{% block md_role_target %}**async** `{{ function_name }}`{% endblock %}
{% block md_analysis_target %}async function{% endblock %}
{% block md_analysis_extra %}
4. What async-specific edge cases exist (concurrent execution, coroutine handling)?
5. What large-scale or throughput scenarios should be covered?
{% endblock %}
## Test Categories
{% block md_categories %}
Then write tests organized into four categories:
@ -84,16 +59,15 @@ Then write tests organized into four categories:
**Large-scale tests** — Assess performance and scalability with concurrent execution. Test multiple concurrent calls using `asyncio.gather()`. Use data structures up to 1000 elements and loops up to 1000 iterations.
**Throughput tests** — Measure performance under load and high-volume scenarios. Name these functions with `_throughput_` in the name (e.g., `test_function_throughput_high_load`). Test with varying loads (small, medium, large) and sustained execution patterns.
{% endblock %}
## Test Quality Criteria
- Tests should be diverse — cover a wide range of inputs and async-specific scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, AsyncMock, patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
{% block md_mock_extras %}AsyncMock, {% endblock %}
{% block md_quality_extra %}
- Include concurrent execution tests using `asyncio.gather()` to assess async performance
- Test proper async/await patterns and coroutine handling
{% endblock %}
{% block md_async_rules_section %}
## Async-Specific Rules
- All test functions must use `async def` and be marked with `@pytest.mark.asyncio`
@ -102,28 +76,8 @@ Then write tests organized into four categories:
- Test concurrent execution using `asyncio.gather()` where appropriate
- Never create tests that intentionally timeout or hang. No `asyncio.wait_for()` with short timeouts expecting `TimeoutError`. No tests that rely on timing or delays.
- All throughput test functions must include `_throughput_` in their name
{% endblock %}
## Rules
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance with `await instance.method(...)`. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
## Output Format
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `async def test_...` function.
- Follow the exact template structure provided in the user message.
## CRITICAL REMINDER
You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, AsyncMock, patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
{% endif %}
{% block md_method_call_detail %} with `await instance.method(...)`{% endblock %}
{% block md_test_signature %}async def test_...{% endblock %}
{% block md_critical_mock_extras %}AsyncMock, {% endblock %}

View file

@ -1,103 +1 @@
{% if model_type == "anthropic" %}
<role>
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
</role>
<analysis_steps>
Think step by step before writing code. Analyze the function:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
4. What large-scale or performance-relevant scenarios should be covered?
</analysis_steps>
<test_categories>
Then write tests organized into three categories:
**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
</test_categories>
<quality_criteria>
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, `Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
- Include large-scale test cases to assess performance with realistic data volumes
</quality_criteria>
<rules>
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
</rules>
<output_format>
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `def test_...` function.
- Follow the exact template structure provided in the user message.
</output_format>
{% else %}
You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
## Analysis Steps
Think step by step before writing code. Analyze the function:
1. What does it do? What are its inputs, outputs, and return types?
2. What are the normal/expected usage patterns?
3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
4. What large-scale or performance-relevant scenarios should be covered?
## Test Categories
Then write tests organized into three categories:
**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
## Test Quality Criteria
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
- Include large-scale test cases to assess performance with realistic data volumes
## Rules
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
## Output Format
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `def test_...` function.
- Follow the exact template structure provided in the user message.
## CRITICAL REMINDER
You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
{% endif %}
{% extends "base_system.md.j2" %}

View file

@ -2,49 +2,45 @@
from __future__ import annotations
from typing import TYPE_CHECKING, TypedDict
from typing import TYPE_CHECKING
from aiservice.common_utils import parse_python_version, validate_trace_id
from aiservice.models.functions_to_optimize import FunctionToOptimize
from ninja.errors import HttpError
from aiservice.common_utils import parse_python_version, validate_trace_id
from core.languages.python.testgen.context import BaseTestGenContext, TestGenContextData
from core.languages.python.testgen.instrumentation.instrument_new_tests import instrument_test_source
from core.languages.python.testgen.models import TestingMode
from core.languages.python.testgen.context import BaseTestGenContext, TestGenContextData
if TYPE_CHECKING:
from core.shared.testgen_models import TestGenSchema
class InstrumentTestSourceArgs(TypedDict):
test_source: str
function_to_optimize: FunctionToOptimize
helper_function_names: list[str]
module_path: str
test_module_path: str
test_framework: str
test_timeout: int
python_version: tuple[int, int, int]
def instrument_tests(
generated_test_source: str, data: TestGenSchema, python_version: tuple[int, int, int]
) -> tuple[str | None, str | None]:
common_args: InstrumentTestSourceArgs = {
"test_source": generated_test_source,
"function_to_optimize": data.function_to_optimize,
"helper_function_names": data.helper_function_names or [],
"module_path": data.module_path,
"test_module_path": data.test_module_path,
"test_framework": data.test_framework,
"test_timeout": data.test_timeout,
"python_version": python_version,
}
# instrument_test_source() already applies isort via format_and_float_to_top()
# No need to apply isort again here (was causing double formatting overhead)
behavior_result = instrument_test_source(**common_args, mode=TestingMode.BEHAVIOR)
perf_result = instrument_test_source(**common_args, mode=TestingMode.PERFORMANCE)
behavior_result = instrument_test_source(
test_source=generated_test_source,
function_to_optimize=data.function_to_optimize,
helper_function_names=data.helper_function_names or [],
module_path=data.module_path,
test_module_path=data.test_module_path,
test_framework=data.test_framework,
test_timeout=data.test_timeout,
python_version=python_version,
mode=TestingMode.BEHAVIOR,
)
perf_result = instrument_test_source(
test_source=generated_test_source,
function_to_optimize=data.function_to_optimize,
helper_function_names=data.helper_function_names or [],
module_path=data.module_path,
test_module_path=data.test_module_path,
test_framework=data.test_framework,
test_timeout=data.test_timeout,
python_version=python_version,
mode=TestingMode.PERFORMANCE,
)
return behavior_result, perf_result