refactor: leverage Jinja2 includes, extends, and composition in testgen prompts

Use {% extends %} to deduplicate sync/async system templates via base_system.md.j2, {% include %} for conditional JIT content, and a compose_user.md.j2 wrapper to replace Python string assembly in build_prompt().
2026-05-04 18:25:18 +00:00 · 2026-03-02 06:47:38 -05:00 · 2026-03-02 06:47:38 -05:00 · a1c0ac6ae4
commit a1c0ac6ae4
parent f191c12438
9 changed files with 247 additions and 281 deletions
--- a/django/aiservice/core/languages/python/testgen/context.py
+++ b/django/aiservice/core/languages/python/testgen/context.py
@ -4,8 +4,8 @@ from dataclasses import dataclass

 import libcst as cst

-from core.languages.python.cst_utils import any_ellipsis_in_cst, ellipsis_in_cst_not_types
 from aiservice.common.markdown_utils import split_markdown_code
+from core.languages.python.cst_utils import any_ellipsis_in_cst, ellipsis_in_cst_not_types
 from core.languages.python.optimizer.context_utils.context_helpers import is_multi_context
 from core.languages.python.testgen.preprocessing.preprocess_pipeline import preprocessing_testgen_pipeline

--- a/django/aiservice/core/languages/python/testgen/generate.py
+++ b/django/aiservice/core/languages/python/testgen/generate.py
@ -9,21 +9,22 @@ from typing import TYPE_CHECKING

 import sentry_sdk
 import stamina
+from jinja2 import Environment, FileSystemLoader, StrictUndefined
+from libcst import parse_module
+from ninja.errors import HttpError
+from openai import OpenAIError
+
 from aiservice.analytics.posthog import ph
 from aiservice.common.markdown_utils import extract_code_block
 from aiservice.common_utils import safe_isort, should_hack_for_demo
 from aiservice.env_specific import debug_log_sensitive_data
 from aiservice.llm import EXECUTE_MODEL, HAIKU_MODEL, OPENAI_MODEL, calculate_llm_cost, call_llm
 from aiservice.models.functions_to_optimize import FunctionToOptimize
-from jinja2 import Environment, FileSystemLoader, StrictUndefined
-from libcst import parse_module
-from ninja.errors import HttpError
-from openai import OpenAIError
-
 from core.languages.python.cst_utils import parse_module_to_cst
 from core.languages.python.testgen.context import BaseTestGenContext
 from core.languages.python.testgen.demo_hacks import hack_for_demo, hack_for_demo_gsq
 from core.languages.python.testgen.instrumentation.edit_generated_test import replace_definition_with_import
+from core.languages.python.testgen.models import CostTracker, LLMOutputParseError
 from core.languages.python.testgen.postprocessing.code_validator import (
    CodeValidationError,
    has_test_functions,
@ -41,10 +42,10 @@ from core.shared.testgen_models import (
 )

 if TYPE_CHECKING:
-    from aiservice.llm import LLM
-    from authapp.auth import AuthenticatedRequest
    from openai.types.chat import ChatCompletionMessageParam

+    from aiservice.llm import LLM
+    from authapp.auth import AuthenticatedRequest
    from core.shared.testgen_models import TestGenSchema

 _current_dir = Path(__file__).parent
@ -52,70 +53,40 @@ _prompts_dir = _current_dir / "prompts"

 _jinja_env = Environment(loader=FileSystemLoader(_prompts_dir), keep_trailing_newline=True, undefined=StrictUndefined)  # noqa: S701 - rendering LLM prompts, not HTML

-JIT_INSTRUCTIONS = _jinja_env.get_template("jit_system.md.j2").render()
-JIT_USER_REMINDER = _jinja_env.get_template("jit_user.md.j2").render()
-

 def build_prompt(
-    ctx: BaseTestGenContext,
+    *,
+    qualified_name: str,
+    source_code: str,
+    notes: str,
    function_name: str,
    unit_test_package: str,
-    *,
    is_async: bool,
    is_numerical_code: bool | None = None,
    model_type: str = "openai",
 ) -> tuple[list[dict[str, str]], str, str]:
-    if is_async:
-        system_template_name = "generate_async_system.md.j2"
-        user_template_name = "generate_async_user.md.j2"
-        posthog_event_suffix = "async-"
-        error_context = "async "
-    else:
-        system_template_name = "generate_system.md.j2"
-        user_template_name = "generate_user.md.j2"
-        posthog_event_suffix = ""
-        error_context = ""
+    system_template = "generate_async_system.md.j2" if is_async else "generate_system.md.j2"

-    system_prompt = _jinja_env.get_template(system_template_name).render(
-        function_name=ctx.data.qualified_name, model_type=model_type
+    system_prompt = _jinja_env.get_template(system_template).render(
+        function_name=qualified_name, model_type=model_type, is_numerical_code=is_numerical_code
    )
-    if is_numerical_code:
-        system_prompt += f"\n{JIT_INSTRUCTIONS}\n"
-    execute_system_message = {"role": "system", "content": system_prompt}
-
-    # Build a single user message combining notes and the code template
-    user_parts = []
-    all_notes = ctx.generate_notes_markdown()
-    if all_notes:
-        user_parts.append(all_notes)
-
-    if is_numerical_code:
-        user_parts.append(JIT_USER_REMINDER)
-
-    user_parts.append(
-        _jinja_env.get_template(user_template_name).render(
-            unit_test_package=unit_test_package,
-            function_name=function_name,
-            function_code=ctx.data.source_code_being_tested,
-            package_comment="",
-        )
+    user_prompt = _jinja_env.get_template("compose_user.md.j2").render(
+        notes=notes,
+        is_numerical_code=is_numerical_code,
+        is_async=is_async,
+        unit_test_package=unit_test_package,
+        function_name=function_name,
+        function_code=source_code,
+        package_comment="",
    )

-    execute_user_message = {"role": "user", "content": "\n\n".join(user_parts)}
-    execute_messages = [execute_system_message, execute_user_message]
+    posthog_event_suffix = "async-" if is_async else ""
+    error_context = "async " if is_async else ""
+    execute_messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]

    return execute_messages, posthog_event_suffix, error_context


-class LLMOutputParseError(Exception):
-    """Exception for LLM output parsing failures with raw output context."""
-
-    def __init__(self, message: str, raw_llm_output: str, code: str | None = None) -> None:
-        super().__init__(message)
-        self.raw_llm_output = raw_llm_output
-        self.code = code  # Code extracted from LLM output, if any
-
-
 def parse_and_validate_llm_output(
    response_content: str,
    ctx: BaseTestGenContext,
@ -172,7 +143,7 @@ async def generate_and_validate_test_code(
    ctx: BaseTestGenContext,
    python_version: tuple[int, int, int],
    error_context: str,
-    cost_tracker: list[float],
+    cost_tracker: CostTracker,
    user_id: str,
    posthog_event_suffix: str,
    trace_id: str = "",
@ -216,7 +187,7 @@ async def generate_and_validate_test_code(
    )

    cost = calculate_llm_cost(response.raw_response, model)
-    cost_tracker.append(cost)
+    cost_tracker.add(cost)

    debug_log_sensitive_data(f"LLM {error_context}execute response:\n{response.raw_response.model_dump_json(indent=2)}")

@ -259,7 +230,9 @@ async def generate_regression_tests_from_function(
    model_type: str = "openai",
 ) -> tuple[str, str | None, str | None, str]:
    execute_messages, posthog_event_suffix, error_context = build_prompt(
-        ctx=ctx,
+        qualified_name=ctx.data.qualified_name,
+        source_code=ctx.data.source_code_being_tested,
+        notes=ctx.generate_notes_markdown(),
        function_name=function_name,
        unit_test_package=unit_test_package,
        is_async=is_async,
@ -267,7 +240,7 @@ async def generate_regression_tests_from_function(
        model_type=model_type,
    )

-    cost_tracker = []
+    cost_tracker = CostTracker()
    try:
        validated_code, raw_llm_content = await generate_and_validate_test_code(
            messages=execute_messages,
@ -287,8 +260,6 @@ async def generate_regression_tests_from_function(
            is_async=data.function_to_optimize.is_async or data.is_async or False,
            test_index=test_index,
        )
-        total_llm_cost = sum(cost_tracker)
-        await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)

        processed_cst = postprocessing_testgen_pipeline(
            parse_module_to_cst(validated_code),
@ -339,17 +310,13 @@ async def generate_regression_tests_from_function(
            )
        return generated_test_source, instrumented_behavior_tests, instrumented_perf_tests, raw_llm_content  # noqa: TRY300
    except CodeValidationError as e:
-        total_llm_cost = sum(cost_tracker)
-        await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
-        msg = f"Failed to generate valid {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
+        msg = f"Failed to generate valid {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
        logging.exception(msg)
        debug_info = e.to_debug_dict()
        debug_info["raw_llm_output"] = getattr(e, "raw_llm_output", None)
        raise TestGenerationFailedError(msg, debug_info=debug_info) from e
    except LLMOutputParseError as e:
-        total_llm_cost = sum(cost_tracker)
-        await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
-        msg = f"Failed to parse LLM output for {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
+        msg = f"Failed to parse LLM output for {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
        logging.exception(msg)
        raise TestGenerationFailedError(
            msg,
@ -361,11 +328,12 @@ async def generate_regression_tests_from_function(
            },
        ) from e
    except (SyntaxError, ValueError) as e:
-        total_llm_cost = sum(cost_tracker)
-        await update_optimization_cost(trace_id=trace_id, cost=total_llm_cost, user_id=user_id)
-        msg = f"Failed to generate valid {error_context}test code after {len(cost_tracker)} tries. trace_id={trace_id}"
+        msg = f"Failed to generate valid {error_context}test code after {cost_tracker.calls} tries. trace_id={trace_id}"
        logging.exception(msg)
        raise TestGenerationFailedError(msg, debug_info={"stage": "unknown", "validation_error": str(e)}) from e
+    finally:
+        if cost_tracker.total > 0:
+            await update_optimization_cost(trace_id=trace_id, cost=cost_tracker.total, user_id=user_id)


 async def testgen_python(
@ -450,7 +418,7 @@ async def testgen_python(
        )

    except TestGenerationFailedError as e:
-        logging.exception(f"Test generation failed. trace_id={data.trace_id}")
+        logging.exception("Test generation failed. trace_id=%s", data.trace_id)
        sentry_sdk.capture_exception(e)
        # Return detailed debug info for self-healing
        debug_info = None
@ -467,7 +435,7 @@ async def testgen_python(
            )
        return 500, TestGenErrorResponseSchema(error=str(e), trace_id=data.trace_id, debug_info=debug_info)
    except Exception as e:
-        logging.exception(f"Test generation failed. trace_id={data.trace_id}")
+        logging.exception("Test generation failed. trace_id=%s", data.trace_id)
        sentry_sdk.capture_exception(e)
        return 500, TestGenErrorResponseSchema(
            error="Error generating tests. Internal server error.", trace_id=data.trace_id
--- a/django/aiservice/core/languages/python/testgen/models.py
+++ b/django/aiservice/core/languages/python/testgen/models.py
@ -1,6 +1,28 @@
+from __future__ import annotations
+
 import enum
+from dataclasses import dataclass


 class TestingMode(enum.Enum):
    BEHAVIOR = "behavior"
    PERFORMANCE = "performance"
+
+
+@dataclass
+class CostTracker:
+    total: float = 0.0
+    calls: int = 0
+
+    def add(self, cost: float) -> None:
+        self.total += cost
+        self.calls += 1
+
+
+class LLMOutputParseError(Exception):
+    """Exception for LLM output parsing failures with raw output context."""
+
+    def __init__(self, message: str, raw_llm_output: str, code: str | None = None) -> None:
+        super().__init__(message)
+        self.raw_llm_output = raw_llm_output
+        self.code = code
--- a/django/aiservice/core/languages/python/testgen/postprocessing/tensor_limit.py
+++ b/django/aiservice/core/languages/python/testgen/postprocessing/tensor_limit.py
@ -4,7 +4,6 @@ from libcst import Arg, Attribute, Call, CSTTransformer, Module, Name

 from core.languages.python.cst_utils import evaluate_expression, make_number_node

-
 # Functions where shape is passed as positional args (not as a tuple/list)
 TORCH_SHAPE_AS_ARGS = frozenset(["rand", "randn", "ones", "zeros", "empty"])

--- a/django/aiservice/core/languages/python/testgen/prompts/base_system.md.j2
+++ b/django/aiservice/core/languages/python/testgen/prompts/base_system.md.j2
@ -0,0 +1,116 @@
+{% if model_type == "anthropic" %}
+<role>
+You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the {% block role_target %}`{{ function_name }}`{% endblock %} function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
+</role>
+
+<analysis_steps>
+Think step by step before writing code. Analyze the {% block analysis_target %}function{% endblock %}:
+1. What does it do? What are its inputs, outputs, and return types?
+2. What are the normal/expected usage patterns?
+3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
+{% block analysis_extra %}{% endblock %}
+</analysis_steps>
+
+<test_categories>
+{% block categories %}
+Then write tests organized into three categories:
+
+**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
+
+**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
+
+**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
+{% endblock %}
+</test_categories>
+
+<quality_criteria>
+- Tests should be diverse — cover a wide range of inputs and scenarios
+- Tests must be deterministic — always pass or fail the same way
+- Sort tests by difficulty, from easiest to hardest
+- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, {% block mock_extras %}{% endblock %}`Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
+{% block quality_extra %}
+- Include large-scale test cases to assess performance with realistic data volumes
+{% endblock %}
+</quality_criteria>
+{% block async_rules_section %}{% endblock %}
+
+<rules>
+- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
+- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
+- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance{% block method_call_detail %}{% endblock %}. Do not pass `self` manually.
+- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
+- **Import everything you use** — every symbol must have a corresponding import.
+- **Only import what you use** — do not add unused imports.
+- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
+- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
+- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
+</rules>
+
+<output_format>
+- Respond with a single markdown code block containing valid Python code.
+- Do not nest code blocks or include markdown fences inside code.
+- Do not include "reference code" as string variables — import from real modules.
+- The code block must contain at least one `{% block test_signature %}def test_...{% endblock %}` function.
+- Follow the exact template structure provided in the user message.
+</output_format>
+{% else %}
+You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the {% block md_role_target %}`{{ function_name }}`{% endblock %} function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
+
+## Analysis Steps
+
+Think step by step before writing code. Analyze the {% block md_analysis_target %}function{% endblock %}:
+1. What does it do? What are its inputs, outputs, and return types?
+2. What are the normal/expected usage patterns?
+3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
+{% block md_analysis_extra %}{% endblock %}
+
+## Test Categories
+{% block md_categories %}
+
+Then write tests organized into three categories:
+
+**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
+
+**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
+
+**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
+{% endblock %}
+
+## Test Quality Criteria
+
+- Tests should be diverse — cover a wide range of inputs and scenarios
+- Tests must be deterministic — always pass or fail the same way
+- Sort tests by difficulty, from easiest to hardest
+- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, {% block md_mock_extras %}{% endblock %}patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
+{% block md_quality_extra %}
+- Include large-scale test cases to assess performance with realistic data volumes
+{% endblock %}
+{% block md_async_rules_section %}{% endblock %}
+
+## Rules
+
+- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
+- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
+- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance{% block md_method_call_detail %}{% endblock %}. Do not pass `self` manually.
+- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
+- **Import everything you use** — every symbol must have a corresponding import.
+- **Only import what you use** — do not add unused imports.
+- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
+- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
+- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
+
+## Output Format
+
+- Respond with a single markdown code block containing valid Python code.
+- Do not nest code blocks or include markdown fences inside code.
+- Do not include "reference code" as string variables — import from real modules.
+- The code block must contain at least one `{% block md_test_signature %}def test_...{% endblock %}` function.
+- Follow the exact template structure provided in the user message.
+
+## CRITICAL REMINDER
+
+You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, {% block md_critical_mock_extras %}{% endblock %}patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
+{% endif %}
+{% if is_numerical_code %}
+{% include "jit_system.md.j2" %}
+{% endif %}
--- a/django/aiservice/core/languages/python/testgen/prompts/compose_user.md.j2
+++ b/django/aiservice/core/languages/python/testgen/prompts/compose_user.md.j2
@ -0,0 +1,13 @@
+{% if notes %}
+{{ notes }}
+
+{% endif %}
+{% if is_numerical_code %}
+{% include "jit_user.md.j2" %}
+
+{% endif %}
+{% if is_async %}
+{% include "generate_async_user.md.j2" %}
+{% else %}
+{% include "generate_user.md.j2" %}
+{% endif %}
--- a/django/aiservice/core/languages/python/testgen/prompts/generate_async_system.md.j2
+++ b/django/aiservice/core/languages/python/testgen/prompts/generate_async_system.md.j2
@ -1,18 +1,13 @@
-{% if model_type == "anthropic" %}
-<role>
-You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the **async** `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
-</role>
+{% extends "base_system.md.j2" %}

-<analysis_steps>
-Think step by step before writing code. Analyze the async function:
-1. What does it do? What are its inputs, outputs, and return types?
-2. What are the normal/expected usage patterns?
-3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
+{% block role_target %}**async** `{{ function_name }}`{% endblock %}
+{% block analysis_target %}async function{% endblock %}
+{% block analysis_extra %}
 4. What async-specific edge cases exist (concurrent execution, coroutine handling)?
 5. What large-scale or throughput scenarios should be covered?
-</analysis_steps>
+{% endblock %}

-<test_categories>
+{% block categories %}
 Then write tests organized into four categories:

 **Basic tests** — Verify fundamental functionality under normal conditions. Test that the function returns expected values when awaited, and test basic async/await behavior.
@ -22,17 +17,15 @@ Then write tests organized into four categories:
 **Large-scale tests** — Assess performance and scalability with concurrent execution. Test multiple concurrent calls using `asyncio.gather()`. Use data structures up to 1000 elements and loops up to 1000 iterations.

 **Throughput tests** — Measure performance under load and high-volume scenarios. Name these functions with `_throughput_` in the name (e.g., `test_function_throughput_high_load`). Test with varying loads (small, medium, large) and sustained execution patterns.
-</test_categories>
+{% endblock %}

-<quality_criteria>
- Tests should be diverse — cover a wide range of inputs and async-specific scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, `AsyncMock`, `Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
+{% block mock_extras %}`AsyncMock`, {% endblock %}
+{% block quality_extra %}
 - Include concurrent execution tests using `asyncio.gather()` to assess async performance
 - Test proper async/await patterns and coroutine handling
-</quality_criteria>
+{% endblock %}

+{% block async_rules_section %}
 <async_rules>
 - All test functions must use `async def` and be marked with `@pytest.mark.asyncio`
 - Use `await` when calling the async function under test
@ -41,39 +34,21 @@ Then write tests organized into four categories:
 - Never create tests that intentionally timeout or hang. No `asyncio.wait_for()` with short timeouts expecting `TimeoutError`. No tests that rely on timing or delays.
 - All throughput test functions must include `_throughput_` in their name
 </async_rules>
+{% endblock %}

-<rules>
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance with `await instance.method(...)`. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
-</rules>
+{% block method_call_detail %} with `await instance.method(...)`{% endblock %}
+{% block test_signature %}async def test_...{% endblock %}

-<output_format>
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `async def test_...` function.
- Follow the exact template structure provided in the user message.
-</output_format>
-{% else %}
-You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the **async** `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
+{# ── Markdown branch overrides ── #}

-## Analysis Steps
-
-Think step by step before writing code. Analyze the async function:
-1. What does it do? What are its inputs, outputs, and return types?
-2. What are the normal/expected usage patterns?
-3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
+{% block md_role_target %}**async** `{{ function_name }}`{% endblock %}
+{% block md_analysis_target %}async function{% endblock %}
+{% block md_analysis_extra %}
 4. What async-specific edge cases exist (concurrent execution, coroutine handling)?
 5. What large-scale or throughput scenarios should be covered?
+{% endblock %}

-## Test Categories
+{% block md_categories %}

 Then write tests organized into four categories:

@ -84,16 +59,15 @@ Then write tests organized into four categories:
 **Large-scale tests** — Assess performance and scalability with concurrent execution. Test multiple concurrent calls using `asyncio.gather()`. Use data structures up to 1000 elements and loops up to 1000 iterations.

 **Throughput tests** — Measure performance under load and high-volume scenarios. Name these functions with `_throughput_` in the name (e.g., `test_function_throughput_high_load`). Test with varying loads (small, medium, large) and sustained execution patterns.
+{% endblock %}

-## Test Quality Criteria
-
- Tests should be diverse — cover a wide range of inputs and async-specific scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, AsyncMock, patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
+{% block md_mock_extras %}AsyncMock, {% endblock %}
+{% block md_quality_extra %}
 - Include concurrent execution tests using `asyncio.gather()` to assess async performance
 - Test proper async/await patterns and coroutine handling
+{% endblock %}

+{% block md_async_rules_section %}
 ## Async-Specific Rules

 - All test functions must use `async def` and be marked with `@pytest.mark.asyncio`
@ -102,28 +76,8 @@ Then write tests organized into four categories:
 - Test concurrent execution using `asyncio.gather()` where appropriate
 - Never create tests that intentionally timeout or hang. No `asyncio.wait_for()` with short timeouts expecting `TimeoutError`. No tests that rely on timing or delays.
 - All throughput test functions must include `_throughput_` in their name
+{% endblock %}

-## Rules
-
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance with `await instance.method(...)`. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
-
-## Output Format
-
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `async def test_...` function.
- Follow the exact template structure provided in the user message.
-
-## CRITICAL REMINDER
-
-You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, AsyncMock, patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
-{% endif %}
+{% block md_method_call_detail %} with `await instance.method(...)`{% endblock %}
+{% block md_test_signature %}async def test_...{% endblock %}
+{% block md_critical_mock_extras %}AsyncMock, {% endblock %}
--- a/django/aiservice/core/languages/python/testgen/prompts/generate_system.md.j2
+++ b/django/aiservice/core/languages/python/testgen/prompts/generate_system.md.j2
@ -1,103 +1 @@
-{% if model_type == "anthropic" %}
-<role>
-You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
-</role>
-
-<analysis_steps>
-Think step by step before writing code. Analyze the function:
-1. What does it do? What are its inputs, outputs, and return types?
-2. What are the normal/expected usage patterns?
-3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
-4. What large-scale or performance-relevant scenarios should be covered?
-</analysis_steps>
-
-<test_categories>
-Then write tests organized into three categories:
-
-**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
-
-**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
-
-**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
-</test_categories>
-
-<quality_criteria>
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- **Never use mocks for test inputs.** Do not use `Mock`, `MagicMock`, `Mock(spec=...)`, `SimpleNamespace`, `patch`, or any fake/stub objects to create test inputs or domain objects. Always construct real instances using the actual class constructors with real arguments. Mocks hide real behavior, silently pass on wrong attribute access, and break when optimized code changes access patterns.
- Include large-scale test cases to assess performance with realistic data volumes
-</quality_criteria>
-
-<rules>
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Use real classes** — never define stub, fake, mock, dummy, or placeholder classes. Never use `SimpleNamespace` as a stand-in for real objects. Import real classes from their actual modules and construct real instances. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
-</rules>
-
-<output_format>
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `def test_...` function.
- Follow the exact template structure provided in the user message.
-</output_format>
-{% else %}
-You are Codeflash, a world-class Python testing engineer. Your goal is to write a comprehensive, high-quality unit test suite for the `{{ function_name }}` function that fully defines its behavior — passing tests confirm correctness, and any mutation to the source code should cause at least one test to fail.
-
-## Analysis Steps
-
-Think step by step before writing code. Analyze the function:
-1. What does it do? What are its inputs, outputs, and return types?
-2. What are the normal/expected usage patterns?
-3. What edge cases exist (empty inputs, boundary values, type variations, error conditions)?
-4. What large-scale or performance-relevant scenarios should be covered?
-
-## Test Categories
-
-Then write tests organized into three categories:
-
-**Basic tests** — Verify fundamental functionality under normal conditions with typical inputs.
-
-**Edge tests** — Evaluate behavior under extreme or unusual conditions: empty inputs, boundary values, special characters, None values, type edge cases.
-
-**Large-scale tests** — Assess performance and scalability. Use data structures up to 1000 elements and loops up to 1000 iterations.
-
-## Test Quality Criteria
-
- Tests should be diverse — cover a wide range of inputs and scenarios
- Tests must be deterministic — always pass or fail the same way
- Sort tests by difficulty, from easiest to hardest
- Always construct real instances using the actual class constructors with real arguments. Import real classes from their actual modules. Do not use Mock, MagicMock, patch, SimpleNamespace, or any fake/stub objects to create test inputs or domain objects. Real instances expose real behavior; mocks silently pass on wrong attribute access and break when optimized code changes access patterns.
- Include large-scale test cases to assess performance with realistic data volumes
-
-## Rules
-
- **Preserve the original function** — do not modify, enhance, or add parameters to the function under test. Test it exactly as provided.
- **Always use real classes** — import real classes from their actual modules and construct real instances with real arguments. Do not define stub, fake, mock, dummy, or placeholder classes. Do not use `SimpleNamespace` as a stand-in for real objects. Tests that define their own classes or use fake objects will fail `isinstance()` checks and break when code is optimized.
- **Handle instance methods correctly** — if the function has `self`, import the class, create a real instance, and call the method on the instance. Do not pass `self` manually.
- **Use conftest.py fixtures when provided** — prefer fixtures over manual instantiation. Fixtures are pre-configured and handle setup/teardown.
- **Import everything you use** — every symbol must have a corresponding import.
- **Only import what you use** — do not add unused imports.
- **Use correct import sources** — when the dependency context shows `from X import Y`, use that exact source module.
- **Use correct constructor signatures** — only use constructor arguments shown in the provided context. Use concrete subclasses instead of abstract classes.
- **Valid Python string literals** — use ASCII quotes (`'` or `"`) as delimiters. Unicode curly quotes are not valid Python string delimiters.
-
-## Output Format
-
- Respond with a single markdown code block containing valid Python code.
- Do not nest code blocks or include markdown fences inside code.
- Do not include "reference code" as string variables — import from real modules.
- The code block must contain at least one `def test_...` function.
- Follow the exact template structure provided in the user message.
-
-## CRITICAL REMINDER
-
-You MUST construct real instances using actual class constructors with real arguments. Import real classes from their real modules. Do NOT use Mock, MagicMock, patch, SimpleNamespace, or any fake/stub objects for test inputs or domain objects.
-{% endif %}
+{% extends "base_system.md.j2" %}
--- a/django/aiservice/core/languages/python/testgen/validate.py
+++ b/django/aiservice/core/languages/python/testgen/validate.py
@ -2,49 +2,45 @@

 from __future__ import annotations

-from typing import TYPE_CHECKING, TypedDict
+from typing import TYPE_CHECKING

-from aiservice.common_utils import parse_python_version, validate_trace_id
-from aiservice.models.functions_to_optimize import FunctionToOptimize
 from ninja.errors import HttpError

+from aiservice.common_utils import parse_python_version, validate_trace_id
+from core.languages.python.testgen.context import BaseTestGenContext, TestGenContextData
 from core.languages.python.testgen.instrumentation.instrument_new_tests import instrument_test_source
 from core.languages.python.testgen.models import TestingMode
-from core.languages.python.testgen.context import BaseTestGenContext, TestGenContextData

 if TYPE_CHECKING:
    from core.shared.testgen_models import TestGenSchema


-class InstrumentTestSourceArgs(TypedDict):
-    test_source: str
-    function_to_optimize: FunctionToOptimize
-    helper_function_names: list[str]
-    module_path: str
-    test_module_path: str
-    test_framework: str
-    test_timeout: int
-    python_version: tuple[int, int, int]
-
-
 def instrument_tests(
    generated_test_source: str, data: TestGenSchema, python_version: tuple[int, int, int]
 ) -> tuple[str | None, str | None]:
-    common_args: InstrumentTestSourceArgs = {
-        "test_source": generated_test_source,
-        "function_to_optimize": data.function_to_optimize,
-        "helper_function_names": data.helper_function_names or [],
-        "module_path": data.module_path,
-        "test_module_path": data.test_module_path,
-        "test_framework": data.test_framework,
-        "test_timeout": data.test_timeout,
-        "python_version": python_version,
-    }
-
    # instrument_test_source() already applies isort via format_and_float_to_top()
-    # No need to apply isort again here (was causing double formatting overhead)
-    behavior_result = instrument_test_source(**common_args, mode=TestingMode.BEHAVIOR)
-    perf_result = instrument_test_source(**common_args, mode=TestingMode.PERFORMANCE)
+    behavior_result = instrument_test_source(
+        test_source=generated_test_source,
+        function_to_optimize=data.function_to_optimize,
+        helper_function_names=data.helper_function_names or [],
+        module_path=data.module_path,
+        test_module_path=data.test_module_path,
+        test_framework=data.test_framework,
+        test_timeout=data.test_timeout,
+        python_version=python_version,
+        mode=TestingMode.BEHAVIOR,
+    )
+    perf_result = instrument_test_source(
+        test_source=generated_test_source,
+        function_to_optimize=data.function_to_optimize,
+        helper_function_names=data.helper_function_names or [],
+        module_path=data.module_path,
+        test_module_path=data.test_module_path,
+        test_framework=data.test_framework,
+        test_timeout=data.test_timeout,
+        python_version=python_version,
+        mode=TestingMode.PERFORMANCE,
+    )

    return behavior_result, perf_result