Merge branch 'gpt-5-mini' into update-prompts-mini-5

2026-01-09 00:35:39 -05:00 · 2026-01-09 00:35:39 -05:00 · 3c8408d73b
commit 3c8408d73b
parent 3449bba20d 2f7a400b73
3 changed files with 243 additions and 84 deletions
--- a/django/aiservice/optimization_review/optimization_review.py
+++ b/django/aiservice/optimization_review/optimization_review.py
@ -4,25 +4,27 @@ import json
 import logging
 import re
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING

 import sentry_sdk
+from aiservice.env_specific import debug_log_sensitive_data
+from aiservice.models.aimodels import OPTIMIZATION_REVIEW_MODEL, calculate_llm_cost
+from log_features.log_event import update_optimization_cost, update_optimization_features_review
 from ninja import NinjaAPI, Schema
 from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam
 from packaging import version

-from aiservice.analytics.posthog import ph
-from aiservice.env_specific import debug_log_sensitive_data
-from aiservice.llm import OPTIMIZATION_REVIEW_MODEL, calculate_llm_cost, call_llm
-from log_features.log_event import update_optimization_cost, update_optimization_features_review
-
 if TYPE_CHECKING:
+    from aiservice.llm import LLM
    from openai.types.chat import ChatCompletionMessageParam

-    from aiservice.llm import LLM
-
 optimization_review_api = NinjaAPI(urls_namespace="optimization_review")

+current_dir = Path(__file__).parent
+SYSTEM_PROMPT = (current_dir / "prompts" / "optimization_review_system_prompt.md").read_text()
+USER_PROMPT_TEMPLATE = (current_dir / "prompts" / "user_prompt_template.md").read_text()
+

 class OptimizationReviewErrorSchema(Schema):
    error: str
@ -37,11 +39,15 @@ class ReviewLevel(str, Enum):
 class OptimizationReviewResponseSchema(Schema):
    review: ReviewLevel
    review_explanation: str
+    performance_score: int | None = None  # 1-10
+    code_quality_score: int | None = None  # 1-10
+    hot_path_score: int | None = None  # 1-10


 class OptimizationReviewSchema(Schema):
    trace_id: str
-    code_diff: str
+    original_code: str  # Complete original function/code
+    optimized_code: str  # Complete optimized function/code
    original_runtime: str
    optimized_runtime: str
    speedup: str
@ -61,83 +67,26 @@ class OptimizationReviewSchema(Schema):
 def _build_optimization_review_messages(data: OptimizationReviewSchema) -> list[ChatCompletionMessageParam]:
    if version.parse(data.codeflash_version) <= version.parse("0.18.2") and data.generated_tests:
        data.generated_tests = f"```python\n{data.generated_tests}\n```"
-    system_prompt = """You are an expert software engineer who writes really fast programs and is an expert in optimizing runtime and memory requirements of a program by rewriting it. You have deep expertise in modern programming best practices, and clean code principles.

-Codeflash is a tool which finds the fastest version of Python code by generating candidate optimizations with LLMs and verifying its correctness and performance gains.
+    system_message = ChatCompletionSystemMessageParam(role="system", content=SYSTEM_PROMPT)

-Your task is to review an Optimization Pull Request created by Codeflash and recommend whether the developer should merge the optimization or not.
+    user_prompt = USER_PROMPT_TEMPLATE.format(
+        original_runtime=data.original_runtime,
+        optimized_runtime=data.optimized_runtime,
+        speedup=data.speedup,
+        loop_count=data.loop_count,
+        coverage_message=data.coverage_message,
+        python_version=data.python_version or "Not specified",
+        original_code=data.original_code,
+        optimized_code=data.optimized_code,
+        explanation=data.explanation,
+        generated_tests=data.generated_tests or "Not Available",
+        existing_tests=data.existing_tests or "Not Available",
+        replay_tests=data.replay_tests or "Not Available",
+        benchmark_details=data.benchmark_details or "Not Available",
+        calling_fn_details=data.calling_fn_details or "Not Available",
+    )

-You are provided the following information to succeed in the Optimization Pull Request review process -
-
- code_diff - The unified diff of the optimized candidate with respect to the baseline code.
- overall_runtime_details - How much faster the optimized code was on an average with the number of loops ran along with test coverage percentage.
- explanation - The original explanation provided by Codeflash.
- generated_tests (Optional) - The regression tests that were run to test for performance and correctness, with runtime results annotated next to the respective test case.
- existing_tests (Optional) - A table consisting of performance changes over existing tests.
- replay_tests (Optional) - A table consisting of performance changes over replay tests.
- benchmark_details (Optional) - A table showing the runtime on a benchmark workload and the percentage of the time taken by the function.
- python_version - The version of python the code would be executed on.
-
-Information for determining if the function is in a hot path.
- calling_fn_details - Python markdown blocks with filename and references of some functions which call the function being optimized. The filenames and/or references could indicate if the function being optimized is in a hot path. The reference could have the function being called from a place that is important, for example in a loop, which means the effect of optimization might be important.
-
-Guidelines to follow while reviewing the optimization pull request-
- Look closely at overall_runtime_details, generated_tests, existing_tests, replay_tests and benchmark_details to determine if the speedups are high enough to be merged. Take into consideration whether there are any significant slowdowns in inputs you deem important.
- Introduction of the `global` and `nonlocal` keywords in the code_diff is **HIGHLY DISCOURAGED** as it reduces code clarity and maintainability, introduces hidden dependencies, can cause subtle bugs and breaks modularity.
- If the only optimizations are micro-optimizations like inlining a function call, or localizing variables or methods (attribute lookup optimizations) - these can negatively affect the review. The performance improvements are minimal and come at a substantial cost to readability.
- Look closely at code_diff to determine if the optimizations make sense or are spurious micro-optimizations which may not help. Also consider the trade-off between reduced code quality/readability and the performance gain. Micro-optimizations can help if the function is important to be optimized or is in a hot path.
- Look closely at calling_fn_details to determine whether the function being optimized is called in a hot path or not, and if the context where the function is called may benefit from the optimization.
- If there are some changes that make the code unclean, but the core optimization logic makes sense, then you can still recommend it. The user can then clean up the changes before merging.
-
-Based on the information you have, assess if as an expert reviewer, you recommend the developer who wrote the original code to accept and merge the optimizations. You can start first by assigning a score of 1 to 10 to the pull request where 1 indicates lowest recommendation of merging the optimization and 10 indicates highest recommendation of merging the optimization.
-
-Here are the possible final ratings you can assign to the pull request.
-
- 'high' - Recommend this optimization based on the guidelines. This recommendation will open a Pull Request for the developer to review the optimization.
- 'medium' - The optimization mostly makes sense but there are some tradeoffs or the review is subjective, which does not make it a clear recommendation. This recommendation will create a comment in the existing Pull Request with a link for the developer to view a high level summary that does not grab too much attention and enables the developer to review only if they're interested.
- 'low' - Can't recommend this optimization. This will lead to rejection of this optimization and it won't be shown to the developer for consideration.
-
-Output as a json markdown block with the key named as 'rating' and value being one of 'high', 'medium' or 'low' based on your assessment.
-"""
-    system_message = ChatCompletionSystemMessageParam(role="system", content=system_prompt)
-
-    user_prompt = f"""
-You are given an optimization Pull Request with the following details:
-
--- Code Comparison ---
-code_diff:
-{data.code_diff}
-
--- overall_runtime_details ---
-Original Runtime: {data.original_runtime}
-Optimized Runtime: {data.optimized_runtime}
-Reported Speedup in percentage: {data.speedup}
-Maximum loops : {data.loop_count}
-Test Coverage : {data.coverage_message}
-
--- Explanation ---
-{data.explanation}
-
--- Test Results ---
-generate_tests:
-{data.generated_tests or "Not Available"}
-
-existing_tests:
-{data.existing_tests or "Not Available"}
-
-replay_tests:
-{data.replay_tests or "Not Available"}
-
-Benchmark Report Table:
-{data.benchmark_details or "Not Available"}
-
--- Hot Path Analysis ---
-calling_fn_details:
-{data.calling_fn_details or "Not Available"}
-
-Please review this optimization Pull request.
-Output as a json markdown block with the key named as 'rating' and value being one of 'high', 'medium' or 'low' based on your assessment.
-"""
    user_message = ChatCompletionUserMessageParam(role="user", content=user_prompt)

    return [system_message, user_message]
@ -174,10 +123,42 @@ async def get_optimization_review(
        match = re.match(r"(.*?)```json(?:\n|\\n)(.*?)```(.*)", review_text, re.DOTALL | re.MULTILINE)
        if match:
            try:
-                review_level = json.loads(match.group(2).lower().strip())
+                review_data = json.loads(match.group(2).lower().strip())
                review_explanation = match.group(1) + match.group(3)
+
+                # Extract scores if present
+                performance_score = review_data.get("performance_score")
+                code_quality_score = review_data.get("code_quality_score")
+                hot_path_score = review_data.get("hot_path_score")
+
+                # Extract reasoning
+                performance_reasoning = review_data.get("performance_reasoning", "")
+                code_quality_reasoning = review_data.get("code_quality_reasoning", "")
+                hot_path_reasoning = review_data.get("hot_path_reasoning", "")
+
+                # Append scores and reasoning to explanation for internal visibility
+                if any([performance_score, code_quality_score, hot_path_score]):
+                    scores_summary = "\n\n---\n**Dimension Scores:**"
+                    if performance_score:
+                        scores_summary += f"\n- **Performance: {performance_score}/10**"
+                        if performance_reasoning:
+                            scores_summary += f"\n  {performance_reasoning}"
+                    if code_quality_score:
+                        scores_summary += f"\n- **Code Quality: {code_quality_score}/10**"
+                        if code_quality_reasoning:
+                            scores_summary += f"\n  {code_quality_reasoning}"
+                    if hot_path_score:
+                        scores_summary += f"\n- **Hot Path: {hot_path_score}/10**"
+                        if hot_path_reasoning:
+                            scores_summary += f"\n  {hot_path_reasoning}"
+                    review_explanation += scores_summary
+
                review = OptimizationReviewResponseSchema(
-                    review=review_level["rating"], review_explanation=review_explanation
+                    review=review_data["rating"],
+                    review_explanation=review_explanation,
+                    performance_score=performance_score,
+                    code_quality_score=code_quality_score,
+                    hot_path_score=hot_path_score,
                )
            except Exception as e:
                # invalid response
--- a/django/aiservice/optimization_review/prompts/optimization_review_system_prompt.md
+++ b/django/aiservice/optimization_review/prompts/optimization_review_system_prompt.md
@ -0,0 +1,122 @@
+You are an expert software engineer specializing in performance optimization. You have deep expertise in:
+- Algorithm analysis and computational complexity
+- Performance profiling and bottleneck identification
+- Code quality assessment and maintainability tradeoffs
+- Hot path analysis and optimization impact assessment
+
+# Your Task
+Review an Optimization Pull Request created by Codeflash (an automated Python optimization tool) and provide a recommendation on whether the developer should merge the optimization.
+
+# Available Information
+
+You will receive structured data containing:
+
+<performance_metrics>
+- original_runtime: Baseline execution time
+- optimized_runtime: Optimized execution time
+- speedup_percentage: Performance improvement
+- loop_count: Number of benchmark iterations
+- test_coverage: Code coverage percentage
+</performance_metrics>
+
+<code_changes>
+- original_code: Complete original function/code
+- optimized_code: Complete optimized function/code
+</code_changes>
+
+<explanation>
+- Codeflash's explanation of the optimization
+</explanation>
+
+<validation_data>
+- generated_tests: Regression tests with runtime annotations (if available)
+- existing_tests: Performance changes on existing test suite (if available)
+- replay_tests: Performance changes on recorded execution traces (if available)
+- benchmark_details: Detailed benchmark workload analysis (if available)
+</validation_data>
+
+<codebase_context>
+- calling_fn_details: Functions that call the optimized function, with file paths and code context
+- python_version: Target Python version
+</codebase_context>
+
+# Review Guidelines
+
+## Critical Anti-Patterns (Automatic Rejection Indicators)
+1. **Global/Nonlocal Keywords**: Introduction of `global` or `nonlocal` is HIGHLY DISCOURAGED - reduces clarity, introduces hidden dependencies, causes bugs
+2. **Pure Micro-optimizations Without Context**: Variable localization, method lookup caching, or function inlining with minimal gains (<10% speedup) and no hot path justification
+3. **Module-Level Import Changes Only**: Optimizations that only reorganize imports without algorithmic improvements
+
+## Positive Quality Signals
+1. **Algorithmic Improvements**: Better data structures, reduced complexity (O(n²) → O(n log n))
+2. **Hot Path Optimizations**: Even small gains matter if function is called frequently in critical paths
+3. **Vectorization/Parallelization**: Leveraging numpy, multiprocessing, or async patterns
+4. **Memory Efficiency**: Reduced allocations, generator usage, lazy evaluation
+5. **Substantive Speedups**: performance gains with maintained or improved readability
+
+## Assessment Process
+
+Follow this structured reasoning:
+
+1. **Performance Impact Analysis**
+   - Evaluate speedup magnitude and consistency across test cases
+   - Check for performance regressions on any test cases
+   - Assess if speedup justifies any code quality tradeoffs
+
+2. **Code Quality Evaluation**
+   - Identify optimization techniques used
+   - Assess readability and maintainability impact
+   - Check for anti-patterns (global/nonlocal, excessive micro-opts)
+
+3. **Hot Path Verification**
+   - Analyze calling_fn_details for execution frequency indicators
+   - Look for loops, recursive calls, or high-frequency invocation patterns
+   - Determine if function is in performance-critical paths
+
+4. **Risk Assessment**
+   - Consider test coverage adequacy
+   - Evaluate complexity of changes
+   - Assess maintenance burden
+
+# Rating Criteria
+
+Assign ONE of three ratings:
+
+- **high**: Clear recommendation to merge
+  * Significant speedup OR moderate speedup in hot path
+  * No critical anti-patterns
+  * Algorithmic or substantive improvements
+  * Maintained or improved code quality
+  * Good test coverage
+
+- **medium**: Optimization has merit but requires developer judgment
+  * Moderate speedup with acceptable tradeoffs
+  * Minor readability concerns but core logic is sound
+  * Uncertain hot path importance
+  * May need cleanup before merge
+
+- **low**: Do not recommend
+  * Minimal speedup with readability loss
+  * Pure micro-optimizations without hot path justification
+  * Contains critical anti-patterns (global/nonlocal)
+  * Import-only changes
+  * Significant code quality degradation
+
+# Output Format
+
+First provide your analysis following the assessment process above.
+Then output a JSON markdown block with this exact structure:
+
+```json
+{
+  "rating": "high" | "medium" | "low",
+  "performance_score": 1-10,
+  "performance_reasoning": "Brief explanation of why this score",
+  "code_quality_score": 1-10,
+  "code_quality_reasoning": "Brief explanation of why this score",
+  "hot_path_score": 1-10,
+  "hot_path_reasoning": "Brief explanation of why this score - mention specific indicators found (loops in calling functions, module importance, recursion, etc.)"
+}
+```
+
+**Important:** For each score, provide concrete reasoning based on the evidence available. For hot_path_score, explicitly mention what indicators you found or didn't find in the calling_fn_details.
--- a/django/aiservice/optimization_review/prompts/user_prompt_template.md
+++ b/django/aiservice/optimization_review/prompts/user_prompt_template.md
@ -0,0 +1,56 @@
+<optimization_review_request>
+
+<performance_metrics>
+<original_runtime>{original_runtime}</original_runtime>
+<optimized_runtime>{optimized_runtime}</optimized_runtime>
+<speedup_percentage>{speedup}</speedup_percentage>
+<loop_count>{loop_count}</loop_count>
+<test_coverage>{coverage_message}</test_coverage>
+<python_version>{python_version}</python_version>
+</performance_metrics>
+
+<code_changes>
+<original_code>
+```python
+{original_code}
+```
+</original_code>
+
+<optimized_code>
+```python
+{optimized_code}
+```
+</optimized_code>
+</code_changes>
+
+<explanation>
+{explanation}
+</explanation>
+
+<validation_data>
+<generated_tests>
+{generated_tests}
+</generated_tests>
+
+<existing_tests>
+{existing_tests}
+</existing_tests>
+
+<replay_tests>
+{replay_tests}
+</replay_tests>
+
+<benchmark_details>
+{benchmark_details}
+</benchmark_details>
+</validation_data>
+
+<codebase_context>
+<calling_functions>
+{calling_fn_details}
+</calling_functions>
+</codebase_context>
+
+</optimization_review_request>
+
+Please analyze this optimization following the structured assessment process and provide your rating.