codeflash-internal/django/aiservice/optimization_review/optimization_review.py

from __future__ import annotations

import json
import logging
import re
from enum import Enum
from typing import TYPE_CHECKING, cast

import sentry_sdk
from aiservice.env_specific import create_llm_client, debug_log_sensitive_data, llm_clients
from aiservice.models.aimodels import OPTIMIZATION_REVIEW_MODEL, calculate_llm_cost
from log_features.log_event import update_optimization_cost, update_optimization_features_review
from ninja import NinjaAPI, Schema
from openai.types.chat import ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam
from packaging import version

if TYPE_CHECKING:
    from aiservice.models.aimodels import LLM
from aiservice.analytics.posthog import ph

optimization_review_api = NinjaAPI(urls_namespace="optimization_review")


class OptimizationReviewErrorSchema(Schema):
    error: str


class ReviewLevel(str, Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"


class OptimizationReviewResponseSchema(Schema):
    review: ReviewLevel
    review_explanation: str


class OptimizationReviewSchema(Schema):
    trace_id: str
    code_diff: str
    original_runtime: str
    optimized_runtime: str
    speedup: str
    existing_tests: str | None
    generated_tests: str | None
    replay_tests: str | None
    benchmark_details: str | None
    coverage_message: str
    loop_count: int
    explanation: str
    calling_fn_details: str
    python_version: str | None = None
    codeflash_version: str = "0.18.2"


def _build_optimization_review_messages(
    data: OptimizationReviewSchema,
) -> list[ChatCompletionSystemMessageParam | ChatCompletionUserMessageParam]:
    if version.parse(data.codeflash_version) <= version.parse("0.18.2") and data.generated_tests:
        data.generated_tests = f"```python\n{data.generated_tests}\n```"
    system_prompt = """You are an expert software engineer who writes really fast programs and is an expert in optimizing runtime and memory requirements of a program by rewriting it. You have deep expertise in modern programming best practices, and clean code principles.

Codeflash is a tool which finds the fastest version of Python code by generating candidate optimizations with LLMs and verifying its correctness and performance gains.

Your task is to review an Optimization Pull Request created by Codeflash and recommend whether the developer should merge the optimization or not.

You are provided the following information to succeed in the Optimization Pull Request review process -

- code_diff - The unified diff of the optimized candidate with respect to the baseline code.
- overall_runtime_details - How much faster the optimized code was on an average with the number of loops ran along with test coverage percentage.
- explanation - The original explanation provided by Codeflash.
- generated_tests (Optional) - The regression tests that were run to test for performance and correctness, with runtime results annotated next to the respective test case.
- existing_tests (Optional) - A table consisting of performance changes over existing tests.
- replay_tests (Optional) - A table consisting of performance changes over replay tests.
- benchmark_details (Optional) - A table showing the runtime on a benchmark workload and the percentage of the time taken by the function.
- python_version - The version of python the code would be executed on.

Information for determining if the function is in a hot path.
- calling_fn_details - Python markdown blocks with filename and references of some functions which call the function being optimized. The filenames and/or references could indicate if the function being optimized is in a hot path. The reference could have the function being called from a place that is important, for example in a loop, which means the effect of optimization might be important.

Guidelines to follow while reviewing the optimization pull request-
- Look closely at overall_runtime_details, generated_tests, existing_tests, replay_tests and benchmark_details to determine if the speedups are high enough to be merged. Take into consideration whether there are any significant slowdowns in inputs you deem important.
- Introduction of the `global` and `nonlocal` keywords in the code_diff is **HIGHLY DISCOURAGED** as it reduces code clarity and maintainability, introduces hidden dependencies, can cause subtle bugs and breaks modularity.
- If the only optimizations are micro-optimizations like inlining a function call, or localizing variables or methods (not being used in a loop) - these can negatively affect the review, especially if python_version is older than 3.11.
- Look closely at code_diff to determine if the optimizations make sense or are spurious micro-optimizations which may not help. Also consider the trade-off between reduced code quality/readability and the performance gain. Micro-optimizations can help if the function is important to be optimized or is in a hot path.
- Look closely at calling_fn_details to determine whether the function being optimized is called in a hot path or not, and if the context where the function is called may benefit from the optimization.
- If there are some changes that make the code unclean, but the core optimization logic makes sense, then you can still recommend it. The user can then clean up the changes before merging.

Based on the information you have, assess if as an expert reviewer, you recommend the developer who wrote the original code to accept and merge the optimizations. You can start first by assigning a score of 1 to 10 to the pull request where 1 indicates lowest recommendation of merging the optimization and 10 indicates highest recommendation of merging the optimization.

Here are the possible final ratings you can assign to the pull request.

- 'high' - Recommend this optimization based on the guidelines. This recommendation will open a Pull Request for the developer to review the optimization.
- 'medium' - The optimization mostly makes sense but there are some tradeoffs or the review is subjective, which does not make it a clear recommendation. This recommendation will create a comment in the existing Pull Request with a link for the developer to view a high level summary that does not grab too much attention and enables the developer to review only if they're interested.
- 'low' - Can't recommend this optimization. This will lead to rejection of this optimization and it won't be shown to the developer for consideration.

Output as a json markdown block with the key named as 'rating' and value being one of 'high', 'medium' or 'low' based on your assessment.
"""
    system_message = ChatCompletionSystemMessageParam(role="system", content=system_prompt)

    user_prompt = f"""
You are given an optimization Pull Request with the following details:

--- Code Comparison ---
code_diff:
{data.code_diff}

--- overall_runtime_details ---
Original Runtime: {data.original_runtime}
Optimized Runtime: {data.optimized_runtime}
Reported Speedup in percentage: {data.speedup}
Maximum loops : {data.loop_count}
Test Coverage : {data.coverage_message}

--- Explanation ---
{data.explanation}

--- Test Results ---
generate_tests:
{data.generated_tests or "Not Available"}

existing_tests:
{data.existing_tests or "Not Available"}

replay_tests:
{data.replay_tests or "Not Available"}

Benchmark Report Table:
{data.benchmark_details or "Not Available"}

--- Hot Path Analysis ---
calling_fn_details:
{data.calling_fn_details or "Not Available"}

Please review this optimization Pull request.
Output as a json markdown block with the key named as 'rating' and value being one of 'high', 'medium' or 'low' based on your assessment.
"""
    user_message = ChatCompletionUserMessageParam(role="user", content=user_prompt)

    return [system_message, user_message]


async def get_optimization_review(
    request,
    data: OptimizationReviewSchema,
    optimization_review_model: LLM = OPTIMIZATION_REVIEW_MODEL,  # noqa: ANN001
) -> tuple[int, OptimizationReviewResponseSchema | OptimizationReviewErrorSchema]:
    """Compute optimization review via Claude."""
    ph(request.user, "aiservice-optimization-review-called")
    try:
        messages = _build_optimization_review_messages(data)

        debug_log_sensitive_data(f"{messages[0]}{messages[1]}")

        llm_client = llm_clients[optimization_review_model.model_type]
        # Call Claude API with retries
        response = await llm_client.with_options(max_retries=2).chat.completions.create(
            model=optimization_review_model.name, messages=messages
        )
        # Calculate and update cost
        cost = calculate_llm_cost(response, optimization_review_model)
        if cost:
            await update_optimization_cost(data.trace_id, cost)

        # Extract review text from Claude response
        review_text = cast("str", response.choices[0].message.content).strip()
        match = re.match(r"(.*?)```json(?:\n|\\n)(.*?)```(.*)", review_text, re.DOTALL | re.MULTILINE)
        if match:
            try:
                review_level = json.loads(match.group(2).lower().strip())
                review_explanation = match.group(1) + match.group(3)
                review = OptimizationReviewResponseSchema(
                    review=review_level["rating"], review_explanation=review_explanation
                )
            except Exception as e:
                # invalid response
                logging.exception("Invalid optimization review response")
                sentry_sdk.capture_exception(e)
                debug_log_sensitive_data(f"Invalid response : {e}")
                return 500, OptimizationReviewErrorSchema(error="Invalid response")
            else:
                ph(request.user, "aiservice-optimization-review-successful")
                return 200, review
        else:
            return 500, OptimizationReviewErrorSchema(error="Invalid response")
    except Exception as e:
        logging.exception("Error in optimization_review")
        sentry_sdk.capture_exception(e)
        return 500, OptimizationReviewErrorSchema(error="Internal server error")


@optimization_review_api.post(
    "/",
    response={
        200: OptimizationReviewResponseSchema,
        400: OptimizationReviewErrorSchema,
        500: OptimizationReviewErrorSchema,
    },
)
async def optimization_review(
    request,
    data: OptimizationReviewSchema,  # noqa: ANN001
) -> tuple[int, OptimizationReviewResponseSchema | OptimizationReviewErrorSchema]:
    response_code, output = await get_optimization_review(request, data)
    try:
        if response_code == 200:
            review_event = output.review.value
            review_explanation = output.review_explanation
        else:
            review_event = output.error
            review_explanation = ""
        await update_optimization_features_review(
            trace_id=data.trace_id,
            review_quality=review_event,
            review_explanation=review_explanation,
            calling_fn_details=data.calling_fn_details,
        )
    except Exception as e:  # noqa: BLE001
        debug_log_sensitive_data(f"event logging failed for optimization review {e}")
    return response_code, output