codeflash-internal/experiments/gen_inspired_tests.py

# Derived from https://github.com/openai/openai-cookbook/blob/main/examples/Unit_test_writing_using_a_multi-step_prompt.ipynb
# imports needed to run the code in this notebook
# TODO: This is only here as a temporary reference implementaion of how an early version of LLM inspired tests was written.
#  It didn't work very well. This should be improved significantly.
import ast  # used for detecting whether generated Python code is valid
import platform

import openai  # used for calling the OpenAI API
from codeflash.code_utils.code_extractor import get_code
from codeflash.code_utils.code_utils import ellipsis_in_ast, get_imports_from_file
from codeflash.models.models import TestsInFile
from codeflash.verification.gen_regression_tests import print_message_delta, print_messages

from aiservice.llm import EXECUTE_MODEL, EXPLAIN_MODEL, LLM, PLAN_MODEL


def regression_tests_from_function_with_inspiration(
    function_to_test: str,  # Python function to test, as a string
    function_name: str,  # name of the function to test
    existing_unit_test_path_and_function: list[
        tuple[str, str]
    ],  # path to existing unit test file and function name that is testing the function
    unit_test_package: str = "pytest",  # unit testing package; use the name as it appears in the import statement
    approx_min_cases_to_cover: int = 7,  # minimum number of test case categories to cover (approximate)
    print_text: bool = False,  # optionally prints text; helpful for understanding the function & debugging
    explain_model: LLM = EXPLAIN_MODEL,  # model used to generate text plans in step 1
    plan_model: LLM = PLAN_MODEL,  # model used to generate text plans in steps 2 and 2b
    execute_model: LLM = EXECUTE_MODEL,  # model used to generate code in step 3
    temperature: float = 0.4,  # temperature = 0 can sometimes get stuck in repetitive loops, so we use 0.4
    reruns_if_fail: int = 1,  # if the output code cannot be parsed, this will re-run the function up to N times
    python_version: tuple[int, int, int] = tuple([int(ver) for ver in platform.python_version_tuple()]),
) -> str:
    """Returns a unit test for a given Python function, using a 3-step GPT prompt."""
    # TODO: This step is exactly the same as the non-inspired test generator. Merge them into one to save on API calls
    # Step 1: Generate an explanation of the function

    # create a markdown-formatted message that asks GPT to explain the function, formatted as a bullet list
    explain_system_message = {
        "role": "system",
        "content": "You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists.",
    }
    explain_user_message = {
        "role": "user",
        "content": f"""Please explain the following Python function '{function_name}'. Review what each element of the function is doing precisely and what the author's intentions may have been. Organize your explanation as a markdown-formatted, bulleted list.

```python
{function_to_test}
```""",
    }
    explain_messages = [explain_system_message, explain_user_message]
    if print_text:
        print_messages(explain_messages)
    try:
        explanation_response = openai.ChatCompletion.create(
            model=explain_model.name, messages=explain_messages, temperature=temperature, stream=True
        )
    except Exception as e:
        print(str(e))
        if reruns_if_fail > 0:
            print("Rerunning...")
            return regression_tests_from_function_with_inspiration(
                function_to_test=function_to_test,
                function_name=function_name,
                existing_unit_test_path_and_function=existing_unit_test_path_and_function,
                unit_test_package=unit_test_package,
                approx_min_cases_to_cover=approx_min_cases_to_cover,
                print_text=print_text,
                explain_model=EXPLAIN_MODEL,
                plan_model=PLAN_MODEL,
                execute_model=EXECUTE_MODEL,
                temperature=temperature,
                reruns_if_fail=reruns_if_fail - 1,  # decrement rerun counter when calling again
            )
    explanation = ""
    for chunk in explanation_response:
        delta = chunk["choices"][0]["delta"]
        if print_text:
            print_message_delta(delta)
        if "content" in delta:
            explanation += delta["content"]
    explain_assistant_message = {"role": "assistant", "content": explanation}

    # Step 2: Generate a plan to write a unit test

    # Asks GPT to plan out cases the units tests should cover, formatted as a bullet list
    existing_unit_tests = []
    max_inspiration_tests = 3
    tests_in_file: TestsInFile
    inspired_test_imports = []
    test_files = set()
    for i, tests_in_file in enumerate(existing_unit_test_path_and_function):
        path = tests_in_file.test_file
        test_files.add(path)
        function = tests_in_file.test_function
        if i >= max_inspiration_tests:
            break
        tester_code = get_code(path, function)

        code = f"""```python
{tester_code}
```"""
        existing_unit_tests.append(code)
    for path in test_files:
        imports_ast = get_imports_from_file(file_path=path)
        inspired_test_imports.append(imports_ast)
    package_comment = ""
    execute_user_message = {
        "role": "user",
        "content": f"""A good unit test suite should aim to:
- Test the function's behavior for a wide range of possible inputs
- Test edge cases that the author may not have foreseen
- Take advantage of the features of `{unit_test_package}` to make the tests easy to write and maintain
- Be easy to read and understand, with clean code and descriptive names
- Be deterministic, so that the tests always pass or fail in the same way

To test the function above, here {"are a few unit tests" if len(existing_unit_tests) > 1 else "is a unit test"} that already exist:

{f"{chr(10)}".join(existing_unit_tests)}

Using Python and the `{unit_test_package}` package, and taking inspiration from the existing test cases above, write a new suite of unit tests for the function '{function_name}'. Include helpful comments to explain each line. Generate concrete runnable test without using ellipsis. Reply only with code, formatted as follows:

```python
# imports
import {unit_test_package}  # used for our unit tests
{{insert other imports as needed}}

# function to test
{function_to_test}

# unit tests
{package_comment}
{{insert unit test code here}}
```
""",
    }
    plan_messages = [explain_system_message, explain_user_message, explain_assistant_message, execute_user_message]
    if print_text:
        print_messages([execute_user_message])
    try:
        execute_response = openai.ChatCompletion.create(
            model=plan_model.name, messages=plan_messages, temperature=temperature, stream=True
        )
    except Exception as e:
        print(str(e))
        if reruns_if_fail > 0:
            print("Rerunning...")
            return regression_tests_from_function_with_inspiration(
                function_to_test=function_to_test,
                function_name=function_name,
                existing_unit_test_path_and_function=existing_unit_test_path_and_function,
                unit_test_package=unit_test_package,
                approx_min_cases_to_cover=approx_min_cases_to_cover,
                print_text=print_text,
                explain_model=EXPLAIN_MODEL,
                plan_model=PLAN_MODEL,
                execute_model=EXECUTE_MODEL,
                temperature=temperature,
                reruns_if_fail=reruns_if_fail - 1,  # decrement rerun counter when calling again
            )
    execution = ""
    for chunk in execute_response:
        delta = chunk["choices"][0]["delta"]
        if print_text:
            print_message_delta(delta)
        if "content" in delta:
            execution += delta["content"]

    # check the output for errors
    code = execution.split("```python")[1].split("```")[0].strip()
    # TODO: This adds a bunch of redundant imports, clean them up
    tests_list = [imp for sublist in inspired_test_imports for imp in sublist]
    code = ast.unparse(tests_list) + "\n" + code
    try:
        module = ast.parse(code, feature_version=python_version[:2])
        if ellipsis_in_ast(module):
            # If the test generator is generating ellipsis, it is punting on generating
            # the concrete test cases and we should re-generate
            raise SyntaxError("Ellipsis in generated test code, regenerating...")

    except SyntaxError:
        print("Syntax error in generated code.")
        if reruns_if_fail > 0:
            print("Rerunning...")
            return regression_tests_from_function_with_inspiration(
                function_to_test=function_to_test,
                function_name=function_name,
                existing_unit_test_path_and_function=existing_unit_test_path_and_function,
                unit_test_package=unit_test_package,
                approx_min_cases_to_cover=approx_min_cases_to_cover,
                print_text=print_text,
                explain_model=EXPLAIN_MODEL,
                plan_model=PLAN_MODEL,
                execute_model=EXECUTE_MODEL,
                temperature=temperature,
                reruns_if_fail=reruns_if_fail - 1,  # decrement rerun counter when calling again
            )

    # return the unit test as a string
    return code


class ModifyInspiredTests(ast.NodeTransformer):
    def __init__(self, import_list, test_framework):
        self.import_list = import_list
        self.test_framework = test_framework

    def visit_Import(self, node: ast.Import):
        self.import_list.append(node)

    def visit_ImportFrom(self, node: ast.ImportFrom):
        self.import_list.append(node)

    def visit_ClassDef(self, node: ast.ClassDef):
        # No unittest-specific transformations needed since we only support pytest
        return node