codeflash-internal/django/aiservice/optimizer/test_optimizer.py

from optimizer.postprocess import (
    optimizations_postprocessing_pipeline,
    cleanup_explanations,
    deduplicate_optimizations,
    fix_missing_docstring,
    dedup_and_sort_imports,
)


def test_postprocess_optimizations_basic():
    original_code = "print('hello')"
    optimizations = [("print('hi')", "Simplified print statement")]

    expected = [("print('hi')", "Simplified print statement")]
    actual = optimizations_postprocessing_pipeline(original_code, optimizations)

    assert actual == expected


def test_postprocess_deduplicates():
    original_code = "print('hello')"
    optimizations = [("print('hi')", "Simplified print"), ("print('hi')", "Simplified print again")]

    expected = [("print('hi')", "Simplified print again")]
    actual = deduplicate_optimizations(original_code, optimizations)

    assert actual == expected


def test_postprocess_bubble_sort():
    original_code = """
def sorter(arr):
    for i in range(len(arr)):
        for j in range(len(arr) - 1):
            if arr[j] > arr[j + 1]:
                temp = arr[j]
                arr[j] = arr[j + 1]
                arr[j + 1] = temp
    return arr
"""

    optimizations = [
        (
            "def sorter(arr):\n    return sorted(arr)\n",
            "Optimized to use built-in sorted() function",
        ),
    ]

    expected = [
        (
            "def sorter(arr):\n    return sorted(arr)\n",
            "Optimized to use built-in sorted() function",
        )
    ]

    actual = optimizations_postprocessing_pipeline(original_code, optimizations)

    assert actual == expected


def test_optimize_bubble_sort_multiple():
    original_code = """
def sorter(arr):
    for i in range(len(arr)):
        for j in range(len(arr) - 1):
            if arr[j] > arr[j + 1]:
                temp = arr[j]
                arr[j] = arr[j + 1]
                arr[j + 1] = temp
    return arr
"""

    optimizations = [
        (
            "def sorter(arr):\n return sorted(arr)\n",
            "Your program is performing a bubble sort, which is known for its poor performance in large datasets because it has a worst and average time complexity of O(n^2). We can make this run faster by using a more efficient sorting algorithm. Python's built-in sorted() function uses the Timsort algorithm, which has an average and worst case time complexity of O(n log n), making it much faster and more efficient for large datasets.\n\nHere's how you can rewrite your function:\n\n\nThis change significantly improves the time complexity and memory usage of your function.",
        ),
        (
            "def sorter(arr):\n return sorted(arr)\n",
            "Your original program is using bubble sort which has a time complexity of O(n^2) making it inefficient for large data sets. A faster sorting algorithm could be applied here, 'Timsort', which is a hybrid sorting algorithm, derived from merge sort and insertion sort, designed to perform well on many kinds of real-world data. This algorithm is built-in as a native sorting algorithm in Python, and hence it runs considerably faster than any algorithm implemented in Python.\n\nHere is the updated faster program:\n\n",
        ),
        (
            original_code,
            "Blah blah Blah.",
        ),
    ]

    expected = [
        (
            "def sorter(arr):\n return sorted(arr)\n",
            "Your original program is using bubble sort which has a time complexity of O(n^2) making it inefficient for large data sets. A faster sorting algorithm could be applied here, 'Timsort', which is a hybrid sorting algorithm, derived from merge sort and insertion sort, designed to perform well on many kinds of real-world data. This algorithm is built-in as a native sorting algorithm in Python, and hence it runs considerably faster than any algorithm implemented in Python.\n\nHere is the updated faster program.\n\n",
        ),
    ]

    actual = optimizations_postprocessing_pipeline(original_code, optimizations)

    assert actual == expected


def test_filter_explanations():
    explanations = [
        """Here is an optimized version of the function. I used an compiled regular expression for better performance and file.readlines() for faster file reading. Here is the code:

This code could be even faster for larger files as it reads the file all at once and uses the compiled regular expression to match lines. It also reduces function calls in the loop, which tend to be expensive, and uses native Python operations.""",
        """Firstly, I noticed some repeated code which you can eliminate. This can improve the runtime performance of your application and also the readability of your code. In the two functions is_target_function_line() and update_line_node(), you loop through ast.walk(line_node) twice to look for a matched function name. This operation is unnecessary and can be improved to iterate once.

I provide an optimized version of your code below. I've used a dictionary to store the result of ast.walk(line_node) so that we only need to scan the nodes once. Please note that refactoring might raise unexpected side effects, depending on the interlink of these functions with other parts of the code. It's advisable to thoroughly test your application after changes.""",
        """The provided program implements a bubble sort, which has a time complexity of O(n^2). A more efficient sorting algorithm is Timsort, which Python's built-in sorted function uses. It has a time complexity of O(n log n). Here's how you can rewrite your function using Python's built-in sorting function:
```
def sorter(arr):
    return sorted(arr)
```
The built-in sorted function returns a new sorted list, and does not modify the original list. If you want to sort the list in place (modify the original list), you can use the sort method of list objects:

Other than the notable performance difference with large input data due to the more effective sorting algorithm, these function versions also incorporate Python's swapping mechanism which too saves additional time compared to traditional element swapping.""",
        """Your given python function is already optimized to the maximum, there's no possible way to rewrite it in a faster way. It simply returns a value from a dictionary which is an O(1) operation meaning it will take the same constant time no matter how large the input dictionary is.

This function is performing in constant time (O(1)) and can't be optimized further.""",
        """The original code you provided is already quite efficient in terms of computational complexity. Nonetheless, we can still make a small optimization to it. Instead of using str.join to turn a list of commands into a string and executing it with subprocess.run, we can pass the list directly to subprocess.run, this way, python will handle it optimally.

Here is the optimized code:

The changes made:

    Running the command directly from the list hf_spaces_deploy_cmd.
    I added str(e) in the except block. This is because formatting a low-level exception may cause another exception because not all exceptions implicitly call str() when they're in a format string. So it's safer to do it explicitly.

These changes are mostly about making the code a little bit more Pythonic and safer, they may not notably speed up the execution. Your original code is already written well and does not do any heavy computations to optimize.""",
        """Here is an optimized version of the function. I used an compiled regular expression for better performance and file.readlines() for faster file reading. Here is the code:

This code could be even faster for larger files as it reads the file all at once and uses the compiled regular expression to match lines. It also reduces function calls in the loop, which tend to be expensive, and uses native Python operations.""",
        """The current program is using a try-except block and calling the logging.warning function when the length of the string is zero. However, this does not need to be caught as an exception and can be handled in a simple if condition check. Also, the in operator is inefficient as it scans through all the characters in string.printable for every character in s resulting in a time complexity of O(n*m). A one-time creation of set(string.printable) can greatly improve the execution time. Here is the optimized version of the code:

This version of the program should run faster for larger inputs. Note that this program makes a trade-off between runtime and memory, using additional memory for storing string.printable in a set.""",
        """The provided code is already well optimized in terms of Python, as it uses the dictionary's get() method which provides a default value when a specified key is not in the dictionary and avoids raising an exception. This is more efficient than checking if the key is in the dictionary then getting the value. However, a slight enhancement would be pre-computing default values in the else statement:

This may slightly speed up runtime due to reduction in function calls but any performance gain would most likely be negligible as the given code was already quite efficient. Also, remember that optimizations can sometimes make code harder to read and should only be pursued when necessary.""",
        """However, if the default parameter config = None is a common case and creating a new instance of BaseEmbedderConfig is a heavy or costly operation, we could improve it by using a global mutable default parameter to avoid creating a new instance each time we call __init__, as follows:

Note: This suggested optimization should be applied carefully because if you modify the default_config object, modifications will persist across calls which can lead to unexpected behavior. If BaseEmbedderConfig's instances are not supposed to be modified after creation, this would not be an issue.

The rest of your code looks efficient and there is not much opportunity for further optimization given the current information. However, keep in mind that the performance of the __init__ method and to_embeddings heavily depends on the BaseEmbedderConfig class and embedding_fn method, which are not provided. Those might provide opportunities for optimization.""",
        """The given code mostly consists of class and method definitions, so there's limited room for optimization because the computational complexity is relatively low. However, there are minor changes that can be applied for a bit of efficiency:

Using is not None comparisons in place of or checks in the constructor could prevent potential issues with zero values (0, '', False, etc.), which are False in a boolean context.

Used c.__class__ is MessageContentText instead of isinstance(c, MessageContentText). This checks for the exact type, not subclasses, which is faster if no subclassing is being used.""",
    ]

    expected = [
        """Here is an optimized version of the function. I used an compiled regular expression for better performance and file.readlines() for faster file reading.

This code could be even faster for larger files as it reads the file all at once and uses the compiled regular expression to match lines. It also reduces function calls in the loop, which tend to be expensive, and uses native Python operations.""",
        """Firstly, I noticed some repeated code which you can eliminate. This can improve the runtime performance of your application and also the readability of your code. In the two functions is_target_function_line() and update_line_node(), you loop through ast.walk(line_node) twice to look for a matched function name. This operation is unnecessary and can be improved to iterate once.

I provide an optimized version of your code below. I've used a dictionary to store the result of ast.walk(line_node) so that we only need to scan the nodes once. Please note that refactoring might raise unexpected side effects, depending on the interlink of these functions with other parts of the code. It's advisable to thoroughly test your application after changes.""",
        """The provided program implements a bubble sort, which has a time complexity of O(n^2). A more efficient sorting algorithm is Timsort, which Python's built-in sorted function uses. It has a time complexity of O(n log n). Here's how you can rewrite your function using Python's built-in sorting function.

The built-in sorted function returns a new sorted list, and does not modify the original list. If you want to sort the list in place (modify the original list), you can use the sort method of list objects.

Other than the notable performance difference with large input data due to the more effective sorting algorithm, these function versions also incorporate Python's swapping mechanism which too saves additional time compared to traditional element swapping.""",
        """Your given python function is already optimized to the maximum, there's no possible way to rewrite it in a faster way. It simply returns a value from a dictionary which is an O(1) operation meaning it will take the same constant time no matter how large the input dictionary is.

This function is performing in constant time (O(1)) and can't be optimized further.""",
        """The original code you provided is already quite efficient in terms of computational complexity. Nonetheless, we can still make a small optimization to it. Instead of using str.join to turn a list of commands into a string and executing it with subprocess.run, we can pass the list directly to subprocess.run, this way, python will handle it optimally.


The changes made.

    Running the command directly from the list hf_spaces_deploy_cmd.
    I added str(e) in the except block. This is because formatting a low-level exception may cause another exception because not all exceptions implicitly call str() when they're in a format string. So it's safer to do it explicitly.

These changes are mostly about making the code a little bit more Pythonic and safer, they may not notably speed up the execution. Your original code is already written well and does not do any heavy computations to optimize.""",
        """Here is an optimized version of the function. I used an compiled regular expression for better performance and file.readlines() for faster file reading.

This code could be even faster for larger files as it reads the file all at once and uses the compiled regular expression to match lines. It also reduces function calls in the loop, which tend to be expensive, and uses native Python operations.""",
        """The current program is using a try-except block and calling the logging.warning function when the length of the string is zero. However, this does not need to be caught as an exception and can be handled in a simple if condition check. Also, the in operator is inefficient as it scans through all the characters in string.printable for every character in s resulting in a time complexity of O(n*m). A one-time creation of set(string.printable) can greatly improve the execution time.

This version of the program should run faster for larger inputs. Note that this program makes a trade-off between runtime and memory, using additional memory for storing string.printable in a set.""",
        """The provided code is already well optimized in terms of Python, as it uses the dictionary's get() method which provides a default value when a specified key is not in the dictionary and avoids raising an exception. This is more efficient than checking if the key is in the dictionary then getting the value. However, a slight enhancement would be pre-computing default values in the else statement.

This may slightly speed up runtime due to reduction in function calls but any performance gain would most likely be negligible as the given code was already quite efficient. Also, remember that optimizations can sometimes make code harder to read and should only be pursued when necessary.""",
        """However, if the default parameter config = None is a common case and creating a new instance of BaseEmbedderConfig is a heavy or costly operation, we could improve it by using a global mutable default parameter to avoid creating a new instance each time we call __init__.

Note: This suggested optimization should be applied carefully because if you modify the default_config object, modifications will persist across calls which can lead to unexpected behavior. If BaseEmbedderConfig's instances are not supposed to be modified after creation, this would not be an issue.

The rest of your code looks efficient and there is not much opportunity for further optimization given the current information. However, keep in mind that the performance of the __init__ method and to_embeddings heavily depends on the BaseEmbedderConfig class and embedding_fn method, which are not provided. Those might provide opportunities for optimization.""",
        """The given code mostly consists of class and method definitions, so there's limited room for optimization because the computational complexity is relatively low. However, there are minor changes that can be applied for a bit of efficiency.

Using is not None comparisons in place of or checks in the constructor could prevent potential issues with zero values (0, '', False, etc.), which are False in a boolean context.

Used c.__class__ is MessageContentText instead of isinstance(c, MessageContentText). This checks for the exact type, not subclasses, which is faster if no subclassing is being used.""",
    ]
    code_and_explanations = [("", exp) for exp in explanations]
    expected_code_and_explanations = [("", exp) for exp in expected]
    filtered = cleanup_explanations("", code_and_explanations)
    for filtered_elem, expected_elem in zip(filtered, expected_code_and_explanations):
        assert filtered_elem[1] == expected_elem[1]


def test_docstring_fix():
    original_code = '''def test_1():
    """useful docstring"""
    pass
'''
    optimized_code = """def test_1():
    pass
"""
    expected = '''def test_1():
    """useful docstring"""
    pass
'''
    actual = fix_missing_docstring(original_code, [(optimized_code, "")])

    assert actual[0][0] == expected

    original_code = '''def test_1():
    """useful docstring
    has a lot of multi-line details."""
    pass
def test_2():
    """useful docstring"""
    pass
    '''
    optimized_code = '''def test_1():
    pass
def test_2():
    """useful docstring v2"""
    pass
    '''
    expected = '''def test_1():
    """useful docstring
    has a lot of multi-line details."""
    pass
def test_2():
    """useful docstring v2"""
    pass
    '''
    actual = fix_missing_docstring(original_code, [(optimized_code, "")])
    assert actual[0][0] == expected

    original_code = '''class TestClass:
    def test_1(self):
        """useful docstring"""
        pass
'''
    optimized_code = """class TestClass:
    def test_1(self):
        pass
"""
    expected = '''class TestClass:
    def test_1(self):
        """useful docstring"""
        pass
'''
    actual = fix_missing_docstring(original_code, [(optimized_code, "")])
    assert actual[0][0] == expected

    original_code = '''def test_1():
    """useful docstring2"""
    pass
class TestClass:
    def test_1(self):
        """useful docstring"""
        pass
    '''
    optimized_code = """def test_1():
    pass
class TestClass:
    def test_1(self):
        pass
    """
    expected = '''def test_1():
    """useful docstring2"""
    pass
class TestClass:
    def test_1(self):
        """useful docstring"""
        pass
    '''
    actual = fix_missing_docstring(original_code, [(optimized_code, "")])
    assert actual[0][0] == expected

    # Testing garbage code
    original_code = """1sfdljs923$%fjh"""

    actual = fix_missing_docstring(original_code, [(original_code, "")])
    assert actual[0][0] == original_code


def test_cleanup_imports_deduplicates():
    original_code = """
import os
import sys


def foo():
    return os.path.join(sys.path[0], 'bar')
"""
    optimizations = [
        (
            """
import os
import sys
import os


def foo():
    return os.path.join(sys.path[0], 'bar')
""",
            "Removed duplicate imports",
        ),
    ]

    expected = [
        (
            """
import os
import sys


def foo():
    return os.path.join(sys.path[0], 'bar')
""",
            "Removed duplicate imports",
        )
    ]

    actual = dedup_and_sort_imports(original_code, optimizations)

    assert actual == expected


def test_cleanup_imports_sorts_and_deduplicates():
    original_code = """
import os
import sys
import json
import os


def foo():
    return os.path.join(sys.path[0], 'bar')
"""
    optimizations = [
        (
            """
import sys
import os
import json
import os


def foo():
    return os.path.join(sys.path[0], 'bar')
""",
            "Sorted and removed duplicate imports",
        ),
    ]

    expected = [
        (
            """
import json
import os
import sys


def foo():
    return os.path.join(sys.path[0], 'bar')
""",
            "Sorted and removed duplicate imports",
        )
    ]

    actual = dedup_and_sort_imports(original_code, optimizations)

    assert actual == expected


# Doesn't work with isort, but maybe we don't care for now
#
# def test_cleanup_imports_sorts_multiple_blocks():
#     original_code = """
# import os
# import sys
#
# def foo():
#     return os.path.join(sys.path[0], 'bar')
#
# import json
# import os
# """
#     optimizations = [
#         (
#             """
# import sys
# import os
#
# def foo():
#     return os.path.join(sys.path[0], 'bar')
#
# import os
# import json
# """,
#             "Sorted imports and ensured they are not duplicated across multiple blocks",
#         ),
#     ]
#
#     expected = [
#         (
#             """
# import json
# import os
# import sys
#
# def foo():
#     return os.path.join(sys.path[0], 'bar')
# """,
#             "Sorted imports and ensured they are not duplicated across multiple blocks",
#         )
#     ]
#
#     actual = cleanup_imports(original_code, optimizations)
#
#     assert actual == expected