codeflash-internal/experiments/remove_subprocess_mock.ipynb

519 lines
23 KiB
Text

{
"cells": [
{
"cell_type": "code",
"outputs": [],
"source": [
"import ast\n",
"\n",
"from openai import OpenAI"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:40:25.977922Z",
"start_time": "2024-01-31T01:40:25.972813Z"
}
},
"id": "8d108492763a25a5",
"execution_count": 86
},
{
"cell_type": "code",
"outputs": [],
"source": [
"color_prefix_by_role = {\n",
" \"system\": \"\\033[0m\", # gray\n",
" \"user\": \"\\033[0m\", # gray\n",
" \"assistant\": \"\\033[92m\", # green\n",
"}\n",
"def ellipsis_in_ast(module: ast.AST) -> bool:\n",
" for node in ast.walk(module):\n",
" if isinstance(node, ast.Constant):\n",
" if node.value == ...:\n",
" return True\n",
" return False\n",
"\n",
"def print_messages(messages, color_prefix_by_role=color_prefix_by_role) -> None:\n",
" \"\"\"Prints messages sent to or from GPT.\"\"\"\n",
" for message in messages:\n",
" role = message[\"role\"]\n",
" color_prefix = color_prefix_by_role[role]\n",
" content = message[\"content\"]\n",
" print(f\"{color_prefix}\\n[{role}]\\n{content}\")\n",
"\n",
"openai_client = OpenAI(api_key=\"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\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:40:25.995518Z",
"start_time": "2024-01-31T01:40:25.976927Z"
}
},
"id": "694116ed54c4af6a",
"execution_count": 87
},
{
"cell_type": "code",
"execution_count": 88,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-01-31T01:40:26.003586Z",
"start_time": "2024-01-31T01:40:25.998970Z"
}
},
"outputs": [],
"source": [
"def regression_tests_from_function(\n",
" function_code: str, # Python function to test, as a string\n",
" function_name: str, # the function to test\n",
" unit_test_package: str = \"unittest\", # unit testing package; use the name as it appears in the import statement\n",
" approx_min_cases_to_cover: int = 7, # minimum number of test case categories to cover (approximate)\n",
" print_text: bool = True, # optionally prints text; helpful for understanding the function & debugging\n",
" explain_model= \"gpt-4-1106-preview\", # model used to generate text plans in step 1\n",
" plan_model= \"gpt-4-1106-preview\", # model used to generate text plans in steps 2 and 2b\n",
" execute_model= \"gpt-4-1106-preview\", # model used to generate code in step 3\n",
" temperature: float = 0.4, # temperature = 0 can sometimes get stuck in repetitive loops, so we use 0.4\n",
") -> str:\n",
" \"\"\"Returns a unit test for a given Python function, using a 3-step GPT prompt.\"\"\"\n",
"\n",
" # Step 1: Generate an explanation of the function\n",
"\n",
" # create a markdown-formatted message that asks GPT to explain the function, formatted as a bullet list\n",
" explain_system_message = {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists.\",\n",
" }\n",
" explain_user_message = {\n",
" \"role\": \"user\",\n",
" \"content\": f\"\"\"Please explain the following Python function '{function_name}'. Review what each element of the function is doing precisely and what the author's intentions may have been. Organize your explanation as a markdown-formatted, bulleted list.\n",
"\n",
"```python\n",
"{function_code}\n",
"```\"\"\",\n",
" }\n",
" explain_messages = [explain_system_message, explain_user_message]\n",
" if print_text:\n",
" print_messages(explain_messages)\n",
" try:\n",
" explanation_response = openai_client.with_options(\n",
" max_retries=2\n",
" ).chat.completions.create(\n",
" model=explain_model, messages=explain_messages, temperature=temperature\n",
" )\n",
" except Exception as e:\n",
" print(str(e))\n",
" raise Exception(e)\n",
" explanation = explanation_response.choices[0].message.content\n",
" explain_assistant_message = {\"role\": \"assistant\", \"content\": explanation}\n",
"\n",
" # Step 2: Generate a plan to write a unit test\n",
"\n",
" # Asks GPT to plan out cases the units tests should cover, formatted as a bullet list\n",
" plan_user_message = {\n",
" \"role\": \"user\",\n",
" \"content\": f\"\"\"A good unit test suite should aim to:\n",
"- Test the function's behavior for a wide range of possible inputs\n",
"- Test edge cases that the author may not have foreseen\n",
"- Take advantage of the features of {unit_test_package} to make the tests easy to write and maintain\n",
"- Be easy to read and understand, with clean code and descriptive names\n",
"- Be deterministic, so that the tests always pass or fail in the same way\n",
"- It should not mock or stub any dependencies, so do not use {unit_test_package}.mock or any other similar mocking or stubbing module, so that the testing environment is as close to the production environment as possible\n",
"\n",
"To help unit test the function above, list diverse scenarios that the function should be able to handle (and under each scenario, include a few examples as sub-bullets).\"\"\",\n",
" }\n",
" plan_messages = [\n",
" explain_system_message,\n",
" explain_user_message,\n",
" explain_assistant_message,\n",
" plan_user_message,\n",
" ]\n",
" if print_text:\n",
" print_messages([plan_user_message])\n",
" try:\n",
" plan_response = openai_client.with_options(max_retries=2).chat.completions.create(\n",
" model=plan_model, messages=plan_messages, temperature=temperature\n",
" )\n",
" except Exception as e:\n",
" print(str(e))\n",
" raise Exception(e)\n",
" plan = plan_response.choices[0].message.content\n",
" plan_assistant_message = {\"role\": \"assistant\", \"content\": plan}\n",
"\n",
" # Step 2b: If the plan is short, ask GPT to elaborate further\n",
" # this counts top-level bullets (e.g., categories), but not sub-bullets (e.g., test cases)\n",
" num_bullets = max(plan.count(\"\\n-\"), plan.count(\"\\n*\"))\n",
" elaboration_needed = num_bullets < approx_min_cases_to_cover\n",
" if elaboration_needed:\n",
" elaboration_user_message = {\n",
" \"role\": \"user\",\n",
" \"content\": f\"\"\"In addition to those scenarios above, list a few rare or unexpected edge cases (and as before, under each edge case, include a few examples as sub-bullets).\"\"\",\n",
" }\n",
" elaboration_messages = [\n",
" explain_system_message,\n",
" explain_user_message,\n",
" explain_assistant_message,\n",
" plan_user_message,\n",
" plan_assistant_message,\n",
" elaboration_user_message,\n",
" ]\n",
" if print_text:\n",
" print_messages([elaboration_user_message])\n",
" try:\n",
" elaboration_response = openai_client.with_options(\n",
" max_retries=2\n",
" ).chat.completions.create(\n",
" model=plan_model,\n",
" messages=elaboration_messages,\n",
" temperature=temperature,\n",
" )\n",
" except Exception as e:\n",
" print(str(e))\n",
" raise Exception(e)\n",
"\n",
" elaboration = elaboration_response.choices[0].message.content\n",
" elaboration_assistant_message = {\"role\": \"assistant\", \"content\": elaboration}\n",
"\n",
" # Step 3: Generate the unit test\n",
"\n",
" # create a markdown-formatted prompt that asks GPT to complete a unit test\n",
" package_comment = \"\"\n",
" # if unit_test_package == \"pytest\":\n",
" # package_comment = \"# below, each test case is represented by a tuple passed to the @pytest.mark.parametrize decorator\"\n",
" execute_system_message = {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You write careful, accurate unit tests. When asked to reply only with code, you write all of your code in a single block.\",\n",
" }\n",
" execute_user_message = {\n",
" \"role\": \"user\",\n",
" \"content\": f\"\"\"Using Python and the {unit_test_package} package, write a suite of unit tests for the function '{function_name}', following the cases above. Include helpful comments to explain each line. Reply only with code, formatted as follows:\n",
"\n",
"```python\n",
"# imports\n",
"import {unit_test_package} # used for our unit tests\n",
"{{insert other imports as needed}}\n",
"\n",
"# function to test\n",
"{function_code}\n",
"\n",
"# unit tests\n",
"{package_comment}\n",
"{{insert unit test code here}}\n",
"```\"\"\",\n",
" }\n",
" execute_messages = [\n",
" execute_system_message,\n",
" explain_user_message,\n",
" explain_assistant_message,\n",
" plan_user_message,\n",
" plan_assistant_message,\n",
" ]\n",
" if elaboration_needed:\n",
" execute_messages += [elaboration_user_message, elaboration_assistant_message]\n",
" execute_messages += [execute_user_message]\n",
" if print_text:\n",
" print_messages([execute_system_message, execute_user_message])\n",
" # TODO: Implement a fallback if the code is too long, implement a straightforward way to write the tests rather than the iterative approach\n",
" tries = 2\n",
" while tries > 0:\n",
" try:\n",
" execute_response = openai_client.with_options(\n",
" max_retries=2\n",
" ).chat.completions.create(\n",
" model=execute_model,\n",
" messages=execute_messages,\n",
" temperature=temperature,\n",
" )\n",
" except Exception as e:\n",
" print(str(e))\n",
" raise Exception(e)\n",
" execution = execute_response.choices[0].message.content\n",
"\n",
" # check the output for errors\n",
" code = execution.split(\"```python\")[1].split(\"```\")[0].strip()\n",
" try:\n",
" module = ast.parse(code)\n",
" if ellipsis_in_ast(module):\n",
" # If the test generator is generating ellipsis, it is punting on generating\n",
" # the concrete test cases and we should re-generate\n",
" raise SyntaxError(\"Ellipsis in generated test code, regenerating...\")\n",
" break\n",
" except SyntaxError as e:\n",
" tries -= 1\n",
" print(f\"Syntax error in generated code.\")\n",
" continue\n",
" if tries == 0:\n",
" raise Exception(\"Failed to generate test code\")\n",
"\n",
" # return the unit test as a string\n",
" return code\n"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"code = \"\"\"def lint_code(path: str) -> str:\n",
" logging.info(\"Formatting code with black...\")\n",
" # black currently does not have a stable public API, so we are using the CLI\n",
" # the main problem is custom config parsing https://github.com/psf/black/issues/779\n",
" assert os.path.exists(path), f\"File {path} does not exist. Cannot format the file. Exiting...\"\n",
" result = subprocess.run([\"black\", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
" if result.returncode == 0:\n",
" logging.info(\"OK\")\n",
" else:\n",
" logging.error(\"Failed to format\")\n",
" with open(path, \"r\") as f:\n",
" new_code = f.read()\"\"\""
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:40:26.006842Z",
"start_time": "2024-01-31T01:40:26.004637Z"
}
},
"id": "80fefa4a71d32347",
"execution_count": 89
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[0m\n",
"[system]\n",
"You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists.\n",
"\u001B[0m\n",
"[user]\n",
"Please explain the following Python function 'lint_code'. Review what each element of the function is doing precisely and what the author's intentions may have been. Organize your explanation as a markdown-formatted, bulleted list.\n",
"\n",
"```python\n",
"def lint_code(path: str) -> str:\n",
" logging.info(\"Formatting code with black...\")\n",
" # black currently does not have a stable public API, so we are using the CLI\n",
" # the main problem is custom config parsing https://github.com/psf/black/issues/779\n",
" assert os.path.exists(path), f\"File {path} does not exist. Cannot format the file. Exiting...\"\n",
" result = subprocess.run([\"black\", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
" if result.returncode == 0:\n",
" logging.info(\"OK\")\n",
" else:\n",
" logging.error(\"Failed to format\")\n",
" with open(path, \"r\") as f:\n",
" new_code = f.read()\n",
"```\n",
"\u001B[0m\n",
"[user]\n",
"A good unit test suite should aim to:\n",
"- Test the function's behavior for a wide range of possible inputs\n",
"- Test edge cases that the author may not have foreseen\n",
"- Take advantage of the features of unittest to make the tests easy to write and maintain\n",
"- Be easy to read and understand, with clean code and descriptive names\n",
"- Be deterministic, so that the tests always pass or fail in the same way\n",
"- It should not mock or stub any dependencies, so do not use unittest.mock or any other similar mocking or stubbing module, so that the testing environment is as close to the production environment as possible\n",
"\n",
"To help unit test the function above, list diverse scenarios that the function should be able to handle (and under each scenario, include a few examples as sub-bullets).\n",
"\u001B[0m\n",
"[user]\n",
"In addition to those scenarios above, list a few rare or unexpected edge cases (and as before, under each edge case, include a few examples as sub-bullets).\n",
"\u001B[0m\n",
"[system]\n",
"You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You write careful, accurate unit tests. When asked to reply only with code, you write all of your code in a single block.\n",
"\u001B[0m\n",
"[user]\n",
"Using Python and the unittest package, write a suite of unit tests for the function 'lint_code', following the cases above. Include helpful comments to explain each line. Reply only with code, formatted as follows:\n",
"\n",
"```python\n",
"# imports\n",
"import unittest # used for our unit tests\n",
"{insert other imports as needed}\n",
"\n",
"# function to test\n",
"def lint_code(path: str) -> str:\n",
" logging.info(\"Formatting code with black...\")\n",
" # black currently does not have a stable public API, so we are using the CLI\n",
" # the main problem is custom config parsing https://github.com/psf/black/issues/779\n",
" assert os.path.exists(path), f\"File {path} does not exist. Cannot format the file. Exiting...\"\n",
" result = subprocess.run([\"black\", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
" if result.returncode == 0:\n",
" logging.info(\"OK\")\n",
" else:\n",
" logging.error(\"Failed to format\")\n",
" with open(path, \"r\") as f:\n",
" new_code = f.read()\n",
"\n",
"# unit tests\n",
"\n",
"{insert unit test code here}\n",
"```\n"
]
}
],
"source": [
"tests = regression_tests_from_function(code, \"lint_code\", approx_min_cases_to_cover=10)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:44:06.796824Z",
"start_time": "2024-01-31T01:40:26.009670Z"
}
},
"id": "3cb5881311ca16e2",
"execution_count": 90
},
{
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# imports\n",
"import unittest # used for our unit tests\n",
"import os # used to interact with the filesystem\n",
"import tempfile # used to create temporary files and directories\n",
"import subprocess # used to run external commands\n",
"import logging # used to capture logging output\n",
"from unittest.mock import patch # used to patch modules and functions within them\n",
"\n",
"# function to test\n",
"def lint_code(path: str) -> str:\n",
" logging.info(\"Formatting code with black...\")\n",
" assert os.path.exists(path), f\"File {path} does not exist. Cannot format the file. Exiting...\"\n",
" result = subprocess.run([\"black\", path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
" if result.returncode == 0:\n",
" logging.info(\"OK\")\n",
" else:\n",
" logging.error(\"Failed to format\")\n",
" with open(path, \"r\") as f:\n",
" new_code = f.read()\n",
"\n",
"# unit tests\n",
"class TestLintCode(unittest.TestCase):\n",
"\n",
" # Set up a temporary directory before each test\n",
" def setUp(self):\n",
" self.test_dir = tempfile.TemporaryDirectory()\n",
"\n",
" # Clean up the temporary directory after each test\n",
" def tearDown(self):\n",
" self.test_dir.cleanup()\n",
"\n",
" # Test normal operation with a file that needs formatting\n",
" def test_normal_operation(self):\n",
" # Create a temporary Python file that needs formatting\n",
" test_file_path = os.path.join(self.test_dir.name, \"test.py\")\n",
" with open(test_file_path, \"w\") as test_file:\n",
" test_file.write(\"def foo():\\n return 1\\n\")\n",
"\n",
" # Run the lint_code function on the test file\n",
" with patch('subprocess.run') as mock_run:\n",
" mock_run.return_value.returncode = 0 # Simulate successful black formatting\n",
" lint_code(test_file_path)\n",
"\n",
" # Check that the file was formatted (mocked)\n",
" mock_run.assert_called_once_with([\"black\", test_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
"\n",
" # Test that an assertion error is raised when the file does not exist\n",
" def test_file_does_not_exist(self):\n",
" # Define a path to a non-existent file\n",
" test_file_path = os.path.join(self.test_dir.name, \"nonexistent.py\")\n",
"\n",
" # Run the lint_code function and check for AssertionError\n",
" with self.assertRaises(AssertionError):\n",
" lint_code(test_file_path)\n",
"\n",
" # Test that an error is logged when black fails to format the file\n",
" def test_black_failure(self):\n",
" # Create a temporary Python file with valid code\n",
" test_file_path = os.path.join(self.test_dir.name, \"test.py\")\n",
" with open(test_file_path, \"w\") as test_file:\n",
" test_file.write(\"def foo():\\n return 1\\n\")\n",
"\n",
" # Run the lint_code function and simulate a black failure\n",
" with patch('subprocess.run') as mock_run, \\\n",
" patch('logging.error') as mock_log_error:\n",
" mock_run.return_value.returncode = 1 # Simulate black formatting failure\n",
" lint_code(test_file_path)\n",
"\n",
" # Check that an error was logged\n",
" mock_log_error.assert_called_once_with(\"Failed to format\")\n",
"\n",
" # Test that the function logs \"OK\" when black successfully formats the file\n",
" def test_black_success_logging(self):\n",
" # Create a temporary Python file with valid code\n",
" test_file_path = os.path.join(self.test_dir.name, \"test.py\")\n",
" with open(test_file_path, \"w\") as test_file:\n",
" test_file.write(\"def foo():\\n return 1\\n\")\n",
"\n",
" # Run the lint_code function and simulate a successful black run\n",
" with patch('subprocess.run') as mock_run, \\\n",
" patch('logging.info') as mock_log_info:\n",
" mock_run.return_value.returncode = 0 # Simulate successful black formatting\n",
" lint_code(test_file_path)\n",
"\n",
" # Check that the \"OK\" message was logged\n",
" mock_log_info.assert_any_call(\"OK\")\n",
"\n",
"# Run the unit tests if this script is executed\n",
"if __name__ == '__main__':\n",
" unittest.main()\n"
]
}
],
"source": [
"print(tests)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:44:06.799741Z",
"start_time": "2024-01-31T01:44:06.794094Z"
}
},
"id": "639c96f42318a856",
"execution_count": 91
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-01-31T01:44:06.811647Z",
"start_time": "2024-01-31T01:44:06.800267Z"
}
},
"id": "12baa8a023eafc26",
"execution_count": 91
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}