stdout comparison in E2E

2025-02-18 05:25:31 -05:00 · 2025-02-18 05:25:31 -05:00 · f40c388ef6
commit f40c388ef6
parent 1f25df9061
3 changed files with 33 additions and 11 deletions
--- a/code_to_optimize/bubble_sort.py
+++ b/code_to_optimize/bubble_sort.py
@ -1,8 +1,10 @@
 def sorter(arr):
+    print("codeflash stdout: Sorting list")
    for i in range(len(arr)):
        for j in range(len(arr) - 1):
            if arr[j] > arr[j + 1]:
                temp = arr[j]
                arr[j] = arr[j + 1]
                arr[j + 1] = temp
+    print(f"result: {arr}")
    return arr
--- a/tests/scripts/end_to_end_test_bubblesort_pytest.py
+++ b/tests/scripts/end_to_end_test_bubblesort_pytest.py
@ -11,11 +11,15 @@ def run_test(expected_improvement_pct: int) -> bool:
        test_framework="pytest",
        min_improvement_x=1.0,
        coverage_expectations=[
-            CoverageExpectation(function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8])
+            CoverageExpectation(
+                function_name="sorter", expected_coverage=100.0, expected_lines=[2, 3, 4, 5, 6, 7, 8, 9, 10]
+            )
        ],
    )
    cwd = (pathlib.Path(__file__).parent.parent.parent / "code_to_optimize").resolve()
-    return run_codeflash_command(cwd, config, expected_improvement_pct)
+    return run_codeflash_command(
+        cwd, config, expected_improvement_pct, ['print("codeflash stdout: Sorting list")', 'print(f"result: {arr}")']
+    )


 if __name__ == "__main__":
--- a/tests/scripts/end_to_end_test_utilities.py
+++ b/tests/scripts/end_to_end_test_utilities.py
@ -63,19 +63,21 @@ def validate_coverage(stdout: str, expectations: list[CoverageExpectation]) -> b
        assert coverage_match, f"Failed to find coverage data for {expect.function_name}"

        coverage = float(coverage_match.group(1))
-        assert (
-            coverage == expect.expected_coverage
-        ), f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        assert coverage == expect.expected_coverage, (
+            f"Coverage was {coverage} instead of {expect.expected_coverage} for function: {expect.function_name}"
+        )

        executed_lines = list(map(int, coverage_match.group(2).split(", ")))
-        assert (
-            executed_lines == expect.expected_lines
-        ), f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        assert executed_lines == expect.expected_lines, (
+            f"Executed lines were {executed_lines} instead of {expect.expected_lines} for function: {expect.function_name}"
+        )

    return True


-def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
+def run_codeflash_command(
+    cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int, expected_in_stdout: list[str] = None
+) -> bool:
    logging.basicConfig(level=logging.INFO)
    if config.trace_mode:
        return run_trace_test(cwd, config, expected_improvement_pct)
@ -97,12 +99,21 @@ def run_codeflash_command(cwd: pathlib.Path, config: TestConfig, expected_improv
    return_code = process.wait()
    stdout = "".join(output)

-    if not validate_output(stdout, return_code, expected_improvement_pct, config):
+    validated = validate_output(stdout, return_code, expected_improvement_pct, config)
+    if not validated:
        # Write original file contents back to file
        path_to_file.write_text(file_contents, "utf-8")
        logging.info("Codeflash run did not meet expected requirements for testing, reverting file changes.")
        return False
-    return True
+
+    if expected_in_stdout:
+        stdout_validated = validate_stdout_in_candidate(stdout, expected_in_stdout)
+        if not stdout_validated:
+            logging.error("Failed to find expected output in candidate output")
+            validated = False
+        logging.info(f"Success: Expected output found in candidate output")
+
+    return validated


 def build_command(cwd: pathlib.Path, config: TestConfig, test_root: pathlib.Path) -> list[str]:
@ -164,6 +175,11 @@ def validate_output(stdout: str, return_code: int, expected_improvement_pct: int
    return True


+def validate_stdout_in_candidate(stdout: str, expected_in_stdout: list[str]) -> bool:
+    candidate_output = stdout[stdout.find("INFO     Best candidate") : stdout.find("Best Candidate Explanation")]
+    return all(expected in candidate_output for expected in expected_in_stdout)
+
+
 def run_trace_test(cwd: pathlib.Path, config: TestConfig, expected_improvement_pct: int) -> bool:
    # First command: Run the tracer
    test_root = cwd / "tests" / (config.test_framework or "")