Merge pull request #1164 from codeflash-ai/catch-one-more-exception-type

minor crash fix
2026-05-04 18:25:18 +00:00 · 2024-11-01 18:13:41 -07:00 · 2024-11-01 18:13:41 -07:00 · 8f607755a3
commit 8f607755a3
parent 20323b4755 13cdbc522e
14 changed files with 331 additions and 188 deletions
--- a/.github/workflows/end-to-end-test-tracer-replay.yaml
+++ b/.github/workflows/end-to-end-test-tracer-replay.yaml
@ -0,0 +1,34 @@
+name: end-to-end-test
+
+on:
+  pull_request:
+  workflow_dispatch:
+
+defaults:
+  run:
+    working-directory: ./cli
+
+jobs:
+  tracer-replay:
+    runs-on: ubuntu-latest
+    env:
+      CODEFLASH_API_KEY: ${{ secrets.CODEFLASH_API_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11.6
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry install --with dev
+      - name: Run Codeflash to optimize code
+        id: optimize_code
+        run: |
+          poetry env use python
+          poetry run python tests/scripts/end_to_end_test_tracer_replay.py
--- a/cli/code_to_optimize/code_directories/simple_tracer_e2e/pyproject.toml
+++ b/cli/code_to_optimize/code_directories/simple_tracer_e2e/pyproject.toml
@ -0,0 +1,6 @@
+[tool.codeflash]
+disable-telemetry = true
+formatter-cmds = ["ruff check --exit-zero --fix $file", "ruff format $file"]
+module-root = "."
+test-framework = "pytest"
+tests-root = "tests"
--- a/cli/code_to_optimize/code_directories/simple_tracer_e2e/tests/.touch
+++ b/cli/code_to_optimize/code_directories/simple_tracer_e2e/tests/.touch
--- a/cli/code_to_optimize/code_directories/simple_tracer_e2e/workload.py
+++ b/cli/code_to_optimize/code_directories/simple_tracer_e2e/workload.py
@ -0,0 +1,14 @@
+def funcA(number):
+    k = 0
+    for i in range(number * 100):
+        k += i
+    # Simplify the for loop by using sum with a range object
+    j = sum(range(number))
+
+    # Use a generator expression directly in join for more efficiency
+    return " ".join(str(i) for i in range(number))
+
+
+if __name__ == "__main__":
+    for i in range(10, 31, 10):
+        funcA(10)
--- a/cli/codeflash/discovery/discover_unit_tests.py
+++ b/cli/codeflash/discovery/discover_unit_tests.py
@ -32,7 +32,7 @@ class TestFunction:


 def discover_unit_tests(
-    cfg: TestConfig, discover_only_these_tests: list[str] | None = None
+    cfg: TestConfig, discover_only_these_tests: list[Path] | None = None
 ) -> dict[str, list[FunctionCalledInTest]]:
    if cfg.test_framework == "pytest":
        return discover_tests_pytest(cfg, discover_only_these_tests)
@ -43,7 +43,7 @@ def discover_unit_tests(


 def discover_tests_pytest(
-    cfg: TestConfig, discover_only_these_tests: list[str] | None = None
+    cfg: TestConfig, discover_only_these_tests: list[Path] | None = None
 ) -> dict[str, list[FunctionCalledInTest]]:
    tests_root = cfg.tests_root
    project_root = cfg.project_root_path
--- a/cli/codeflash/discovery/functions_to_optimize.py
+++ b/cli/codeflash/discovery/functions_to_optimize.py
@ -81,10 +81,10 @@ class FunctionVisitor(cst.CSTVisitor):


 class FunctionWithReturnStatement(ast.NodeVisitor):
-    def __init__(self, file_path: str) -> None:
+    def __init__(self, file_path: Path) -> None:
        self.functions: list[FunctionToOptimize] = []
        self.ast_path: list[FunctionParent] = []
-        self.file_path: str = file_path
+        self.file_path: Path = file_path

    def visit_FunctionDef(self, node: FunctionDef) -> None:
        # Check if the function has a return statement and add it to the list
@ -188,7 +188,7 @@ def get_functions_to_optimize(
                class_name = None
                only_function_name = split_function[0]
            found_function = None
-            for fn in functions.get(str(file), []):
+            for fn in functions.get(file, []):
                if only_function_name == fn.function_name and (
                    class_name is None or class_name == fn.top_level_parent_name
                ):
@ -196,7 +196,7 @@ def get_functions_to_optimize(
            if found_function is None:
                msg = f"Function {only_function_name} not found in file {file} or the function does not have a 'return' statement."
                raise ValueError(msg)
-            functions[str(file)] = [found_function]
+            functions[file] = [found_function]
    else:
        logger.info("Finding all functions modified in the current git diff ...")
        ph("cli-optimizing-git-diff")
@ -247,23 +247,23 @@ def get_all_files_and_functions(module_root_path: Path) -> dict[str, list[Functi
    return dict(files_list)


-def find_all_functions_in_file(file_path: Path) -> dict[str, list[FunctionToOptimize]]:
-    functions: dict[str, list[FunctionToOptimize]] = {}
+def find_all_functions_in_file(file_path: Path) -> dict[Path, list[FunctionToOptimize]]:
+    functions: dict[Path, list[FunctionToOptimize]] = {}
    with file_path.open(encoding="utf8") as f:
        try:
            ast_module = ast.parse(f.read())
        except Exception as e:
            logger.exception(e)
            return functions
-        function_name_visitor = FunctionWithReturnStatement(str(file_path))
+        function_name_visitor = FunctionWithReturnStatement(file_path)
        function_name_visitor.visit(ast_module)
-        functions[str(file_path)] = function_name_visitor.functions
+        functions[file_path] = function_name_visitor.functions
    return functions


 def get_all_replay_test_functions(
-    replay_test: str, test_cfg: TestConfig, project_root_path: Path
-) -> dict[str, list[FunctionToOptimize]]:
+    replay_test: Path, test_cfg: TestConfig, project_root_path: Path
+) -> dict[Path, list[FunctionToOptimize]]:
    function_tests = discover_unit_tests(test_cfg, discover_only_these_tests=[replay_test])
    # Get the absolute file paths for each function, excluding class name if present
    filtered_valid_functions = defaultdict(list)
@ -292,7 +292,7 @@ def get_all_replay_test_functions(
        file_path = Path(project_root_path, *file_path_parts).with_suffix(".py")
        file_to_functions_map[file_path].append((function, function_name, class_name))
    for file_path, functions in file_to_functions_map.items():
-        all_valid_functions: dict[str, list[FunctionToOptimize]] = find_all_functions_in_file(file_path=file_path)
+        all_valid_functions: dict[Path, list[FunctionToOptimize]] = find_all_functions_in_file(file_path=file_path)
        filtered_list = []
        for function in functions:
            function_name, function_name_only, class_name = function
@ -407,7 +407,7 @@ def inspect_top_level_functions_or_methods(


 def filter_functions(
-    modified_functions: dict[str, list[FunctionToOptimize]],
+    modified_functions: dict[Path, list[FunctionToOptimize]],
    tests_root: Path,
    ignore_paths: list[Path],
    project_root: Path,
@ -431,7 +431,8 @@ def filter_functions(
    tests_root_str = str(tests_root)
    module_root_str = str(module_root)
    # We desperately need Python 3.10+ only support to make this code readable with structural pattern matching
-    for file_path, functions in modified_functions.items():
+    for file_path_path, functions in modified_functions.items():
+        file_path = str(file_path_path)
        if file_path.startswith(tests_root_str + os.sep):
            test_functions_removed_count += len(functions)
            continue
@ -499,10 +500,11 @@ def filter_files_optimized(file_path: Path, tests_root: Path, ignore_paths: list
        return False
    if submodule_paths is None:
        submodule_paths = ignored_submodule_paths(module_root)
-    return not (
-        file_path in submodule_paths
-        or any(file_path.is_relative_to(submodule_path) for submodule_path in submodule_paths)
-    )
+    if file_path in submodule_paths or any(
+        file_path.is_relative_to(submodule_path) for submodule_path in submodule_paths
+    ):
+        return False
+    return True


 def function_has_return_statement(function_node: FunctionDef | AsyncFunctionDef) -> bool:
--- a/cli/codeflash/optimization/optimizer.py
+++ b/cli/codeflash/optimization/optimizer.py
@ -44,6 +44,7 @@ from codeflash.models.ExperimentMetadata import ExperimentMetadata
 from codeflash.models.models import (
    BestOptimization,
    CodeOptimizationContext,
+    FunctionParent,
    GeneratedTests,
    GeneratedTestsList,
    OptimizationSet,
@ -51,9 +52,6 @@ from codeflash.models.models import (
    OriginalCodeBaseline,
    TestFile,
    TestFiles,
-    OptimizedCandidate,
-    FunctionCalledInTest,
-    FunctionParent,
 )
 from codeflash.optimization.function_context import get_constrained_function_context_and_helper_functions
 from codeflash.result.create_pr import check_create_pr, existing_tests_source_for
@ -72,7 +70,7 @@ if TYPE_CHECKING:

    from returns.result import Result

-    from codeflash.models.models import FunctionSource
+    from codeflash.models.models import FunctionCalledInTest, FunctionSource, OptimizedCandidate


 class Optimizer:
@ -245,7 +243,6 @@ class Optimizer:

        baseline_result = self.establish_original_code_baseline(
            function_to_optimize.qualified_name,
-            generated_tests_paths,
            function_to_tests.get(module_path + "." + function_to_optimize.qualified_name, []),
        )
        console.rule()
@ -410,6 +407,7 @@ class Optimizer:
                    original_test_results=original_code_baseline.overall_test_results,
                    tests_in_file=only_run_this_test_function,
                )
+                console.rule()
                if not is_successful(run_results):
                    optimized_runtimes[candidate.optimization_id] = None
                    is_correct[candidate.optimization_id] = False
@ -635,7 +633,8 @@ class Optimizer:
    def instrument_existing_tests(
        self, function_to_optimize: FunctionToOptimize, function_to_tests: dict[str, list[FunctionCalledInTest]]
    ) -> set[Path]:
-        relevant_test_files_count = 0
+        existing_test_files_count = 0
+        replay_test_files_count = 0
        unique_instrumented_test_files = set()

        func_qualname = function_to_optimize.qualified_name_with_modules_from_root(self.args.project_root)
@ -644,10 +643,17 @@ class Optimizer:
        else:
            test_file_invocation_positions = defaultdict(list)
            for tests_in_file in function_to_tests.get(func_qualname):
-                test_file_invocation_positions[tests_in_file.tests_in_file.test_file].append(tests_in_file.position)
-            for test_file, positions in test_file_invocation_positions.items():
+                test_file_invocation_positions[
+                    (tests_in_file.tests_in_file.test_file, tests_in_file.tests_in_file.test_type)
+                ].append(tests_in_file.position)
+            for (test_file, test_type), positions in test_file_invocation_positions.items():
                path_obj_test_file = Path(test_file)
-                relevant_test_files_count += 1
+                if test_type == TestType.EXISTING_UNIT_TEST:
+                    existing_test_files_count += 1
+                elif test_type == TestType.REPLAY_TEST:
+                    replay_test_files_count += 1
+                else:
+                    raise ValueError(f"Unexpected test type: {test_type}")
                success, injected_test = inject_profiling_into_existing_test(
                    test_path=path_obj_test_file,
                    call_positions=positions,
@ -674,12 +680,13 @@ class Optimizer:
                            instrumented_file_path=new_test_path,
                            original_source=None,
                            original_file_path=Path(test_file),
-                            test_type=TestType.EXISTING_UNIT_TEST,
+                            test_type=test_type,
                        )
                    )
            logger.info(
-                f"Discovered {relevant_test_files_count} existing unit test file"
-                f"{'s' if relevant_test_files_count != 1 else ''} for {func_qualname}"
+                f"Discovered {existing_test_files_count} existing unit test file"
+                f"{'s' if existing_test_files_count != 1 else ''} and {replay_test_files_count} replay test file"
+                f"{'s' if replay_test_files_count != 1 else ''} for {func_qualname}"
            )
        return unique_instrumented_test_files

@ -756,7 +763,7 @@ class Optimizer:
        return Success((generated_tests, OptimizationSet(control=candidates, experiment=candidates_experiment)))

    def establish_original_code_baseline(
-        self, function_name: str, generated_tests_paths: list[Path], tests_in_file: list[FunctionCalledInTest]
+        self, function_name: str, tests_in_file: list[FunctionCalledInTest]
    ) -> Result[tuple[OriginalCodeBaseline, list[str]], str]:
        # For the original function - run the tests and get the runtime

@ -772,32 +779,31 @@ class Optimizer:
            else:
                test_env["PYTHONPATH"] += os.pathsep + str(self.args.project_root)

-            first_test_types = []
-            first_test_functions = []
+            only_run_these_test_functions_for_test_files: dict[str, str] = {}

-            for test_file in self.test_files.get_by_type(TestType.EXISTING_UNIT_TEST).test_files:
+            # Replay tests can have hundreds of test functions and running them can be very slow,
+            # so we only run the test functions that are relevant to the function we are optimizing
+            for test_file in self.test_files.get_by_type(TestType.REPLAY_TEST).test_files:
                relevant_tests_in_file = [
                    test_in_file
                    for test_in_file in tests_in_file
                    if test_in_file.tests_in_file.test_file == test_file.original_file_path
                ]
-                is_replay_test = (
-                    first_test_type := relevant_tests_in_file[0].tests_in_file.test_type
-                ) == TestType.REPLAY_TEST
-                first_test_types.append(first_test_type)
-                first_test_functions.append(
-                    relevant_tests_in_file[0].tests_in_file.test_function if is_replay_test else None
-                )
-                if is_replay_test and len(relevant_tests_in_file) > 1:
-                    logger.warning(f"Multiple tests found for the replay test {test_file}. Should not happen")
-            first_test_functions.extend([None] * len(generated_tests_paths))
+                only_run_these_test_functions_for_test_files[test_file.instrumented_file_path] = relevant_tests_in_file[
+                    0
+                ].tests_in_file.test_function
+
+                if len(relevant_tests_in_file) > 1:
+                    logger.warning(
+                        f"Multiple tests found ub the replay test {test_file} for {function_name}. Should not happen"
+                    )

            if test_framework == "pytest":
                unittest_results = self.run_and_parse_tests(
                    test_env=test_env,
                    test_files=self.test_files,
                    optimization_iteration=0,
-                    test_functions=first_test_functions,
+                    test_functions=only_run_these_test_functions_for_test_files,
                    testing_time=TOTAL_LOOPING_TIME,
                )
            else:
@ -811,7 +817,7 @@ class Optimizer:
                        test_env=test_env,
                        test_files=self.test_files,
                        optimization_iteration=0,
-                        test_functions=first_test_functions,
+                        test_functions=only_run_these_test_functions_for_test_files,
                        testing_time=TOTAL_LOOPING_TIME,
                    )
                    unittest_results.merge(unittest_loop_results)
@ -887,120 +893,111 @@ class Optimizer:
    ) -> Result[OptimizedCandidateResult, str]:
        assert (test_framework := self.args.test_framework) in ["pytest", "unittest"]

-        instrumented_unittests_created_for_function = self.test_files.get_by_type(TestType.EXISTING_UNIT_TEST)
-        generated_tests_paths = self.test_files.get_by_type(TestType.GENERATED_REGRESSION)
+        with progress_bar("Testing optimization candidate"):
+            success = True

-        success = True
+            test_env = os.environ.copy()
+            test_env["CODEFLASH_TEST_ITERATION"] = str(optimization_candidate_index)
+            test_env["CODEFLASH_TRACER_DISABLE"] = "1"
+            if "PYTHONPATH" not in test_env:
+                test_env["PYTHONPATH"] = str(self.args.project_root)
+            else:
+                test_env["PYTHONPATH"] += os.pathsep + str(self.args.project_root)

-        test_env = os.environ.copy()
-        test_env["CODEFLASH_TEST_ITERATION"] = str(optimization_candidate_index)
-        test_env["CODEFLASH_TRACER_DISABLE"] = "1"
-        if "PYTHONPATH" not in test_env:
-            test_env["PYTHONPATH"] = str(self.args.project_root)
-        else:
-            test_env["PYTHONPATH"] += os.pathsep + str(self.args.project_root)
+            get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)
+            get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)

-        first_test_types = []
-        first_test_functions = []
-        get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)
-        get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)
+            only_run_these_test_functions_for_test_files: dict[str, str] = {}
+            # Replay tests can have hundreds of test functions and running them can be very slow,
+            # so we only run the test functions that are relevant to the function we are optimizing
+            for test_file in self.test_files.get_by_type(TestType.REPLAY_TEST).test_files:
+                relevant_tests_in_file = [
+                    test_in_file
+                    for test_in_file in tests_in_file
+                    if test_in_file.tests_in_file.test_file == test_file.original_file_path
+                ]
+                only_run_these_test_functions_for_test_files[test_file.instrumented_file_path] = relevant_tests_in_file[
+                    0
+                ].tests_in_file.test_function

-        for test_file in instrumented_unittests_created_for_function:
-            relevant_tests_in_file = [
-                test_in_file
-                for test_in_file in tests_in_file
-                if test_in_file.tests_in_file.test_file == test_file.original_file_path
-            ]
-            is_replay_test = (
-                first_test_type := relevant_tests_in_file[0].tests_in_file.test_type
-            ) == TestType.REPLAY_TEST
-            first_test_types.append(first_test_type)
-            first_test_functions.append(
-                relevant_tests_in_file[0].tests_in_file.test_function if is_replay_test else None
-            )
-            if is_replay_test and len(relevant_tests_in_file) > 1:
-                logger.warning(
-                    f"Multiple tests found for the replay test {test_file.original_file_path}. Should not happen"
-                )
-        first_test_functions.extend([None] * len(generated_tests_paths))
-        if test_framework == "pytest":
-            candidate_results = self.run_and_parse_tests(
-                test_env=test_env,
-                test_files=self.test_files,
-                optimization_iteration=optimization_candidate_index,
-                test_functions=first_test_functions,
-                testing_time=TOTAL_LOOPING_TIME,
-            )
-            loop_count = (
-                max(all_loop_indices)
-                if (all_loop_indices := {result.loop_index for result in candidate_results.test_results})
-                else 0
-            )
-        else:
-            candidate_results = TestResults()
-            start_time: float = time.time()
-            loop_count = 0
-            for i in range(100):
-                if i >= 5 and time.time() - start_time >= TOTAL_LOOPING_TIME:
-                    break
-                test_env["CODEFLASH_LOOP_INDEX"] = str(i + 1)
-                candidate_loop_results = self.run_and_parse_tests(
+            if test_framework == "pytest":
+                candidate_results = self.run_and_parse_tests(
                    test_env=test_env,
                    test_files=self.test_files,
                    optimization_iteration=optimization_candidate_index,
-                    test_functions=first_test_functions,
+                    test_functions=only_run_these_test_functions_for_test_files,
                    testing_time=TOTAL_LOOPING_TIME,
                )
-                loop_count = i + 1
-                candidate_results.merge(candidate_loop_results)
+                loop_count = (
+                    max(all_loop_indices)
+                    if (all_loop_indices := {result.loop_index for result in candidate_results.test_results})
+                    else 0
+                )
+            else:
+                candidate_results = TestResults()
+                start_time: float = time.time()
+                loop_count = 0
+                for i in range(100):
+                    if i >= 5 and time.time() - start_time >= TOTAL_LOOPING_TIME:
+                        break
+                    test_env["CODEFLASH_LOOP_INDEX"] = str(i + 1)
+                    candidate_loop_results = self.run_and_parse_tests(
+                        test_env=test_env,
+                        test_files=self.test_files,
+                        optimization_iteration=optimization_candidate_index,
+                        test_functions=only_run_these_test_functions_for_test_files,
+                        testing_time=TOTAL_LOOPING_TIME,
+                    )
+                    loop_count = i + 1
+                    candidate_results.merge(candidate_loop_results)

-        initial_loop_candidate_results = TestResults(
-            test_results=[result for result in candidate_results.test_results if result.loop_index == 1]
-        )
-
-        console.print(
-            TestResults.report_to_tree(
-                initial_loop_candidate_results.get_test_pass_fail_report_by_type(),
-                title="Overall initial loop test results for candidate",
+            initial_loop_candidate_results = TestResults(
+                test_results=[result for result in candidate_results.test_results if result.loop_index == 1]
            )
-        )
-        console.rule()

-        initial_loop_original_test_results = TestResults(
-            test_results=[result for result in original_test_results.test_results if result.loop_index == 1]
-        )
-
-        if compare_test_results(initial_loop_original_test_results, initial_loop_candidate_results):
-            logger.info("Test results matched!")
-            console.rule()
-            equal_results = True
-        else:
-            logger.info("Test results did not match the test results of the original code.")
-            console.rule()
-            success = False
-            equal_results = False
-
-        if (total_candidate_timing := candidate_results.total_passed_runtime()) == 0:
-            logger.warning("The overall test runtime of the optimized function is 0, couldn't run tests.")
-            console.rule()
-        get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.bin")).unlink(missing_ok=True)
-
-        get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)
-        if not equal_results:
-            success = False
-
-        if not success:
-            return Failure("Failed to run the optimized candidate.")
-
-        return Success(
-            OptimizedCandidateResult(
-                max_loop_count=loop_count,
-                best_test_runtime=total_candidate_timing,
-                test_results=candidate_results,
-                optimization_candidate_index=optimization_candidate_index,
-                total_candidate_timing=total_candidate_timing,
+            console.print(
+                TestResults.report_to_tree(
+                    initial_loop_candidate_results.get_test_pass_fail_report_by_type(),
+                    title="Overall initial loop test results for candidate",
+                )
+            )
+            console.rule()
+
+            initial_loop_original_test_results = TestResults(
+                test_results=[result for result in original_test_results.test_results if result.loop_index == 1]
+            )
+
+            if compare_test_results(initial_loop_original_test_results, initial_loop_candidate_results):
+                logger.info("Test results matched!")
+                console.rule()
+                equal_results = True
+            else:
+                logger.info("Test results did not match the test results of the original code.")
+                console.rule()
+                success = False
+                equal_results = False
+
+            if (total_candidate_timing := candidate_results.total_passed_runtime()) == 0:
+                logger.warning("The overall test runtime of the optimized function is 0, couldn't run tests.")
+                console.rule()
+            get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.bin")).unlink(missing_ok=True)
+
+            get_run_tmp_file(Path(f"test_return_values_{optimization_candidate_index}.sqlite")).unlink(missing_ok=True)
+            if not equal_results:
+                success = False
+
+            if not success:
+                return Failure("Failed to run the optimized candidate.")
+
+            return Success(
+                OptimizedCandidateResult(
+                    max_loop_count=loop_count,
+                    best_test_runtime=total_candidate_timing,
+                    test_results=candidate_results,
+                    optimization_candidate_index=optimization_candidate_index,
+                    total_candidate_timing=total_candidate_timing,
+                )
            )
-        )

    def run_and_parse_tests(
        self,
--- a/cli/codeflash/tracer.py
+++ b/cli/codeflash/tracer.py
@ -27,7 +27,7 @@ from copy import copy
 from io import StringIO
 from pathlib import Path
 from types import FrameType
-from typing import Any, ClassVar, List, Optional
+from typing import Any, ClassVar, List

 import dill
 import isort
@ -42,6 +42,7 @@ from codeflash.tracing.tracing_utils import FunctionModules
 from codeflash.verification.verification_utils import get_test_file_path


+# Debug this file by simply adding print statements. This file is not meant to be debugged by the debugger.
 class Tracer:
    """Use this class as a 'with' context manager to trace a function call,
    input arguments, and profiling info.
@ -50,11 +51,11 @@ class Tracer:
    def __init__(
        self,
        output: str = "codeflash.trace",
-        functions: Optional[List[str]] = None,
+        functions: list[str] | None = None,
        disable: bool = False,
        config_file_path: Path | None = None,
        max_function_count: int = 256,
-        timeout: Optional[int] = None,  # seconds
+        timeout: int | None = None,  # seconds
    ) -> None:
        """:param output: The path to the output trace file
        :param functions: List of functions to trace. If None, trace all functions
@ -91,7 +92,8 @@ class Tracer:
        }
        self.max_function_count = max_function_count
        self.config, found_config_path = parse_config_file(config_file_path)
-        self.project_root = project_root_from_module_root(self.config["module_root"], found_config_path)
+        self.project_root = project_root_from_module_root(Path(self.config["module_root"]), found_config_path)
+        print("project_root", self.project_root)
        self.ignored_functions = {"<listcomp>", "<genexpr>", "<dictcomp>", "<setcomp>", "<lambda>", "<module>"}

        self.file_being_called_from: str = str(Path(sys._getframe().f_back.f_code.co_filename).name).replace(".", "_")
@ -160,7 +162,7 @@ class Tracer:
            remapped_callers = [{"key": k, "value": v} for k, v in callers.items()]
            cur.execute(
                "INSERT INTO pstats VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
-                (Path(func[0]).resolve(), func[1], func[2], func[3], cc, nc, tt, ct, json.dumps(remapped_callers)),
+                (str(Path(func[0]).resolve()), func[1], func[2], func[3], cc, nc, tt, ct, json.dumps(remapped_callers)),
            )
        self.con.commit()

@ -177,7 +179,7 @@ class Tracer:
            function
            for function in self.function_modules
            if self.function_count[
-                function.file_name
+                str(function.file_name)
                + ":"
                + (function.class_name + ":" if function.class_name else "")
                + function.function_name
@ -193,14 +195,17 @@ class Tracer:
        )
        function_path = "_".join(self.functions) if self.functions else self.file_being_called_from
        test_file_path = get_test_file_path(
-            test_dir=self.config["tests_root"], function_name=function_path, test_type="replay"
+            test_dir=Path(self.config["tests_root"]), function_name=function_path, test_type="replay"
        )
        replay_test = isort.code(replay_test)
        with open(test_file_path, "w", encoding="utf8") as file:
            file.write(replay_test)

        console.print(
-            f"Codeflash: Traced {self.trace_count} function calls successfully and replay test created at - {test_file_path}"
+            f"Codeflash: Traced {self.trace_count} function calls successfully and replay test created at - {test_file_path}",
+            crop=False,
+            soft_wrap=False,
+            overflow="ignore",
        )

    def tracer_logic(self, frame: FrameType, event: str):
@ -212,12 +217,12 @@ class Tracer:
                console.print(f"Codeflash: Timeout reached! Stopping tracing at {self.timeout} seconds.")
                return
        code = frame.f_code
-        file_name = code.co_filename
+        file_name = Path(code.co_filename).resolve()
        # TODO : It currently doesn't log the last return call from the first function

        if code.co_name in self.ignored_functions:
            return
-        if not Path(file_name).exists():
+        if not file_name.exists():
            return
        if self.functions:
            if code.co_name not in self.functions:
@ -236,7 +241,6 @@ class Tracer:
        except:
            # someone can override the getattr method and raise an exception. I'm looking at you wrapt
            return
-        file_name = Path(file_name).resolve()
        function_qualified_name = f"{file_name}:{(class_name + ':' if class_name else '')}{code.co_name}"
        if function_qualified_name in self.ignored_qualified_functions:
            return
@ -250,9 +254,9 @@ class Tracer:
            self.function_count[function_qualified_name] = 0
            file_valid = filter_files_optimized(
                file_path=file_name,
-                tests_root=self.config["tests_root"],
-                ignore_paths=self.config["ignore_paths"],
-                module_root=self.config["module_root"],
+                tests_root=Path(self.config["tests_root"]),
+                ignore_paths=[Path(p) for p in self.config["ignore_paths"]],
+                module_root=Path(self.config["module_root"]),
            )
            if not file_valid:
                # we don't want to trace this function because it cannot be optimized
@ -279,7 +283,7 @@ class Tracer:
            sys.setrecursionlimit(10000)
            # We do not pickle self for __init__ to avoid recursion errors, and instead instantiate its class
            # directly with the rest of the arguments in the replay tests. We copy the arguments to avoid memory
-            # leaks, bad references or side-effects when unpickling.
+            # leaks, bad references or side effects when unpickling.
            arguments = dict(arguments.items())
            if class_name and code.co_name == "__init__":
                del arguments["self"]
@ -297,7 +301,16 @@ class Tracer:
                return
        cur.execute(
            "INSERT INTO function_calls VALUES(?, ?, ?, ?, ?, ?, ?, ?)",
-            (event, code.co_name, class_name, file_name, frame.f_lineno, frame.f_back.__hash__(), t_ns, local_vars),
+            (
+                event,
+                code.co_name,
+                class_name,
+                str(file_name),
+                frame.f_lineno,
+                frame.f_back.__hash__(),
+                t_ns,
+                local_vars,
+            ),
        )
        self.trace_count += 1
        self.next_insert -= 1
@ -596,7 +609,7 @@ def main():
            globs = {"run_module": runpy.run_module, "modname": unknown_args[0]}
        else:
            progname = unknown_args[0]
-            sys.path.insert(0, str(Path(progname).parent))
+            sys.path.insert(0, str(Path(progname).resolve().parent))
            with io.open_code(progname) as fp:
                code = compile(fp.read(), progname, "exec")
            spec = importlib.machinery.ModuleSpec(name="__main__", loader=None, origin=progname)
--- a/cli/codeflash/tracing/tracing_utils.py
+++ b/cli/codeflash/tracing/tracing_utils.py
@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Optional

 from pydantic import dataclasses
@ -6,7 +7,7 @@ from pydantic import dataclasses
@dataclasses.dataclass
 class FunctionModules:
    function_name: str
-    file_name: str
+    file_name: Path
    module_name: str
    class_name: Optional[str] = None
    line_no: Optional[int] = None
--- a/cli/codeflash/verification/parse_test_output.py
+++ b/cli/codeflash/verification/parse_test_output.py
@ -72,7 +72,7 @@ def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, tes
            test_type = test_files.get_test_type_by_instrumented_file_path(test_file_path)
            try:
                test_pickle = pickle.loads(test_pickle_bin) if loop_index == 1 else None
-            except (AttributeError, ModuleNotFoundError, IndexError) as e:
+            except Exception as e:
                logger.exception(f"Failed to load pickle file. Exception: {e}")
                return test_results
            assert test_type is not None, f"Test type not found for {test_file_path}"
@ -115,7 +115,7 @@ def parse_sqlite_test_results(sqlite_file_path: Path, test_files: TestFiles, tes
            loop_index = val[4]
            try:
                ret_val = (pickle.loads(val[7]) if loop_index == 1 else None,)
-            except (AttributeError, ModuleNotFoundError, IndexError):
+            except Exception:
                continue
            test_results.add(
                function_test_invocation=FunctionTestInvocation(
@ -184,14 +184,7 @@ def parse_test_xml(
                return test_results

            test_class_path = testcase.classname
-            try:
-                test_function = testcase.name.split("[", 1)[0] if "[" in testcase.name else testcase.name
-            except ValueError as e:
-                xml_content = test_xml_file_path.read_text(encoding="utf-8")
-                logger.exception(
-                    f"Failed to parse test function name from {testcase.name} in {xml_content} Exception:{e}"
-                )
-                raise
+            test_function = testcase.name.split("[", 1)[0] if "[" in testcase.name else testcase.name
            if test_file_name is None:
                if test_class_path:
                    # TODO : This might not be true if the test is organized under a class
@ -222,7 +215,7 @@ def parse_test_xml(
                continue
            timed_out = False
            if test_config.test_framework == "pytest":
-                loop_index = int(testcase.name.split("[ ", 1)[1][:-2]) if "[" in testcase.name else 1
+                loop_index = int(testcase.name.split("[ ")[-1][:-2]) if "[" in testcase.name else 1
                if len(testcase.result) > 1:
                    logger.warning(f"!!!!!Multiple results for {testcase.name} in {test_xml_file_path}!!!")
                if len(testcase.result) == 1:
--- a/cli/codeflash/verification/test_runner.py
+++ b/cli/codeflash/verification/test_runner.py
@ -25,13 +25,6 @@ def run_tests(
    pytest_max_loops: int = 100_000,
 ) -> tuple[Path, subprocess.CompletedProcess]:
    assert test_framework in ["pytest", "unittest"]
-    # TODO: Make this work for replay tests
-    for i, test_file in enumerate(test_paths):
-        if (
-            only_run_these_test_functions and test_file.test_type == TestType.REPLAY_TEST
-        ):  # "__replay_test" in test_path:
-            # TODO: This might not work for replay tests
-            test_paths[i] = str(test_file.instrumented_file_path) + "::" + only_run_these_test_functions

    if test_framework == "pytest":
        result_file_path = get_run_tmp_file(Path("pytest_results.xml"))
@ -51,7 +44,14 @@ def run_tests(
            "--codeflash_loops_scope=session",
        ]

-        test_files = [str(file.instrumented_file_path) for file in test_paths.test_files]
+        test_files = []
+        for file in test_paths.test_files:
+            if file.test_type == TestType.REPLAY_TEST:
+                test_files.append(
+                    str(file.instrumented_file_path) + "::" + only_run_these_test_functions[file.instrumented_file_path]
+                )
+            else:
+                test_files.append(str(file.instrumented_file_path))

        results = subprocess.run(
            pytest_cmd_list + test_files + pytest_args,
--- a/cli/tests/scripts/end_to_end_test_futurehouse.py
+++ b/cli/tests/scripts/end_to_end_test_futurehouse.py
@ -27,7 +27,7 @@ def main():
    improvement_pct = int(re.search(r"📈 ([\d,]+)% improvement", stdout).group(1).replace(",", ""))
    improvement_x = float(improvement_pct) / 100

-    assert improvement_pct > 5, f"Performance improvement percentage was {improvement_pct}, which was not above 10%"
+    assert improvement_pct > 10, f"Performance improvement percentage was {improvement_pct}, which was not above 10%"
    assert improvement_x > 0.1, f"Performance improvement rate was {improvement_x}x, which was not above 0.1x"

    # Check for the line indicating the number of discovered existing unit tests
--- a/cli/tests/scripts/end_to_end_test_tracer_replay.py
+++ b/cli/tests/scripts/end_to_end_test_tracer_replay.py
@ -0,0 +1,66 @@
+import os
+import pathlib
+import re
+import subprocess
+
+
+def main():
+    cwd = (
+        pathlib.Path(__file__).parent.parent.parent / "code_to_optimize" / "code_directories" / "simple_tracer_e2e"
+    ).resolve()
+    print("cwd", cwd)
+    command = ["python", "-m", "codeflash.tracer", "-o", "codeflash.trace", "workload.py"]
+    process = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(cwd), env=os.environ.copy()
+    )
+    output = []
+
+    for line in process.stdout:
+        print(line, end="")  # Print each line in real-time
+        output.append(line)  # Store each line in the output variable
+    return_code = process.wait()
+    stdout = "".join(output)
+    assert return_code == 0, f"The codeflash command returned exit code {return_code} instead of 0"
+    functions_traced = re.search(r"Traced (\d+) function calls successfully and replay test created at - (.*)$", stdout)
+    assert functions_traced, "Failed to find any traced functions or replay test"
+    assert int(functions_traced.group(1)) == 3, "Failed to find the correct number of traced functions"
+    replay_test_path = pathlib.Path(functions_traced.group(2))
+    assert replay_test_path, "Failed to find the replay test file path"
+    assert replay_test_path.exists(), f"Replay test file does not exist at - {replay_test_path}"
+
+    command = ["python", "../../../codeflash/main.py", "--replay-test", str(replay_test_path), "--no-pr"]
+    process = subprocess.Popen(
+        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(cwd), env=os.environ.copy()
+    )
+    output = []
+
+    for line in process.stdout:
+        print(line, end="")  # Print each line in real-time
+        output.append(line)  # Store each line in the output variable
+    return_code = process.wait()
+    stdout = "".join(output)
+    assert return_code == 0, f"The codeflash command returned exit code {return_code} instead of 0"
+
+    improvement_pct = int(re.search(r"📈 ([\d,]+)% improvement", stdout).group(1).replace(",", ""))
+    improvement_x = float(improvement_pct) / 100
+
+    assert improvement_pct > 10, f"Performance improvement percentage was {improvement_pct}, which was not above 10%"
+    assert improvement_x > 0.1, f"Performance improvement rate was {improvement_x}x, which was not above 0.1x"
+
+    # Check for the line indicating the number of discovered existing unit tests
+    unit_test_search = re.search(r"Discovered (\d+) existing unit tests", stdout)
+    num_unit_tests = int(unit_test_search.group(1))
+    assert num_unit_tests == 1, f"Could not find 1 existing unit test, found {num_unit_tests} instead"
+
+    # check if the replay test was correctly run for the original code
+    m = re.search(r"Replay Tests - Passed: (\d+), Failed: (\d+)", stdout)
+    assert m, "Failed to run replay tests"
+
+    passed, failed = int(m.group(1)), int(m.group(2))
+
+    assert passed > 0, f"Expected >0 passed replay tests, found {passed}"
+    assert failed == 0, f"Expected 0 failed replay tests, found {failed}"
+
+
+if __name__ == "__main__":
+    main()
--- a/cli/tests/test_function_discovery.py
+++ b/cli/tests/test_function_discovery.py
@ -2,6 +2,7 @@ import tempfile
 from pathlib import Path

 from codeflash.discovery.functions_to_optimize import (
+    filter_files_optimized,
    find_all_functions_in_file,
    get_functions_to_optimize,
    inspect_top_level_functions_or_methods,
@ -19,7 +20,7 @@ def test_function_eligible_for_optimization() -> None:
        f.write(function)
        f.flush()
        functions_found = find_all_functions_in_file(Path(f.name))
-    assert functions_found[f.name][0].function_name == "test_function_eligible_for_optimization"
+    assert functions_found[Path(f.name)][0].function_name == "test_function_eligible_for_optimization"

    # Has no return statement
    function = """def test_function_not_eligible_for_optimization():
@ -31,7 +32,7 @@ def test_function_eligible_for_optimization() -> None:
        f.write(function)
        f.flush()
        functions_found = find_all_functions_in_file(Path(f.name))
-    assert len(functions_found[f.name]) == 0
+    assert len(functions_found[Path(f.name)]) == 0


 def test_find_top_level_function_or_method():
@ -149,3 +150,19 @@ def functionA():
        for file in functions:
            assert functions[file][0].qualified_name == "functionA"
            assert functions[file][0].function_name == "functionA"
+
+
+def test_filter_files_optimized():
+    tests_root = Path("tests").resolve()
+    module_root = Path().resolve()
+    ignore_paths = []
+
+    file_path_test = Path("tests/test_function_discovery.py").resolve()
+    file_path_same_level = Path("file.py").resolve()
+    file_path_different_level = Path("src/file.py").resolve()
+    file_path_above_level = Path("../file.py").resolve()
+
+    assert not filter_files_optimized(file_path_test, tests_root, ignore_paths, module_root)
+    assert filter_files_optimized(file_path_same_level, tests_root, ignore_paths, module_root)
+    assert filter_files_optimized(file_path_different_level, tests_root, ignore_paths, module_root)
+    assert not filter_files_optimized(file_path_above_level, tests_root, ignore_paths, module_root)