prevent duplicate global assignments when reverting helpers

2025-08-23 18:34:37 +03:00 · 2025-08-23 18:34:37 +03:00 · e7de51a2df
commit e7de51a2df
parent 02b4d6557a
4 changed files with 262 additions and 2 deletions
--- a/codeflash/code_utils/code_replacer.py
+++ b/codeflash/code_utils/code_replacer.py
@ -412,11 +412,17 @@ def replace_function_definitions_in_module(
    module_abspath: Path,
    preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]],
    project_root_path: Path,
+    global_assignments_added_before: bool = False,  # noqa: FBT001, FBT002
 ) -> bool:
    source_code: str = module_abspath.read_text(encoding="utf8")
    code_to_apply = get_optimized_code_for_module(module_abspath.relative_to(project_root_path), optimized_code)
+
    new_code: str = replace_functions_and_add_imports(
-        add_global_assignments(code_to_apply, source_code),
+        # adding the global assignments before replacing the code, not after
+        # becuase of an "edge case" where the optimized code intoduced a new import and a global assignment using that import
+        # and that import wasn't used before, so it was ignored when calling AddImportsVisitor.add_needed_import inside replace_functions_and_add_imports (because the global assignment wasn't added yet)
+        # this was added at https://github.com/codeflash-ai/codeflash/pull/448
+        add_global_assignments(code_to_apply, source_code) if not global_assignments_added_before else source_code,
        function_names,
        code_to_apply,
        module_abspath,
--- a/codeflash/context/unused_definition_remover.py
+++ b/codeflash/context/unused_definition_remover.py
@ -537,6 +537,7 @@ def revert_unused_helper_functions(
                    module_abspath=file_path,
                    preexisting_objects=set(),  # Empty set since we're reverting
                    project_root_path=project_root,
+                    global_assignments_added_before=True,  # since we revert helpers functions after applying the optimization, we know that the file already has global assignments added, otherwise they would be added twice.
                )

                if reverted_code:
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@ -820,7 +820,10 @@ class FunctionOptimizer:
        return new_code, new_helper_code

    def replace_function_and_helpers_with_optimized_code(
-        self, code_context: CodeOptimizationContext, optimized_code: CodeStringsMarkdown, original_helper_code: str
+        self,
+        code_context: CodeOptimizationContext,
+        optimized_code: CodeStringsMarkdown,
+        original_helper_code: dict[Path, str],
    ) -> bool:
        did_update = False
        read_writable_functions_by_file_path = defaultdict(set)
--- a/tests/test_code_replacement.py
+++ b/tests/test_code_replacement.py
@ -3228,3 +3228,253 @@ class HuggingFaceModel(Model):
    assert not re.search(r"^import aiohttp as aiohttp_\b", new_code, re.MULTILINE)  # conditional alias import: import <name> as <alias>
    assert not re.search(r"^from math import pi as PI, sin as sine\b", new_code, re.MULTILINE)  # conditional multiple aliases imports
    assert "from huggingface_hub import AsyncInferenceClient, ChatCompletionInputTool" not in new_code # conditional from import
+
+
+def test_test():
+    root_dir = Path(__file__).parent.parent.resolve()
+    main_file = Path(root_dir / "code_to_optimize/temp_main.py").resolve()
+
+    original_code = '''"""Chunking objects not specific to a particular chunking strategy."""
+
+from __future__ import annotations
+
+import collections
+import copy
+from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
+
+import regex
+from typing_extensions import Self, TypeAlias
+
+from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
+from unstructured.documents.elements import (
+    CompositeElement,
+    ConsolidationStrategy,
+    Element,
+    ElementMetadata,
+    Table,
+    TableChunk,
+    Title,
+)
+from unstructured.utils import lazyproperty
+
+# ================================================================================================
+# MODEL
+# ================================================================================================
+
+CHUNK_MAX_CHARS_DEFAULT: int = 500
+"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
+
+Provided for reference only, for example so the ingest CLI can advertise the default value in its
+UI. External chunking-related functions (e.g. in ingest or decorators) should use
+`max_characters: int | None = None` and not apply this default themselves. Only
+`ChunkingOptions.max_characters` should apply a default value.
+"""
+
+CHUNK_MULTI_PAGE_DEFAULT: bool = True
+"""When False, respect page-boundaries (no two elements from different page in same chunk).
+
+Only operative for "by_title" chunking strategy.
+"""
+
+BoundaryPredicate: TypeAlias = Callable[[Element], bool]
+"""Detects when element represents crossing a semantic boundary like section or page."""
+
+TextAndHtml: TypeAlias = tuple[str, str]
+
+# ================================================================================================
+# PRE-CHUNKER
+# ================================================================================================
+
+
+class PreChunker:
+    """Gathers sequential elements into pre-chunks as length constraints allow.
+
+    The pre-chunker's responsibilities are:
+
+    - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
+      either side of those boundaries into different sections. In this case, the primary indicator
+      of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
+      semantic boundary when `multipage_sections` is `False`.
+
+    - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
+      into sections as big as possible without exceeding the chunk window size.
+
+    - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
+      and only produce a section that exceeds the chunk window size when there is a single element
+      with text longer than that window.
+
+    A Table element is placed into a section by itself. CheckBox elements are dropped.
+
+    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
+    a new "section", hence the "by-title" designation.
+    """
+
+    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
+        self._elements = elements
+        self._opts = opts
+
+    @lazyproperty
+    def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks."""
+        return self._opts.boundary_predicates
+
+    def _is_in_new_semantic_unit(self, element: Element) -> bool:
+        """True when `element` begins a new semantic unit such as a section or page."""
+        # -- all detectors need to be called to update state and avoid double counting
+        # -- boundaries that happen to coincide, like Table and new section on same element.
+        # -- Using `any()` would short-circuit on first True.
+        semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
+        return any(semantic_boundaries)
+
+'''
+    main_file.write_text(original_code, encoding="utf-8")
+    optim_code = f'''```python:{main_file.relative_to(root_dir)}
+# ================================================================================================
+# PRE-CHUNKER
+# ================================================================================================
+
+from __future__ import annotations
+from typing import Iterable
+from unstructured.documents.elements import Element
+from unstructured.utils import lazyproperty
+
+class PreChunker:
+    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
+        self._elements = elements
+        self._opts = opts
+
+    @lazyproperty
+    def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks."""
+        return self._opts.boundary_predicates
+
+    def _is_in_new_semantic_unit(self, element: Element) -> bool:
+        """True when `element` begins a new semantic unit such as a section or page."""
+        # Use generator expression for lower memory usage and avoid building intermediate list
+        for pred in self._boundary_predicates:
+            if pred(element):
+                return True
+        return False
+```
+'''
+
+    func = FunctionToOptimize(function_name="_is_in_new_semantic_unit", parents=[FunctionParent("PreChunker", "ClassDef")], file_path=main_file)
+    test_config = TestConfig(
+        tests_root=root_dir / "tests/pytest",
+        tests_project_rootdir=root_dir,
+        project_root_path=root_dir,
+        test_framework="pytest",
+        pytest_cmd="pytest",
+    )
+    func_optimizer = FunctionOptimizer(function_to_optimize=func, test_cfg=test_config)
+    code_context: CodeOptimizationContext = func_optimizer.get_code_optimization_context().unwrap()
+
+    original_helper_code: dict[Path, str] = {}
+    helper_function_paths = {hf.file_path for hf in code_context.helper_functions}
+    for helper_function_path in helper_function_paths:
+        with helper_function_path.open(encoding="utf8") as f:
+            helper_code = f.read()
+            original_helper_code[helper_function_path] = helper_code
+
+    func_optimizer.args = Args()
+    func_optimizer.replace_function_and_helpers_with_optimized_code(
+        code_context=code_context, optimized_code=CodeStringsMarkdown.parse_markdown_code(optim_code), original_helper_code=original_helper_code
+    )
+
+
+    new_code = main_file.read_text(encoding="utf-8")
+    main_file.unlink(missing_ok=True)
+
+    expected = '''"""Chunking objects not specific to a particular chunking strategy."""
+
+from __future__ import annotations
+
+import collections
+import copy
+from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
+
+import regex
+from typing_extensions import Self, TypeAlias
+
+from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
+from unstructured.documents.elements import (
+    CompositeElement,
+    ConsolidationStrategy,
+    Element,
+    ElementMetadata,
+    Table,
+    TableChunk,
+    Title,
+)
+from unstructured.utils import lazyproperty
+
+# ================================================================================================
+# MODEL
+# ================================================================================================
+
+CHUNK_MAX_CHARS_DEFAULT: int = 500
+"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
+
+Provided for reference only, for example so the ingest CLI can advertise the default value in its
+UI. External chunking-related functions (e.g. in ingest or decorators) should use
+`max_characters: int | None = None` and not apply this default themselves. Only
+`ChunkingOptions.max_characters` should apply a default value.
+"""
+
+CHUNK_MULTI_PAGE_DEFAULT: bool = True
+"""When False, respect page-boundaries (no two elements from different page in same chunk).
+
+Only operative for "by_title" chunking strategy.
+"""
+
+BoundaryPredicate: TypeAlias = Callable[[Element], bool]
+"""Detects when element represents crossing a semantic boundary like section or page."""
+
+TextAndHtml: TypeAlias = tuple[str, str]
+
+# ================================================================================================
+# PRE-CHUNKER
+# ================================================================================================
+
+
+class PreChunker:
+    """Gathers sequential elements into pre-chunks as length constraints allow.
+
+    The pre-chunker's responsibilities are:
+
+    - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
+      either side of those boundaries into different sections. In this case, the primary indicator
+      of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
+      semantic boundary when `multipage_sections` is `False`.
+
+    - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
+      into sections as big as possible without exceeding the chunk window size.
+
+    - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
+      and only produce a section that exceeds the chunk window size when there is a single element
+      with text longer than that window.
+
+    A Table element is placed into a section by itself. CheckBox elements are dropped.
+
+    The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
+    a new "section", hence the "by-title" designation.
+    """
+    def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
+        self._elements = elements
+        self._opts = opts
+
+    @lazyproperty
+    def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
+        """The semantic-boundary detectors to be applied to break pre-chunks."""
+        return self._opts.boundary_predicates
+
+    def _is_in_new_semantic_unit(self, element: Element) -> bool:
+        """True when `element` begins a new semantic unit such as a section or page."""
+        # Use generator expression for lower memory usage and avoid building intermediate list
+        for pred in self._boundary_predicates:
+            if pred(element):
+                return True
+        return False
+
+'''
+    assert new_code == expected