diff --git a/codeflash/code_utils/code_replacer.py b/codeflash/code_utils/code_replacer.py index 740e578b6..712df8c30 100644 --- a/codeflash/code_utils/code_replacer.py +++ b/codeflash/code_utils/code_replacer.py @@ -412,11 +412,17 @@ def replace_function_definitions_in_module( module_abspath: Path, preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]], project_root_path: Path, + global_assignments_added_before: bool = False, # noqa: FBT001, FBT002 ) -> bool: source_code: str = module_abspath.read_text(encoding="utf8") code_to_apply = get_optimized_code_for_module(module_abspath.relative_to(project_root_path), optimized_code) + new_code: str = replace_functions_and_add_imports( - add_global_assignments(code_to_apply, source_code), + # adding the global assignments before replacing the code, not after + # becuase of an "edge case" where the optimized code intoduced a new import and a global assignment using that import + # and that import wasn't used before, so it was ignored when calling AddImportsVisitor.add_needed_import inside replace_functions_and_add_imports (because the global assignment wasn't added yet) + # this was added at https://github.com/codeflash-ai/codeflash/pull/448 + add_global_assignments(code_to_apply, source_code) if not global_assignments_added_before else source_code, function_names, code_to_apply, module_abspath, diff --git a/codeflash/context/unused_definition_remover.py b/codeflash/context/unused_definition_remover.py index cf57af031..749757315 100644 --- a/codeflash/context/unused_definition_remover.py +++ b/codeflash/context/unused_definition_remover.py @@ -537,6 +537,7 @@ def revert_unused_helper_functions( module_abspath=file_path, preexisting_objects=set(), # Empty set since we're reverting project_root_path=project_root, + global_assignments_added_before=True, # since we revert helpers functions after applying the optimization, we know that the file already has global assignments added, otherwise they would be added twice. ) if reverted_code: diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index ba1b79492..07c596412 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -820,7 +820,10 @@ class FunctionOptimizer: return new_code, new_helper_code def replace_function_and_helpers_with_optimized_code( - self, code_context: CodeOptimizationContext, optimized_code: CodeStringsMarkdown, original_helper_code: str + self, + code_context: CodeOptimizationContext, + optimized_code: CodeStringsMarkdown, + original_helper_code: dict[Path, str], ) -> bool: did_update = False read_writable_functions_by_file_path = defaultdict(set) diff --git a/tests/test_code_replacement.py b/tests/test_code_replacement.py index 26e6e915b..fa0ad5adb 100644 --- a/tests/test_code_replacement.py +++ b/tests/test_code_replacement.py @@ -3228,3 +3228,253 @@ class HuggingFaceModel(Model): assert not re.search(r"^import aiohttp as aiohttp_\b", new_code, re.MULTILINE) # conditional alias import: import as assert not re.search(r"^from math import pi as PI, sin as sine\b", new_code, re.MULTILINE) # conditional multiple aliases imports assert "from huggingface_hub import AsyncInferenceClient, ChatCompletionInputTool" not in new_code # conditional from import + + +def test_test(): + root_dir = Path(__file__).parent.parent.resolve() + main_file = Path(root_dir / "code_to_optimize/temp_main.py").resolve() + + original_code = '''"""Chunking objects not specific to a particular chunking strategy.""" + +from __future__ import annotations + +import collections +import copy +from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast + +import regex +from typing_extensions import Self, TypeAlias + +from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable +from unstructured.documents.elements import ( + CompositeElement, + ConsolidationStrategy, + Element, + ElementMetadata, + Table, + TableChunk, + Title, +) +from unstructured.utils import lazyproperty + +# ================================================================================================ +# MODEL +# ================================================================================================ + +CHUNK_MAX_CHARS_DEFAULT: int = 500 +"""Hard-max chunk-length when no explicit value specified in `max_characters` argument. + +Provided for reference only, for example so the ingest CLI can advertise the default value in its +UI. External chunking-related functions (e.g. in ingest or decorators) should use +`max_characters: int | None = None` and not apply this default themselves. Only +`ChunkingOptions.max_characters` should apply a default value. +""" + +CHUNK_MULTI_PAGE_DEFAULT: bool = True +"""When False, respect page-boundaries (no two elements from different page in same chunk). + +Only operative for "by_title" chunking strategy. +""" + +BoundaryPredicate: TypeAlias = Callable[[Element], bool] +"""Detects when element represents crossing a semantic boundary like section or page.""" + +TextAndHtml: TypeAlias = tuple[str, str] + +# ================================================================================================ +# PRE-CHUNKER +# ================================================================================================ + + +class PreChunker: + """Gathers sequential elements into pre-chunks as length constraints allow. + + The pre-chunker's responsibilities are: + + - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on + either side of those boundaries into different sections. In this case, the primary indicator + of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a + semantic boundary when `multipage_sections` is `False`. + + - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit + into sections as big as possible without exceeding the chunk window size. + + - **Minimize chunks that must be split mid-text.** Precompute the text length of each section + and only produce a section that exceeds the chunk window size when there is a single element + with text longer than that window. + + A Table element is placed into a section by itself. CheckBox elements are dropped. + + The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates + a new "section", hence the "by-title" designation. + """ + + def __init__(self, elements: Iterable[Element], opts: ChunkingOptions): + self._elements = elements + self._opts = opts + + @lazyproperty + def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: + """The semantic-boundary detectors to be applied to break pre-chunks.""" + return self._opts.boundary_predicates + + def _is_in_new_semantic_unit(self, element: Element) -> bool: + """True when `element` begins a new semantic unit such as a section or page.""" + # -- all detectors need to be called to update state and avoid double counting + # -- boundaries that happen to coincide, like Table and new section on same element. + # -- Using `any()` would short-circuit on first True. + semantic_boundaries = [pred(element) for pred in self._boundary_predicates] + return any(semantic_boundaries) + +''' + main_file.write_text(original_code, encoding="utf-8") + optim_code = f'''```python:{main_file.relative_to(root_dir)} +# ================================================================================================ +# PRE-CHUNKER +# ================================================================================================ + +from __future__ import annotations +from typing import Iterable +from unstructured.documents.elements import Element +from unstructured.utils import lazyproperty + +class PreChunker: + def __init__(self, elements: Iterable[Element], opts: ChunkingOptions): + self._elements = elements + self._opts = opts + + @lazyproperty + def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: + """The semantic-boundary detectors to be applied to break pre-chunks.""" + return self._opts.boundary_predicates + + def _is_in_new_semantic_unit(self, element: Element) -> bool: + """True when `element` begins a new semantic unit such as a section or page.""" + # Use generator expression for lower memory usage and avoid building intermediate list + for pred in self._boundary_predicates: + if pred(element): + return True + return False +``` +''' + + func = FunctionToOptimize(function_name="_is_in_new_semantic_unit", parents=[FunctionParent("PreChunker", "ClassDef")], file_path=main_file) + test_config = TestConfig( + tests_root=root_dir / "tests/pytest", + tests_project_rootdir=root_dir, + project_root_path=root_dir, + test_framework="pytest", + pytest_cmd="pytest", + ) + func_optimizer = FunctionOptimizer(function_to_optimize=func, test_cfg=test_config) + code_context: CodeOptimizationContext = func_optimizer.get_code_optimization_context().unwrap() + + original_helper_code: dict[Path, str] = {} + helper_function_paths = {hf.file_path for hf in code_context.helper_functions} + for helper_function_path in helper_function_paths: + with helper_function_path.open(encoding="utf8") as f: + helper_code = f.read() + original_helper_code[helper_function_path] = helper_code + + func_optimizer.args = Args() + func_optimizer.replace_function_and_helpers_with_optimized_code( + code_context=code_context, optimized_code=CodeStringsMarkdown.parse_markdown_code(optim_code), original_helper_code=original_helper_code + ) + + + new_code = main_file.read_text(encoding="utf-8") + main_file.unlink(missing_ok=True) + + expected = '''"""Chunking objects not specific to a particular chunking strategy.""" + +from __future__ import annotations + +import collections +import copy +from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast + +import regex +from typing_extensions import Self, TypeAlias + +from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable +from unstructured.documents.elements import ( + CompositeElement, + ConsolidationStrategy, + Element, + ElementMetadata, + Table, + TableChunk, + Title, +) +from unstructured.utils import lazyproperty + +# ================================================================================================ +# MODEL +# ================================================================================================ + +CHUNK_MAX_CHARS_DEFAULT: int = 500 +"""Hard-max chunk-length when no explicit value specified in `max_characters` argument. + +Provided for reference only, for example so the ingest CLI can advertise the default value in its +UI. External chunking-related functions (e.g. in ingest or decorators) should use +`max_characters: int | None = None` and not apply this default themselves. Only +`ChunkingOptions.max_characters` should apply a default value. +""" + +CHUNK_MULTI_PAGE_DEFAULT: bool = True +"""When False, respect page-boundaries (no two elements from different page in same chunk). + +Only operative for "by_title" chunking strategy. +""" + +BoundaryPredicate: TypeAlias = Callable[[Element], bool] +"""Detects when element represents crossing a semantic boundary like section or page.""" + +TextAndHtml: TypeAlias = tuple[str, str] + +# ================================================================================================ +# PRE-CHUNKER +# ================================================================================================ + + +class PreChunker: + """Gathers sequential elements into pre-chunks as length constraints allow. + + The pre-chunker's responsibilities are: + + - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on + either side of those boundaries into different sections. In this case, the primary indicator + of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a + semantic boundary when `multipage_sections` is `False`. + + - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit + into sections as big as possible without exceeding the chunk window size. + + - **Minimize chunks that must be split mid-text.** Precompute the text length of each section + and only produce a section that exceeds the chunk window size when there is a single element + with text longer than that window. + + A Table element is placed into a section by itself. CheckBox elements are dropped. + + The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates + a new "section", hence the "by-title" designation. + """ + def __init__(self, elements: Iterable[Element], opts: ChunkingOptions): + self._elements = elements + self._opts = opts + + @lazyproperty + def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: + """The semantic-boundary detectors to be applied to break pre-chunks.""" + return self._opts.boundary_predicates + + def _is_in_new_semantic_unit(self, element: Element) -> bool: + """True when `element` begins a new semantic unit such as a section or page.""" + # Use generator expression for lower memory usage and avoid building intermediate list + for pred in self._boundary_predicates: + if pred(element): + return True + return False + +''' + assert new_code == expected