prevent duplicate global assignments when reverting helpers
This commit is contained in:
parent
02b4d6557a
commit
e7de51a2df
4 changed files with 262 additions and 2 deletions
|
|
@ -412,11 +412,17 @@ def replace_function_definitions_in_module(
|
|||
module_abspath: Path,
|
||||
preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]],
|
||||
project_root_path: Path,
|
||||
global_assignments_added_before: bool = False, # noqa: FBT001, FBT002
|
||||
) -> bool:
|
||||
source_code: str = module_abspath.read_text(encoding="utf8")
|
||||
code_to_apply = get_optimized_code_for_module(module_abspath.relative_to(project_root_path), optimized_code)
|
||||
|
||||
new_code: str = replace_functions_and_add_imports(
|
||||
add_global_assignments(code_to_apply, source_code),
|
||||
# adding the global assignments before replacing the code, not after
|
||||
# becuase of an "edge case" where the optimized code intoduced a new import and a global assignment using that import
|
||||
# and that import wasn't used before, so it was ignored when calling AddImportsVisitor.add_needed_import inside replace_functions_and_add_imports (because the global assignment wasn't added yet)
|
||||
# this was added at https://github.com/codeflash-ai/codeflash/pull/448
|
||||
add_global_assignments(code_to_apply, source_code) if not global_assignments_added_before else source_code,
|
||||
function_names,
|
||||
code_to_apply,
|
||||
module_abspath,
|
||||
|
|
|
|||
|
|
@ -537,6 +537,7 @@ def revert_unused_helper_functions(
|
|||
module_abspath=file_path,
|
||||
preexisting_objects=set(), # Empty set since we're reverting
|
||||
project_root_path=project_root,
|
||||
global_assignments_added_before=True, # since we revert helpers functions after applying the optimization, we know that the file already has global assignments added, otherwise they would be added twice.
|
||||
)
|
||||
|
||||
if reverted_code:
|
||||
|
|
|
|||
|
|
@ -820,7 +820,10 @@ class FunctionOptimizer:
|
|||
return new_code, new_helper_code
|
||||
|
||||
def replace_function_and_helpers_with_optimized_code(
|
||||
self, code_context: CodeOptimizationContext, optimized_code: CodeStringsMarkdown, original_helper_code: str
|
||||
self,
|
||||
code_context: CodeOptimizationContext,
|
||||
optimized_code: CodeStringsMarkdown,
|
||||
original_helper_code: dict[Path, str],
|
||||
) -> bool:
|
||||
did_update = False
|
||||
read_writable_functions_by_file_path = defaultdict(set)
|
||||
|
|
|
|||
|
|
@ -3228,3 +3228,253 @@ class HuggingFaceModel(Model):
|
|||
assert not re.search(r"^import aiohttp as aiohttp_\b", new_code, re.MULTILINE) # conditional alias import: import <name> as <alias>
|
||||
assert not re.search(r"^from math import pi as PI, sin as sine\b", new_code, re.MULTILINE) # conditional multiple aliases imports
|
||||
assert "from huggingface_hub import AsyncInferenceClient, ChatCompletionInputTool" not in new_code # conditional from import
|
||||
|
||||
|
||||
def test_test():
|
||||
root_dir = Path(__file__).parent.parent.resolve()
|
||||
main_file = Path(root_dir / "code_to_optimize/temp_main.py").resolve()
|
||||
|
||||
original_code = '''"""Chunking objects not specific to a particular chunking strategy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import copy
|
||||
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
|
||||
|
||||
import regex
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
ConsolidationStrategy,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Title,
|
||||
)
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
# ================================================================================================
|
||||
# MODEL
|
||||
# ================================================================================================
|
||||
|
||||
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
||||
"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
|
||||
|
||||
Provided for reference only, for example so the ingest CLI can advertise the default value in its
|
||||
UI. External chunking-related functions (e.g. in ingest or decorators) should use
|
||||
`max_characters: int | None = None` and not apply this default themselves. Only
|
||||
`ChunkingOptions.max_characters` should apply a default value.
|
||||
"""
|
||||
|
||||
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
||||
"""When False, respect page-boundaries (no two elements from different page in same chunk).
|
||||
|
||||
Only operative for "by_title" chunking strategy.
|
||||
"""
|
||||
|
||||
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
|
||||
"""Detects when element represents crossing a semantic boundary like section or page."""
|
||||
|
||||
TextAndHtml: TypeAlias = tuple[str, str]
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class PreChunker:
|
||||
"""Gathers sequential elements into pre-chunks as length constraints allow.
|
||||
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
|
||||
either side of those boundaries into different sections. In this case, the primary indicator
|
||||
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
|
||||
semantic boundary when `multipage_sections` is `False`.
|
||||
|
||||
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
|
||||
into sections as big as possible without exceeding the chunk window size.
|
||||
|
||||
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
|
||||
and only produce a section that exceeds the chunk window size when there is a single element
|
||||
with text longer than that window.
|
||||
|
||||
A Table element is placed into a section by itself. CheckBox elements are dropped.
|
||||
|
||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
|
||||
a new "section", hence the "by-title" designation.
|
||||
"""
|
||||
|
||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
|
||||
self._elements = elements
|
||||
self._opts = opts
|
||||
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
return self._opts.boundary_predicates
|
||||
|
||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
# -- all detectors need to be called to update state and avoid double counting
|
||||
# -- boundaries that happen to coincide, like Table and new section on same element.
|
||||
# -- Using `any()` would short-circuit on first True.
|
||||
semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
|
||||
return any(semantic_boundaries)
|
||||
|
||||
'''
|
||||
main_file.write_text(original_code, encoding="utf-8")
|
||||
optim_code = f'''```python:{main_file.relative_to(root_dir)}
|
||||
# ================================================================================================
|
||||
# PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Iterable
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
class PreChunker:
|
||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
|
||||
self._elements = elements
|
||||
self._opts = opts
|
||||
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
return self._opts.boundary_predicates
|
||||
|
||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
# Use generator expression for lower memory usage and avoid building intermediate list
|
||||
for pred in self._boundary_predicates:
|
||||
if pred(element):
|
||||
return True
|
||||
return False
|
||||
```
|
||||
'''
|
||||
|
||||
func = FunctionToOptimize(function_name="_is_in_new_semantic_unit", parents=[FunctionParent("PreChunker", "ClassDef")], file_path=main_file)
|
||||
test_config = TestConfig(
|
||||
tests_root=root_dir / "tests/pytest",
|
||||
tests_project_rootdir=root_dir,
|
||||
project_root_path=root_dir,
|
||||
test_framework="pytest",
|
||||
pytest_cmd="pytest",
|
||||
)
|
||||
func_optimizer = FunctionOptimizer(function_to_optimize=func, test_cfg=test_config)
|
||||
code_context: CodeOptimizationContext = func_optimizer.get_code_optimization_context().unwrap()
|
||||
|
||||
original_helper_code: dict[Path, str] = {}
|
||||
helper_function_paths = {hf.file_path for hf in code_context.helper_functions}
|
||||
for helper_function_path in helper_function_paths:
|
||||
with helper_function_path.open(encoding="utf8") as f:
|
||||
helper_code = f.read()
|
||||
original_helper_code[helper_function_path] = helper_code
|
||||
|
||||
func_optimizer.args = Args()
|
||||
func_optimizer.replace_function_and_helpers_with_optimized_code(
|
||||
code_context=code_context, optimized_code=CodeStringsMarkdown.parse_markdown_code(optim_code), original_helper_code=original_helper_code
|
||||
)
|
||||
|
||||
|
||||
new_code = main_file.read_text(encoding="utf-8")
|
||||
main_file.unlink(missing_ok=True)
|
||||
|
||||
expected = '''"""Chunking objects not specific to a particular chunking strategy."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import copy
|
||||
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
|
||||
|
||||
import regex
|
||||
from typing_extensions import Self, TypeAlias
|
||||
|
||||
from unstructured.common.html_table import HtmlCell, HtmlRow, HtmlTable
|
||||
from unstructured.documents.elements import (
|
||||
CompositeElement,
|
||||
ConsolidationStrategy,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Title,
|
||||
)
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
# ================================================================================================
|
||||
# MODEL
|
||||
# ================================================================================================
|
||||
|
||||
CHUNK_MAX_CHARS_DEFAULT: int = 500
|
||||
"""Hard-max chunk-length when no explicit value specified in `max_characters` argument.
|
||||
|
||||
Provided for reference only, for example so the ingest CLI can advertise the default value in its
|
||||
UI. External chunking-related functions (e.g. in ingest or decorators) should use
|
||||
`max_characters: int | None = None` and not apply this default themselves. Only
|
||||
`ChunkingOptions.max_characters` should apply a default value.
|
||||
"""
|
||||
|
||||
CHUNK_MULTI_PAGE_DEFAULT: bool = True
|
||||
"""When False, respect page-boundaries (no two elements from different page in same chunk).
|
||||
|
||||
Only operative for "by_title" chunking strategy.
|
||||
"""
|
||||
|
||||
BoundaryPredicate: TypeAlias = Callable[[Element], bool]
|
||||
"""Detects when element represents crossing a semantic boundary like section or page."""
|
||||
|
||||
TextAndHtml: TypeAlias = tuple[str, str]
|
||||
|
||||
# ================================================================================================
|
||||
# PRE-CHUNKER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class PreChunker:
|
||||
"""Gathers sequential elements into pre-chunks as length constraints allow.
|
||||
|
||||
The pre-chunker's responsibilities are:
|
||||
|
||||
- **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
|
||||
either side of those boundaries into different sections. In this case, the primary indicator
|
||||
of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
|
||||
semantic boundary when `multipage_sections` is `False`.
|
||||
|
||||
- **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
|
||||
into sections as big as possible without exceeding the chunk window size.
|
||||
|
||||
- **Minimize chunks that must be split mid-text.** Precompute the text length of each section
|
||||
and only produce a section that exceeds the chunk window size when there is a single element
|
||||
with text longer than that window.
|
||||
|
||||
A Table element is placed into a section by itself. CheckBox elements are dropped.
|
||||
|
||||
The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
|
||||
a new "section", hence the "by-title" designation.
|
||||
"""
|
||||
def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
|
||||
self._elements = elements
|
||||
self._opts = opts
|
||||
|
||||
@lazyproperty
|
||||
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
|
||||
"""The semantic-boundary detectors to be applied to break pre-chunks."""
|
||||
return self._opts.boundary_predicates
|
||||
|
||||
def _is_in_new_semantic_unit(self, element: Element) -> bool:
|
||||
"""True when `element` begins a new semantic unit such as a section or page."""
|
||||
# Use generator expression for lower memory usage and avoid building intermediate list
|
||||
for pred in self._boundary_predicates:
|
||||
if pred(element):
|
||||
return True
|
||||
return False
|
||||
|
||||
'''
|
||||
assert new_code == expected
|
||||
|
|
|
|||
Loading…
Reference in a new issue