react changes for interactive pattern

This commit is contained in:
Sarthak Agarwal 2026-03-18 03:27:35 +05:30
parent 9df446b72f
commit 4bc89f2b9d
13 changed files with 584 additions and 26 deletions

View file

@ -15,7 +15,12 @@ from dataclasses import dataclass
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from codeflash.languages.javascript.parse import DomMutationProfile, InteractionDurationProfile, RenderProfile
from codeflash.languages.javascript.parse import (
DomMutationProfile,
InteractionDurationProfile,
InteractionRenderProfile,
RenderProfile,
)
logger = logging.getLogger(__name__)
@ -52,6 +57,29 @@ def _aggregate_avg_duration(profiles: list[RenderProfile]) -> float:
return sum(p.actual_duration_ms for p in profiles) / len(profiles)
@dataclass(frozen=True)
class InteractionComparison:
"""Per-interaction render count comparison."""
interaction_label: str
original_render_count: int
optimized_render_count: int
@property
def reduction_pct(self) -> float:
if self.original_render_count == 0:
return 0.0
return (
(self.original_render_count - self.optimized_render_count)
/ self.original_render_count
* 100
)
@property
def improved(self) -> bool:
return self.optimized_render_count < self.original_render_count
@dataclass(frozen=True)
class RenderBenchmark:
"""Comparison of original vs optimized render metrics.
@ -83,6 +111,12 @@ class RenderBenchmark:
optimized_interaction_duration_ms: float = 0.0
original_burst_count: int = 0
optimized_burst_count: int = 0
# Per-interaction render comparisons
per_interaction_comparisons: tuple[InteractionComparison, ...] = ()
@property
def has_per_interaction_data(self) -> bool:
return len(self.per_interaction_comparisons) > 0
@property
def render_count_reduction_pct(self) -> float:
@ -161,6 +195,81 @@ class RenderBenchmark:
return self.original_interaction_duration_ms > 0 or self.optimized_interaction_duration_ms > 0
def validate_render_count_stability(runs: list[list[RenderProfile]]) -> str:
"""Compare render counts across multiple runs to assess measurement confidence.
Args:
runs: List of render profile lists, one per validation run.
Returns:
"high" if counts are identical across all runs,
"low" if any component's render count varies by >= 2 across runs.
Falls back to "high" if there's only 1 run or no profiles.
"""
if len(runs) <= 1:
return "high"
# Group by component across runs: {component_name: [max_render_count_per_run]}
per_component_counts: dict[str, list[int]] = {}
for run_profiles in runs:
by_comp = _group_by_component(run_profiles)
seen_components = set()
for comp_name, profiles in by_comp.items():
seen_components.add(comp_name)
count = _aggregate_render_count(profiles)
per_component_counts.setdefault(comp_name, []).append(count)
# Components not seen in this run get 0
for comp_name in per_component_counts:
if comp_name not in seen_components:
per_component_counts[comp_name].append(0)
for comp_name, counts in per_component_counts.items():
spread = max(counts) - min(counts)
if spread >= 2:
logger.warning(
"[REACT] Unstable render count for %s across %d runs: %s (spread=%d)",
comp_name,
len(runs),
counts,
spread,
)
return "low"
if spread == 1:
logger.info(
"[REACT] Minor render count variance for %s across %d runs: %s (±1)",
comp_name,
len(runs),
counts,
)
return "high"
def _build_interaction_comparisons(
original_profiles: list[InteractionRenderProfile],
optimized_profiles: list[InteractionRenderProfile],
) -> tuple[InteractionComparison, ...]:
"""Build per-interaction render comparisons from original and optimized profiles."""
orig_by_label: dict[str, int] = {}
for p in original_profiles:
orig_by_label[p.interaction_label] = orig_by_label.get(p.interaction_label, 0) + p.render_count
opt_by_label: dict[str, int] = {}
for p in optimized_profiles:
opt_by_label[p.interaction_label] = opt_by_label.get(p.interaction_label, 0) + p.render_count
all_labels = list(dict.fromkeys(list(orig_by_label.keys()) + list(opt_by_label.keys())))
comparisons: list[InteractionComparison] = []
for label in all_labels:
comparisons.append(
InteractionComparison(
interaction_label=label,
original_render_count=orig_by_label.get(label, 0),
optimized_render_count=opt_by_label.get(label, 0),
)
)
return tuple(comparisons)
def compare_render_benchmarks(
original_profiles: list[RenderProfile],
optimized_profiles: list[RenderProfile],
@ -169,6 +278,8 @@ def compare_render_benchmarks(
target_component_name: str | None = None,
original_interaction_durations: list[InteractionDurationProfile] | None = None,
optimized_interaction_durations: list[InteractionDurationProfile] | None = None,
original_interaction_renders: list[InteractionRenderProfile] | None = None,
optimized_interaction_renders: list[InteractionRenderProfile] | None = None,
) -> RenderBenchmark | None:
"""Compare original and optimized render profiles with phase awareness.
@ -252,6 +363,13 @@ def compare_render_benchmarks(
)
opt_bursts = max((d.burst_count for d in optimized_interaction_durations), default=0)
# Build per-interaction render comparisons
interaction_comparisons: tuple[InteractionComparison, ...] = ()
if original_interaction_renders and optimized_interaction_renders:
interaction_comparisons = _build_interaction_comparisons(
original_interaction_renders, optimized_interaction_renders
)
return RenderBenchmark(
component_name=component_name,
original_render_count=orig_count,
@ -271,6 +389,7 @@ def compare_render_benchmarks(
optimized_interaction_duration_ms=opt_interaction_ms,
original_burst_count=orig_bursts,
optimized_burst_count=opt_bursts,
per_interaction_comparisons=interaction_comparisons,
)
@ -325,4 +444,18 @@ def format_render_benchmark_for_pr(benchmark: RenderBenchmark) -> str:
if benchmark.render_speedup_x > 1:
lines.append(f"\nRender time improved **{benchmark.render_speedup_x:.1f}x**.")
# Per-interaction breakdown table
if benchmark.has_per_interaction_data:
lines.append("")
lines.append("#### Per-Interaction Breakdown")
lines.append("")
lines.append("| Interaction | Before | After | Change |")
lines.append("|-------------|--------|-------|--------|")
for ic in benchmark.per_interaction_comparisons:
change = f"{ic.reduction_pct:.1f}% fewer" if ic.improved else "no change"
lines.append(
f"| {ic.interaction_label} | {ic.original_render_count} renders "
f"| {ic.optimized_render_count} renders | {change} |"
)
return "\n".join(lines)

View file

@ -236,6 +236,37 @@ def _extract_props_type(func: FunctionNode, source: str, analyzer: TreeSitterAna
return None
# Virtualization library imports that require real layout for meaningful benchmarks
_VIRTUALIZATION_IMPORTS = re.compile(
r"""(?:from|import)\s+['"](?:"""
r"react-window|react-virtuoso|react-virtual|@tanstack/react-virtual"
r"|react-virtualized|@tanstack/virtual-core"
r""")['"]""",
)
# Layout APIs that return zeros in jsdom
_LAYOUT_API_USAGE = re.compile(
r"\b(?:getBoundingClientRect|offsetWidth|offsetHeight|clientWidth|clientHeight"
r"|scrollTop|scrollHeight|scrollWidth|scrollLeft"
r"|IntersectionObserver|ResizeObserver)\b"
)
def needs_real_layout(source: str) -> bool:
"""Detect whether a component depends on real layout APIs unavailable in jsdom.
Returns True if the source imports virtualization libraries or uses layout
measurement APIs (getBoundingClientRect, offsetWidth, IntersectionObserver, etc.)
that return zeros/stubs in jsdom.
When True, jsdom-based render benchmarks may be inaccurate. Callers should
log a warning; Playwright support is deferred.
"""
if _VIRTUALIZATION_IMPORTS.search(source):
return True
return bool(_LAYOUT_API_USAGE.search(source))
def _is_wrapped_in_memo(func: FunctionNode, source: str) -> bool:
"""Check if the component is already wrapped in React.memo or memo()."""
# Check if the variable declaration wrapping this function uses memo()

View file

@ -164,6 +164,10 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
count=1,
)
# Auto-inject per-interaction render tracking markers around fireEvent/userEvent calls.
# This gives per-interaction A/B signal without the LLM needing to know about it.
result = inject_interaction_markers(result)
# Warn if no tests contain interaction calls — mount-phase only markers are
# not useful for measuring optimization effectiveness.
if not has_react_test_interactions(result):
@ -173,6 +177,18 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
component_info.function_name,
)
# Check interaction density — fewer than MIN_INTERACTION_CALLS total interactions
# means the test is unlikely to produce enough update-phase renders for reliable measurement.
interaction_count = count_interaction_calls(result)
if interaction_count < MIN_INTERACTION_CALLS:
logger.error(
"[REACT] Generated tests for %s have only %d interaction calls (minimum %d). "
"Render count measurement will have low confidence.",
component_info.function_name,
interaction_count,
MIN_INTERACTION_CALLS,
)
# Warn if tests lack high-density interaction patterns (loops or 3+ sequential calls)
if not has_high_density_interactions(result):
logger.warning(
@ -184,6 +200,75 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
return result
# Pattern to find the variable assigned from captureRenderPerf (await or sync)
# Matches: const result = await codeflash.captureRenderPerf(...)
# const { container } = await codeflash.captureRenderPerf(...)
# let result = codeflash.captureRenderPerf(...)
_CAPTURE_RENDER_RESULT_PATTERN = re.compile(
r"(?:const|let|var)\s+(?:\{[^}]+\}|(\w+))\s*=\s*(?:await\s+)?(?:\w+\.)?captureRenderPerf\(",
)
# Pattern matching fireEvent.* or userEvent.* standalone calls (not in comments)
_INTERACTION_CALL_PATTERN = re.compile(
r"^(\s*)((?:await\s+)?(?:fireEvent\.\w+|userEvent\.\w+)\s*\([^)]*\))\s*;",
re.MULTILINE,
)
def _extract_interaction_label(call_text: str) -> str:
"""Extract a short label from an interaction call, e.g. 'click' from 'fireEvent.click(...)'."""
m = re.search(r"(?:fireEvent|userEvent)\.(\w+)", call_text)
return m.group(1) if m else "interaction"
def inject_interaction_markers(test_source: str) -> str:
"""Inject _codeflashMarkInteraction() calls before each fireEvent/userEvent call.
Only injects when captureRenderPerf is used (the result object has the method).
Assigns a label derived from the interaction type (click, change, type, etc.)
and a sequential counter for uniqueness.
"""
if "captureRenderPerf" not in test_source:
return test_source
# Find the result variable name from captureRenderPerf assignment
# Support both: const result = ... and const { container, ...rest } = ...
result_var = None
capture_match = _CAPTURE_RENDER_RESULT_PATTERN.search(test_source)
if capture_match:
# Group 1 is the simple variable name; for destructuring we need a different approach
result_var = capture_match.group(1)
if not result_var:
# Look for destructuring pattern and use the first variable
destr_match = re.search(
r"(?:const|let|var)\s+(\w+)\s*=\s*(?:await\s+)?(?:\w+\.)?captureRenderPerf\(",
test_source,
)
if destr_match:
result_var = destr_match.group(1)
if not result_var:
# Can't determine result variable — skip injection
return test_source
# Find all interaction calls and inject marker before each
interaction_counter: dict[str, int] = {}
lines = test_source.split("\n")
new_lines: list[str] = []
for line in lines:
m = _INTERACTION_CALL_PATTERN.match(line)
if m:
indent = m.group(1)
call_text = m.group(2)
label = _extract_interaction_label(call_text)
interaction_counter[label] = interaction_counter.get(label, 0) + 1
unique_label = f"{label}_{interaction_counter[label]}"
marker_line = f"{indent}{result_var}._codeflashMarkInteraction('{unique_label}');"
new_lines.append(marker_line)
new_lines.append(line)
return "\n".join(new_lines)
# Patterns that indicate a test triggers user interactions causing re-renders
_INTERACTION_PATTERNS = re.compile(
r"fireEvent\.|userEvent\.|\.rerender\(|rerender\(|act\("
@ -200,6 +285,24 @@ def has_react_test_interactions(test_source: str) -> bool:
return bool(_INTERACTION_PATTERNS.search(test_source))
# Minimum interaction calls for reliable render count measurement
MIN_INTERACTION_CALLS = 3
# Pattern matching individual interaction calls (fireEvent.*, userEvent.*, .rerender(), rerender())
_INTERACTION_CALL_COUNT_PATTERN = re.compile(
r"(?:fireEvent\.\w+|userEvent\.\w+|\.rerender\(|(?<!\.)rerender\()\s*\(",
)
def count_interaction_calls(test_source: str) -> int:
"""Count the number of interaction calls in a test source.
Counts fireEvent.*, userEvent.*, and rerender() calls. Used to assess
whether tests produce enough update-phase renders for reliable measurement.
"""
return len(_INTERACTION_CALL_COUNT_PATTERN.findall(test_source))
# Patterns for loops containing interaction calls
_LOOP_WITH_INTERACTION = re.compile(
r"for\s*\([^)]*\)\s*\{[^}]*(?:fireEvent\.|userEvent\.|rerender\()",

View file

@ -37,6 +37,9 @@ jest_end_pattern = re.compile(r"!######([^:]+):([^:]+):([^:]+):([^:]+):([^:]+):(
# Format: !######REACT_RENDER:{component}:{phase}:{actualDuration}:{baseDuration}:{renderCount}######!
REACT_RENDER_MARKER_PATTERN = re.compile(r"!######REACT_RENDER:([^:]+):([^:]+):([^:]+):([^:]+):(\d+)######!")
# Validation run boundary marker (separates output from multiple validation runs)
REACT_VALIDATION_RUN_BOUNDARY = "!######REACT_VALIDATION_RUN_BOUNDARY######!"
# DOM mutation marker pattern
# Format: !######DOM_MUTATIONS:{component}:{mutationCount}######!
DOM_MUTATION_MARKER_PATTERN = re.compile(r"!######DOM_MUTATIONS:([^:]+):(\d+)######!")
@ -45,6 +48,10 @@ DOM_MUTATION_MARKER_PATTERN = re.compile(r"!######DOM_MUTATIONS:([^:]+):(\d+)###
# Format: !######REACT_INTERACTION_DURATION:{component}:{durationMs}:{burstCount}######!
REACT_INTERACTION_DURATION_PATTERN = re.compile(r"!######REACT_INTERACTION_DURATION:([^:]+):([^:]+):(\d+)######!")
# Per-interaction render count marker pattern
# Format: !######REACT_INTERACTION_RENDERS:{component}:{label}:{renderCount}######!
REACT_INTERACTION_RENDERS_PATTERN = re.compile(r"!######REACT_INTERACTION_RENDERS:([^:]+):([^:]+):(\d+)######!")
@dataclass(frozen=True)
class RenderProfile:
@ -147,6 +154,50 @@ def parse_interaction_duration_markers(stdout: str) -> list[InteractionDurationP
return profiles
@dataclass(frozen=True)
class InteractionRenderProfile:
"""Per-interaction render count from a single boundary marker."""
component_name: str
interaction_label: str
render_count: int
def parse_interaction_render_markers(stdout: str) -> list[InteractionRenderProfile]:
"""Parse per-interaction render count markers from test output.
Returns a list of InteractionRenderProfile instances, one per marker found.
"""
profiles: list[InteractionRenderProfile] = []
for match in REACT_INTERACTION_RENDERS_PATTERN.finditer(stdout):
try:
profiles.append(
InteractionRenderProfile(
component_name=match.group(1),
interaction_label=match.group(2),
render_count=int(match.group(3)),
)
)
except (ValueError, IndexError) as e:
logger.debug("Failed to parse interaction render marker: %s", e)
return profiles
def parse_per_run_render_profiles(stdout: str) -> list[list[RenderProfile]]:
"""Split multi-run stdout by boundary markers and parse render profiles per run.
When ``n_validation_runs > 1``, the test runner inserts
``REACT_VALIDATION_RUN_BOUNDARY`` markers between runs. This function
splits on those boundaries and parses each segment independently.
Returns a list of render profile lists (one per validation run).
If no boundary markers are found, returns a single-element list with
the profiles from the entire stdout.
"""
segments = stdout.split(REACT_VALIDATION_RUN_BOUNDARY)
return [parse_react_render_markers(segment) for segment in segments]
def _extract_jest_console_output(suite_elem: Any) -> str:
"""Extract console output from Jest's JUnit XML system-out element.

View file

@ -2432,6 +2432,7 @@ class JavaScriptSupport:
target_duration_seconds: float = 10.0,
test_framework: str | None = None,
is_react_component: bool = False,
n_validation_runs: int = 1,
) -> tuple[Path, Any]:
"""Run benchmarking tests using the detected test framework.
@ -2482,6 +2483,7 @@ class JavaScriptSupport:
max_loops=max_loops,
target_duration_ms=int(target_duration_seconds * 1000),
is_react_component=is_react_component,
n_validation_runs=n_validation_runs,
)
def run_line_profile_tests(

View file

@ -1064,6 +1064,7 @@ def run_jest_benchmarking_tests(
target_duration_ms: int = 10_000, # 10 seconds for benchmarking tests
stability_check: bool = True,
is_react_component: bool = False,
n_validation_runs: int = 1,
) -> tuple[Path, subprocess.CompletedProcess[str]]:
"""Run Jest benchmarking tests with in-process session-level looping.
@ -1075,6 +1076,11 @@ def run_jest_benchmarking_tests(
- Timing data is collected per iteration
- Stability is checked within the runner
For React components with n_validation_runs > 1, runs the test suite
multiple times and concatenates all stdout. Each run's render markers
are separated by ``!######REACT_VALIDATION_RUN_BOUNDARY######!`` markers
so the caller can split and compare render counts across runs.
Args:
test_paths: TestFiles object containing test file information.
test_env: Environment variables for the test run.
@ -1085,6 +1091,10 @@ def run_jest_benchmarking_tests(
max_loops: Maximum number of loop iterations.
target_duration_ms: Target TOTAL duration in milliseconds for all loops.
stability_check: Whether to enable stability-based early stopping.
is_react_component: Whether the target is a React component.
n_validation_runs: Number of times to run the test suite for render
count validation (React only). Each run's output is concatenated
with boundary markers.
Returns:
Tuple of (result_file_path, subprocess_result with stdout from all iterations).
@ -1211,25 +1221,70 @@ def run_jest_benchmarking_tests(
f"target_duration={target_duration_ms}ms, stability_check={stability_check}"
)
# Determine effective number of validation runs (only >1 for React)
effective_validation_runs = n_validation_runs if is_react_component and n_validation_runs > 1 else 1
total_start_time = time.time()
try:
run_args = get_cross_platform_subprocess_run_args(
cwd=effective_cwd, env=jest_env, timeout=total_timeout, check=False, text=True, capture_output=True
)
result = subprocess.run(jest_cmd, **run_args) # noqa: PLW1510
# Combine stderr into stdout for timing markers
stdout = result.stdout or ""
if result.stderr:
stdout = stdout + "\n" + result.stderr if stdout else result.stderr
if effective_validation_runs == 1:
result = subprocess.run(jest_cmd, **run_args) # noqa: PLW1510
# Create result with combined stdout
result = subprocess.CompletedProcess(args=result.args, returncode=result.returncode, stdout=stdout, stderr="")
if result.returncode != 0:
logger.debug(f"Jest benchmarking failed with return code {result.returncode}")
logger.debug(f"Jest benchmarking stdout: {result.stdout}")
logger.debug(f"Jest benchmarking stderr: {result.stderr}")
stdout = result.stdout or ""
if result.stderr:
stdout = stdout + "\n" + result.stderr if stdout else result.stderr
result = subprocess.CompletedProcess(
args=result.args, returncode=result.returncode, stdout=stdout, stderr=""
)
if result.returncode != 0:
logger.debug(f"Jest benchmarking failed with return code {result.returncode}")
logger.debug(f"Jest benchmarking stdout: {result.stdout}")
logger.debug(f"Jest benchmarking stderr: {result.stderr}")
else:
# Multi-run validation for React: run N times, concatenate output with boundary markers
logger.debug(
f"Running {effective_validation_runs} validation runs for React render count stability"
)
combined_stdout_parts: list[str] = []
last_returncode = 0
last_args = jest_cmd
for run_idx in range(effective_validation_runs):
run_result = subprocess.run(jest_cmd, **run_args) # noqa: PLW1510
run_stdout = run_result.stdout or ""
if run_result.stderr:
run_stdout = run_stdout + "\n" + run_result.stderr if run_stdout else run_result.stderr
combined_stdout_parts.append(run_stdout)
# Add boundary marker between runs (not after the last one)
if run_idx < effective_validation_runs - 1:
combined_stdout_parts.append(
"\n!######REACT_VALIDATION_RUN_BOUNDARY######!\n"
)
last_returncode = run_result.returncode
last_args = run_result.args
if run_result.returncode != 0:
logger.debug(
f"Jest benchmarking run {run_idx + 1}/{effective_validation_runs} "
f"failed with return code {run_result.returncode}"
)
logger.debug(
f"Validation run {run_idx + 1}/{effective_validation_runs} complete"
)
combined_stdout = "".join(combined_stdout_parts)
result = subprocess.CompletedProcess(
args=last_args, returncode=last_returncode, stdout=combined_stdout, stderr=""
)
except subprocess.TimeoutExpired:
logger.warning(f"Jest benchmarking timed out after {total_timeout}s")

View file

@ -404,6 +404,8 @@ class OptimizedCandidateResult(BaseModel):
render_profiles: Optional[list[Any]] = None
dom_mutations: Optional[list[Any]] = None
interaction_durations: Optional[list[Any]] = None
interaction_render_profiles: Optional[list[Any]] = None
render_count_confidence: str = "high"
class GeneratedTests(BaseModel):
@ -640,6 +642,8 @@ class OriginalCodeBaseline(BaseModel):
render_profiles: Optional[list[Any]] = None
dom_mutations: Optional[list[Any]] = None
interaction_durations: Optional[list[Any]] = None
interaction_render_profiles: Optional[list[Any]] = None
render_count_confidence: str = "high"
class CoverageStatus(Enum):

View file

@ -575,7 +575,7 @@ class FunctionOptimizer:
if not self.is_react_component or not test_results.perf_stdout:
return None
try:
from codeflash.languages.javascript.parse import parse_react_render_markers
from codeflash.languages.javascript.parse import parse_react_render_markers # noqa: PLC0415
profiles = parse_react_render_markers(test_results.perf_stdout)
if profiles:
@ -589,6 +589,26 @@ class FunctionOptimizer:
logger.debug("Failed to parse React render markers", exc_info=True)
return None
def compute_render_count_confidence(self, test_results: TestResults) -> str:
"""Compute render count confidence from multi-run validation output.
Splits stdout by validation run boundaries and compares render counts
across runs. Returns "high" if identical, "low" if unstable.
"""
if not self.is_react_component or not test_results.perf_stdout:
return "high"
try:
from codeflash.languages.javascript.frameworks.react.benchmarking import validate_render_count_stability # noqa: PLC0415
from codeflash.languages.javascript.parse import parse_per_run_render_profiles # noqa: PLC0415
per_run_profiles = parse_per_run_render_profiles(test_results.perf_stdout)
if len(per_run_profiles) <= 1:
return "high"
return validate_render_count_stability(per_run_profiles)
except Exception:
logger.debug("Failed to compute render count confidence", exc_info=True)
return "high"
def parse_dom_mutations_from_results(self, test_results: TestResults) -> list | None:
"""Parse DOM mutation markers from test stdout."""
if not self.is_react_component or not test_results.perf_stdout:
@ -620,6 +640,21 @@ class FunctionOptimizer:
logger.debug("Failed to parse interaction duration markers", exc_info=True)
return None
def parse_interaction_render_profiles_from_results(self, test_results: TestResults) -> list | None:
"""Parse per-interaction render count markers from test stdout."""
if not self.is_react_component or not test_results.perf_stdout:
return None
try:
from codeflash.languages.javascript.parse import parse_interaction_render_markers # noqa: PLC0415
profiles = parse_interaction_render_markers(test_results.perf_stdout)
if profiles:
logger.debug(f"Parsed {len(profiles)} per-interaction render profiles from test output")
return profiles
except Exception:
logger.debug("Failed to parse interaction render markers", exc_info=True)
return None
def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
should_run_experiment = self.experiment_id is not None
logger.info(f"!lsp|Function Trace ID: {self.function_trace_id}")
@ -1022,6 +1057,8 @@ class FunctionOptimizer:
target_component_name=self.function_to_optimize.function_name,
original_interaction_durations=original_code_baseline.interaction_durations,
optimized_interaction_durations=candidate_result.interaction_durations,
original_interaction_renders=original_code_baseline.interaction_render_profiles,
optimized_interaction_renders=candidate_result.interaction_render_profiles,
)
best_optimization = BestOptimization(
@ -1228,6 +1265,10 @@ class FunctionOptimizer:
eval_ctx.record_successful_candidate(candidate.optimization_id, candidate_result.best_test_runtime, perf_gain)
# Check if this is a successful optimization
low_confidence = (
original_code_baseline.render_count_confidence == "low"
or candidate_result.render_count_confidence == "low"
)
is_successful_opt = speedup_critic(
candidate_result,
original_code_baseline.runtime,
@ -1237,6 +1278,7 @@ class FunctionOptimizer:
original_concurrency_metrics=original_code_baseline.concurrency_metrics,
best_concurrency_ratio_until_now=None,
original_render_profiles=original_code_baseline.render_profiles,
render_count_low_confidence=low_confidence,
) and quantity_of_tests_critic(candidate_result)
tree = self.build_runtime_info_tree(
@ -1974,6 +2016,27 @@ class FunctionOptimizer:
f"[REACT-TESTGEN] {len(tests_without_interactions)} tests still lack interactions after retries"
)
# Check interaction density across all perf tests — if total interaction calls
# are below the minimum, preemptively flag low confidence.
from codeflash.languages.javascript.frameworks.react.testgen import ( # noqa: PLC0415
MIN_INTERACTION_CALLS,
count_interaction_calls,
)
total_interactions = sum(
count_interaction_calls(t.instrumented_perf_test_source) for t in tests
)
if total_interactions < MIN_INTERACTION_CALLS:
logger.error(
"[REACT-TESTGEN] Total interaction calls across all perf tests: %d (minimum %d). "
"Render count confidence will be set to low.",
total_interactions,
MIN_INTERACTION_CALLS,
)
self.insufficient_test_interactions = True
else:
self.insufficient_test_interactions = False
if not tests:
logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
@ -2619,6 +2682,7 @@ class FunctionOptimizer:
enable_coverage=False,
code_context=code_context,
is_react_component=self.is_react_component,
n_validation_runs=3 if self.is_react_component else 1,
)
logger.debug(f"[BENCHMARK-DONE] Got {len(benchmarking_results.test_results)} benchmark results")
finally:
@ -2629,10 +2693,41 @@ class FunctionOptimizer:
self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
)
# Parse React render profiles, DOM mutations, and interaction durations from performance test stdout
# Parse React render profiles, DOM mutations, interaction durations, and per-interaction renders
original_render_profiles = self.parse_render_profiles_from_results(benchmarking_results)
original_dom_mutations = self.parse_dom_mutations_from_results(benchmarking_results)
original_interaction_durations = self.parse_interaction_durations_from_results(benchmarking_results)
original_interaction_renders = self.parse_interaction_render_profiles_from_results(benchmarking_results)
original_render_confidence = self.compute_render_count_confidence(benchmarking_results)
# Validate that baseline render profiles contain update-phase markers.
# Tests that only produce mount-phase markers cannot measure optimization effectiveness.
if self.is_react_component and original_render_profiles:
has_update_phase = any(p.phase == "update" for p in original_render_profiles)
if not has_update_phase:
logger.error(
"[REACT] Baseline render profiles contain zero update-phase markers. "
"Perf tests may lack interactions — render-based acceptance will require 30%% threshold."
)
original_render_confidence = "low"
# Propagate insufficient interaction count from testgen phase
if self.is_react_component and getattr(self, "insufficient_test_interactions", False):
original_render_confidence = "low"
# Warn if the component uses layout APIs that jsdom cannot measure
if self.is_react_component:
try:
source = self.function_to_optimize.file_path.read_text("utf-8")
from codeflash.languages.javascript.frameworks.react.discovery import needs_real_layout # noqa: PLC0415
if needs_real_layout(source):
logger.warning(
"[REACT] Component uses layout APIs (virtualization, getBoundingClientRect, etc.) "
"— jsdom benchmarks may be inaccurate. Playwright support is planned."
)
except Exception:
logger.debug("Failed to check layout API usage", exc_info=True)
console.print(
TestResults.report_to_tree(
@ -2702,6 +2797,8 @@ class FunctionOptimizer:
render_profiles=original_render_profiles,
dom_mutations=original_dom_mutations,
interaction_durations=original_interaction_durations,
interaction_render_profiles=original_interaction_renders,
render_count_confidence=original_render_confidence,
),
functions_to_remove,
)
@ -2891,6 +2988,7 @@ class FunctionOptimizer:
testing_time=total_looping_time,
enable_coverage=False,
is_react_component=self.is_react_component,
n_validation_runs=3 if self.is_react_component else 1,
)
finally:
self.restore_source_after_profiler(pre_profiler_source)
@ -2901,10 +2999,21 @@ class FunctionOptimizer:
candidate_fto_code, candidate_helper_code, self.function_to_optimize.file_path
)
# Parse React render profiles, DOM mutations, and interaction durations from candidate performance test stdout
# Parse React render profiles, DOM mutations, interaction durations, and per-interaction renders
candidate_render_profiles = self.parse_render_profiles_from_results(candidate_benchmarking_results)
candidate_dom_mutations = self.parse_dom_mutations_from_results(candidate_benchmarking_results)
candidate_interaction_durations = self.parse_interaction_durations_from_results(candidate_benchmarking_results)
candidate_interaction_renders = self.parse_interaction_render_profiles_from_results(candidate_benchmarking_results)
candidate_render_confidence = self.compute_render_count_confidence(candidate_benchmarking_results)
if self.is_react_component and candidate_render_profiles:
has_update_phase = any(p.phase == "update" for p in candidate_render_profiles)
if not has_update_phase:
logger.error(
"[REACT] Candidate render profiles contain zero update-phase markers. "
"Render-based acceptance will require 30%% threshold."
)
candidate_render_confidence = "low"
# Use effective_loop_count which represents the minimum number of timing samples
# across all test cases. This is more accurate for JavaScript tests where
# capturePerf does internal looping with potentially different iteration counts per test.
@ -2958,6 +3067,8 @@ class FunctionOptimizer:
render_profiles=candidate_render_profiles,
dom_mutations=candidate_dom_mutations,
interaction_durations=candidate_interaction_durations,
interaction_render_profiles=candidate_interaction_renders,
render_count_confidence=candidate_render_confidence,
)
)
@ -2975,6 +3086,7 @@ class FunctionOptimizer:
code_context: CodeOptimizationContext | None = None,
line_profiler_output_file: Path | None = None,
is_react_component: bool = False,
n_validation_runs: int = 1,
) -> tuple[TestResults | dict, CoverageData | None]:
coverage_database_file = None
coverage_config_file = None
@ -3017,6 +3129,7 @@ class FunctionOptimizer:
test_framework=self.test_cfg.test_framework,
js_project_root=self.test_cfg.js_project_root,
is_react_component=is_react_component,
n_validation_runs=n_validation_runs,
)
else:
msg = f"Unexpected testing type: {testing_type}"

View file

@ -73,6 +73,7 @@ def speedup_critic(
original_concurrency_metrics: ConcurrencyMetrics | None = None,
best_concurrency_ratio_until_now: float | None = None,
original_render_profiles: list | None = None,
render_count_low_confidence: bool = False,
) -> bool:
"""Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.
@ -131,6 +132,7 @@ def speedup_critic(
original_interaction_duration_ms=benchmark.original_interaction_duration_ms,
optimized_interaction_duration_ms=benchmark.optimized_interaction_duration_ms,
trust_duration=False,
low_confidence=render_count_low_confidence,
)
throughput_improved = True # Default to True if no throughput data
@ -312,6 +314,7 @@ def render_efficiency_critic(
original_interaction_duration_ms: float = 0.0,
optimized_interaction_duration_ms: float = 0.0,
trust_duration: bool = True,
low_confidence: bool = False,
) -> bool:
"""Evaluate whether a React optimization reduces re-renders, render time, or DOM mutations sufficiently.
@ -322,8 +325,12 @@ def render_efficiency_critic(
When ``trust_duration`` is False (e.g. jsdom where actualDuration is noise),
render duration is excluded from the acceptance criteria.
When ``low_confidence`` is True (render counts varied across validation
runs), the render count reduction threshold is raised from 20% to 30%
to reduce false positives from measurement noise.
Accepts if:
- Update render count reduced by >= 20% (primary), OR total render count reduced by >= 20% (fallback)
- Update render count reduced by >= threshold (primary), OR total render count reduced by >= threshold (fallback)
- OR render duration reduced by >= MIN_IMPROVEMENT_THRESHOLD (when trust_duration=True)
- OR DOM mutations reduced by >= 20%
- OR child component render reduction >= MIN_CHILD_RENDER_REDUCTION (captures useCallback/memo optimizations)
@ -333,16 +340,38 @@ def render_efficiency_critic(
if original_render_count == 0 and original_dom_mutations == 0 and child_render_reduction == 0:
return False
# Use update-phase counts as primary signal when available
# Use update-phase counts as primary signal when available.
# When the ONLY signal is mount-phase render count (no update-phase data, no DOM mutations,
# no child reduction, no interaction data), we cannot meaningfully evaluate the optimization.
# Mount count reductions are not a valid React optimization signal — memoization optimizations
# often *increase* mount cost while reducing update-phase renders.
# When update-phase data exists, ONLY use it for render count acceptance —
# total count (which includes mount) dilutes the signal.
has_update_data = original_update_render_count > 0 or optimized_update_render_count > 0
effective_orig_count = original_update_render_count if has_update_data else original_render_count
effective_opt_count = optimized_update_render_count if has_update_data else optimized_render_count
has_dom_signal = original_dom_mutations > 0
has_child_signal = child_render_reduction > 0
has_interaction_signal = original_interaction_duration_ms > 0
# Check render count reduction
if not has_update_data and not has_dom_signal and not has_child_signal and not has_interaction_signal:
return False
# Check render count reduction (higher threshold when confidence is low)
render_count_threshold = 0.30 if low_confidence else MIN_RENDER_COUNT_REDUCTION_PCT
count_improved = False
if effective_orig_count > 0:
count_reduction = (effective_orig_count - effective_opt_count) / effective_orig_count
count_improved = count_reduction >= MIN_RENDER_COUNT_REDUCTION_PCT
if has_update_data:
# Primary: update-phase only — do NOT fall through to total count
if original_update_render_count > 0:
count_reduction = (
(original_update_render_count - optimized_update_render_count) / original_update_render_count
)
count_improved = count_reduction >= render_count_threshold
elif original_render_count > 0:
# Fallback: total count when zero update-phase data exists
count_reduction = (original_render_count - optimized_render_count) / original_render_count
count_improved = count_reduction >= render_count_threshold
# Determine effective counts for best-candidate tracking
effective_opt_count = optimized_update_render_count if has_update_data else optimized_render_count
# Check render duration reduction (prefer update-phase duration)
# Skipped when trust_duration=False (jsdom actualDuration is noise)

View file

@ -329,6 +329,7 @@ def run_benchmarking_tests(
pytest_max_loops: int = 100_000,
js_project_root: Path | None = None,
is_react_component: bool = False,
n_validation_runs: int = 1,
) -> tuple[Path, subprocess.CompletedProcess]:
logger.debug(f"run_benchmarking_tests called: framework={test_framework}, num_files={len(test_paths.test_files)}")
# Check if there's a language support for this test framework that implements run_benchmarking_tests
@ -344,6 +345,7 @@ def run_benchmarking_tests(
max_loops=pytest_max_loops,
target_duration_seconds=pytest_target_runtime_seconds,
is_react_component=is_react_component,
n_validation_runs=n_validation_runs,
)
if is_python(): # pytest runs both pytest and unittest tests
pytest_cmd_list = (

View file

@ -1114,6 +1114,7 @@ function captureRenderPerf(funcName, lineId, renderFn, Component, ...createEleme
const React = _getReact();
let renderCount = 0;
let renderCountAtLastBoundary = 0;
function onRender(id, phase, actualDuration, baseDuration) {
renderCount++;
console.log(`!######REACT_RENDER:${funcName}:${phase}:${actualDuration}:${baseDuration}:${renderCount}######!`);
@ -1155,6 +1156,17 @@ function captureRenderPerf(funcName, lineId, renderFn, Component, ...createEleme
};
}
// Per-interaction render tracking: records renders since last boundary
// and emits a REACT_INTERACTION_RENDERS marker for A/B comparison.
if (result) {
result._codeflashMarkInteraction = (label) => {
const rendersSinceLast = renderCount - renderCountAtLastBoundary;
console.log(`!######REACT_INTERACTION_RENDERS:${funcName}:${label}:${rendersSinceLast}######!`);
renderCountAtLastBoundary = renderCount;
return rendersSinceLast;
};
}
return Promise.resolve(result);
}

View file

@ -153,6 +153,8 @@ class TestRenderEfficiencyCritic:
optimized_render_count=10,
original_render_duration=100.0,
optimized_render_duration=100.0,
original_update_render_count=48,
optimized_update_render_count=8,
) is True
def test_rejects_insignificant_reduction(self):
@ -169,6 +171,8 @@ class TestRenderEfficiencyCritic:
optimized_render_count=10,
original_render_duration=100.0,
optimized_render_duration=10.0,
original_update_render_count=8,
optimized_update_render_count=8,
) is True
def test_rejects_worse_than_best(self):
@ -187,6 +191,8 @@ class TestRenderEfficiencyCritic:
original_render_duration=100.0,
optimized_render_duration=10.0,
best_render_count_until_now=5,
original_update_render_count=48,
optimized_update_render_count=1,
) is True
def test_uses_update_phase_counts_when_available(self):
@ -202,8 +208,8 @@ class TestRenderEfficiencyCritic:
optimized_update_duration=10.0,
) is True
def test_falls_back_to_total_when_no_update_data(self):
# No update-phase data → uses total counts
def test_rejects_mount_only_when_no_secondary_signals(self):
# No update-phase data AND no DOM/child/interaction signals → rejected
assert render_efficiency_critic(
original_render_count=50,
optimized_render_count=10,
@ -211,6 +217,19 @@ class TestRenderEfficiencyCritic:
optimized_render_duration=100.0,
original_update_render_count=0,
optimized_update_render_count=0,
) is False
def test_falls_back_to_total_with_dom_signal(self):
# No update-phase data but DOM mutations present → uses total counts
assert render_efficiency_critic(
original_render_count=50,
optimized_render_count=10,
original_render_duration=100.0,
optimized_render_duration=100.0,
original_update_render_count=0,
optimized_update_render_count=0,
original_dom_mutations=100,
optimized_dom_mutations=20,
) is True
@ -427,6 +446,8 @@ class TestRenderEfficiencyCriticTrustDuration:
optimized_render_count=10,
original_render_duration=100.0,
optimized_render_duration=10.0,
original_update_render_count=8,
optimized_update_render_count=8,
trust_duration=True,
) is True
@ -437,6 +458,8 @@ class TestRenderEfficiencyCriticTrustDuration:
optimized_render_count=10,
original_render_duration=100.0,
optimized_render_duration=100.0,
original_update_render_count=48,
optimized_update_render_count=8,
trust_duration=False,
) is True

View file

@ -47,7 +47,7 @@ class TestPostProcessReactTests:
assert result.count("@testing-library/react") == 1
def test_adds_user_event_for_click(self):
source = "import { render } from '@testing-library/react';\ntest('clicks button', () => { click(button); });"
source = "import { render } from '@testing-library/react';\ntest('clicks button', () => { userEvent.click(button); });"
result = post_process_react_tests(source, _make_info())
assert "@testing-library/user-event" in result