react changes for interactive pattern

2026-05-04 18:25:17 +00:00 · 2026-03-18 03:27:35 +05:30 · 2026-03-18 03:27:35 +05:30 · 4bc89f2b9d
commit 4bc89f2b9d
parent 9df446b72f
13 changed files with 584 additions and 26 deletions
--- a/codeflash/languages/javascript/frameworks/react/benchmarking.py
+++ b/codeflash/languages/javascript/frameworks/react/benchmarking.py
@ -15,7 +15,12 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING

 if TYPE_CHECKING:
-    from codeflash.languages.javascript.parse import DomMutationProfile, InteractionDurationProfile, RenderProfile
+    from codeflash.languages.javascript.parse import (
+        DomMutationProfile,
+        InteractionDurationProfile,
+        InteractionRenderProfile,
+        RenderProfile,
+    )

 logger = logging.getLogger(__name__)

@ -52,6 +57,29 @@ def _aggregate_avg_duration(profiles: list[RenderProfile]) -> float:
    return sum(p.actual_duration_ms for p in profiles) / len(profiles)


+@dataclass(frozen=True)
+class InteractionComparison:
+    """Per-interaction render count comparison."""
+
+    interaction_label: str
+    original_render_count: int
+    optimized_render_count: int
+
+    @property
+    def reduction_pct(self) -> float:
+        if self.original_render_count == 0:
+            return 0.0
+        return (
+            (self.original_render_count - self.optimized_render_count)
+            / self.original_render_count
+            * 100
+        )
+
+    @property
+    def improved(self) -> bool:
+        return self.optimized_render_count < self.original_render_count
+
+
@dataclass(frozen=True)
 class RenderBenchmark:
    """Comparison of original vs optimized render metrics.
@ -83,6 +111,12 @@ class RenderBenchmark:
    optimized_interaction_duration_ms: float = 0.0
    original_burst_count: int = 0
    optimized_burst_count: int = 0
+    # Per-interaction render comparisons
+    per_interaction_comparisons: tuple[InteractionComparison, ...] = ()
+
+    @property
+    def has_per_interaction_data(self) -> bool:
+        return len(self.per_interaction_comparisons) > 0

    @property
    def render_count_reduction_pct(self) -> float:
@ -161,6 +195,81 @@ class RenderBenchmark:
        return self.original_interaction_duration_ms > 0 or self.optimized_interaction_duration_ms > 0


+def validate_render_count_stability(runs: list[list[RenderProfile]]) -> str:
+    """Compare render counts across multiple runs to assess measurement confidence.
+
+    Args:
+        runs: List of render profile lists, one per validation run.
+
+    Returns:
+        "high" if counts are identical across all runs,
+        "low" if any component's render count varies by >= 2 across runs.
+        Falls back to "high" if there's only 1 run or no profiles.
+    """
+    if len(runs) <= 1:
+        return "high"
+
+    # Group by component across runs: {component_name: [max_render_count_per_run]}
+    per_component_counts: dict[str, list[int]] = {}
+    for run_profiles in runs:
+        by_comp = _group_by_component(run_profiles)
+        seen_components = set()
+        for comp_name, profiles in by_comp.items():
+            seen_components.add(comp_name)
+            count = _aggregate_render_count(profiles)
+            per_component_counts.setdefault(comp_name, []).append(count)
+        # Components not seen in this run get 0
+        for comp_name in per_component_counts:
+            if comp_name not in seen_components:
+                per_component_counts[comp_name].append(0)
+
+    for comp_name, counts in per_component_counts.items():
+        spread = max(counts) - min(counts)
+        if spread >= 2:
+            logger.warning(
+                "[REACT] Unstable render count for %s across %d runs: %s (spread=%d)",
+                comp_name,
+                len(runs),
+                counts,
+                spread,
+            )
+            return "low"
+        if spread == 1:
+            logger.info(
+                "[REACT] Minor render count variance for %s across %d runs: %s (±1)",
+                comp_name,
+                len(runs),
+                counts,
+            )
+
+    return "high"
+
+
+def _build_interaction_comparisons(
+    original_profiles: list[InteractionRenderProfile],
+    optimized_profiles: list[InteractionRenderProfile],
+) -> tuple[InteractionComparison, ...]:
+    """Build per-interaction render comparisons from original and optimized profiles."""
+    orig_by_label: dict[str, int] = {}
+    for p in original_profiles:
+        orig_by_label[p.interaction_label] = orig_by_label.get(p.interaction_label, 0) + p.render_count
+    opt_by_label: dict[str, int] = {}
+    for p in optimized_profiles:
+        opt_by_label[p.interaction_label] = opt_by_label.get(p.interaction_label, 0) + p.render_count
+
+    all_labels = list(dict.fromkeys(list(orig_by_label.keys()) + list(opt_by_label.keys())))
+    comparisons: list[InteractionComparison] = []
+    for label in all_labels:
+        comparisons.append(
+            InteractionComparison(
+                interaction_label=label,
+                original_render_count=orig_by_label.get(label, 0),
+                optimized_render_count=opt_by_label.get(label, 0),
+            )
+        )
+    return tuple(comparisons)
+
+
 def compare_render_benchmarks(
    original_profiles: list[RenderProfile],
    optimized_profiles: list[RenderProfile],
@ -169,6 +278,8 @@ def compare_render_benchmarks(
    target_component_name: str | None = None,
    original_interaction_durations: list[InteractionDurationProfile] | None = None,
    optimized_interaction_durations: list[InteractionDurationProfile] | None = None,
+    original_interaction_renders: list[InteractionRenderProfile] | None = None,
+    optimized_interaction_renders: list[InteractionRenderProfile] | None = None,
 ) -> RenderBenchmark | None:
    """Compare original and optimized render profiles with phase awareness.

@ -252,6 +363,13 @@ def compare_render_benchmarks(
        )
        opt_bursts = max((d.burst_count for d in optimized_interaction_durations), default=0)

+    # Build per-interaction render comparisons
+    interaction_comparisons: tuple[InteractionComparison, ...] = ()
+    if original_interaction_renders and optimized_interaction_renders:
+        interaction_comparisons = _build_interaction_comparisons(
+            original_interaction_renders, optimized_interaction_renders
+        )
+
    return RenderBenchmark(
        component_name=component_name,
        original_render_count=orig_count,
@ -271,6 +389,7 @@ def compare_render_benchmarks(
        optimized_interaction_duration_ms=opt_interaction_ms,
        original_burst_count=orig_bursts,
        optimized_burst_count=opt_bursts,
+        per_interaction_comparisons=interaction_comparisons,
    )


@ -325,4 +444,18 @@ def format_render_benchmark_for_pr(benchmark: RenderBenchmark) -> str:
    if benchmark.render_speedup_x > 1:
        lines.append(f"\nRender time improved **{benchmark.render_speedup_x:.1f}x**.")

+    # Per-interaction breakdown table
+    if benchmark.has_per_interaction_data:
+        lines.append("")
+        lines.append("#### Per-Interaction Breakdown")
+        lines.append("")
+        lines.append("| Interaction | Before | After | Change |")
+        lines.append("|-------------|--------|-------|--------|")
+        for ic in benchmark.per_interaction_comparisons:
+            change = f"{ic.reduction_pct:.1f}% fewer" if ic.improved else "no change"
+            lines.append(
+                f"| {ic.interaction_label} | {ic.original_render_count} renders "
+                f"| {ic.optimized_render_count} renders | {change} |"
+            )
+
    return "\n".join(lines)
--- a/codeflash/languages/javascript/frameworks/react/discovery.py
+++ b/codeflash/languages/javascript/frameworks/react/discovery.py
@ -236,6 +236,37 @@ def _extract_props_type(func: FunctionNode, source: str, analyzer: TreeSitterAna
    return None


+# Virtualization library imports that require real layout for meaningful benchmarks
+_VIRTUALIZATION_IMPORTS = re.compile(
+    r"""(?:from|import)\s+['"](?:"""
+    r"react-window|react-virtuoso|react-virtual|@tanstack/react-virtual"
+    r"|react-virtualized|@tanstack/virtual-core"
+    r""")['"]""",
+)
+
+# Layout APIs that return zeros in jsdom
+_LAYOUT_API_USAGE = re.compile(
+    r"\b(?:getBoundingClientRect|offsetWidth|offsetHeight|clientWidth|clientHeight"
+    r"|scrollTop|scrollHeight|scrollWidth|scrollLeft"
+    r"|IntersectionObserver|ResizeObserver)\b"
+)
+
+
+def needs_real_layout(source: str) -> bool:
+    """Detect whether a component depends on real layout APIs unavailable in jsdom.
+
+    Returns True if the source imports virtualization libraries or uses layout
+    measurement APIs (getBoundingClientRect, offsetWidth, IntersectionObserver, etc.)
+    that return zeros/stubs in jsdom.
+
+    When True, jsdom-based render benchmarks may be inaccurate. Callers should
+    log a warning; Playwright support is deferred.
+    """
+    if _VIRTUALIZATION_IMPORTS.search(source):
+        return True
+    return bool(_LAYOUT_API_USAGE.search(source))
+
+
 def _is_wrapped_in_memo(func: FunctionNode, source: str) -> bool:
    """Check if the component is already wrapped in React.memo or memo()."""
    # Check if the variable declaration wrapping this function uses memo()
--- a/codeflash/languages/javascript/frameworks/react/testgen.py
+++ b/codeflash/languages/javascript/frameworks/react/testgen.py
@ -164,6 +164,10 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
            count=1,
        )

+    # Auto-inject per-interaction render tracking markers around fireEvent/userEvent calls.
+    # This gives per-interaction A/B signal without the LLM needing to know about it.
+    result = inject_interaction_markers(result)
+
    # Warn if no tests contain interaction calls — mount-phase only markers are
    # not useful for measuring optimization effectiveness.
    if not has_react_test_interactions(result):
@ -173,6 +177,18 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
            component_info.function_name,
        )

+    # Check interaction density — fewer than MIN_INTERACTION_CALLS total interactions
+    # means the test is unlikely to produce enough update-phase renders for reliable measurement.
+    interaction_count = count_interaction_calls(result)
+    if interaction_count < MIN_INTERACTION_CALLS:
+        logger.error(
+            "[REACT] Generated tests for %s have only %d interaction calls (minimum %d). "
+            "Render count measurement will have low confidence.",
+            component_info.function_name,
+            interaction_count,
+            MIN_INTERACTION_CALLS,
+        )
+
    # Warn if tests lack high-density interaction patterns (loops or 3+ sequential calls)
    if not has_high_density_interactions(result):
        logger.warning(
@ -184,6 +200,75 @@ def post_process_react_tests(test_source: str, component_info: ReactComponentInf
    return result


+# Pattern to find the variable assigned from captureRenderPerf (await or sync)
+# Matches: const result = await codeflash.captureRenderPerf(...)
+#          const { container } = await codeflash.captureRenderPerf(...)
+#          let result = codeflash.captureRenderPerf(...)
+_CAPTURE_RENDER_RESULT_PATTERN = re.compile(
+    r"(?:const|let|var)\s+(?:\{[^}]+\}|(\w+))\s*=\s*(?:await\s+)?(?:\w+\.)?captureRenderPerf\(",
+)
+
+# Pattern matching fireEvent.* or userEvent.* standalone calls (not in comments)
+_INTERACTION_CALL_PATTERN = re.compile(
+    r"^(\s*)((?:await\s+)?(?:fireEvent\.\w+|userEvent\.\w+)\s*\([^)]*\))\s*;",
+    re.MULTILINE,
+)
+
+
+def _extract_interaction_label(call_text: str) -> str:
+    """Extract a short label from an interaction call, e.g. 'click' from 'fireEvent.click(...)'."""
+    m = re.search(r"(?:fireEvent|userEvent)\.(\w+)", call_text)
+    return m.group(1) if m else "interaction"
+
+
+def inject_interaction_markers(test_source: str) -> str:
+    """Inject _codeflashMarkInteraction() calls before each fireEvent/userEvent call.
+
+    Only injects when captureRenderPerf is used (the result object has the method).
+    Assigns a label derived from the interaction type (click, change, type, etc.)
+    and a sequential counter for uniqueness.
+    """
+    if "captureRenderPerf" not in test_source:
+        return test_source
+
+    # Find the result variable name from captureRenderPerf assignment
+    # Support both: const result = ... and const { container, ...rest } = ...
+    result_var = None
+    capture_match = _CAPTURE_RENDER_RESULT_PATTERN.search(test_source)
+    if capture_match:
+        # Group 1 is the simple variable name; for destructuring we need a different approach
+        result_var = capture_match.group(1)
+    if not result_var:
+        # Look for destructuring pattern and use the first variable
+        destr_match = re.search(
+            r"(?:const|let|var)\s+(\w+)\s*=\s*(?:await\s+)?(?:\w+\.)?captureRenderPerf\(",
+            test_source,
+        )
+        if destr_match:
+            result_var = destr_match.group(1)
+    if not result_var:
+        # Can't determine result variable — skip injection
+        return test_source
+
+    # Find all interaction calls and inject marker before each
+    interaction_counter: dict[str, int] = {}
+    lines = test_source.split("\n")
+    new_lines: list[str] = []
+    for line in lines:
+        m = _INTERACTION_CALL_PATTERN.match(line)
+        if m:
+            indent = m.group(1)
+            call_text = m.group(2)
+            label = _extract_interaction_label(call_text)
+            interaction_counter[label] = interaction_counter.get(label, 0) + 1
+            unique_label = f"{label}_{interaction_counter[label]}"
+            marker_line = f"{indent}{result_var}._codeflashMarkInteraction('{unique_label}');"
+            new_lines.append(marker_line)
+        new_lines.append(line)
+
+    return "\n".join(new_lines)
+
+
 # Patterns that indicate a test triggers user interactions causing re-renders
 _INTERACTION_PATTERNS = re.compile(
    r"fireEvent\.|userEvent\.|\.rerender\(|rerender\(|act\("
@ -200,6 +285,24 @@ def has_react_test_interactions(test_source: str) -> bool:
    return bool(_INTERACTION_PATTERNS.search(test_source))


+# Minimum interaction calls for reliable render count measurement
+MIN_INTERACTION_CALLS = 3
+
+# Pattern matching individual interaction calls (fireEvent.*, userEvent.*, .rerender(), rerender())
+_INTERACTION_CALL_COUNT_PATTERN = re.compile(
+    r"(?:fireEvent\.\w+|userEvent\.\w+|\.rerender\(|(?<!\.)rerender\()\s*\(",
+)
+
+
+def count_interaction_calls(test_source: str) -> int:
+    """Count the number of interaction calls in a test source.
+
+    Counts fireEvent.*, userEvent.*, and rerender() calls. Used to assess
+    whether tests produce enough update-phase renders for reliable measurement.
+    """
+    return len(_INTERACTION_CALL_COUNT_PATTERN.findall(test_source))
+
+
 # Patterns for loops containing interaction calls
 _LOOP_WITH_INTERACTION = re.compile(
    r"for\s*\([^)]*\)\s*\{[^}]*(?:fireEvent\.|userEvent\.|rerender\()",
--- a/codeflash/languages/javascript/parse.py
+++ b/codeflash/languages/javascript/parse.py
@ -37,6 +37,9 @@ jest_end_pattern = re.compile(r"!######([^:]+):([^:]+):([^:]+):([^:]+):([^:]+):(
 # Format: !######REACT_RENDER:{component}:{phase}:{actualDuration}:{baseDuration}:{renderCount}######!
 REACT_RENDER_MARKER_PATTERN = re.compile(r"!######REACT_RENDER:([^:]+):([^:]+):([^:]+):([^:]+):(\d+)######!")

+# Validation run boundary marker (separates output from multiple validation runs)
+REACT_VALIDATION_RUN_BOUNDARY = "!######REACT_VALIDATION_RUN_BOUNDARY######!"
+
 # DOM mutation marker pattern
 # Format: !######DOM_MUTATIONS:{component}:{mutationCount}######!
 DOM_MUTATION_MARKER_PATTERN = re.compile(r"!######DOM_MUTATIONS:([^:]+):(\d+)######!")
@ -45,6 +48,10 @@ DOM_MUTATION_MARKER_PATTERN = re.compile(r"!######DOM_MUTATIONS:([^:]+):(\d+)###
 # Format: !######REACT_INTERACTION_DURATION:{component}:{durationMs}:{burstCount}######!
 REACT_INTERACTION_DURATION_PATTERN = re.compile(r"!######REACT_INTERACTION_DURATION:([^:]+):([^:]+):(\d+)######!")

+# Per-interaction render count marker pattern
+# Format: !######REACT_INTERACTION_RENDERS:{component}:{label}:{renderCount}######!
+REACT_INTERACTION_RENDERS_PATTERN = re.compile(r"!######REACT_INTERACTION_RENDERS:([^:]+):([^:]+):(\d+)######!")
+

@dataclass(frozen=True)
 class RenderProfile:
@ -147,6 +154,50 @@ def parse_interaction_duration_markers(stdout: str) -> list[InteractionDurationP
    return profiles


+@dataclass(frozen=True)
+class InteractionRenderProfile:
+    """Per-interaction render count from a single boundary marker."""
+
+    component_name: str
+    interaction_label: str
+    render_count: int
+
+
+def parse_interaction_render_markers(stdout: str) -> list[InteractionRenderProfile]:
+    """Parse per-interaction render count markers from test output.
+
+    Returns a list of InteractionRenderProfile instances, one per marker found.
+    """
+    profiles: list[InteractionRenderProfile] = []
+    for match in REACT_INTERACTION_RENDERS_PATTERN.finditer(stdout):
+        try:
+            profiles.append(
+                InteractionRenderProfile(
+                    component_name=match.group(1),
+                    interaction_label=match.group(2),
+                    render_count=int(match.group(3)),
+                )
+            )
+        except (ValueError, IndexError) as e:
+            logger.debug("Failed to parse interaction render marker: %s", e)
+    return profiles
+
+
+def parse_per_run_render_profiles(stdout: str) -> list[list[RenderProfile]]:
+    """Split multi-run stdout by boundary markers and parse render profiles per run.
+
+    When ``n_validation_runs > 1``, the test runner inserts
+    ``REACT_VALIDATION_RUN_BOUNDARY`` markers between runs. This function
+    splits on those boundaries and parses each segment independently.
+
+    Returns a list of render profile lists (one per validation run).
+    If no boundary markers are found, returns a single-element list with
+    the profiles from the entire stdout.
+    """
+    segments = stdout.split(REACT_VALIDATION_RUN_BOUNDARY)
+    return [parse_react_render_markers(segment) for segment in segments]
+
+
 def _extract_jest_console_output(suite_elem: Any) -> str:
    """Extract console output from Jest's JUnit XML system-out element.

--- a/codeflash/languages/javascript/support.py
+++ b/codeflash/languages/javascript/support.py
@ -2432,6 +2432,7 @@ class JavaScriptSupport:
        target_duration_seconds: float = 10.0,
        test_framework: str | None = None,
        is_react_component: bool = False,
+        n_validation_runs: int = 1,
    ) -> tuple[Path, Any]:
        """Run benchmarking tests using the detected test framework.

@ -2482,6 +2483,7 @@ class JavaScriptSupport:
            max_loops=max_loops,
            target_duration_ms=int(target_duration_seconds * 1000),
            is_react_component=is_react_component,
+            n_validation_runs=n_validation_runs,
        )

    def run_line_profile_tests(
--- a/codeflash/languages/javascript/test_runner.py
+++ b/codeflash/languages/javascript/test_runner.py
@ -1064,6 +1064,7 @@ def run_jest_benchmarking_tests(
    target_duration_ms: int = 10_000,  # 10 seconds for benchmarking tests
    stability_check: bool = True,
    is_react_component: bool = False,
+    n_validation_runs: int = 1,
 ) -> tuple[Path, subprocess.CompletedProcess[str]]:
    """Run Jest benchmarking tests with in-process session-level looping.

@ -1075,6 +1076,11 @@ def run_jest_benchmarking_tests(
    - Timing data is collected per iteration
    - Stability is checked within the runner

+    For React components with n_validation_runs > 1, runs the test suite
+    multiple times and concatenates all stdout. Each run's render markers
+    are separated by ``!######REACT_VALIDATION_RUN_BOUNDARY######!`` markers
+    so the caller can split and compare render counts across runs.
+
    Args:
        test_paths: TestFiles object containing test file information.
        test_env: Environment variables for the test run.
@ -1085,6 +1091,10 @@ def run_jest_benchmarking_tests(
        max_loops: Maximum number of loop iterations.
        target_duration_ms: Target TOTAL duration in milliseconds for all loops.
        stability_check: Whether to enable stability-based early stopping.
+        is_react_component: Whether the target is a React component.
+        n_validation_runs: Number of times to run the test suite for render
+            count validation (React only). Each run's output is concatenated
+            with boundary markers.

    Returns:
        Tuple of (result_file_path, subprocess_result with stdout from all iterations).
@ -1211,25 +1221,70 @@ def run_jest_benchmarking_tests(
        f"target_duration={target_duration_ms}ms, stability_check={stability_check}"
    )

+    # Determine effective number of validation runs (only >1 for React)
+    effective_validation_runs = n_validation_runs if is_react_component and n_validation_runs > 1 else 1
+
    total_start_time = time.time()

    try:
        run_args = get_cross_platform_subprocess_run_args(
            cwd=effective_cwd, env=jest_env, timeout=total_timeout, check=False, text=True, capture_output=True
        )
-        result = subprocess.run(jest_cmd, **run_args)  # noqa: PLW1510

-        # Combine stderr into stdout for timing markers
-        stdout = result.stdout or ""
-        if result.stderr:
-            stdout = stdout + "\n" + result.stderr if stdout else result.stderr
+        if effective_validation_runs == 1:
+            result = subprocess.run(jest_cmd, **run_args)  # noqa: PLW1510

-        # Create result with combined stdout
-        result = subprocess.CompletedProcess(args=result.args, returncode=result.returncode, stdout=stdout, stderr="")
-        if result.returncode != 0:
-            logger.debug(f"Jest benchmarking failed with return code {result.returncode}")
-            logger.debug(f"Jest benchmarking stdout: {result.stdout}")
-            logger.debug(f"Jest benchmarking stderr: {result.stderr}")
+            stdout = result.stdout or ""
+            if result.stderr:
+                stdout = stdout + "\n" + result.stderr if stdout else result.stderr
+
+            result = subprocess.CompletedProcess(
+                args=result.args, returncode=result.returncode, stdout=stdout, stderr=""
+            )
+            if result.returncode != 0:
+                logger.debug(f"Jest benchmarking failed with return code {result.returncode}")
+                logger.debug(f"Jest benchmarking stdout: {result.stdout}")
+                logger.debug(f"Jest benchmarking stderr: {result.stderr}")
+        else:
+            # Multi-run validation for React: run N times, concatenate output with boundary markers
+            logger.debug(
+                f"Running {effective_validation_runs} validation runs for React render count stability"
+            )
+            combined_stdout_parts: list[str] = []
+            last_returncode = 0
+            last_args = jest_cmd
+
+            for run_idx in range(effective_validation_runs):
+                run_result = subprocess.run(jest_cmd, **run_args)  # noqa: PLW1510
+
+                run_stdout = run_result.stdout or ""
+                if run_result.stderr:
+                    run_stdout = run_stdout + "\n" + run_result.stderr if run_stdout else run_result.stderr
+
+                combined_stdout_parts.append(run_stdout)
+                # Add boundary marker between runs (not after the last one)
+                if run_idx < effective_validation_runs - 1:
+                    combined_stdout_parts.append(
+                        "\n!######REACT_VALIDATION_RUN_BOUNDARY######!\n"
+                    )
+
+                last_returncode = run_result.returncode
+                last_args = run_result.args
+
+                if run_result.returncode != 0:
+                    logger.debug(
+                        f"Jest benchmarking run {run_idx + 1}/{effective_validation_runs} "
+                        f"failed with return code {run_result.returncode}"
+                    )
+
+                logger.debug(
+                    f"Validation run {run_idx + 1}/{effective_validation_runs} complete"
+                )
+
+            combined_stdout = "".join(combined_stdout_parts)
+            result = subprocess.CompletedProcess(
+                args=last_args, returncode=last_returncode, stdout=combined_stdout, stderr=""
+            )

    except subprocess.TimeoutExpired:
        logger.warning(f"Jest benchmarking timed out after {total_timeout}s")
--- a/codeflash/models/models.py
+++ b/codeflash/models/models.py
@ -404,6 +404,8 @@ class OptimizedCandidateResult(BaseModel):
    render_profiles: Optional[list[Any]] = None
    dom_mutations: Optional[list[Any]] = None
    interaction_durations: Optional[list[Any]] = None
+    interaction_render_profiles: Optional[list[Any]] = None
+    render_count_confidence: str = "high"


 class GeneratedTests(BaseModel):
@ -640,6 +642,8 @@ class OriginalCodeBaseline(BaseModel):
    render_profiles: Optional[list[Any]] = None
    dom_mutations: Optional[list[Any]] = None
    interaction_durations: Optional[list[Any]] = None
+    interaction_render_profiles: Optional[list[Any]] = None
+    render_count_confidence: str = "high"


 class CoverageStatus(Enum):
--- a/codeflash/optimization/function_optimizer.py
+++ b/codeflash/optimization/function_optimizer.py
@ -575,7 +575,7 @@ class FunctionOptimizer:
        if not self.is_react_component or not test_results.perf_stdout:
            return None
        try:
-            from codeflash.languages.javascript.parse import parse_react_render_markers
+            from codeflash.languages.javascript.parse import parse_react_render_markers  # noqa: PLC0415

            profiles = parse_react_render_markers(test_results.perf_stdout)
            if profiles:
@ -589,6 +589,26 @@ class FunctionOptimizer:
            logger.debug("Failed to parse React render markers", exc_info=True)
        return None

+    def compute_render_count_confidence(self, test_results: TestResults) -> str:
+        """Compute render count confidence from multi-run validation output.
+
+        Splits stdout by validation run boundaries and compares render counts
+        across runs. Returns "high" if identical, "low" if unstable.
+        """
+        if not self.is_react_component or not test_results.perf_stdout:
+            return "high"
+        try:
+            from codeflash.languages.javascript.frameworks.react.benchmarking import validate_render_count_stability  # noqa: PLC0415
+            from codeflash.languages.javascript.parse import parse_per_run_render_profiles  # noqa: PLC0415
+
+            per_run_profiles = parse_per_run_render_profiles(test_results.perf_stdout)
+            if len(per_run_profiles) <= 1:
+                return "high"
+            return validate_render_count_stability(per_run_profiles)
+        except Exception:
+            logger.debug("Failed to compute render count confidence", exc_info=True)
+            return "high"
+
    def parse_dom_mutations_from_results(self, test_results: TestResults) -> list | None:
        """Parse DOM mutation markers from test stdout."""
        if not self.is_react_component or not test_results.perf_stdout:
@ -620,6 +640,21 @@ class FunctionOptimizer:
            logger.debug("Failed to parse interaction duration markers", exc_info=True)
        return None

+    def parse_interaction_render_profiles_from_results(self, test_results: TestResults) -> list | None:
+        """Parse per-interaction render count markers from test stdout."""
+        if not self.is_react_component or not test_results.perf_stdout:
+            return None
+        try:
+            from codeflash.languages.javascript.parse import parse_interaction_render_markers  # noqa: PLC0415
+
+            profiles = parse_interaction_render_markers(test_results.perf_stdout)
+            if profiles:
+                logger.debug(f"Parsed {len(profiles)} per-interaction render profiles from test output")
+                return profiles
+        except Exception:
+            logger.debug("Failed to parse interaction render markers", exc_info=True)
+        return None
+
    def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
        should_run_experiment = self.experiment_id is not None
        logger.info(f"!lsp|Function Trace ID: {self.function_trace_id}")
@ -1022,6 +1057,8 @@ class FunctionOptimizer:
                target_component_name=self.function_to_optimize.function_name,
                original_interaction_durations=original_code_baseline.interaction_durations,
                optimized_interaction_durations=candidate_result.interaction_durations,
+                original_interaction_renders=original_code_baseline.interaction_render_profiles,
+                optimized_interaction_renders=candidate_result.interaction_render_profiles,
            )

        best_optimization = BestOptimization(
@ -1228,6 +1265,10 @@ class FunctionOptimizer:
        eval_ctx.record_successful_candidate(candidate.optimization_id, candidate_result.best_test_runtime, perf_gain)

        # Check if this is a successful optimization
+        low_confidence = (
+            original_code_baseline.render_count_confidence == "low"
+            or candidate_result.render_count_confidence == "low"
+        )
        is_successful_opt = speedup_critic(
            candidate_result,
            original_code_baseline.runtime,
@ -1237,6 +1278,7 @@ class FunctionOptimizer:
            original_concurrency_metrics=original_code_baseline.concurrency_metrics,
            best_concurrency_ratio_until_now=None,
            original_render_profiles=original_code_baseline.render_profiles,
+            render_count_low_confidence=low_confidence,
        ) and quantity_of_tests_critic(candidate_result)

        tree = self.build_runtime_info_tree(
@ -1974,6 +2016,27 @@ class FunctionOptimizer:
                            f"[REACT-TESTGEN] {len(tests_without_interactions)} tests still lack interactions after retries"
                        )

+                # Check interaction density across all perf tests — if total interaction calls
+                # are below the minimum, preemptively flag low confidence.
+                from codeflash.languages.javascript.frameworks.react.testgen import (  # noqa: PLC0415
+                    MIN_INTERACTION_CALLS,
+                    count_interaction_calls,
+                )
+
+                total_interactions = sum(
+                    count_interaction_calls(t.instrumented_perf_test_source) for t in tests
+                )
+                if total_interactions < MIN_INTERACTION_CALLS:
+                    logger.error(
+                        "[REACT-TESTGEN] Total interaction calls across all perf tests: %d (minimum %d). "
+                        "Render count confidence will be set to low.",
+                        total_interactions,
+                        MIN_INTERACTION_CALLS,
+                    )
+                    self.insufficient_test_interactions = True
+                else:
+                    self.insufficient_test_interactions = False
+
            if not tests:
                logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
                return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
@ -2619,6 +2682,7 @@ class FunctionOptimizer:
                    enable_coverage=False,
                    code_context=code_context,
                    is_react_component=self.is_react_component,
+                    n_validation_runs=3 if self.is_react_component else 1,
                )
                logger.debug(f"[BENCHMARK-DONE] Got {len(benchmarking_results.test_results)} benchmark results")
            finally:
@ -2629,10 +2693,41 @@ class FunctionOptimizer:
                        self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
                    )

-        # Parse React render profiles, DOM mutations, and interaction durations from performance test stdout
+        # Parse React render profiles, DOM mutations, interaction durations, and per-interaction renders
        original_render_profiles = self.parse_render_profiles_from_results(benchmarking_results)
        original_dom_mutations = self.parse_dom_mutations_from_results(benchmarking_results)
        original_interaction_durations = self.parse_interaction_durations_from_results(benchmarking_results)
+        original_interaction_renders = self.parse_interaction_render_profiles_from_results(benchmarking_results)
+        original_render_confidence = self.compute_render_count_confidence(benchmarking_results)
+
+        # Validate that baseline render profiles contain update-phase markers.
+        # Tests that only produce mount-phase markers cannot measure optimization effectiveness.
+        if self.is_react_component and original_render_profiles:
+            has_update_phase = any(p.phase == "update" for p in original_render_profiles)
+            if not has_update_phase:
+                logger.error(
+                    "[REACT] Baseline render profiles contain zero update-phase markers. "
+                    "Perf tests may lack interactions — render-based acceptance will require 30%% threshold."
+                )
+                original_render_confidence = "low"
+
+        # Propagate insufficient interaction count from testgen phase
+        if self.is_react_component and getattr(self, "insufficient_test_interactions", False):
+            original_render_confidence = "low"
+
+        # Warn if the component uses layout APIs that jsdom cannot measure
+        if self.is_react_component:
+            try:
+                source = self.function_to_optimize.file_path.read_text("utf-8")
+                from codeflash.languages.javascript.frameworks.react.discovery import needs_real_layout  # noqa: PLC0415
+
+                if needs_real_layout(source):
+                    logger.warning(
+                        "[REACT] Component uses layout APIs (virtualization, getBoundingClientRect, etc.) "
+                        "— jsdom benchmarks may be inaccurate. Playwright support is planned."
+                    )
+            except Exception:
+                logger.debug("Failed to check layout API usage", exc_info=True)

        console.print(
            TestResults.report_to_tree(
@ -2702,6 +2797,8 @@ class FunctionOptimizer:
                    render_profiles=original_render_profiles,
                    dom_mutations=original_dom_mutations,
                    interaction_durations=original_interaction_durations,
+                    interaction_render_profiles=original_interaction_renders,
+                    render_count_confidence=original_render_confidence,
                ),
                functions_to_remove,
            )
@ -2891,6 +2988,7 @@ class FunctionOptimizer:
                    testing_time=total_looping_time,
                    enable_coverage=False,
                    is_react_component=self.is_react_component,
+                    n_validation_runs=3 if self.is_react_component else 1,
                )
            finally:
                self.restore_source_after_profiler(pre_profiler_source)
@ -2901,10 +2999,21 @@ class FunctionOptimizer:
                        candidate_fto_code, candidate_helper_code, self.function_to_optimize.file_path
                    )

-            # Parse React render profiles, DOM mutations, and interaction durations from candidate performance test stdout
+            # Parse React render profiles, DOM mutations, interaction durations, and per-interaction renders
            candidate_render_profiles = self.parse_render_profiles_from_results(candidate_benchmarking_results)
            candidate_dom_mutations = self.parse_dom_mutations_from_results(candidate_benchmarking_results)
            candidate_interaction_durations = self.parse_interaction_durations_from_results(candidate_benchmarking_results)
+            candidate_interaction_renders = self.parse_interaction_render_profiles_from_results(candidate_benchmarking_results)
+            candidate_render_confidence = self.compute_render_count_confidence(candidate_benchmarking_results)
+
+            if self.is_react_component and candidate_render_profiles:
+                has_update_phase = any(p.phase == "update" for p in candidate_render_profiles)
+                if not has_update_phase:
+                    logger.error(
+                        "[REACT] Candidate render profiles contain zero update-phase markers. "
+                        "Render-based acceptance will require 30%% threshold."
+                    )
+                    candidate_render_confidence = "low"
            # Use effective_loop_count which represents the minimum number of timing samples
            # across all test cases. This is more accurate for JavaScript tests where
            # capturePerf does internal looping with potentially different iteration counts per test.
@ -2958,6 +3067,8 @@ class FunctionOptimizer:
                    render_profiles=candidate_render_profiles,
                    dom_mutations=candidate_dom_mutations,
                    interaction_durations=candidate_interaction_durations,
+                    interaction_render_profiles=candidate_interaction_renders,
+                    render_count_confidence=candidate_render_confidence,
                )
            )

@ -2975,6 +3086,7 @@ class FunctionOptimizer:
        code_context: CodeOptimizationContext | None = None,
        line_profiler_output_file: Path | None = None,
        is_react_component: bool = False,
+        n_validation_runs: int = 1,
    ) -> tuple[TestResults | dict, CoverageData | None]:
        coverage_database_file = None
        coverage_config_file = None
@ -3017,6 +3129,7 @@ class FunctionOptimizer:
                    test_framework=self.test_cfg.test_framework,
                    js_project_root=self.test_cfg.js_project_root,
                    is_react_component=is_react_component,
+                    n_validation_runs=n_validation_runs,
                )
            else:
                msg = f"Unexpected testing type: {testing_type}"
--- a/codeflash/result/critic.py
+++ b/codeflash/result/critic.py
@ -73,6 +73,7 @@ def speedup_critic(
    original_concurrency_metrics: ConcurrencyMetrics | None = None,
    best_concurrency_ratio_until_now: float | None = None,
    original_render_profiles: list | None = None,
+    render_count_low_confidence: bool = False,
 ) -> bool:
    """Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.

@ -131,6 +132,7 @@ def speedup_critic(
                original_interaction_duration_ms=benchmark.original_interaction_duration_ms,
                optimized_interaction_duration_ms=benchmark.optimized_interaction_duration_ms,
                trust_duration=False,
+                low_confidence=render_count_low_confidence,
            )

    throughput_improved = True  # Default to True if no throughput data
@ -312,6 +314,7 @@ def render_efficiency_critic(
    original_interaction_duration_ms: float = 0.0,
    optimized_interaction_duration_ms: float = 0.0,
    trust_duration: bool = True,
+    low_confidence: bool = False,
 ) -> bool:
    """Evaluate whether a React optimization reduces re-renders, render time, or DOM mutations sufficiently.

@ -322,8 +325,12 @@ def render_efficiency_critic(
    When ``trust_duration`` is False (e.g. jsdom where actualDuration is noise),
    render duration is excluded from the acceptance criteria.

+    When ``low_confidence`` is True (render counts varied across validation
+    runs), the render count reduction threshold is raised from 20% to 30%
+    to reduce false positives from measurement noise.
+
    Accepts if:
-    - Update render count reduced by >= 20% (primary), OR total render count reduced by >= 20% (fallback)
+    - Update render count reduced by >= threshold (primary), OR total render count reduced by >= threshold (fallback)
    - OR render duration reduced by >= MIN_IMPROVEMENT_THRESHOLD (when trust_duration=True)
    - OR DOM mutations reduced by >= 20%
    - OR child component render reduction >= MIN_CHILD_RENDER_REDUCTION (captures useCallback/memo optimizations)
@ -333,16 +340,38 @@ def render_efficiency_critic(
    if original_render_count == 0 and original_dom_mutations == 0 and child_render_reduction == 0:
        return False

-    # Use update-phase counts as primary signal when available
+    # Use update-phase counts as primary signal when available.
+    # When the ONLY signal is mount-phase render count (no update-phase data, no DOM mutations,
+    # no child reduction, no interaction data), we cannot meaningfully evaluate the optimization.
+    # Mount count reductions are not a valid React optimization signal — memoization optimizations
+    # often *increase* mount cost while reducing update-phase renders.
+    # When update-phase data exists, ONLY use it for render count acceptance —
+    # total count (which includes mount) dilutes the signal.
    has_update_data = original_update_render_count > 0 or optimized_update_render_count > 0
-    effective_orig_count = original_update_render_count if has_update_data else original_render_count
-    effective_opt_count = optimized_update_render_count if has_update_data else optimized_render_count
+    has_dom_signal = original_dom_mutations > 0
+    has_child_signal = child_render_reduction > 0
+    has_interaction_signal = original_interaction_duration_ms > 0

-    # Check render count reduction
+    if not has_update_data and not has_dom_signal and not has_child_signal and not has_interaction_signal:
+        return False
+
+    # Check render count reduction (higher threshold when confidence is low)
+    render_count_threshold = 0.30 if low_confidence else MIN_RENDER_COUNT_REDUCTION_PCT
    count_improved = False
-    if effective_orig_count > 0:
-        count_reduction = (effective_orig_count - effective_opt_count) / effective_orig_count
-        count_improved = count_reduction >= MIN_RENDER_COUNT_REDUCTION_PCT
+    if has_update_data:
+        # Primary: update-phase only — do NOT fall through to total count
+        if original_update_render_count > 0:
+            count_reduction = (
+                (original_update_render_count - optimized_update_render_count) / original_update_render_count
+            )
+            count_improved = count_reduction >= render_count_threshold
+    elif original_render_count > 0:
+        # Fallback: total count when zero update-phase data exists
+        count_reduction = (original_render_count - optimized_render_count) / original_render_count
+        count_improved = count_reduction >= render_count_threshold
+
+    # Determine effective counts for best-candidate tracking
+    effective_opt_count = optimized_update_render_count if has_update_data else optimized_render_count

    # Check render duration reduction (prefer update-phase duration)
    # Skipped when trust_duration=False (jsdom actualDuration is noise)
--- a/codeflash/verification/test_runner.py
+++ b/codeflash/verification/test_runner.py
@ -329,6 +329,7 @@ def run_benchmarking_tests(
    pytest_max_loops: int = 100_000,
    js_project_root: Path | None = None,
    is_react_component: bool = False,
+    n_validation_runs: int = 1,
 ) -> tuple[Path, subprocess.CompletedProcess]:
    logger.debug(f"run_benchmarking_tests called: framework={test_framework}, num_files={len(test_paths.test_files)}")
    # Check if there's a language support for this test framework that implements run_benchmarking_tests
@ -344,6 +345,7 @@ def run_benchmarking_tests(
            max_loops=pytest_max_loops,
            target_duration_seconds=pytest_target_runtime_seconds,
            is_react_component=is_react_component,
+            n_validation_runs=n_validation_runs,
        )
    if is_python():  # pytest runs both pytest and unittest tests
        pytest_cmd_list = (
--- a/packages/codeflash/runtime/capture.js
+++ b/packages/codeflash/runtime/capture.js
@ -1114,6 +1114,7 @@ function captureRenderPerf(funcName, lineId, renderFn, Component, ...createEleme
    const React = _getReact();

    let renderCount = 0;
+    let renderCountAtLastBoundary = 0;
    function onRender(id, phase, actualDuration, baseDuration) {
        renderCount++;
        console.log(`!######REACT_RENDER:${funcName}:${phase}:${actualDuration}:${baseDuration}:${renderCount}######!`);
@ -1155,6 +1156,17 @@ function captureRenderPerf(funcName, lineId, renderFn, Component, ...createEleme
        };
    }

+    // Per-interaction render tracking: records renders since last boundary
+    // and emits a REACT_INTERACTION_RENDERS marker for A/B comparison.
+    if (result) {
+        result._codeflashMarkInteraction = (label) => {
+            const rendersSinceLast = renderCount - renderCountAtLastBoundary;
+            console.log(`!######REACT_INTERACTION_RENDERS:${funcName}:${label}:${rendersSinceLast}######!`);
+            renderCountAtLastBoundary = renderCount;
+            return rendersSinceLast;
+        };
+    }
+
    return Promise.resolve(result);
 }

--- a/tests/react/test_benchmarking.py
+++ b/tests/react/test_benchmarking.py
@ -153,6 +153,8 @@ class TestRenderEfficiencyCritic:
            optimized_render_count=10,
            original_render_duration=100.0,
            optimized_render_duration=100.0,
+            original_update_render_count=48,
+            optimized_update_render_count=8,
        ) is True

    def test_rejects_insignificant_reduction(self):
@ -169,6 +171,8 @@ class TestRenderEfficiencyCritic:
            optimized_render_count=10,
            original_render_duration=100.0,
            optimized_render_duration=10.0,
+            original_update_render_count=8,
+            optimized_update_render_count=8,
        ) is True

    def test_rejects_worse_than_best(self):
@ -187,6 +191,8 @@ class TestRenderEfficiencyCritic:
            original_render_duration=100.0,
            optimized_render_duration=10.0,
            best_render_count_until_now=5,
+            original_update_render_count=48,
+            optimized_update_render_count=1,
        ) is True

    def test_uses_update_phase_counts_when_available(self):
@ -202,8 +208,8 @@ class TestRenderEfficiencyCritic:
            optimized_update_duration=10.0,
        ) is True

-    def test_falls_back_to_total_when_no_update_data(self):
-        # No update-phase data → uses total counts
+    def test_rejects_mount_only_when_no_secondary_signals(self):
+        # No update-phase data AND no DOM/child/interaction signals → rejected
        assert render_efficiency_critic(
            original_render_count=50,
            optimized_render_count=10,
@ -211,6 +217,19 @@ class TestRenderEfficiencyCritic:
            optimized_render_duration=100.0,
            original_update_render_count=0,
            optimized_update_render_count=0,
+        ) is False
+
+    def test_falls_back_to_total_with_dom_signal(self):
+        # No update-phase data but DOM mutations present → uses total counts
+        assert render_efficiency_critic(
+            original_render_count=50,
+            optimized_render_count=10,
+            original_render_duration=100.0,
+            optimized_render_duration=100.0,
+            original_update_render_count=0,
+            optimized_update_render_count=0,
+            original_dom_mutations=100,
+            optimized_dom_mutations=20,
        ) is True


@ -427,6 +446,8 @@ class TestRenderEfficiencyCriticTrustDuration:
            optimized_render_count=10,
            original_render_duration=100.0,
            optimized_render_duration=10.0,
+            original_update_render_count=8,
+            optimized_update_render_count=8,
            trust_duration=True,
        ) is True

@ -437,6 +458,8 @@ class TestRenderEfficiencyCriticTrustDuration:
            optimized_render_count=10,
            original_render_duration=100.0,
            optimized_render_duration=100.0,
+            original_update_render_count=48,
+            optimized_update_render_count=8,
            trust_duration=False,
        ) is True

--- a/tests/react/test_testgen.py
+++ b/tests/react/test_testgen.py
@ -47,7 +47,7 @@ class TestPostProcessReactTests:
        assert result.count("@testing-library/react") == 1

    def test_adds_user_event_for_click(self):
-        source = "import { render } from '@testing-library/react';\ntest('clicks button', () => { click(button); });"
+        source = "import { render } from '@testing-library/react';\ntest('clicks button', () => { userEvent.click(button); });"
        result = post_process_react_tests(source, _make_info())
        assert "@testing-library/user-event" in result