fix: pin PYTHONHASHSEED=0 in test env and enhance diff diagnostics

Set PYTHONHASHSEED=0 in test subprocess environments so original and candidate runs use identical hash behavior, eliminating a source of non-deterministic return-value comparisons. Also upgrade diff logging from debug to info level with actual types and repr values for DID_PASS, RETURN_VALUE, and STDOUT diffs.
2026-04-10 06:38:08 -05:00 · 2026-04-10 06:38:08 -05:00 · 986654b7e6
commit 986654b7e6
parent e191f74aa6
2 changed files with 24 additions and 7 deletions
--- a/codeflash/languages/function_optimizer.py
+++ b/codeflash/languages/function_optimizer.py
@ -3253,6 +3253,11 @@ class FunctionOptimizer:
        test_env["CODEFLASH_TEST_ITERATION"] = str(codeflash_test_iteration)
        test_env["CODEFLASH_TRACER_DISABLE"] = str(codeflash_tracer_disable)
        test_env["CODEFLASH_LOOP_INDEX"] = str(codeflash_loop_index)
+        # Pin PYTHONHASHSEED so original and candidate test processes use the same hash seed.
+        # Without this, each subprocess gets a random seed, which can cause non-deterministic
+        # iteration order in sets/dicts and lead to flaky return-value comparisons.
+        if "PYTHONHASHSEED" not in test_env:
+            test_env["PYTHONHASHSEED"] = "0"
        return test_env

    def line_profiler_step(
--- a/codeflash/verification/equivalence.py
+++ b/codeflash/verification/equivalence.py
@ -111,6 +111,11 @@ def compare_test_results(
                    original_pytest_error=original_pytest_error,
                )
            )
+            logger.info(
+                f"[DIFF] scope=DID_PASS test_id={test_id} "
+                f"orig_pass={original_test_result.did_pass} cand_pass={cdd_test_result.did_pass} "
+                f"test_type={original_test_result.test_type} cand_error={cdd_pytest_error[:200] if cdd_pytest_error else 'none'}"
+            )

        elif not pass_fail_only and not comparator(
            original_test_result.return_value, cdd_test_result.return_value, superset_obj=superset_obj
@ -129,13 +134,15 @@ def compare_test_results(
            )

            try:
-                logger.debug(
-                    f"File Name: {original_test_result.file_name}\n"
-                    f"Test Type: {original_test_result.test_type}\n"
-                    f"Verification Type: {original_test_result.verification_type}\n"
-                    f"Invocation ID: {original_test_result.id}\n"
-                    f"Original return value: {original_test_result.return_value}\n"
-                    f"Candidate return value: {cdd_test_result.return_value}\n"
+                _orig_rv = original_test_result.return_value
+                _cand_rv = cdd_test_result.return_value
+                logger.info(
+                    f"[DIFF] scope=RETURN_VALUE test_id={test_id} "
+                    f"orig_type={type(_orig_rv).__name__} cand_type={type(_cand_rv).__name__} "
+                    f"orig_pass={original_test_result.did_pass} cand_pass={cdd_test_result.did_pass} "
+                    f"test_type={original_test_result.test_type} "
+                    f"orig_repr={safe_repr(_orig_rv)[:200]} "
+                    f"cand_repr={safe_repr(_cand_rv)[:200]}"
                )
            except Exception as e:
                logger.error(e)
@ -156,6 +163,11 @@ def compare_test_results(
                    original_pytest_error=original_pytest_error,
                )
            )
+            logger.info(
+                f"[DIFF] scope=STDOUT test_id={test_id} "
+                f"orig_stdout={str(original_test_result.stdout)[:200]} "
+                f"cand_stdout={str(cdd_test_result.stdout)[:200]}"
+            )

    sys.setrecursionlimit(original_recursion_limit)
    logger.info(