From 1e0956e60f0ded4183000b7b696bbbd7af4d28aa Mon Sep 17 00:00:00 2001
From: RD <92499101+iusedmyimagination@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:44:24 -0800
Subject: [PATCH] Byesian analysis implementation

---
 .idea/inspectionProfiles/Project_Default.xml  |   7 +-
 .../verification/bayesian_analysis.py         | 112 +++++++++++++++
 .../verification/statistical_analysis.py      |  59 --------
 cli/experiments/statistical_analysis.py       | 131 ++++++++++++++++++
 .../tests/test_statistical_analysis.py        |   4 +-
 cli/pyproject.toml                            |   2 +-
 cli/tests/test_bayesian_analysis.py           |  79 +++++++++++
 mypy.ini                                      |  11 +-
 8 files changed, 341 insertions(+), 64 deletions(-)
 create mode 100644 cli/codeflash/verification/bayesian_analysis.py
 delete mode 100644 cli/codeflash/verification/statistical_analysis.py
 create mode 100644 cli/experiments/statistical_analysis.py
 rename cli/{ => experiments}/tests/test_statistical_analysis.py (92%)
 create mode 100644 cli/tests/test_bayesian_analysis.py
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
index d30a543e2..e39bbc66c 100644
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -3,7 +3,7 @@
     <option name="myName" value="Project Default" />
     <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
       <Languages>
-        <language minSize="47" name="Python" />
+        <language minSize="63" name="Python" />
       </Languages>
     </inspection_tool>
     <inspection_tool class="Eslint" enabled="true" level="SERVER PROBLEM" enabled_by_default="true" editorAttributes="GENERIC_SERVER_ERROR_OR_WARNING">
@@ -20,5 +20,10 @@
     </inspection_tool>
     <inspection_tool class="PyTestUnpassedFixtureInspection" enabled="false" level="WARNING" enabled_by_default="false" />
     <inspection_tool class="PyTypeCheckerInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
+      <option name="processCode" value="true" />
+      <option name="processLiterals" value="true" />
+      <option name="processComments" value="true" />
+    </inspection_tool>
   </profile>
 </component>
\ No newline at end of file
diff --git a/cli/codeflash/verification/bayesian_analysis.py b/cli/codeflash/verification/bayesian_analysis.py
new file mode 100644
index 000000000..41ec1f972
--- /dev/null
+++ b/cli/codeflash/verification/bayesian_analysis.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numba as nb
+import numpy as np
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bayesian_bootstrap_runtime_means(
+    runtimes: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    """Bayesian bootstrap for the mean of the runtimes list.
+
+    Returns an array of shape (bootstrap_size,) with draws from the posterior of the mean.
+    We draw random weights from Dirichlet(1,1,...,1) using the rngs random generators
+    (one per computation thread), and compute the weighted mean.
+    """
+    num_timings = len(runtimes)
+    np_runtimes = np.array(runtimes).astype(np.float64)
+    draws = np.empty(bootstrap_size, dtype=np.float64)
+
+    num_threads = len(rngs)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for thread_id in nb.prange(num_threads):
+        thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+        for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+            # Dirichlet(1,...,1) is the normalized Gamma(1,1) distribution
+            weights = rngs[thread_id].gamma(1.0, 1.0, size=num_timings)
+            thread_draws[bootstrap_id] = np.dot(np_runtimes, weights / np.sum(weights))
+    return draws
+
+
+def compute_function_runtime_posterior_means(
+    function_runtime_data: list[list[int]], bootstrap_size: int
+) -> list[npt.NDArray[np.float64]]:
+    """For each list of runtimes associated to a function input, do a Bayesian bootstrap to get a posterior of the mean.
+
+    Returns an array of shape (bootstrap_size,) for each function input.
+    """
+    rng = np.random.default_rng()
+    return [
+        bayesian_bootstrap_runtime_means(input_runtime_data, tuple(rng.spawn(nb.get_num_threads())), bootstrap_size)
+        for input_runtime_data in function_runtime_data
+    ]
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_combined_function_input_runtime_means(
+    posterior_means: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    """Given a function, we have posterior draws for each input, and get an overall expected time across these inputs.
+
+    We make random draws from each input's distribution using the rngs random generators (one per computation thread),
+    and compute their arithmetic mean.
+    Returns an array of shape (bootstrap_size,).
+    """
+    num_inputs = len(posterior_means)
+    num_input_means = max([len(posterior_mean) for posterior_mean in posterior_means])
+    draws = np.empty(bootstrap_size, dtype=np.float64)
+
+    num_threads = len(rngs)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for thread_id in nb.prange(num_threads):
+        thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+        for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+            thread_draws[bootstrap_id] = (
+                sum([input_means[rngs[thread_id].integers(0, num_input_means)] for input_means in posterior_means])
+                / num_inputs
+            )
+    return draws
+
+
+def compute_statistics(distribution: npt.NDArray[np.float64], gamma: float = 0.95) -> dict[str, np.float64]:
+    lower_p = (1.0 - gamma) / 2 * 100
+    return {
+        "median": np.median(distribution),
+        "credible_interval_lower_bound": np.percentile(distribution, lower_p),
+        "credible_interval_upper_bound": np.percentile(distribution, 100 - lower_p),
+    }
+
+
+def analyse_function_runtime_data(
+    function_runtime_data: list[list[int]], bootstrap_size: int
+) -> tuple[npt.NDArray[np.float64], dict[str, np.float64]]:
+    rng = np.random.default_rng()
+    function_runtime_distribution = bootstrap_combined_function_input_runtime_means(
+        compute_function_runtime_posterior_means(function_runtime_data, bootstrap_size),
+        tuple(rng.spawn(nb.get_num_threads())),
+        bootstrap_size,
+    )
+    return function_runtime_distribution, compute_statistics(function_runtime_distribution)
+
+
+def compare_function_runtime_distributions(
+    function1_runtime_distribution: npt.NDArray[np.float64], function2_runtime_distribution: npt.NDArray[np.float64]
+) -> tuple[dict[str, np.float64], np.float64]:
+    speedup_distribution = function1_runtime_distribution / function2_runtime_distribution
+    return compute_statistics(speedup_distribution), np.mean(speedup_distribution > 1)
diff --git a/cli/codeflash/verification/statistical_analysis.py b/cli/codeflash/verification/statistical_analysis.py
deleted file mode 100644
index a39041a46..000000000
--- a/cli/codeflash/verification/statistical_analysis.py
+++ /dev/null
@@ -1,59 +0,0 @@
-﻿from __future__ import annotations
-
-import math
-from typing import TYPE_CHECKING
-
-import numpy as np
-from numba import get_num_threads, njit, prange
-
-if TYPE_CHECKING:
-    import numpy.typing as npt
-
-TWO_SIGMA = 2
-
-
-@njit(parallel=True, fastmath=True, cache=True)
-def bootstrap_minima(
-    series: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
-) -> npt.NDArray[np.int64]:
-    num_threads = len(rngs)
-    series_size = len(series)
-    npseries = np.array(series)
-    thread_remainder = bootstrap_size % num_threads
-    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
-        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
-    )
-    minima = np.empty(bootstrap_size)
-    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
-
-    for i in prange(num_threads):
-        thread_minima = minima[thread_idx[i] : thread_idx[i + 1]]
-        for k in range(num_bootstraps_per_thread[i]):
-            thread_minima[k] = min(npseries[rngs[i].integers(0, series_size, size=series_size)])
-    return minima
-
-
-def bootstrap_noise_floor(series: list[int], bootstrap_size: int) -> np.float64:
-    rng = np.random.default_rng()
-    return np.std(bootstrap_minima(series, tuple(rng.spawn(get_num_threads())), bootstrap_size))
-
-
-def combined_series_noise_floor(series1: list[int], series2: list[int], bootstrap_size: int) -> np.float64:
-    noise_floor1 = bootstrap_noise_floor(series1, bootstrap_size)
-    noise_floor2 = bootstrap_noise_floor(series2, bootstrap_size)
-    return math.sqrt(noise_floor1 * noise_floor1 + noise_floor2 * noise_floor2)
-
-
-def series2_faster_95_confidence(
-    series1: list[int], series2: list[int], bootstrap_size: int
-) -> tuple[float, float] | None:
-    min1 = min(series1)
-    min_diff = min1 - min(series2)
-    if min_diff <= 0:
-        return None
-    combined_noise_floor = combined_series_noise_floor(series1, series2, bootstrap_size)
-    percent_diff = 100 * min_diff / min1
-    uncertainty = TWO_SIGMA * 100 * combined_noise_floor / min1
-    if combined_noise_floor == 0 or min_diff / combined_noise_floor > TWO_SIGMA:
-        return percent_diff, uncertainty
-    return None
diff --git a/cli/experiments/statistical_analysis.py b/cli/experiments/statistical_analysis.py
new file mode 100644
index 000000000..2f17de52c
--- /dev/null
+++ b/cli/experiments/statistical_analysis.py
@@ -0,0 +1,131 @@
+﻿from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numba import get_num_threads, njit, prange
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+TWO_SIGMA = 2
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_minima(
+    series: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.int64]:
+    num_threads = len(rngs)
+    series_size = len(series)
+    npseries = np.array(series)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    minima = np.empty(bootstrap_size)
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for i in prange(num_threads):
+        thread_minima = minima[thread_idx[i] : thread_idx[i + 1]]
+        for k in range(num_bootstraps_per_thread[i]):
+            thread_minima[k] = min(npseries[rngs[i].integers(0, series_size, size=series_size)])
+    return minima
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_minima_ratios(
+    series1: list[int], series2: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    num_threads = len(rngs)
+    series1_size = len(series1)
+    series2_size = len(series2)
+    npseries1 = np.array(series1)
+    npseries2 = np.array(series2)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    minima_ratios = np.empty(bootstrap_size, dtype=np.float64)
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for i in prange(num_threads):
+        thread_minima_ratios = minima_ratios[thread_idx[i] : thread_idx[i + 1]]
+        for k in range(num_bootstraps_per_thread[i]):
+            min2 = min(npseries2[rngs[i].integers(0, series2_size, size=series2_size)])
+            if min2 == 0:
+                thread_minima_ratios[k] = np.inf
+            else:
+                thread_minima_ratios[k] = min(npseries1[rngs[i].integers(0, series1_size, size=series1_size)]) / min2
+    return minima_ratios
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_ratios_geomean(
+    ratio_series: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    num_series = len(ratio_series)
+    num_threads = len(rngs)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    combined_ratios = np.empty(bootstrap_size, dtype=np.float64)
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for i in prange(num_threads):
+        thread_combined_ratios = combined_ratios[thread_idx[i] : thread_idx[i + 1]]
+        for k in range(num_bootstraps_per_thread[i]):
+            sum_log = 0.0
+            for series in ratio_series:
+                ratio = series[rngs[i].integers(0, len(series))]
+                if ratio <= 0:
+                    ratio = 1e-12
+                sum_log += np.log(ratio)
+            thread_combined_ratios[k] = np.exp(sum_log / num_series)
+    return combined_ratios
+
+
+def bootstrap_noise_floor(series: list[int], bootstrap_size: int) -> np.float64:
+    rng = np.random.default_rng()
+    return np.std(bootstrap_minima(series, tuple(rng.spawn(get_num_threads())), bootstrap_size))
+
+
+def combined_series_noise_floor(series1: list[int], series2: list[int], bootstrap_size: int) -> np.float64:
+    noise_floor1 = bootstrap_noise_floor(series1, bootstrap_size)
+    noise_floor2 = bootstrap_noise_floor(series2, bootstrap_size)
+    return math.sqrt(noise_floor1 * noise_floor1 + noise_floor2 * noise_floor2)
+
+
+def series2_faster_95_confidence(
+    series1: list[int], series2: list[int], bootstrap_size: int
+) -> tuple[float, float] | None:
+    min1 = min(series1)
+    min_diff = min1 - min(series2)
+    if min_diff <= 0:
+        return None
+    combined_noise_floor = combined_series_noise_floor(series1, series2, bootstrap_size)
+    percent_diff = 100 * min_diff / min1
+    uncertainty = TWO_SIGMA * 100 * combined_noise_floor / min1
+    if combined_noise_floor == 0 or min_diff / combined_noise_floor > TWO_SIGMA:
+        return percent_diff, uncertainty
+    return None
+
+
+def analyze_series_speedup(
+    multi_series1: list[list[int]], multi_series2: list[list[int]], bootstrap_size: int
+) -> tuple[np.float64, np.float64, np.float64, np.float64, np.float64]:
+    rng = np.random.default_rng()
+    combined_ratios = bootstrap_ratios_geomean(
+        [
+            bootstrap_minima_ratios(series1, series2, tuple(rng.spawn(get_num_threads())), bootstrap_size)
+            for series1, series2 in zip(multi_series1, multi_series2)
+        ],
+        tuple(rng.spawn(get_num_threads())),
+        bootstrap_size,
+    )
+    lower_bound_95_confidence = np.percentile(combined_ratios, 2.5)
+    upper_bound_95_confidence = np.percentile(combined_ratios, 97.5)
+    mean = np.mean(combined_ratios)
+    probablility_faster = np.mean(combined_ratios > 1.0)
+    return lower_bound_95_confidence, upper_bound_95_confidence, mean, probablility_faster
diff --git a/cli/tests/test_statistical_analysis.py b/cli/experiments/tests/test_statistical_analysis.py
similarity index 92%
rename from cli/tests/test_statistical_analysis.py
rename to cli/experiments/tests/test_statistical_analysis.py
index 7d8abf572..9370f159f 100644
--- a/cli/tests/test_statistical_analysis.py
+++ b/cli/experiments/tests/test_statistical_analysis.py
@@ -14,5 +14,5 @@ def test_compare_timing_series() -> None:
     optimized_timing_series = create_timing_series(10000, 1800, 48)
     result = series2_faster_95_confidence(original_timing_series, optimized_timing_series, 10000)
     assert result is not None
-    assert 5 < result[0] < 15
-    assert result[1] < 3
+    assert 4 < result[0] < 16
+    assert result[1] < 4
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 0b98357c6..b5e5164e9 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -139,7 +139,7 @@ install_types = true
 plugins = ["pydantic.mypy"]
 
 [[tool.mypy.overrides]]
-module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes"]
+module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes", "numba"]
 ignore_missing_imports = true
 
 [tool.pydantic-mypy]
diff --git a/cli/tests/test_bayesian_analysis.py b/cli/tests/test_bayesian_analysis.py
new file mode 100644
index 000000000..aec61faf3
--- /dev/null
+++ b/cli/tests/test_bayesian_analysis.py
@@ -0,0 +1,79 @@
+import numpy as np
+from codeflash.verification.bayesian_analysis import (
+    analyse_function_runtime_data,
+    compare_function_runtime_distributions,
+)
+
+
+def test_bayesian_analysis() -> None:
+    functions = ["orig", "opt1", "opt2"]  # original + 2 optimization candidates
+    inputs = ["inpA", "inpB", "inpC"]  # 3 benchmarks
+    n = 5000  # repeated measurements per (function, input)
+
+    # Let's simulate some data in a dictionary:
+    # data[(fn, inp)] = array of shape (N,) with raw runtimes
+    rng = np.random.default_rng(42)
+    data = {}
+    for fn in functions:
+        for inp in inputs:
+            # We'll do random times with some big outliers for demonstration
+            base_time = 1.0
+            if fn == "orig":
+                factor = 1.0
+            elif fn == "opt1":
+                factor = 0.85  # 15% faster
+            else:  # opt2
+                factor = 0.95  # 5% faster
+            # We'll also vary by input
+            if inp == "inpA":
+                factor_inp = 1.0
+            elif inp == "inpB":
+                factor_inp = 1.2
+            else:  # inpC
+                factor_inp = 0.9
+
+            # final mean time = base_time * factor * factor_inp
+            mu = base_time * factor * factor_inp
+            # add noise, outliers
+            times = mu + rng.normal(0, 0.2 * mu, size=n)
+            times = np.clip(times, 0, None)  # no negative times
+            # add some random big spikes
+            if rng.random() < 0.1:
+                times[rng.integers(0, n)] *= 5.0
+            data[(fn, inp)] = times
+
+    orig = [list(data[("orig", i)]) for i in inputs]
+    opt1 = [list(data[("opt1", i)]) for i in inputs]
+    opt2 = [list(data[("opt2", i)]) for i in inputs]
+
+    original_distribution, original_stats = analyse_function_runtime_data(orig, 10000)
+    optimized_distribution1, optimized_stats1 = analyse_function_runtime_data(opt1, 10000)
+    optimized_distribution2, optimized_stats2 = analyse_function_runtime_data(opt2, 10000)
+
+    speedup_stats1, faster_prob1 = compare_function_runtime_distributions(
+        original_distribution, optimized_distribution1
+    )
+    assert (
+        1.162
+        < speedup_stats1["credible_interval_lower_bound"]
+        < 1.165
+        < speedup_stats1["median"]
+        < 1.171
+        < speedup_stats1["credible_interval_upper_bound"]
+        < 1.174
+    )
+    assert faster_prob1 == 1.0
+
+    speedup_stats2, faster_prob2 = compare_function_runtime_distributions(
+        original_distribution, optimized_distribution2
+    )
+    assert (
+        1.046
+        < speedup_stats2["credible_interval_lower_bound"]
+        < 1.051
+        < speedup_stats2["median"]
+        < 1.054
+        < speedup_stats2["credible_interval_upper_bound"]
+        < 1.057
+    )
+    assert faster_prob1 == 1.0
diff --git a/mypy.ini b/mypy.ini
index 4dc81fe3b..afc4607c3 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -16,4 +16,13 @@ install_types = True
 init_forbid_extra = True
 init_typed = True
 warn_required_dynamic_aliases = True
-warn_untyped_fields = True
\ No newline at end of file
+warn_untyped_fields = True
+
+[mypy-jedi.*]
+ignore_missing_imports = True
+
+[mypy-inquirer.*]
+ignore_missing_imports = True
+
+[mypy-numba.*]
+ignore_missing_imports = True
\ No newline at end of file