From 1e0956e60f0ded4183000b7b696bbbd7af4d28aa Mon Sep 17 00:00:00 2001
From: RD <92499101+iusedmyimagination@users.noreply.github.com>
Date: Fri, 17 Jan 2025 17:44:24 -0800
Subject: [PATCH] Byesian analysis implementation
---
.idea/inspectionProfiles/Project_Default.xml | 7 +-
.../verification/bayesian_analysis.py | 112 +++++++++++++++
.../verification/statistical_analysis.py | 59 --------
cli/experiments/statistical_analysis.py | 131 ++++++++++++++++++
.../tests/test_statistical_analysis.py | 4 +-
cli/pyproject.toml | 2 +-
cli/tests/test_bayesian_analysis.py | 79 +++++++++++
mypy.ini | 11 +-
8 files changed, 341 insertions(+), 64 deletions(-)
create mode 100644 cli/codeflash/verification/bayesian_analysis.py
delete mode 100644 cli/codeflash/verification/statistical_analysis.py
create mode 100644 cli/experiments/statistical_analysis.py
rename cli/{ => experiments}/tests/test_statistical_analysis.py (92%)
create mode 100644 cli/tests/test_bayesian_analysis.py
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
index d30a543e2..e39bbc66c 100644
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -3,7 +3,7 @@
-
+
@@ -20,5 +20,10 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/cli/codeflash/verification/bayesian_analysis.py b/cli/codeflash/verification/bayesian_analysis.py
new file mode 100644
index 000000000..41ec1f972
--- /dev/null
+++ b/cli/codeflash/verification/bayesian_analysis.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numba as nb
+import numpy as np
+
+if TYPE_CHECKING:
+ import numpy.typing as npt
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bayesian_bootstrap_runtime_means(
+ runtimes: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+ """Bayesian bootstrap for the mean of the runtimes list.
+
+ Returns an array of shape (bootstrap_size,) with draws from the posterior of the mean.
+ We draw random weights from Dirichlet(1,1,...,1) using the rngs random generators
+ (one per computation thread), and compute the weighted mean.
+ """
+ num_timings = len(runtimes)
+ np_runtimes = np.array(runtimes).astype(np.float64)
+ draws = np.empty(bootstrap_size, dtype=np.float64)
+
+ num_threads = len(rngs)
+ thread_remainder = bootstrap_size % num_threads
+ num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+ [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+ )
+ thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+ for thread_id in nb.prange(num_threads):
+ thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+ for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+ # Dirichlet(1,...,1) is the normalized Gamma(1,1) distribution
+ weights = rngs[thread_id].gamma(1.0, 1.0, size=num_timings)
+ thread_draws[bootstrap_id] = np.dot(np_runtimes, weights / np.sum(weights))
+ return draws
+
+
+def compute_function_runtime_posterior_means(
+ function_runtime_data: list[list[int]], bootstrap_size: int
+) -> list[npt.NDArray[np.float64]]:
+ """For each list of runtimes associated to a function input, do a Bayesian bootstrap to get a posterior of the mean.
+
+ Returns an array of shape (bootstrap_size,) for each function input.
+ """
+ rng = np.random.default_rng()
+ return [
+ bayesian_bootstrap_runtime_means(input_runtime_data, tuple(rng.spawn(nb.get_num_threads())), bootstrap_size)
+ for input_runtime_data in function_runtime_data
+ ]
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_combined_function_input_runtime_means(
+ posterior_means: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+ """Given a function, we have posterior draws for each input, and get an overall expected time across these inputs.
+
+ We make random draws from each input's distribution using the rngs random generators (one per computation thread),
+ and compute their arithmetic mean.
+ Returns an array of shape (bootstrap_size,).
+ """
+ num_inputs = len(posterior_means)
+ num_input_means = max([len(posterior_mean) for posterior_mean in posterior_means])
+ draws = np.empty(bootstrap_size, dtype=np.float64)
+
+ num_threads = len(rngs)
+ thread_remainder = bootstrap_size % num_threads
+ num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+ [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+ )
+ thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+ for thread_id in nb.prange(num_threads):
+ thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+ for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+ thread_draws[bootstrap_id] = (
+ sum([input_means[rngs[thread_id].integers(0, num_input_means)] for input_means in posterior_means])
+ / num_inputs
+ )
+ return draws
+
+
+def compute_statistics(distribution: npt.NDArray[np.float64], gamma: float = 0.95) -> dict[str, np.float64]:
+ lower_p = (1.0 - gamma) / 2 * 100
+ return {
+ "median": np.median(distribution),
+ "credible_interval_lower_bound": np.percentile(distribution, lower_p),
+ "credible_interval_upper_bound": np.percentile(distribution, 100 - lower_p),
+ }
+
+
+def analyse_function_runtime_data(
+ function_runtime_data: list[list[int]], bootstrap_size: int
+) -> tuple[npt.NDArray[np.float64], dict[str, np.float64]]:
+ rng = np.random.default_rng()
+ function_runtime_distribution = bootstrap_combined_function_input_runtime_means(
+ compute_function_runtime_posterior_means(function_runtime_data, bootstrap_size),
+ tuple(rng.spawn(nb.get_num_threads())),
+ bootstrap_size,
+ )
+ return function_runtime_distribution, compute_statistics(function_runtime_distribution)
+
+
+def compare_function_runtime_distributions(
+ function1_runtime_distribution: npt.NDArray[np.float64], function2_runtime_distribution: npt.NDArray[np.float64]
+) -> tuple[dict[str, np.float64], np.float64]:
+ speedup_distribution = function1_runtime_distribution / function2_runtime_distribution
+ return compute_statistics(speedup_distribution), np.mean(speedup_distribution > 1)
diff --git a/cli/codeflash/verification/statistical_analysis.py b/cli/codeflash/verification/statistical_analysis.py
deleted file mode 100644
index a39041a46..000000000
--- a/cli/codeflash/verification/statistical_analysis.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from __future__ import annotations
-
-import math
-from typing import TYPE_CHECKING
-
-import numpy as np
-from numba import get_num_threads, njit, prange
-
-if TYPE_CHECKING:
- import numpy.typing as npt
-
-TWO_SIGMA = 2
-
-
-@njit(parallel=True, fastmath=True, cache=True)
-def bootstrap_minima(
- series: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
-) -> npt.NDArray[np.int64]:
- num_threads = len(rngs)
- series_size = len(series)
- npseries = np.array(series)
- thread_remainder = bootstrap_size % num_threads
- num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
- [1] * thread_remainder + [0] * (num_threads - thread_remainder)
- )
- minima = np.empty(bootstrap_size)
- thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
-
- for i in prange(num_threads):
- thread_minima = minima[thread_idx[i] : thread_idx[i + 1]]
- for k in range(num_bootstraps_per_thread[i]):
- thread_minima[k] = min(npseries[rngs[i].integers(0, series_size, size=series_size)])
- return minima
-
-
-def bootstrap_noise_floor(series: list[int], bootstrap_size: int) -> np.float64:
- rng = np.random.default_rng()
- return np.std(bootstrap_minima(series, tuple(rng.spawn(get_num_threads())), bootstrap_size))
-
-
-def combined_series_noise_floor(series1: list[int], series2: list[int], bootstrap_size: int) -> np.float64:
- noise_floor1 = bootstrap_noise_floor(series1, bootstrap_size)
- noise_floor2 = bootstrap_noise_floor(series2, bootstrap_size)
- return math.sqrt(noise_floor1 * noise_floor1 + noise_floor2 * noise_floor2)
-
-
-def series2_faster_95_confidence(
- series1: list[int], series2: list[int], bootstrap_size: int
-) -> tuple[float, float] | None:
- min1 = min(series1)
- min_diff = min1 - min(series2)
- if min_diff <= 0:
- return None
- combined_noise_floor = combined_series_noise_floor(series1, series2, bootstrap_size)
- percent_diff = 100 * min_diff / min1
- uncertainty = TWO_SIGMA * 100 * combined_noise_floor / min1
- if combined_noise_floor == 0 or min_diff / combined_noise_floor > TWO_SIGMA:
- return percent_diff, uncertainty
- return None
diff --git a/cli/experiments/statistical_analysis.py b/cli/experiments/statistical_analysis.py
new file mode 100644
index 000000000..2f17de52c
--- /dev/null
+++ b/cli/experiments/statistical_analysis.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+import numpy as np
+from numba import get_num_threads, njit, prange
+
+if TYPE_CHECKING:
+ import numpy.typing as npt
+
+TWO_SIGMA = 2
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_minima(
+ series: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.int64]:
+ num_threads = len(rngs)
+ series_size = len(series)
+ npseries = np.array(series)
+ thread_remainder = bootstrap_size % num_threads
+ num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+ [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+ )
+ minima = np.empty(bootstrap_size)
+ thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+ for i in prange(num_threads):
+ thread_minima = minima[thread_idx[i] : thread_idx[i + 1]]
+ for k in range(num_bootstraps_per_thread[i]):
+ thread_minima[k] = min(npseries[rngs[i].integers(0, series_size, size=series_size)])
+ return minima
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_minima_ratios(
+ series1: list[int], series2: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+ num_threads = len(rngs)
+ series1_size = len(series1)
+ series2_size = len(series2)
+ npseries1 = np.array(series1)
+ npseries2 = np.array(series2)
+ thread_remainder = bootstrap_size % num_threads
+ num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+ [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+ )
+ minima_ratios = np.empty(bootstrap_size, dtype=np.float64)
+ thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+ for i in prange(num_threads):
+ thread_minima_ratios = minima_ratios[thread_idx[i] : thread_idx[i + 1]]
+ for k in range(num_bootstraps_per_thread[i]):
+ min2 = min(npseries2[rngs[i].integers(0, series2_size, size=series2_size)])
+ if min2 == 0:
+ thread_minima_ratios[k] = np.inf
+ else:
+ thread_minima_ratios[k] = min(npseries1[rngs[i].integers(0, series1_size, size=series1_size)]) / min2
+ return minima_ratios
+
+
+@njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_ratios_geomean(
+ ratio_series: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+ num_series = len(ratio_series)
+ num_threads = len(rngs)
+ thread_remainder = bootstrap_size % num_threads
+ num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+ [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+ )
+ combined_ratios = np.empty(bootstrap_size, dtype=np.float64)
+ thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+ for i in prange(num_threads):
+ thread_combined_ratios = combined_ratios[thread_idx[i] : thread_idx[i + 1]]
+ for k in range(num_bootstraps_per_thread[i]):
+ sum_log = 0.0
+ for series in ratio_series:
+ ratio = series[rngs[i].integers(0, len(series))]
+ if ratio <= 0:
+ ratio = 1e-12
+ sum_log += np.log(ratio)
+ thread_combined_ratios[k] = np.exp(sum_log / num_series)
+ return combined_ratios
+
+
+def bootstrap_noise_floor(series: list[int], bootstrap_size: int) -> np.float64:
+ rng = np.random.default_rng()
+ return np.std(bootstrap_minima(series, tuple(rng.spawn(get_num_threads())), bootstrap_size))
+
+
+def combined_series_noise_floor(series1: list[int], series2: list[int], bootstrap_size: int) -> np.float64:
+ noise_floor1 = bootstrap_noise_floor(series1, bootstrap_size)
+ noise_floor2 = bootstrap_noise_floor(series2, bootstrap_size)
+ return math.sqrt(noise_floor1 * noise_floor1 + noise_floor2 * noise_floor2)
+
+
+def series2_faster_95_confidence(
+ series1: list[int], series2: list[int], bootstrap_size: int
+) -> tuple[float, float] | None:
+ min1 = min(series1)
+ min_diff = min1 - min(series2)
+ if min_diff <= 0:
+ return None
+ combined_noise_floor = combined_series_noise_floor(series1, series2, bootstrap_size)
+ percent_diff = 100 * min_diff / min1
+ uncertainty = TWO_SIGMA * 100 * combined_noise_floor / min1
+ if combined_noise_floor == 0 or min_diff / combined_noise_floor > TWO_SIGMA:
+ return percent_diff, uncertainty
+ return None
+
+
+def analyze_series_speedup(
+ multi_series1: list[list[int]], multi_series2: list[list[int]], bootstrap_size: int
+) -> tuple[np.float64, np.float64, np.float64, np.float64, np.float64]:
+ rng = np.random.default_rng()
+ combined_ratios = bootstrap_ratios_geomean(
+ [
+ bootstrap_minima_ratios(series1, series2, tuple(rng.spawn(get_num_threads())), bootstrap_size)
+ for series1, series2 in zip(multi_series1, multi_series2)
+ ],
+ tuple(rng.spawn(get_num_threads())),
+ bootstrap_size,
+ )
+ lower_bound_95_confidence = np.percentile(combined_ratios, 2.5)
+ upper_bound_95_confidence = np.percentile(combined_ratios, 97.5)
+ mean = np.mean(combined_ratios)
+ probablility_faster = np.mean(combined_ratios > 1.0)
+ return lower_bound_95_confidence, upper_bound_95_confidence, mean, probablility_faster
diff --git a/cli/tests/test_statistical_analysis.py b/cli/experiments/tests/test_statistical_analysis.py
similarity index 92%
rename from cli/tests/test_statistical_analysis.py
rename to cli/experiments/tests/test_statistical_analysis.py
index 7d8abf572..9370f159f 100644
--- a/cli/tests/test_statistical_analysis.py
+++ b/cli/experiments/tests/test_statistical_analysis.py
@@ -14,5 +14,5 @@ def test_compare_timing_series() -> None:
optimized_timing_series = create_timing_series(10000, 1800, 48)
result = series2_faster_95_confidence(original_timing_series, optimized_timing_series, 10000)
assert result is not None
- assert 5 < result[0] < 15
- assert result[1] < 3
+ assert 4 < result[0] < 16
+ assert result[1] < 4
diff --git a/cli/pyproject.toml b/cli/pyproject.toml
index 0b98357c6..b5e5164e9 100644
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -139,7 +139,7 @@ install_types = true
plugins = ["pydantic.mypy"]
[[tool.mypy.overrides]]
-module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes"]
+module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes", "numba"]
ignore_missing_imports = true
[tool.pydantic-mypy]
diff --git a/cli/tests/test_bayesian_analysis.py b/cli/tests/test_bayesian_analysis.py
new file mode 100644
index 000000000..aec61faf3
--- /dev/null
+++ b/cli/tests/test_bayesian_analysis.py
@@ -0,0 +1,79 @@
+import numpy as np
+from codeflash.verification.bayesian_analysis import (
+ analyse_function_runtime_data,
+ compare_function_runtime_distributions,
+)
+
+
+def test_bayesian_analysis() -> None:
+ functions = ["orig", "opt1", "opt2"] # original + 2 optimization candidates
+ inputs = ["inpA", "inpB", "inpC"] # 3 benchmarks
+ n = 5000 # repeated measurements per (function, input)
+
+ # Let's simulate some data in a dictionary:
+ # data[(fn, inp)] = array of shape (N,) with raw runtimes
+ rng = np.random.default_rng(42)
+ data = {}
+ for fn in functions:
+ for inp in inputs:
+ # We'll do random times with some big outliers for demonstration
+ base_time = 1.0
+ if fn == "orig":
+ factor = 1.0
+ elif fn == "opt1":
+ factor = 0.85 # 15% faster
+ else: # opt2
+ factor = 0.95 # 5% faster
+ # We'll also vary by input
+ if inp == "inpA":
+ factor_inp = 1.0
+ elif inp == "inpB":
+ factor_inp = 1.2
+ else: # inpC
+ factor_inp = 0.9
+
+ # final mean time = base_time * factor * factor_inp
+ mu = base_time * factor * factor_inp
+ # add noise, outliers
+ times = mu + rng.normal(0, 0.2 * mu, size=n)
+ times = np.clip(times, 0, None) # no negative times
+ # add some random big spikes
+ if rng.random() < 0.1:
+ times[rng.integers(0, n)] *= 5.0
+ data[(fn, inp)] = times
+
+ orig = [list(data[("orig", i)]) for i in inputs]
+ opt1 = [list(data[("opt1", i)]) for i in inputs]
+ opt2 = [list(data[("opt2", i)]) for i in inputs]
+
+ original_distribution, original_stats = analyse_function_runtime_data(orig, 10000)
+ optimized_distribution1, optimized_stats1 = analyse_function_runtime_data(opt1, 10000)
+ optimized_distribution2, optimized_stats2 = analyse_function_runtime_data(opt2, 10000)
+
+ speedup_stats1, faster_prob1 = compare_function_runtime_distributions(
+ original_distribution, optimized_distribution1
+ )
+ assert (
+ 1.162
+ < speedup_stats1["credible_interval_lower_bound"]
+ < 1.165
+ < speedup_stats1["median"]
+ < 1.171
+ < speedup_stats1["credible_interval_upper_bound"]
+ < 1.174
+ )
+ assert faster_prob1 == 1.0
+
+ speedup_stats2, faster_prob2 = compare_function_runtime_distributions(
+ original_distribution, optimized_distribution2
+ )
+ assert (
+ 1.046
+ < speedup_stats2["credible_interval_lower_bound"]
+ < 1.051
+ < speedup_stats2["median"]
+ < 1.054
+ < speedup_stats2["credible_interval_upper_bound"]
+ < 1.057
+ )
+ assert faster_prob1 == 1.0
diff --git a/mypy.ini b/mypy.ini
index 4dc81fe3b..afc4607c3 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -16,4 +16,13 @@ install_types = True
init_forbid_extra = True
init_typed = True
warn_required_dynamic_aliases = True
-warn_untyped_fields = True
\ No newline at end of file
+warn_untyped_fields = True
+
+[mypy-jedi.*]
+ignore_missing_imports = True
+
+[mypy-inquirer.*]
+ignore_missing_imports = True
+
+[mypy-numba.*]
+ignore_missing_imports = True
\ No newline at end of file