codeflash-internal/experiments/testgen_analysis.py
2025-05-09 19:42:20 -07:00

63 lines
2 KiB
Python

import os
from typing import Any, Dict, Optional
import pandas as pd
from pandas import DataFrame
from scipy.stats import hmean
from sqlalchemy import create_engine
import openpyxl
from openpyxl.styles import Alignment
def load_data(
database_uri: str = os.environ.get("DATABASE_URL")
) -> DataFrame:
engine = create_engine(database_uri)
with engine.connect() as connection:
query = """
SELECT * FROM optimization_features WHERE
user_id = 'github|4725571' AND EXISTS (SELECT 1 FROM unnest(generated_test) AS elem WHERE elem LIKE '%%# TRY41 GENERATED WITH SINGLE PROMPT%%')
"""
return pd.read_sql_query(
query, connection
)
def split_data(df: DataFrame) -> tuple[DataFrame, DataFrame]:
"""Split the data into two groups based on the 'test_framework' column."""
df_ret = df.copy()
def get_a(row):
out = ""
for elem in row['generated_test']:
if 'GENERATED WITH SINGLE PROMPT' not in elem:
out += elem + '\n'
return out
def get_b(row):
out = ""
for elem in row['generated_test']:
if 'GENERATED WITH SINGLE PROMPT' in elem:
out += elem + '\n'
return out
df_ret['generated_test_a'] = df.apply(lambda row: get_a(row), axis=1)
df_ret['generated_test_b'] = df.apply(lambda row: get_b(row), axis = 1)
return df_ret
def main() -> None:
df = load_data()
df_with_new_cols = split_data(df)
excel_path = "output.xlsx"
df_with_new_cols.drop(['created_at'],axis=1).to_excel(excel_path, index=False)
# Open the Excel file and apply wrapping
wb = openpyxl.load_workbook(excel_path)
ws = wb.active
# Apply wrap text to all cells
for row in ws.iter_rows():
for cell in row:
if isinstance(cell.value, str) and "\n" in cell.value:
cell.alignment = Alignment(wrapText=True)
wb.save(excel_path)
if __name__ == "__main__":
main()