undo structured output

2025-04-16 20:05:19 -07:00 · 2025-04-16 20:05:19 -07:00 · dacab4fef1
commit dacab4fef1
parent 01062bbbb8
3 changed files with 24 additions and 20 deletions
--- a/django/aiservice/optimizer/optimizer.py
+++ b/django/aiservice/optimizer/optimizer.py
@ -2,7 +2,6 @@ from __future__ import annotations

 import ast
 import asyncio
-import json
 import re
 import uuid
 from pathlib import Path
@ -128,23 +127,28 @@ async def optimize_python_code(
    results = [content for op in outputs if (content := op.choices[0].message.content)]
    optimized_code_and_explanations: list[CodeAndExplanation] = []
    for result in results:
-        json_blocks = re.findall(r"\{[^}]*\}", result)
-        if len(json_blocks)>1:
-            debug_log_sensitive_data("Ideally should just have one json block")
-        if len(json_blocks)==0:
-            debug_log_sensitive_data(f"No json block found in output:\n{result}")
-        if len(json_blocks)>0:
-            try:
-                json_dict = json.loads(json_blocks[0])
-            except json.JSONDecodeError as e:
-                debug_log_sensitive_data(f"Failed to parse json:\n{json_blocks[0]}")
-                debug_log_sensitive_data(f"Traceback: {e}")
-                continue
-            if "optimized_code" not in json_dict or "explanation" not in json_dict:
-                debug_log_sensitive_data(f"invalid json output from llm:\n{json_dict}")
-                continue
-            code = json_dict["optimized_code"]
-            explanation = json_dict["explanation"]
+        # json_blocks = re.findall(r"```markdown\s*([\s\S]*?)\s*```", result) + re.findall(r"```json\s*([\s\S]*?)\s*```", result)
+        # if len(json_blocks)>1:
+        #     debug_log_sensitive_data("Ideally should just have one json block")
+        # if len(json_blocks)==0:
+        #     debug_log_sensitive_data(f"No json block found in output:\n{result}")
+        # json_blocks = [result]
+        # if len(json_blocks)>0:
+        #     try:
+        #         json_dict = json.loads(json_blocks[0])
+        #     except json.JSONDecodeError as e:
+        #         debug_log_sensitive_data(f"Failed to parse json:\n{json_blocks[0]}\n{result}")
+        #         debug_log_sensitive_data(f"Traceback: {e}")
+        #         continue
+        #     if "optimized_code" not in json_dict or "explanation" not in json_dict:
+        #         debug_log_sensitive_data(f"invalid json output from llm:\n{json_dict}")
+        #         continue
+        #     code = json_dict["optimized_code"]
+        #     explanation = json_dict["explanation"]
+            match = re.match(r"(.*)```python(?:\n|\\n)(.*?)```(.*)", result, re.DOTALL | re.MULTILINE)
+            if match:
+                code = match.group(2)
+                explanation = match.group(1) + match.group(3)
            try:
                cst_module = parse_module_to_cst(code)
            except cst.ParserSyntaxError as e:
--- a/django/aiservice/optimizer/user_prompt.md
+++ b/django/aiservice/optimizer/user_prompt.md
@ -1,4 +1,4 @@
-Rewrite this python program to run faster. Think step by step and explain your reasoning. Output in json format with the following keys, "optimized_code" and "explanation".
+Rewrite this python program to run faster. Explain your reasoning.
 ```python
 {source_code}
 ```
--- a/experiments/metrics_analysis.py
+++ b/experiments/metrics_analysis.py
@ -273,7 +273,7 @@ def augment_with_best_correct_speedup_ratio(df: DataFrame) -> DataFrame:


 def main() -> None:
-    df = load_data("test_claude_aseem_sanity1")
+    df = load_data("test_gemini_aseem_apr16")
    non_orphan_ids = remove_orphans(df)
    df = df.iloc[non_orphan_ids]
    #df = process_column_pairs(df, "metadata")