codeflash-internal/experiments/optimization-factory/server/app.py

"""
Flask web server for Optimizer Factory - EC2-based Codeflash Management
Provides REST API and web UI for managing repositories and EC2-backed optimization jobs

This server manages the complete lifecycle of code optimization jobs:
1. Repository management (add/edit/delete repos from CSV)
2. Analysis submission and result retrieval
3. EC2 instance lifecycle (launch, configure, monitor, terminate)
4. Job execution and monitoring (logs, status, optimization results)
5. Web UI for managing the entire workflow
"""

# Standard library imports for core functionality
import os
import re
import json
import csv
import logging
import time
import subprocess
import tempfile
from pathlib import Path
from typing import List, Dict, Any, Optional

# Flask framework for web API and UI
from flask import Flask, jsonify, request, send_from_directory, Response, stream_with_context
# Environment variable management
from dotenv import load_dotenv
# Analysis module for repository analysis functionality
from .analyzer import submit_analysis, job_status as analysis_status, job_result as analysis_result, load_analysis_for_repo
# AWS SDK for EC2 instance management
import boto3
# SSH client for remote instance operations
import paramiko

# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================

# Configure structured logging with timestamps and service identification
# This helps track request flows, performance, and debugging across the application
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Create logger for this module - used throughout the application
logger = logging.getLogger(__name__)

def log_request_start(endpoint: str, **kwargs) -> None:
    """
    Log the start of an API request with parameters
    Used to track incoming requests and their parameters for debugging
    """
    params = ', '.join(f"{k}={v}" for k, v in kwargs.items() if v is not None)
    logger.info(f"🔄 [{endpoint}] Request started - {params}")

def log_request_success(endpoint: str, duration_ms: int, **kwargs) -> None:
    """
    Log successful API request completion with timing information
    Helps monitor performance and successful operations
    """
    details = ', '.join(f"{k}={v}" for k, v in kwargs.items() if v is not None)
    logger.info(f"✅ [{endpoint}] Request completed successfully in {duration_ms}ms - {details}")

def log_request_error(endpoint: str, error: str, duration_ms: int, **kwargs) -> None:
    """
    Log failed API request with error details and timing
    Critical for debugging failed operations and understanding error patterns
    """
    details = ', '.join(f"{k}={v}" for k, v in kwargs.items() if v is not None)
    logger.error(f"❌ [{endpoint}] Request failed in {duration_ms}ms - Error: {error} - {details}")

def log_service_operation(service: str, operation: str, **kwargs) -> None:
    """
    Log service operations (AWS, file operations, SSH, etc.)
    Tracks detailed operations across different services for comprehensive monitoring
    """
    details = ', '.join(f"{k}={v}" for k, v in kwargs.items() if v is not None)
    logger.info(f"🔧 [{service}] {operation} - {details}")

# =============================================================================
# CONFIGURATION AND PATHS
# =============================================================================

# Define core application paths for data persistence and configuration
# Base directory is the project root (parent of server/) - where all config files live
BASE_DIR = Path(__file__).resolve().parent.parent
# CSV file containing repository configurations - stores repo URLs, module roots, test roots, resource tiers
CONFIG_CSV = BASE_DIR / "config" / "repos.csv"
# Server data directory for storing job tracking - where runtime data is persisted
DATA_DIR = Path(__file__).resolve().parent
# JSON file mapping repository URLs to their latest job IDs - tracks which EC2 instance handles each repo
JOBS_JSON = DATA_DIR / "jobs.json"
# Local archive directory for saving job logs before shutdown
LOGS_ARCHIVE_DIR = DATA_DIR / "logs"

# Load environment variables from .env file if present
# This makes CODEFLASH_API_KEY, GITHUB_TOKEN, AWS credentials, etc. available
try:
    load_dotenv()
except Exception:
    pass

# AWS EC2 configuration from environment variables
# These control which AWS region, instance type, AMI, and security settings to use
AWS_REGION = os.getenv("AWS_REGION", "us-east-1")  # AWS region for EC2 instances
AWS_KEY_NAME = os.getenv("AWS_KEY_NAME", "").strip()  # EC2 key pair name for SSH access
AWS_SECURITY_GROUP = os.getenv("AWS_SECURITY_GROUP", "").strip()  # Security group for EC2 instances
AWS_INSTANCE_TYPE = os.getenv("AWS_INSTANCE_TYPE", "c7i.xlarge").strip()  # EC2 instance size
AWS_AMI_ID = os.getenv("AWS_AMI_ID", "").strip()  # AMI ID for EC2 instances (Ubuntu with pre-installed tools)
SSH_KEY_PATH = os.getenv("SSH_KEY_PATH", "").strip()  # Local path to SSH private key for EC2 access

# Log successful configuration loading for debugging
logger.info(
    f"🚀 Server configuration loaded - AWS_REGION={AWS_REGION}, INSTANCE_TYPE={AWS_INSTANCE_TYPE}"
)

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def _ensure_files() -> None:
    """
    Ensures required directories and files exist for application startup
    Creates the jobs.json file if it doesn't exist to track repository-to-instance mappings
    """
    logger.info("📁 Ensuring required directories and files exist")
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    if not JOBS_JSON.exists():
        JOBS_JSON.write_text(json.dumps({}, indent=2))
        logger.info(f"📝 Created empty jobs index file: {JOBS_JSON}")
    else:
        logger.info(f"📝 Jobs index file exists: {JOBS_JSON}")

def _canon_repo_url(repo_url: str) -> str:
    """
    Canonicalize repo URL for consistent indexing across the application
    Removes trailing slashes and normalizes URLs to prevent duplicate entries
    """
    return (repo_url or "").strip().rstrip("/")


def _rsync_logs_from_instance(public_ip: str, local_dest: str) -> bool:
    """
    Uses rsync to efficiently download all log files from a remote instance
    to a local directory.

    Args:
        public_ip: The public IP of the EC2 instance.
        local_dest: The local destination directory path.

    Returns:
        True if the rsync operation was successful, False otherwise.
    """
    ssh_key = os.path.expanduser(SSH_KEY_PATH)
    if not os.path.exists(ssh_key):
        logger.error(f"❌ [RSYNC] SSH key not found at {ssh_key}")
        return False

    # Convert to absolute path for rsync
    ssh_key_abs = os.path.abspath(ssh_key)

    # Define rsync command with includes for all relevant log files and patterns
    # This is more efficient than running multiple commands or listing files first.
    rsync_command = [
        "rsync",
        "-avh",  # Archive, verbose, human-readable
        "--progress", # Show progress during transfer
        "--compress", # Compress file data during the transfer
        "-e", f"ssh -i {ssh_key_abs} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null",
        "--include=optimization-*.log",
        "--include=optimization.log",
        "--include=launcher.log",
        "--include=llm-setup-*.log",
        "--include=llm-setup.log",
        "--include=claude-setup-*.log",
        "--include=claude-setup-round-*.log",
        "--include=tests-*.log",
        "--include=tests.log",
        "--include=stage.jsonl",
        "--include=job.exitcode",
        "--include=optimization.pid",
        "--exclude=*",  # Exclude all other files from the directory
        f"ubuntu@{public_ip}:/home/ubuntu/app/logs/", # Source directory
        str(local_dest),  # Destination directory
    ]

    try:
        log_service_operation("RSYNC", "Starting log download", ip=public_ip, dest=local_dest, ssh_key=ssh_key_abs)
        logger.debug(f"[RSYNC] Command: {' '.join(rsync_command)}")

        # We use subprocess.run to execute the command
        result = subprocess.run(
            rsync_command,
            capture_output=True,
            text=True,
            check=True, # This will raise CalledProcessError if rsync returns a non-zero exit code
            timeout=300 # 5-minute timeout for the rsync operation
        )
        logger.info(f"✅ [RSYNC] Logs successfully synced from {public_ip}.")
        # Log the output from rsync for debugging progress
        if result.stdout:
            logger.debug(f"[RSYNC STDOUT]:\n{result.stdout}")
        return True
    except subprocess.CalledProcessError as e:
        logger.error(f"❌ [RSYNC] Command failed with exit code {e.returncode}")
        logger.error(f"❌ [RSYNC] Command: {' '.join(rsync_command)}")
        if e.stderr:
            logger.error(f"❌ [RSYNC] STDERR: {e.stderr}")
        if e.stdout:
            logger.error(f"❌ [RSYNC] STDOUT: {e.stdout}")
        log_request_error("RSYNC", f"Rsync failed with exit code {e.returncode}", 0, ip=public_ip)
        return False
    except subprocess.TimeoutExpired:
        log_request_error("RSYNC", "Rsync operation timed out", 0, ip=public_ip)
        return False
    except Exception as e:
        log_request_error("RSYNC", f"An unexpected error occurred during rsync: {str(e)}", 0, ip=public_ip)
        return False


def _load_jobs_index() -> Dict[str, str]:
    """
    Loads the job tracking index from JSON file
    This maintains the mapping between repository URLs and their current EC2 instance IDs
    Returns: Dictionary mapping repository URLs to their latest job IDs
    """
    try:
        data = json.loads(JOBS_JSON.read_text())
        # Normalize keys as canonical URLs to prevent duplicate entries
        if isinstance(data, dict):
            normalized = {}
            for k, v in data.items():
                normalized[_canon_repo_url(k)] = v
            if normalized != data:
                JOBS_JSON.write_text(json.dumps(normalized, indent=2))
                data = normalized
        logger.debug(f"📖 Loaded jobs index with {len(data)} entries")
        return data
    except Exception as e:
        logger.warning(f"⚠️ Failed to load jobs index: {e}, returning empty dict")
        return {}


def _save_jobs_index(data: Dict[str, str]) -> None:
    """
    Saves the job tracking index to JSON file
    Persists the repository-to-instance mapping for state management across server restarts
    """
    try:
        # Persist canonical URL keys only to maintain consistency
        normalized = { _canon_repo_url(k): v for k, v in data.items() }
        JOBS_JSON.write_text(json.dumps(normalized, indent=2))
        logger.debug(f"💾 Saved jobs index with {len(data)} entries")
    except Exception as e:
        logger.error(f"❌ Failed to save jobs index: {e}")


def _set_repo_job(repo_url: str, instance_id: str) -> None:
    """
    Update jobs index mapping for a repo to an EC2 instance id
    This creates/updates the association between a repository and its current optimization instance
    """
    try:
        jobs_index = _load_jobs_index()
        jobs_index[_canon_repo_url(repo_url)] = instance_id
        _save_jobs_index(jobs_index)
        log_service_operation("JOBS", "Set repo->instance mapping", repo_url=_canon_repo_url(repo_url), instance_id=instance_id)
    except Exception as e:
        logger.exception(f"Failed to update jobs index for {repo_url}: {e}")


def _read_csv() -> List[Dict[str, str]]:
    """
    Reads repository configurations from CSV file
    This loads all configured repositories with their optimization settings
    Returns: List of dictionaries with repo_url, module_root, tests_root, resource_tier
    """
    try:
        rows: List[Dict[str, str]] = []
        with CONFIG_CSV.open("r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                rows.append({
                    "repo_url": row.get("repo_url", "").strip(),
                    "module_root": row.get("module_root", "auto").strip(),
                    "tests_root": row.get("tests_root", "auto").strip(),
                    "resource_tier": row.get("resource_tier", "small").strip().lower() or "small",
                })
        logger.debug(f"📊 Read {len(rows)} repositories from CSV")
        return rows
    except Exception as e:
        logger.error(f"❌ Failed to read CSV file: {e}")
        return []


def _write_csv(rows: List[Dict[str, str]]) -> None:
    """
    Writes repository configurations to CSV file
    Persists the current repository list with all configuration changes
    """
    try:
        fieldnames = ["repo_url", "module_root", "tests_root", "resource_tier"]
        with CONFIG_CSV.open("w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for r in rows:
                writer.writerow({
                    "repo_url": r["repo_url"].strip(),
                    "module_root": r.get("module_root", "auto").strip(),
                    "tests_root": r.get("tests_root", "auto").strip(),
                    "resource_tier": r.get("resource_tier", "small").strip().lower() or "small",
                })
        logger.info(f"💾 Saved {len(rows)} repositories to CSV")
    except Exception as e:
        logger.error(f"❌ Failed to write CSV file: {e}")


def _find_row(rows: List[Dict[str, str]], repo_url: str) -> Optional[int]:
    """
    Finds the index of a repository in the rows list
    Used to locate specific repositories for updates, deletions, or job launches
    Args:
        rows: List of repository dictionaries
        repo_url: Repository URL to find
    Returns: Index of the repository or None if not found
    """
    for idx, r in enumerate(rows):
        if r.get("repo_url", "").rstrip("/") == repo_url.rstrip("/"):
            logger.debug(f"🔍 Found repository {repo_url} at index {idx}")
            return idx
    logger.debug(f"🔍 Repository {repo_url} not found")
    return None


# =============================================================================
# INITIALIZATION
# =============================================================================

# Application startup sequence - initialize all required components
logger.info("🔧 Initializing Flask server...")

# Ensure required files exist before starting the server
# This creates the jobs.json file and ensures data directories exist
_ensure_files()

# Initialize AWS EC2 client for instance management
# This client is used throughout the application for EC2 operations
try:
    ec2 = boto3.client("ec2", region_name=AWS_REGION)
    logger.info(f"☁️ AWS EC2 client initialized successfully - Region: {AWS_REGION}")
except Exception as e:
    logger.error(f"❌ Failed to initialize AWS EC2 client: {e}")
    raise

# Initialize Flask app with static file serving capability
# The static folder serves the web UI (HTML, CSS, JS files)
app = Flask(__name__, static_folder="static")
logger.info("🌐 Flask app initialized")


# =============================================================================
# EC2 MANAGER
# =============================================================================

class EC2Manager:
    """
    Simple EC2 instance lifecycle and SSH utilities for running optimizations.

    This class handles the complete lifecycle of EC2 instances used for code optimization:
    1. Launch instances with pre-configured user data
    2. SSH operations (upload files, execute commands, download logs)
    3. Instance monitoring and termination
    4. File management on remote instances
    """

    def __init__(self, region: str, key_name: str, security_group: str, instance_type: str, ami_id: str, ssh_key_path: str) -> None:
        """
        Initialize EC2 manager with AWS configuration
        Sets up the EC2 client and stores configuration for instance operations
        """
        self.ec2 = boto3.client("ec2", region_name=region)
        self.region = region
        self.key_name = key_name
        self.security_group = security_group
        self.instance_type = instance_type
        self.ami_id = ami_id
        self.ssh_key_path = os.path.expanduser(ssh_key_path)

    def launch_instance(self, job_name: str, job_tag_value: str, tags: Optional[Dict[str, str]] = None) -> str:
        """
        Launch an EC2 instance and return its instance_id.

        This method:
        1. Creates a user-data script that installs prerequisites (git, python, codeflash, etc.)
        2. Launches the instance with proper tags and security settings
        3. Waits for the instance to be running
        4. Returns the instance ID for tracking

        The instance is initialized via user-data to install prerequisites.
        """
        user_data = """#!/bin/bash
set -u

# Log all output
exec > >(tee /var/log/user-data.log)
exec 2>&1

# Prepare application directories early regardless of package install success
mkdir -p /home/ubuntu/app/scripts /home/ubuntu/app/logs || true
chown -R ubuntu:ubuntu /home/ubuntu/app || true
ln -s /home/ubuntu/app /app 2>/dev/null || true

apt-get update || true
DEBIAN_FRONTEND=noninteractive apt-get upgrade -y || true
DEBIAN_FRONTEND=noninteractive apt-get install -y \
  git curl wget ca-certificates build-essential \
  python3 python3-venv python3-pip \
  unzip jq software-properties-common cloud-guest-utils || true

# Ensure the root partition and filesystem use the full EBS volume
# This is safe to run multiple times and works for NVMe and Xen devices
ROOT_PART=$(lsblk -no SOURCE,MOUNTPOINT | awk '$2=="/"{print "/dev/"$1}')
DISK=$(echo "$ROOT_PART" | sed -E 's|p?[0-9]+$||')
PART_NUM=$(echo "$ROOT_PART" | sed -E 's|^.*p?([0-9]+)$|\\1|')
growpart "$DISK" "$PART_NUM" || true
FSTYPE=$(findmnt -n -o FSTYPE /)
if [ "$FSTYPE" = "xfs" ]; then
  xfs_growfs /
else
  resize2fs "$ROOT_PART" || true
fi

# Install GitHub CLI
type -p curl >/dev/null || (apt-get update && apt-get install -y curl || true)
curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | \
  dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg
chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" > /etc/apt/sources.list.d/github-cli.list
apt-get update || true
DEBIAN_FRONTEND=noninteractive apt-get install -y gh || true

# Install Codeflash
python3 -m pip install --upgrade pip || true
python3 -m pip install codeflash || true
echo "EC2 bootstrap complete for {JOB_NAME}" > /home/ubuntu/.bootstrap_done
""".replace("{JOB_NAME}", job_name)

        log_service_operation("AWS_EC2", "Launching instance", instance_type=self.instance_type, ami_id=self.ami_id, key_name=self.key_name)
        # Discover the AMI root device name so we resize the actual root volume instead of attaching an extra disk
        try:
            image_desc = self.ec2.describe_images(ImageIds=[self.ami_id])
            root_device_name = image_desc["Images"][0].get("RootDeviceName", "/dev/sda1")
        except Exception:
            root_device_name = "/dev/sda1"
        params = {
            "ImageId": self.ami_id,
            "InstanceType": self.instance_type,
            "MinCount": 1,
            "MaxCount": 1,
            "UserData": user_data,
            "TagSpecifications": [
                {
                    "ResourceType": "instance",
                    "Tags": [
                        {"Key": "Name", "Value": job_name},
                        {"Key": "Job", "Value": job_tag_value},
                        {"Key": "Project", "Value": "OptimizerFactory"},
                    ] + ([{"Key": k, "Value": v} for k, v in (tags or {}).items()] ),
                }
            ],
            "BlockDeviceMappings": [
                {
                    "DeviceName": root_device_name,
                    "Ebs": {"VolumeSize": 50, "VolumeType": "gp3", "DeleteOnTermination": True},
                }
            ],
        }
        if self.key_name:
            params["KeyName"] = self.key_name
        if self.security_group:
            params["SecurityGroupIds"] = [self.security_group]

        resp = self.ec2.run_instances(**params)
        instance_id = resp["Instances"][0]["InstanceId"]
        log_service_operation("AWS_EC2", "Instance launched", instance_id=instance_id)

        waiter = self.ec2.get_waiter("instance_running")
        waiter.wait(InstanceIds=[instance_id])
        log_service_operation("AWS_EC2", "Instance is running", instance_id=instance_id)
        return instance_id

    def get_public_ip(self, instance_id: str) -> Optional[str]:
        """
        Return the public IP address for an instance, if assigned.
        Used to determine when an instance is ready for SSH connections.
        """
        desc = self.ec2.describe_instances(InstanceIds=[instance_id])
        try:
            return desc["Reservations"][0]["Instances"][0].get("PublicIpAddress")
        except Exception:
            return None

    def wait_for_ssh(self, public_ip: str, timeout: int = 600) -> bool:
        """
        Wait until SSH is reachable on the instance.
        This ensures the instance is fully booted and ready for remote operations.
        """
        start = time.time()
        while time.time() - start < timeout:
            try:
                ssh = paramiko.SSHClient()
                ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
                ssh.connect(public_ip, username="ubuntu", key_filename=self.ssh_key_path, timeout=15, banner_timeout=15, auth_timeout=15)
                ssh.close()
                return True
            except Exception:
                time.sleep(10)
        return False

    def open_ssh(self, public_ip: str) -> paramiko.SSHClient:
        """
        Open and return an SSH client connection.
        Creates a new SSH connection for remote operations on the instance.
        """
        log_service_operation("SSH", "Connecting", ip=public_ip)
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect(public_ip, username="ubuntu", key_filename=self.ssh_key_path)
        return ssh

    def upload_text(self, public_ip: str, content: str, remote_path: str) -> None:
        """
        Upload text content to a remote file on the instance.
        Used for uploading generated scripts (like the job wrapper script).
        """
        log_service_operation("SSH", "Uploading text", ip=public_ip, remote_path=remote_path, size=len(content or ""))
        ssh = self.open_ssh(public_ip)
        try:
            sftp = ssh.open_sftp()
            # Ensure parent directory exists
            parent = os.path.dirname(remote_path)
            try:
                sftp.stat(parent)
            except FileNotFoundError:
                # Try to create nested directories
                parts = parent.strip('/').split('/')
                cur = ''
                for p in parts:
                    cur = f"{cur}/{p}" if cur else f"/{p}"
                    try:
                        sftp.stat(cur)
                    except FileNotFoundError:
                        try:
                            sftp.mkdir(cur)
                        except Exception:
                            pass
            with sftp.file(remote_path, "w") as f:
                f.write(content)
            sftp.chmod(remote_path, 0o755)
            log_service_operation("SSH", "Upload text complete", ip=public_ip, remote_path=remote_path)
        finally:
            ssh.close()

    def upload_file(self, public_ip: str, local_path: str, remote_path: str) -> None:
        """
        Upload a local file to the remote instance.
        Used for uploading optimization scripts and other required files.
        """
        log_service_operation("SSH", "Uploading file", ip=public_ip, local_path=local_path, remote_path=remote_path)
        ssh = self.open_ssh(public_ip)
        try:
            sftp = ssh.open_sftp()
            # Ensure parent directory exists
            parent = os.path.dirname(remote_path)
            try:
                sftp.stat(parent)
            except FileNotFoundError:
                parts = parent.strip('/').split('/')
                cur = ''
                for p in parts:
                    cur = f"{cur}/{p}" if cur else f"/{p}"
                    try:
                        sftp.stat(cur)
                    except FileNotFoundError:
                        try:
                            sftp.mkdir(cur)
                        except Exception:
                            pass
            sftp.put(local_path, remote_path)
            sftp.chmod(remote_path, 0o755)
            log_service_operation("SSH", "Upload file complete", ip=public_ip, remote_path=remote_path)
        finally:
            ssh.close()

    def exec(self, public_ip: str, command: str, get_pty: bool = False) -> int:
        """
        Execute a command on the remote instance and return exit status.
        Used for running optimization scripts and other remote operations.
        """
        log_service_operation("SSH", "Executing remote command", ip=public_ip, command=command[:120])
        ssh = self.open_ssh(public_ip)
        try:
            _, stdout, stderr = ssh.exec_command(command, get_pty=get_pty)
            status = stdout.channel.recv_exit_status()
            log_service_operation("SSH", "Command completed", ip=public_ip, exit_status=status)
            return status
        finally:
            ssh.close()

    def exec_capture(self, public_ip: str, command: str, get_pty: bool = False):
        """
        Execute a command and capture stdout/stderr.
        Used for commands that need to return output (like grep for optimization links).
        """
        log_service_operation("SSH", "Executing remote command (capture)", ip=public_ip, command=command[:120])
        ssh = self.open_ssh(public_ip)
        try:
            _, stdout, stderr = ssh.exec_command(command, get_pty=get_pty)
            out = stdout.read().decode("utf-8", errors="ignore")
            err = stderr.read().decode("utf-8", errors="ignore")
            code = stdout.channel.recv_exit_status()
            log_service_operation("SSH", "Command completed (capture)", ip=public_ip, exit_status=code, out_len=len(out), err_len=len(err))
            return code, out, err
        finally:
            ssh.close()

    def read_file_tail(self, public_ip: str, remote_path: str, lines: int = 1000) -> List[str]:
        """
        Read the last N lines of a remote file.
        Used for fetching recent log entries from optimization runs.
        """
        log_service_operation("SSH", "Tailing file", ip=public_ip, path=remote_path, lines=lines)
        ssh = self.open_ssh(public_ip)
        try:
            cmd = f"bash -lc 'set -o pipefail; test -f {remote_path} && tail -n {lines} {remote_path} || true'"
            # Use a longer timeout by requesting a pty and non-blocking read
            _, stdout, _ = ssh.exec_command(cmd, get_pty=True)
            content = stdout.read().decode("utf-8", errors="ignore")
            return content.splitlines()
        finally:
            ssh.close()

    def read_file_bytes(self, public_ip: str, remote_path: str) -> bytes:
        """
        Download a remote file as bytes.
        Used for downloading full log files for analysis.
        """
        log_service_operation("SSH", "Downloading file", ip=public_ip, path=remote_path)
        ssh = self.open_ssh(public_ip)
        try:
            sftp = ssh.open_sftp()
            try:
                with sftp.file(remote_path, "rb") as f:
                    data = f.read()
                log_service_operation("SSH", "Download complete", ip=public_ip, path=remote_path, size=len(data))
                return data
            finally:
                sftp.close()
        finally:
            ssh.close()

    def file_exists(self, public_ip: str, remote_path: str) -> bool:
        """
        Check if a file exists on the remote instance.
        Used to verify job completion markers and log file availability.
        """
        log_service_operation("SSH", "Checking file exists", ip=public_ip, path=remote_path)
        ssh = self.open_ssh(public_ip)
        try:
            sftp = ssh.open_sftp()
            try:
                sftp.stat(remote_path)
                log_service_operation("SSH", "File exists", ip=public_ip, path=remote_path, exists=True)
                return True
            except FileNotFoundError:
                log_service_operation("SSH", "File not found", ip=public_ip, path=remote_path, exists=False)
                return False
            finally:
                sftp.close()
        finally:
            ssh.close()

    def terminate(self, instance_id: str) -> None:
        """
        Terminate an EC2 instance.
        Used for cleanup when jobs complete or are cancelled.
        """
        try:
            self.ec2.terminate_instances(InstanceIds=[instance_id])
            log_service_operation("AWS_EC2", "Instance termination initiated", instance_id=instance_id)
        except Exception as e:
            logger.exception(f"Failed to terminate instance {instance_id}: {e}")


# Global EC2 manager instance - shared across the application
# This instance handles all EC2 operations throughout the application lifecycle
ec2_manager = EC2Manager(
    region=AWS_REGION,
    key_name=AWS_KEY_NAME,
    security_group=AWS_SECURITY_GROUP,
    instance_type=AWS_INSTANCE_TYPE,
    ami_id=AWS_AMI_ID,
    ssh_key_path=SSH_KEY_PATH,
)

# Simple in-memory watcher registry to avoid duplicate watchers per instance
# Prevents multiple completion watchers from being started for the same instance
_watchers: Dict[str, bool] = {}

def _start_completion_watcher(instance_id: str, public_ip: str, repo_url: str, log_path: str = "/var/log/codeflash-optimization.log") -> None:
    """
    Start a background thread that waits for job completion then terminates the instance.

    This watcher:
    1. Monitors for job completion markers on the remote instance
    2. Checks instance state to detect if it's already terminated
    3. Automatically terminates the instance when the job completes
    4. Prevents resource waste by ensuring instances are cleaned up
    """
    if instance_id in _watchers:
        return
    _watchers[instance_id] = True

    def _watch() -> None:
        try:
            logger.info(f"👀 Starting completion watcher for {instance_id}")
            # Wait up to 24 hours for job exitcode marker
            deadline = time.time() + 24 * 3600
            while time.time() < deadline:
                try:
                    if ec2_manager.file_exists(public_ip, "/home/ubuntu/app/logs/job.exitcode"):
                        logger.info(f"✅ Remote job finished on {instance_id}; terminating")
                        # Attempt to archive remote logs locally before termination
                        try:
                            _archive_remote_logs_to_local(public_ip, repo_url)
                        except Exception as _e:
                            logger.warning(f"⚠️ Failed to archive logs for {instance_id}: {_e}")
                        break
                except Exception:
                    pass
                # If instance is no longer running, exit
                try:
                    state = _describe_instance_state(instance_id)
                    if state and state.lower() in {"shutting-down", "terminated", "stopped", "stopping"}:
                        logger.info(f"ℹ️ Instance {instance_id} state={state}; stopping watcher")
                        # Best-effort archive if still reachable
                        try:
                            _archive_remote_logs_to_local(public_ip, repo_url)
                        except Exception:
                            pass
                        return
                except Exception:
                    pass
                time.sleep(30)
        finally:
            try:
                ec2_manager.terminate(instance_id)
            except Exception:
                pass

    import threading
    t = threading.Thread(target=_watch, daemon=True)
    t.start()


def _describe_instance_state(instance_id: str) -> Optional[str]:
    """
    Get the current state of an EC2 instance.
    Used to check if instances are running, stopped, terminated, etc.
    """
    try:
        resp = ec2.describe_instances(InstanceIds=[instance_id])
        state = resp["Reservations"][0]["Instances"][0]["State"]["Name"]
        return state
    except Exception:
        return None

def _archive_remote_logs_to_local(public_ip: str, repo_url: str) -> None:
    """
    Download remote job logs to local disk before shutdown using rsync.
    Saves into server/logs/<org-repo>_<YYYY-MM-DD_HH-MM>/
    """
    LOGS_ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
    # Determine slug from repo_url
    slug = None
    try:
        parts = (repo_url or "").rstrip("/").split("/")
        if len(parts) >= 2:
            org = parts[-2]
            name = parts[-1].replace(".git", "")
            slug = f"{org}-{name}"
    except Exception:
        pass
    if not slug:
        slug = "job"

    # Determine timestamp from latest optimization log if available
    ts = ""
    try:
        cmd = "bash -lc 'basename $(ls -1 /home/ubuntu/app/logs/optimization-*.log 2>/dev/null | sort -r | head -n1)'"
        _, out, _ = ec2_manager.exec_capture(public_ip, cmd)
        base = (out or "").strip()
        m = re.match(r"optimization-([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2})", base)
        if m:
            ts = m.group(1).replace("T", "_")
    except Exception:
        ts = ""

    prefix = f"{slug}_{ts}" if ts else slug
    dest_dir = LOGS_ARCHIVE_DIR / prefix
    dest_dir.mkdir(parents=True, exist_ok=True)

    # Use the new rsync helper to download all logs in one command
    success = _rsync_logs_from_instance(public_ip, str(dest_dir))

    if success:
        log_service_operation("ARCHIVE", "Saved remote logs to local via rsync", dest=str(dest_dir), ip=public_ip)
    else:
        logger.error(f"❌ [ARCHIVE] Failed to save remote logs via rsync from {public_ip}")


# =============================================================================
# CUSTOM ERROR HANDLERS
# =============================================================================

@app.errorhandler(500)
def handle_server_error(e):
    """
    Handle 500 Internal Server Error with JSON response for API calls

    This handler provides user-friendly error messages for common AWS/EC2 issues:
    - Authentication failures
    - Missing key pairs
    - Permission errors
    - Other AWS configuration issues
    """
    if request.path.startswith('/api/'):
        # For API calls, return JSON error instead of HTML debug page
        error_message = str(e.original_exception) if hasattr(e, 'original_exception') else "Internal server error"

        # Extract more meaningful error from common AWS EC2 exceptions
        if 'AuthFailure' in error_message:
            error_message = "AWS EC2 Auth Failure: Check AWS credentials and region"
        elif 'InvalidKeyPair' in error_message or 'InvalidKeyPair.NotFound' in error_message:
            error_message = "AWS EC2 Key Pair Error: Check AWS_KEY_NAME exists in the region"
        elif 'InvalidGroup' in error_message or 'UnauthorizedOperation' in error_message:
            error_message = "AWS EC2 Permission Error: Check IAM permissions for ec2:RunInstances and security groups"

        logger.error(f"❌ [ERROR_HANDLER] 500 Internal Server Error: {error_message}")
        return jsonify({"error": error_message}), 500

    # For non-API calls, use default HTML error page
    return e

@app.errorhandler(404)
def handle_not_found(e):
    """
    Handle 404 Not Found with JSON response for API calls
    Provides consistent JSON error responses for API endpoints
    """
    if request.path.startswith('/api/'):
        logger.warning(f"⚠️ [ERROR_HANDLER] 404 Not Found: {request.path}")
        return jsonify({"error": "Resource not found"}), 404
    return e

@app.errorhandler(403)
def handle_forbidden(e):
    """
    Handle 403 Forbidden with JSON response for API calls
    Provides consistent JSON error responses for API endpoints
    """
    if request.path.startswith('/api/'):
        logger.warning(f"⚠️ [ERROR_HANDLER] 403 Forbidden: {request.path}")
        return jsonify({"error": "Access forbidden"}), 403
    return e


# =============================================================================
# FLASK ROUTES
# =============================================================================

@app.get("/")
def index() -> Any:
    """
    Serves the main HTML page - the web UI for managing repositories and jobs
    This is the entry point for the web interface where users can:
    - View and manage repositories
    - Start optimization jobs
    - Monitor job progress
    - View logs and results
    """
    start_time = time.time()
    log_request_start("GET /")

    try:
        result = send_from_directory(app.static_folder, "index.html")
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /", duration_ms)
        return result
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /", str(e), duration_ms)
        raise


@app.get("/static/<path:filename>")
def static_files(filename: str):
    """
    Serves static files (CSS, JS, images) for the web UI
    Handles all static assets needed by the frontend interface
    """
    start_time = time.time()
    log_request_start("GET /static", filename=filename)

    try:
        result = send_from_directory(app.static_folder, filename)
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /static", duration_ms, filename=filename)
        return result
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /static", str(e), duration_ms, filename=filename)
        raise


@app.get("/health")
def health() -> Any:
    """
    Health check endpoint for monitoring and load balancer health checks
    Returns a simple JSON response indicating the server is running
    """
    start_time = time.time()
    log_request_start("GET /health")

    try:
        result = jsonify({"ok": True})
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /health", duration_ms)
        return result
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /health", str(e), duration_ms)
        raise


# =============================================================================
# REPOSITORY MANAGEMENT API
# =============================================================================

@app.get("/api/repos")
def list_repos() -> Any:
    """
    GET /api/repos - Lists all repositories with their latest job IDs

    This endpoint:
    1. Reads all repository configurations from CSV
    2. Loads the job index to get current job IDs for each repo
    3. Returns a combined view showing repos and their current optimization status

    Returns: JSON with items array containing repository configurations
    """
    start_time = time.time()
    log_request_start("GET /api/repos")

    try:
        log_service_operation("CSV", "Reading repository configurations")
        rows = _read_csv()

        log_service_operation("JOBS", "Loading job index")
        jobs_index = _load_jobs_index()

        # Add current job ID to each repository entry
        for r in rows:
            r["last_job_id"] = jobs_index.get(_canon_repo_url(r.get("repo_url", ""))) or ""

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/repos", duration_ms, repo_count=len(rows))
        return jsonify({"items": rows})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/repos", str(e), duration_ms)
        raise


@app.post("/api/repos")
def add_repo() -> Any:
    """
    POST /api/repos - Adds a new repository to the optimization queue

    This endpoint:
    1. Validates the repository URL is provided
    2. Checks for duplicate repositories
    3. Adds the new repository with configuration settings
    4. Saves the updated repository list to CSV

    Body: {repo_url, module_root, tests_root, resource_tier}
    Returns: JSON with success status
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()

    log_request_start("POST /api/repos", repo_url=repo_url)

    try:
        if not repo_url:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos", "repo_url is required", duration_ms)
            return jsonify({"error": "repo_url is required"}), 400

        log_service_operation("CSV", "Reading existing repositories")
        rows = _read_csv()

        # Check for duplicate repositories
        if _find_row(rows, repo_url) is not None:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos", "repo already exists", duration_ms, repo_url=repo_url)
            return jsonify({"error": "repo already exists"}), 400

        # Create new repository entry with default values
        new_repo = {
            "repo_url": repo_url,
            "module_root": (payload.get("module_root") or "auto").strip(),
            "tests_root": (payload.get("tests_root") or "auto").strip(),
            "resource_tier": (payload.get("resource_tier") or "small").strip().lower(),
        }
        rows.append(new_repo)

        log_service_operation("CSV", "Saving updated repository list")
        _write_csv(rows)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/repos", duration_ms, repo_url=repo_url,
                          resource_tier=new_repo["resource_tier"])
        return jsonify({"ok": True})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/repos", str(e), duration_ms, repo_url=repo_url)
        raise


# =============================================================================
# ANALYZER API
# =============================================================================

@app.post("/api/analyze_repo")
def analyze_repo() -> Any:
    """
    POST /api/analyze_repo - Submits a repository for analysis

    This endpoint:
    1. Validates the repository URL
    2. Submits the repository to the analyzer service
    3. Returns an analysis job ID for tracking progress

    The analysis process examines the repository structure, dependencies,
    and configuration to determine optimal settings for code optimization.
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()

    log_request_start("POST /api/analyze_repo", repo_url=repo_url)

    try:
        if not repo_url:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/analyze_repo", "repo_url is required", duration_ms)
            return jsonify({"error": "repo_url is required"}), 400

        log_service_operation("ANALYZER", "Submitting analysis job", repo_url=repo_url)
        job_id = submit_analysis(repo_url)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/analyze_repo", duration_ms, repo_url=repo_url, analysis_id=job_id)
        return jsonify({"ok": True, "analysis_id": job_id})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/analyze_repo", str(e), duration_ms, repo_url=repo_url)
        raise


@app.get("/api/analyze_repo/status")
def analyze_status() -> Any:
    """
    GET /api/analyze_repo/status - Check the status of a repository analysis

    This endpoint:
    1. Validates the analysis ID is provided
    2. Checks the current status of the analysis job
    3. Returns status information (pending, running, completed, failed)

    Query params: analysis_id
    Returns: JSON with status and message
    """
    start_time = time.time()
    analysis_id = (request.args.get("analysis_id") or "").strip()

    log_request_start("GET /api/analyze_repo/status", analysis_id=analysis_id)

    try:
        if not analysis_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/analyze_repo/status", "analysis_id is required", duration_ms)
            return jsonify({"error": "analysis_id is required"}), 400

        log_service_operation("ANALYZER", "Checking analysis status", analysis_id=analysis_id)
        st = analysis_status(analysis_id)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/analyze_repo/status", duration_ms,
                          analysis_id=analysis_id, status=st.status)
        return jsonify({
            "ok": True,
            "status": st.status,
            "message": st.message,
        })
    except KeyError:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/analyze_repo/status", "analysis not found", duration_ms,
                        analysis_id=analysis_id)
        return jsonify({"error": "analysis not found"}), 404
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/analyze_repo/status", str(e), duration_ms, analysis_id=analysis_id)
        raise


@app.get("/api/analyze_repo/result")
def analyze_result() -> Any:
    """
    GET /api/analyze_repo/result - Get the results of a completed analysis

    This endpoint:
    1. Validates the analysis ID is provided
    2. Fetches the analysis results if completed
    3. Returns detailed analysis data including recommended settings

    Query params: analysis_id
    Returns: JSON with analysis results or error if not ready
    """
    start_time = time.time()
    analysis_id = (request.args.get("analysis_id") or "").strip()

    log_request_start("GET /api/analyze_repo/result", analysis_id=analysis_id)

    try:
        if not analysis_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/analyze_repo/result", "analysis_id is required", duration_ms)
            return jsonify({"error": "analysis_id is required"}), 400

        log_service_operation("ANALYZER", "Fetching analysis result", analysis_id=analysis_id)
        res = analysis_result(analysis_id)

        if not res:
            st = analysis_status(analysis_id)
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/analyze_repo/result", f"not ready (status={st.status})",
                            duration_ms, analysis_id=analysis_id)
            return jsonify({"error": f"not ready (status={st.status})"}), 400

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/analyze_repo/result", duration_ms, analysis_id=analysis_id)
        return jsonify({"ok": True, "result": res})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/analyze_repo/result", str(e), duration_ms, analysis_id=analysis_id)
        raise


@app.post("/api/apply_analysis")
def apply_analysis() -> Any:
    """
    POST /api/apply_analysis - Apply analysis results to repository configuration

    This endpoint:
    1. Loads the analysis results for a repository
    2. Applies selected analysis recommendations to the repository configuration
    3. Updates the CSV with the new settings
    4. Returns success with applied field information

    Body: {repo_url, apply: {module_root?, tests_root?, resource_tier?}}
    Returns: JSON with success status and applied fields
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()
    apply = payload.get("apply") or {}

    log_request_start("POST /api/apply_analysis", repo_url=repo_url,
                     fields=list(apply.keys()) if apply else [])

    try:
        if not repo_url:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/apply_analysis", "repo_url is required", duration_ms)
            return jsonify({"error": "repo_url is required"}), 400

        log_service_operation("ANALYZER", "Loading analysis result", repo_url=repo_url)
        result = load_analysis_for_repo(repo_url)
        if not result:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/apply_analysis", "no saved analysis found for repo",
                            duration_ms, repo_url=repo_url)
            return jsonify({"error": "no saved analysis found for repo"}), 404

        log_service_operation("CSV", "Reading repository configurations")
        rows = _read_csv()
        idx = _find_row(rows, repo_url)
        if idx is None:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/apply_analysis", "repo not found in CSV",
                            duration_ms, repo_url=repo_url)
            return jsonify({"error": "repo not found in CSV"}), 404

        # Apply selected fields from analysis to repository configuration
        applied_fields = []
        cf = result.get("codeflash", {})
        if apply.get("module_root") and cf.get("module_root"):
            rows[idx]["module_root"] = cf["module_root"]
            applied_fields.append(f"module_root={cf['module_root']}")
        if apply.get("tests_root") and cf.get("tests_root"):
            rows[idx]["tests_root"] = cf["tests_root"]
            applied_fields.append(f"tests_root={cf['tests_root']}")
        if apply.get("resource_tier") and result.get("resources", {}).get("tier"):
            rows[idx]["resource_tier"] = result["resources"]["tier"]
            applied_fields.append(f"resource_tier={result['resources']['tier']}")

        log_service_operation("CSV", "Saving updated repository configurations")
        _write_csv(rows)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/apply_analysis", duration_ms, repo_url=repo_url,
                          applied_fields=applied_fields)
        return jsonify({"ok": True})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/apply_analysis", str(e), duration_ms, repo_url=repo_url)
        raise


@app.put("/api/repos")
def update_repo() -> Any:
    """
    PUT /api/repos - Updates an existing repository
    Body: {repo_url, module_root, tests_root, resource_tier}
    Returns: JSON with success status
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()

    log_request_start("PUT /api/repos", repo_url=repo_url)

    try:
        if not repo_url:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("PUT /api/repos", "repo_url is required", duration_ms)
            return jsonify({"error": "repo_url is required"}), 400

        log_service_operation("CSV", "Reading repository configurations")
        rows = _read_csv()
        idx = _find_row(rows, repo_url)
        if idx is None:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("PUT /api/repos", "repo not found", duration_ms, repo_url=repo_url)
            return jsonify({"error": "repo not found"}), 404

        rows[idx] = {
            "repo_url": repo_url,
            "module_root": (payload.get("module_root") or "auto").strip(),
            "tests_root": (payload.get("tests_root") or "auto").strip(),
            "resource_tier": (payload.get("resource_tier") or "small").strip().lower(),
        }

        log_service_operation("CSV", "Saving updated repository configurations")
        _write_csv(rows)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("PUT /api/repos", duration_ms, repo_url=repo_url)
        return jsonify({"ok": True})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("PUT /api/repos", str(e), duration_ms, repo_url=repo_url)
        raise


@app.delete("/api/repos")
def delete_repo() -> Any:
    """
    DELETE /api/repos - Deletes a repository
    Body: {repo_url}
    Returns: JSON with success status
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()

    log_request_start("DELETE /api/repos", repo_url=repo_url)

    try:
        log_service_operation("CSV", "Reading repository configurations")
        rows = _read_csv()
        new_rows = [r for r in rows if _canon_repo_url(r.get("repo_url", "")) != _canon_repo_url(repo_url)]

        if len(new_rows) == len(rows):
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("DELETE /api/repos", "repo not found", duration_ms, repo_url=repo_url)
            return jsonify({"error": "repo not found"}), 404

        log_service_operation("CSV", "Saving updated repository configurations")
        _write_csv(new_rows)

        # also clear job id if exists
        log_service_operation("JOBS", "Cleaning up job index")
        jobs_index = _load_jobs_index()
        key = _canon_repo_url(repo_url)
        if key in jobs_index:
            jobs_index.pop(key, None)
            _save_jobs_index(jobs_index)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("DELETE /api/repos", duration_ms, repo_url=repo_url)
        return jsonify({"ok": True})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("DELETE /api/repos", str(e), duration_ms, repo_url=repo_url)
        raise


@app.post("/api/repos/bulk")
def bulk_upload_repos() -> Any:
    """
    POST /api/repos/bulk - Bulk upload repositories from CSV data
    Body: {csv_data: string}
    Returns: JSON with validation results and success/error counts
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    csv_data = (payload.get("csv_data") or "").strip()

    log_request_start("POST /api/repos/bulk", data_size=len(csv_data) if csv_data else 0)

    try:
        if not csv_data:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos/bulk", "csv_data is required", duration_ms)
            return jsonify({"error": "csv_data is required"}), 400

        # Parse and validate CSV data
        log_service_operation("CSV", "Parsing and validating bulk CSV data")
        validation_results = []
        valid_rows = []
        line_number = 1

        # Split CSV data into lines and parse
        lines = csv_data.strip().split('\n')
        if not lines:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos/bulk", "Empty CSV data", duration_ms)
            return jsonify({"error": "Empty CSV data"}), 400

        # Check for header
        header_line = lines[0].strip()
        expected_headers = ["repo_url", "module_root", "tests_root", "resource_tier"]

        # Parse header
        import io
        csv_reader = csv.DictReader(io.StringIO(csv_data))

        # Validate header
        if not all(h in csv_reader.fieldnames for h in expected_headers):
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos/bulk", f"Invalid CSV header", duration_ms,
                            expected=expected_headers, found=csv_reader.fieldnames)
            return jsonify({
                "error": f"Invalid CSV header. Expected: {', '.join(expected_headers)}",
                "found_headers": csv_reader.fieldnames
            }), 400

        # Get existing repositories to check for duplicates
        log_service_operation("CSV", "Loading existing repositories for duplicate check")
        existing_rows = _read_csv()
        existing_urls = {r["repo_url"].rstrip("/") for r in existing_rows}
        new_urls_in_csv = set()

        # Validate each row
        for row_idx, row in enumerate(csv_reader, start=2):  # Start at 2 since header is line 1
            validation_result = {
                "line": row_idx,
                "repo_url": row.get("repo_url", "").strip(),
                "errors": [],
                "warnings": []
            }

            # Validate repo_url
            repo_url = row.get("repo_url", "").strip()
            if not repo_url:
                validation_result["errors"].append("repo_url is required")
            elif not repo_url.startswith("https://github.com/"):
                validation_result["errors"].append("repo_url must be a valid GitHub URL starting with https://github.com/")
            elif repo_url.rstrip("/") in existing_urls:
                validation_result["errors"].append("repository already exists in database")
            elif repo_url.rstrip("/") in new_urls_in_csv:
                validation_result["errors"].append("duplicate repository URL in CSV")
            else:
                new_urls_in_csv.add(repo_url.rstrip("/"))

            # Validate module_root
            module_root = row.get("module_root", "").strip()
            if not module_root:
                validation_result["warnings"].append("module_root is empty, will default to 'auto'")
                module_root = "auto"

            # Validate tests_root
            tests_root = row.get("tests_root", "").strip()
            if not tests_root:
                validation_result["warnings"].append("tests_root is empty, will default to 'auto'")
                tests_root = "auto"

            # Validate resource_tier
            resource_tier = row.get("resource_tier", "").strip().lower()
            valid_tiers = ["small", "medium", "large"]
            if not resource_tier:
                validation_result["warnings"].append("resource_tier is empty, will default to 'small'")
                resource_tier = "small"
            elif resource_tier not in valid_tiers:
                validation_result["errors"].append(f"resource_tier must be one of: {', '.join(valid_tiers)}")
                resource_tier = "small"  # fallback

            validation_results.append(validation_result)

            # If no errors, add to valid rows
            if not validation_result["errors"]:
                valid_rows.append({
                    "repo_url": repo_url,
                    "module_root": module_root,
                    "tests_root": tests_root,
                    "resource_tier": resource_tier
                })

        # Count results
        total_rows = len(validation_results)
        error_count = sum(1 for r in validation_results if r["errors"])
        warning_count = sum(1 for r in validation_results if r["warnings"] and not r["errors"])
        valid_count = len(valid_rows)

        log_service_operation("CSV", "Bulk validation completed",
                            total_rows=total_rows, valid_count=valid_count,
                            warning_count=warning_count, error_count=error_count)

        # If there are any errors, don't save anything
        if error_count > 0:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/repos/bulk", f"Validation failed: {error_count} errors found",
                            duration_ms, error_count=error_count, valid_count=valid_count)
            return jsonify({
                "ok": False,
                "message": f"Validation failed: {error_count} errors found",
                "validation_results": validation_results,
                "stats": {
                    "total_rows": total_rows,
                    "valid_count": valid_count,
                    "warning_count": warning_count,
                    "error_count": error_count
                }
            }), 400

        # Save valid rows to CSV
        if valid_rows:
            existing_rows.extend(valid_rows)
            log_service_operation("CSV", "Saving bulk uploaded repositories")
            _write_csv(existing_rows)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/repos/bulk", duration_ms,
                          added_count=valid_count, warning_count=warning_count)
        return jsonify({
            "ok": True,
            "message": f"Successfully added {valid_count} repositories",
            "validation_results": validation_results,
            "stats": {
                "total_rows": total_rows,
                "valid_count": valid_count,
                "warning_count": warning_count,
                "error_count": error_count
            }
        })

    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/repos/bulk", f"CSV parsing error: {str(e)}", duration_ms)
        return jsonify({"error": f"CSV parsing error: {str(e)}"}), 400


# =============================================================================
# EC2 JOB MANAGEMENT
# =============================================================================

def _job_name_for_repo(repo_url: str) -> str:
    """
    Generate a unique job name for a repository URL.

    This creates a consistent naming scheme for EC2 instances based on the repository:
    - Extracts organization and repository name from GitHub URL
    - Creates a sanitized name suitable for AWS resource naming
    - Ensures uniqueness across different repositories
    """
    parts = repo_url.rstrip("/").split("/")
    org = parts[-2] if len(parts) >= 2 else "repo"
    name = parts[-1].replace(".", "-")
    job_name = f"codeflash-opt-{org}-{name}".lower()
    logger.debug(f"🏷️ Generated job name: {job_name} for repo: {repo_url}")
    return job_name


def _launch_ec2_job(row: Dict[str, str], launch_opts: Optional[Dict[str, str]] = None) -> str:
    """
    Launch an EC2 instance, upload scripts, start optimization in background.

    This is the core function that orchestrates the complete optimization workflow:

    1. INSTANCE REUSE CHECK:
       - Checks if there's already a running instance for this repository
       - Reuses existing instances to avoid duplicate work and costs

    2. INSTANCE LAUNCH:
       - Launches a new EC2 instance with pre-configured user data
       - User data installs prerequisites (git, python, codeflash, etc.)
       - Tags the instance for tracking and cost management

    3. INSTANCE PREPARATION:
       - Waits for public IP assignment
       - Waits for SSH connectivity
       - Waits for bootstrap completion (user-data script)

    4. SCRIPT UPLOAD:
       - Uploads optimization scripts (run_optimization.sh, detect_roots.py)
       - Sets proper permissions and ownership
       - Ensures scripts are executable

    5. ENVIRONMENT SETUP:
       - Loads analysis results for the repository
       - Builds environment variables from analysis data
       - Creates a wrapper script with all necessary configuration

    6. JOB EXECUTION:
       - Uploads the wrapper script to the instance
       - Starts the optimization job in the background
       - Starts a completion watcher to monitor progress

    7. CLEANUP:
       - The completion watcher automatically terminates the instance when done
       - Prevents resource waste and cost accumulation

    Args:
        row: Repository configuration from CSV (repo_url, module_root, tests_root, resource_tier)
        launch_opts: Optional launch options (target_file, target_function for single-file optimization)

    Returns: instance_id used as job_id for tracking
    """
    repo_url = row["repo_url"]  # Extract repository URL from configuration row
    job_name = _job_name_for_repo(repo_url)  # Generate unique job name for EC2 instance
    log_service_operation("JOB", "Launch requested", repo_url=repo_url, job_name=job_name)

    # If an instance is already associated with this repo and still running, reuse it
    jobs_index = _load_jobs_index()  # Load current job-to-instance mappings
    canon = _canon_repo_url(repo_url)  # Normalize URL for consistent lookup
    log_service_operation("AWS_EC2", "Checking for reusable instance", repo_url=repo_url, canonical=canon)
    existing_id = jobs_index.get(canon)  # Check if we already have an instance for this repo
    if existing_id:  # If an instance exists, check if it's still usable
        state = _describe_instance_state(existing_id)  # Get current EC2 instance state
        if state in {"pending", "running", "stopping", "stopped"}:  # Instance is still active
            log_service_operation("AWS_EC2", "Reusing existing instance", instance_id=existing_id, state=state)
            return existing_id  # Return existing instance ID to avoid duplicate work
        else:  # Instance is terminated or in unusable state
            log_service_operation("AWS_EC2", "Existing instance not reusable", instance_id=existing_id, state=state)

    # Otherwise launch a new instance
    log_service_operation("AWS_EC2", "Launching new instance", job_name=job_name, canonical=canon)
    instance_id = ec2_manager.launch_instance(  # Launch new EC2 instance with user-data script
        job_name=job_name,  # Human-readable name for the instance
        job_tag_value=job_name,  # Tag value for identification
        tags={"RepoUrl": canon},  # Additional tags for tracking
    )
    log_service_operation("AWS_EC2", "Instance launched", instance_id=instance_id, job_name=job_name)

    # Save mapping early so UI reflects the new instance id promptly
    log_service_operation("STATE", "Saving repo to instance mapping", repo_url=repo_url, instance_id=instance_id)
    _set_repo_job(repo_url, instance_id)  # Persist the repo-to-instance mapping
    log_service_operation("STATE", "Mapping saved", canonical=canon, instance_id=instance_id)

    # Resolve public IP and wait for bootstrap completion
    public_ip: Optional[str] = None  # Will store the public IP once assigned
    ip_attempts_used = 0  # Track number of attempts for logging
    for i in range(60):  # Try for up to 5 minutes (60 * 5s = 300s)
        ip_attempts_used = i + 1  # Count attempts starting from 1
        public_ip = ec2_manager.get_public_ip(instance_id)  # Check if public IP is assigned
        if public_ip:  # If we got the IP, we can proceed
            break
        # Log progress every ~30s to avoid log spam (6 * 5s)
        if ip_attempts_used % 6 == 0:  # Every 6 attempts = 30 seconds
            log_service_operation("AWS_EC2", "Waiting for public IP assignment", instance_id=instance_id, attempts=ip_attempts_used)
        time.sleep(5)  # Wait 5 seconds before next attempt

    if not public_ip:  # If we never got a public IP
        log_service_operation("AWS_EC2", "Public IP not assigned within timeout", instance_id=instance_id, attempts=ip_attempts_used)
        return instance_id  # Return instance ID anyway, job might still work later
    else:  # We successfully got the public IP
        log_service_operation("AWS_EC2", "Public IP resolved", instance_id=instance_id, public_ip=public_ip, attempts=ip_attempts_used)

    # Wait for SSH
    log_service_operation("SSH", "Waiting for SSH reachability", instance_id=instance_id, public_ip=public_ip, timeout=600)
    if not ec2_manager.wait_for_ssh(public_ip, timeout=600):  # Wait up to 10 minutes for SSH
        log_service_operation("AWS_EC2", "SSH not reachable within timeout", instance_id=instance_id, public_ip=public_ip, timeout=600)
        return instance_id  # Return instance ID, job might still work later
    log_service_operation("SSH", "SSH reachable", instance_id=instance_id, ip=public_ip)  # SSH is working

    # Wait for bootstrap marker created by user-data
    try:
        log_service_operation("SSH", "Checking bootstrap completion", instance_id=instance_id, ip=public_ip)
        ssh = ec2_manager.open_ssh(public_ip)  # Open SSH connection to instance
        try:
            bootstrap_ready = False  # Track if bootstrap is complete
            bootstrap_attempts = 0  # Count bootstrap check attempts
            for bootstrap_attempts in range(1, 121):  # Try for up to 10 minutes (121 * 5s = 605s)
                _, stdout, _ = ssh.exec_command("test -f /home/ubuntu/.bootstrap_done && echo READY || echo WAIT")  # Check for bootstrap marker file
                if (stdout.read() or b"").decode().strip() == "READY":  # If marker file exists
                    bootstrap_ready = True  # Bootstrap is complete
                    break
                if bootstrap_attempts % 12 == 0:  # every ~60s (12 * 5s = 60s)
                    log_service_operation("SSH", "Still waiting for bootstrap marker", instance_id=instance_id, ip=public_ip, attempts=bootstrap_attempts)
                time.sleep(5)  # Wait 5 seconds before next check
        finally:
            ssh.close()  # Always close SSH connection
        if bootstrap_ready:  # Bootstrap completed successfully
            log_service_operation("SSH", "Bootstrap completion detected", instance_id=instance_id, ip=public_ip, attempts=bootstrap_attempts)
        else:  # Bootstrap didn't complete in time
            log_service_operation("SSH", "Bootstrap marker not detected within wait window", instance_id=instance_id, ip=public_ip, attempts=bootstrap_attempts)
    except Exception as e:  # Handle any SSH or bootstrap check errors
        logger.exception(f"Error while checking bootstrap completion on {instance_id}@{public_ip}: {e}")

    # Upload required scripts
    try:
        local_root = BASE_DIR  # Get the project root directory
        log_service_operation("SSH", "Uploading optimization scripts", instance_id=instance_id, ip=public_ip)
        ec2_manager.upload_file(public_ip, str(local_root / "scripts" / "run_optimization.sh"),  # Upload main optimization script
                                "/home/ubuntu/app/scripts/run_optimization.sh")
        log_service_operation("SSH", "Uploaded script", instance_id=instance_id, ip=public_ip, filename="run_optimization.sh")
        ec2_manager.upload_file(public_ip, str(local_root / "scripts" / "detect_roots.py"),  # Upload root detection script
                                "/home/ubuntu/app/scripts/detect_roots.py")
        log_service_operation("SSH", "Uploaded script", instance_id=instance_id, ip=public_ip, filename="detect_roots.py")
        ec2_manager.upload_file(public_ip, str(local_root / "scripts" / "llm_setup_helper.py"),  # Upload LLM setup helper
                                "/home/ubuntu/app/scripts/llm_setup_helper.py")
        log_service_operation("SSH", "Uploaded script", instance_id=instance_id, ip=public_ip, filename="llm_setup_helper.py")
        # Ensure ownership and execute bits
        log_service_operation("SSH", "Setting script permissions and ownership", instance_id=instance_id, ip=public_ip)
        ssh = ec2_manager.open_ssh(public_ip)  # Open SSH for permission changes
        try:
            ssh.exec_command(  # Set proper ownership and permissions
                "sudo chown -R ubuntu:ubuntu /home/ubuntu/app && chmod +x /home/ubuntu/app/scripts/*.sh; sed -i 's/\\r$//' /home/ubuntu/app/scripts/*.sh || true"
            )  # Fix line endings and set execute permissions
            log_service_operation("SSH", "Script permissions set", instance_id=instance_id, ip=public_ip)
        finally:
            ssh.close()  # Always close SSH connection
    except Exception as e:  # Handle any script upload errors
        logger.exception(f"Failed to upload scripts to {instance_id}@{public_ip}: {e}")

    # Build environment and wrapper script
    log_service_operation("ANALYSIS", "Loading analysis for repo", repo_url=repo_url)
    analysis = load_analysis_for_repo(repo_url) or {}  # Load repository analysis results
    log_service_operation("ANALYSIS", "Analysis loaded", has_data=bool(analysis))

    cf = analysis.get("codeflash", {}) if isinstance(analysis, dict) else {}  # Extract codeflash configuration
    tests_block = analysis.get("tests", {}) if isinstance(analysis, dict) else {}  # Extract test configuration
    sys_pkgs_list = analysis.get("system_packages") or []  # Get system packages to install
    py_pkgs_list = analysis.get("python_packages") or []  # Get Python packages to install
    install = analysis.get("install", {}) or {}  # Get custom install commands

    def _join_cmds(key: str) -> str:  # Helper function to join command lists
        cmds = install.get(key) or []  # Get commands for specific phase
        if not isinstance(cmds, list) or not cmds:  # Validate it's a non-empty list
            return ""  # Return empty string if no commands
        return " && ".join(cmds)  # Join commands with && for sequential execution

    pre_cmds = _join_cmds("pre_install_cmds")  # Commands to run before package installation
    ins_cmds = _join_cmds("install_cmds")  # Commands to run during package installation
    post_cmds = _join_cmds("post_install_cmds")  # Commands to run after package installation
    log_service_operation(
        "ANALYSIS",
        "Install commands prepared",
        has_pre=bool(pre_cmds),  # Log whether we have pre-install commands
        has_install=bool(ins_cmds),  # Log whether we have install commands
        has_post=bool(post_cmds),  # Log whether we have post-install commands
    )

    # JSON for formatter cmds
    try:
        formatter_cmds_json = json.dumps(cf.get("formatter_cmds", []))  # Convert formatter commands to JSON
    except Exception:  # Handle JSON serialization errors
        formatter_cmds_json = "[\"disabled\"]"  # Default to disabled if serialization fails
    num_formatter_cmds = len(cf.get("formatter_cmds", [])) if isinstance(cf.get("formatter_cmds", []), list) else 0  # Count formatter commands

    # JSON for python packages list
    pip_specs_count = 0  # Track number of pip packages
    try:
        def _to_spec(item: Any) -> Optional[str]:  # Convert package item to pip spec string
            if isinstance(item, str):  # If it's already a string
                return item.strip() or None  # Return trimmed string or None if empty
            if isinstance(item, dict):  # If it's a dictionary with name/version
                name = str(item.get("name") or "").strip()  # Extract package name
                spec = str(item.get("version_spec") or "").strip()  # Extract version specification
                if not name:  # If no name provided
                    return None  # Skip this package
                return name + (spec if spec else "")  # Combine name and version spec
            return None  # Skip unknown item types
        pip_specs = []  # List to store pip package specifications
        for it in py_pkgs_list:  # Process each Python package
            spec = _to_spec(it)  # Convert to pip spec
            if spec:  # If conversion succeeded
                pip_specs.append(spec)  # Add to specifications list
        pip_packages_json = json.dumps(pip_specs)  # Convert to JSON string
        pip_specs_count = len(pip_specs)  # Count successful specifications
    except Exception:  # Handle any errors in package processing
        pip_packages_json = "[]"  # Default to empty list
        pip_specs_count = 0  # Reset count

    log_service_operation(
        "ANALYSIS",
        "Environment summary",
        system_packages=len(sys_pkgs_list) if isinstance(sys_pkgs_list, list) else 0,  # Count system packages
        pip_packages=pip_specs_count,  # Count Python packages
        formatter_cmds=num_formatter_cmds,  # Count formatter commands
        has_pytest_cmd=bool(tests_block.get("test_command")),  # Check if test command exists
    )

    # Compose wrapper script
    # Export only non-empty secrets; otherwise, wrapper will print a helpful error
    cf_key = os.getenv('CODEFLASH_API_KEY') or ''  # Get Codeflash API key from environment
    gh_token = os.getenv('GITHUB_TOKEN') or ''  # Get GitHub token from environment
    anthropic_key = os.getenv('ANTHROPIC_API_KEY') or ''  # Anthropic API key for LLM setup helper
    anthropic_model = os.getenv('ANTHROPIC_MODEL') or 'claude-3-5-haiku-20241022'
    env_lines = [  # Start building environment variable exports
        (f"export ANTHROPIC_API_KEY='{anthropic_key}'" if anthropic_key else "unset ANTHROPIC_API_KEY"),  # Expose Anthropic key if provided
        (f"export CODEFLASH_API_KEY='{cf_key}'" if cf_key else "unset CODEFLASH_API_KEY"),  # Set or unset Codeflash key
        (f"export GITHUB_TOKEN='{gh_token}'" if gh_token else "unset GITHUB_TOKEN"),  # Set or unset GitHub token
        (f"export GH_TOKEN='{gh_token}'" if gh_token else "unset GH_TOKEN"),  # Also set GH_TOKEN alias
        f"export ANTHROPIC_MODEL='{anthropic_model}'",
        f"export GITHUB_REPO_URL='{repo_url}'",  # Set repository URL
        f"export MODULE_ROOT='{row.get('module_root', 'auto')}'",  # Set module root path
        f"export TESTS_ROOT='{row.get('tests_root', 'auto')}'",  # Set tests root path
    ]
    # Pass optional single-file optimization targets
    if launch_opts:  # If launch options are provided
        tf = (launch_opts.get('target_file') or '').strip()  # Get target file path
        fn = (launch_opts.get('target_function') or '').strip()  # Get target function name
        if tf:  # If target file is specified
            env_lines.append(f"export CF_TARGET_FILE='{tf}'")  # Set target file environment variable
        if fn:  # If target function is specified
            env_lines.append(f"export CF_TARGET_FUNCTION='{fn}'")  # Set target function environment variable
    if cf.get("module_root"):  # If analysis found a module root
        env_lines.append(f"export LLM_MODULE_ROOT='{cf['module_root']}'")  # Set LLM module root
    if cf.get("tests_root"):  # If analysis found a tests root
        env_lines.append(f"export LLM_TESTS_ROOT='{cf['tests_root']}'")  # Set LLM tests root
    if tests_block.get("test_command"):  # If test command is available
        # Properly escape the test command for shell export
        test_cmd = tests_block['test_command'].replace("'", "'\"'\"'")  # Escape single quotes in test command
        env_lines.append(f"export LLM_PYTEST_CMD='{test_cmd}'")  # Set test command environment variable
    env_lines.append(f"export LLM_FORMATTER_CMDS='{formatter_cmds_json}'")  # Set formatter commands JSON
    env_lines.append(f"export LLM_PIP_PACKAGES='{pip_packages_json}'")  # Set pip packages JSON

    system_packages = " ".join(sys_pkgs_list) if sys_pkgs_list else ""  # Join system packages into space-separated string
    if system_packages:  # If we have system packages to install
        env_lines.append(f"export SYSTEM_PACKAGES='{system_packages}'")  # Set system packages environment variable

    if pre_cmds:  # If we have pre-install commands
        # Properly escape commands for shell export
        escaped_pre = pre_cmds.replace("'", "'\"'\"'")  # Escape single quotes in pre-install commands
        env_lines.append(f"export PRE_INSTALL_CMDS='{escaped_pre}'")  # Set pre-install commands environment variable
    if ins_cmds:  # If we have install commands
        escaped_ins = ins_cmds.replace("'", "'\"'\"'")  # Escape single quotes in install commands
        env_lines.append(f"export INSTALL_CMDS='{escaped_ins}'")  # Set install commands environment variable
    if post_cmds:  # If we have post-install commands
        escaped_post = post_cmds.replace("'", "'\"'\"'")  # Escape single quotes in post-install commands
        env_lines.append(f"export POST_INSTALL_CMDS='{escaped_post}'")  # Set post-install commands environment variable

    log_service_operation(
        "ANALYSIS",
        "Environment variables composed",
        env_lines=len(env_lines),  # Log total number of environment variables
        has_system_packages=bool(sys_pkgs_list),  # Log whether system packages are configured
        has_pre_install=bool(pre_cmds),  # Log whether pre-install commands exist
        has_install=bool(ins_cmds),  # Log whether install commands exist
        has_post_install=bool(post_cmds),  # Log whether post-install commands exist
    )

    # Non-secret env vars passthrough
    try:
        extra_env = (analysis.get("env", {}) or {}).get("non_secret_env_vars", {}) if isinstance(analysis, dict) else {}  # Get additional environment variables from analysis
        if isinstance(extra_env, dict):  # If we have extra environment variables
            for k, v in extra_env.items():  # Process each additional environment variable
                try:
                    key = str(k)  # Convert key to string
                    if not re.match(r"^[A-Z_][A-Z0-9_]*$", key):  # Validate environment variable name format
                        continue  # Skip invalid environment variable names
                    val = str(v)  # Convert value to string
                    # Escape single quotes in value
                    val = val.replace("'", "'\\''")  # Escape single quotes in environment variable value
                    env_lines.append(f"export {key}='{val}'")  # Add environment variable export
                except Exception:  # Handle any errors in environment variable processing
                    continue  # Skip problematic environment variables
    except Exception:  # Handle any errors in extra environment processing
        pass  # Continue if extra environment processing fails

    wrapper = "\n".join([  # Create wrapper script by joining lines
        "#!/bin/bash",  # Shebang for bash script
        "set -euo pipefail",  # Exit on error, undefined vars, pipe failures
        "LOG_DIR=/home/ubuntu/app/logs",  # Set log directory path
        "mkdir -p \"$LOG_DIR\"",  # Create log directory if it doesn't exist
        "TS=$(date -Is | sed 's/[:+]/-/g')",  # Timestamp for log rotation (safe filename)
        "LOG_FILE=$LOG_DIR/optimization-$TS.log",  # Timestamped log file
        "PID_FILE=$LOG_DIR/optimization.pid",  # PID file for running job
        "EXIT_FILE=$LOG_DIR/job.exitcode",  # Exit code marker
        "STAGE_FILE=$LOG_DIR/stage.jsonl",  # Stage tracking file (JSONL)
        "# Clean previous markers to ensure fresh run",
        "rm -f \"$PID_FILE\" \"$EXIT_FILE\" || true",
        "touch \"$LOG_FILE\" && chmod 666 \"$LOG_FILE\"",  # Create log file with write permissions
        "touch \"$STAGE_FILE\" && chmod 666 \"$STAGE_FILE\" || true",
        "echo '=== Job start' $(date -Is) | tee -a \"$LOG_FILE\"",  # Log job start timestamp
        "echo USER=$(whoami) | tee -a \"$LOG_FILE\"",  # Log current user
        "echo HOME=$HOME | tee -a \"$LOG_FILE\"",  # Log home directory
        "echo PWD=$(pwd) | tee -a \"$LOG_FILE\"",  # Log current working directory
        "env | sort | sed -n '1,40p' | sed 's/.*/ENV: &/' | tee -a \"$LOG_FILE\"",  # Log first 40 environment variables
        "export WORK_DIR=/home/ubuntu/work",  # Set working directory for optimization
        "# Export markers for child processes (runner) to be able to update status",
        "export LOG_FILE PID_FILE EXIT_FILE STAGE_FILE WORK_DIR",
        "# Stage: wrapper_started",
        "echo \"{\\\"ts\\\":\\\"$(date -Is)\\\",\\\"stage\\\":\\\"wrapper_started\\\"}\" >> \"$STAGE_FILE\" || true",
        # Ensure a basic Python is available via 'python' fallback
        "if ! command -v python >/dev/null 2>&1 && command -v python3 >/dev/null 2>&1; then ln -sf $(command -v python3) /home/ubuntu/python || true; export PATH=/home/ubuntu:$PATH; fi",  # Create python symlink if needed
        *env_lines,  # Insert all environment variable exports
        "if [ -n \"${SYSTEM_PACKAGES:-}\" ]; then apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y ${SYSTEM_PACKAGES} || true; fi",  # Install system packages if specified
        # Choose mode: single file or entire codebase
        "if [ -n \"${CF_TARGET_FILE:-}\" ]; then export CF_MODE=single; else export CF_MODE=all; fi",  # Set optimization mode based on target file
        # Normalize line endings just in case the script was uploaded with CRLF
        "sed -i 's/\\r$//' /home/ubuntu/app/scripts/run_optimization.sh || true",  # Fix line endings in optimization script
        "# Start the optimization script and record PID",
        "( bash /home/ubuntu/app/scripts/run_optimization.sh >>\"$LOG_FILE\" 2>&1 ) & echo $! > \"$PID_FILE\"",
        "# Wait for process and persist exit code",
        "PID=$(cat \"$PID_FILE\" 2>/dev/null || echo 0)",
        "if [ \"$PID\" -gt 0 ] && kill -0 \"$PID\" 2>/dev/null; then",
        "  wait \"$PID\"; RC=$?; echo $RC > \"$EXIT_FILE\";",
        "else",
        "  echo 1 > \"$EXIT_FILE\"; RC=1;",
        "fi",
        "# Stage: wrapper_finished",
        "echo \"{\\\"ts\\\":\\\"$(date -Is)\\\",\\\"stage\\\":\\\"wrapper_finished\\\",\\\"rc\\\":$RC}\" >> \"$STAGE_FILE\" || true",
        "echo '=== Job end' $(date -Is) | tee -a \"$LOG_FILE\"",  # Log job end timestamp
    ])

    log_service_operation("SSH", "Uploading wrapper script", instance_id=instance_id, ip=public_ip, remote_path="/home/ubuntu/app/run_job.sh")

    try:
        ec2_manager.upload_text(public_ip, wrapper, "/home/ubuntu/app/run_job.sh")  # Upload the wrapper script to instance
        # Verify files exist before starting
        run_path = "/home/ubuntu/app/run_job.sh"  # Path to wrapper script on remote instance
        script_path = "/home/ubuntu/app/scripts/run_optimization.sh"  # Path to optimization script on remote instance
        has_runner = ec2_manager.file_exists(public_ip, run_path)  # Check if wrapper script exists
        has_script = ec2_manager.file_exists(public_ip, script_path)  # Check if optimization script exists
        log_service_operation("AWS_EC2", "Remote files present", run_job=has_runner, opt_script=has_script)  # Log file existence status
        # Run in background; fully detach so it survives SSH session
        log_service_operation("SSH", "Starting background job", instance_id=instance_id, ip=public_ip, run_path=run_path)
        ec2_manager.exec(  # Execute the wrapper script in background
            public_ip,
            "bash -lc 'chmod +x /home/ubuntu/app/run_job.sh && setsid nohup /home/ubuntu/app/run_job.sh >> /home/ubuntu/app/logs/launcher.log 2>&1 & disown'",  # Make executable and run in background
            get_pty=False,  # Don't allocate pseudo-terminal
        )
        # Start watcher for cleanup
        log_service_operation("WATCHER", "Starting completion watcher", instance_id=instance_id, ip=public_ip)  # Log watcher start
        _start_completion_watcher(instance_id, public_ip, repo_url)  # Start background thread to monitor job completion
    except Exception as e:  # Handle any errors in job execution setup
        logger.exception(f"Failed to start remote job on {instance_id}@{public_ip}: {e}")  # Log the error

    log_service_operation("JOB", "Launch orchestration finished", instance_id=instance_id, repo_url=repo_url)  # Log completion of launch process
    return instance_id  # Return the instance ID for tracking


# =============================================================================
# JOB EXECUTION API
# =============================================================================

@app.post("/api/run")
def run_single() -> Any:
    """
    POST /api/run - Runs optimization for a single repository

    This endpoint:
    1. Validates the repository exists in the configuration
    2. Optionally validates target_file for single-file optimization
    3. Launches an EC2 instance and starts the optimization job
    4. Updates the job index to track the new job
    5. Returns the job ID for monitoring progress

    Body: {repo_url, target_file?, target_function?}
    Returns: JSON with job_id
    """
    start_time = time.time()
    payload = request.get_json(force=True)
    repo_url = (payload.get("repo_url") or "").strip()
    target_file = (payload.get("target_file") or "").strip()
    target_function = (payload.get("target_function") or "").strip()

    log_request_start("POST /api/run", repo_url=repo_url)

    try:
        log_service_operation("CSV", "Finding repository configuration")
        rows = _read_csv()
        idx = _find_row(rows, repo_url)
        if idx is None:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/run", "repo not found", duration_ms, repo_url=repo_url)
            return jsonify({"error": "repo not found"}), 404
        # Minimal validation for single-file mode
        if target_file:
            if target_file.startswith(("/", "..")) or not target_file.endswith(".py"):
                duration_ms = int((time.time() - start_time) * 1000)
                log_request_error("POST /api/run", "invalid target_file", duration_ms, repo_url=repo_url, target_file=target_file)
                return jsonify({"error": "target_file must be a relative .py path under repo"}), 400

        job_id = _launch_ec2_job(rows[idx], {
            "target_file": target_file,
            "target_function": target_function,
        })

        log_service_operation("JOBS", "Updating job index")
        jobs_index = _load_jobs_index()
        jobs_index[_canon_repo_url(repo_url)] = job_id
        _save_jobs_index(jobs_index)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/run", duration_ms, repo_url=repo_url, job_id=job_id)
        return jsonify({"ok": True, "job_id": job_id})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/run", str(e), duration_ms, repo_url=repo_url)
        raise


@app.post("/api/run_all")
def run_all() -> Any:
    """
    POST /api/run_all - Runs optimization for all repositories

    This endpoint:
    1. Reads all configured repositories from CSV
    2. Launches optimization jobs for each repository
    3. Handles errors gracefully for individual repositories
    4. Updates the job index with all successful launches
    5. Returns a mapping of repositories to job IDs or error messages

    This is useful for bulk optimization of multiple repositories at once.

    Returns: JSON with results mapping repo URLs to job IDs or errors
    """
    start_time = time.time()
    log_request_start("POST /api/run_all")

    try:
        log_service_operation("CSV", "Reading all repository configurations")
        rows = _read_csv()
        jobs_index = _load_jobs_index()
        results: Dict[str, str] = {}
        successful_jobs = 0
        failed_jobs = 0

        for row in rows:
            repo_url = row["repo_url"]
            try:
                job_id = _launch_ec2_job(row)
                jobs_index[_canon_repo_url(repo_url)] = job_id
                results[repo_url] = job_id
                successful_jobs += 1
            except Exception as e:
                results[repo_url] = f"ERROR: {e}"
                failed_jobs += 1
                logger.error(f"❌ Failed to submit job for {repo_url}: {e}")

        log_service_operation("JOBS", "Updating job index after bulk run")
        _save_jobs_index(jobs_index)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/run_all", duration_ms,
                          total_repos=len(rows), successful_jobs=successful_jobs,
                          failed_jobs=failed_jobs)
        return jsonify({"results": results})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/run_all", str(e), duration_ms)
        raise


# =============================================================================
# JOB MONITORING API
# =============================================================================

def _resolve_job_id(args) -> Optional[str]:
    """
    Resolves job ID from request arguments

    This utility function handles flexible job ID resolution:
    - If job_id is provided directly, use it
    - If repo_url is provided, look up the job ID from the job index
    - Returns None if neither is provided or job not found

    Args:
        args: Request arguments (job_id or repo_url)
    Returns: Job ID or None if not found
    """
    job_id = args.get("job_id")
    if job_id:
        logger.debug(f"🔍 Using direct job_id: {job_id}")
        return job_id
    repo_url = args.get("repo_url")
    if not repo_url:
        return None
    jobs_index = _load_jobs_index()
    resolved_job_id = jobs_index.get(_canon_repo_url(repo_url))
    logger.debug(f"🔍 Resolved job_id for {repo_url}: {resolved_job_id}")
    return resolved_job_id


@app.get("/api/job_status")
def job_status() -> Any:
    """
    GET /api/job_status - Gets job status for EC2-backed job

    This endpoint provides comprehensive job status information:
    1. Resolves the job ID from request parameters
    2. Checks the EC2 instance state (pending, running, stopped, etc.)
    3. Determines if the optimization job has completed
    4. Reads the exit code if available
    5. Returns a unified status (running, succeeded, failed)

    Query params: job_id or repo_url
    Returns: JSON with job status information
    """
    start_time = time.time()
    job_id = _resolve_job_id(request.args)
    repo_url = request.args.get("repo_url")

    log_request_start("GET /api/job_status", job_id=job_id, repo_url=repo_url)

    try:
        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_status", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400

        # job_id is EC2 instance id
        state = _describe_instance_state(job_id) or "unknown"
        public_ip = ec2_manager.get_public_ip(job_id) or None

        # If instance has SSH and job exit code file, consider finished
        finished = False
        exit_code: Optional[int] = None
        if public_ip:
            try:
                if ec2_manager.file_exists(public_ip, "/home/ubuntu/app/logs/job.exitcode"):
                    finished = True
                    # Try to read exit code
                    log_service_operation("SSH", "Reading job exit code", job_id=job_id, ip=public_ip)
                    ssh = ec2_manager.open_ssh(public_ip)
                    try:
                        _, stdout, _ = ssh.exec_command("cat /home/ubuntu/app/logs/job.exitcode")
                        try:
                            exit_code = int((stdout.read() or b"0").decode().strip() or "0")
                        except Exception:
                            exit_code = 0
                    finally:
                        ssh.close()
            except Exception:
                pass

        status = "running"
        if state in {"pending", "running"}:
            status = "running"
        elif finished:
            status = "succeeded" if (exit_code is not None and exit_code == 0) else "failed"
        elif state in {"stopped", "stopping", "shutting-down", "terminated"}:
            status = "failed"

        simple = {
            "jobId": job_id,
            "jobName": repo_url or job_id,
            "status": status,
            "state": state,
            "publicIp": public_ip,
            "exitCode": exit_code,
        }

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/job_status", duration_ms, job_id=job_id, status=status)
        return jsonify({"job": simple})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/job_status", str(e), duration_ms, job_id=job_id)
        raise


@app.get("/api/job_logs")
def job_logs() -> Any:
    """
    GET /api/job_logs - Gets job logs from the EC2 instance

    This endpoint provides real-time access to optimization logs:
    1. Resolves the job ID and gets the instance public IP
    2. Fetches the most recent log entries from the optimization run
    3. Provides fallback to other log sources if main log is empty
    4. Returns log lines for display in the web UI

    The logs show the complete optimization process including:
    - Repository cloning and setup
    - Dependency installation
    - Codeflash optimization execution
    - Results and any errors encountered

    Query params: job_id or repo_url
    Returns: JSON with log events (lines) from remote log file
    """
    start_time = time.time()
    job_id = _resolve_job_id(request.args)
    repo_url = request.args.get("repo_url")

    log_request_start("GET /api/job_logs", job_id=job_id, repo_url=repo_url)

    try:
        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400

        public_ip = ec2_manager.get_public_ip(job_id)
        if not public_ip:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_success("GET /api/job_logs", duration_ms, job_id=job_id, note="public ip not available yet")
            return jsonify({"events": [], "note": "instance public ip not available yet"})

        # Resolve latest optimization log by timestamp if available; fallback to legacy symlink
        def _latest_log_path(ip: str) -> str:
            try:
                ssh = ec2_manager.open_ssh(ip)
                try:
                    # List timestamped logs and pick the newest lexicographically
                    cmd = "bash -lc 'ls -1 /home/ubuntu/app/logs/optimization-*.log 2>/dev/null | sort -r | head -n1'"
                    _, stdout, _ = ssh.exec_command(cmd)
                    candidate = (stdout.read() or b"").decode().strip()
                    if candidate:
                        return candidate
                finally:
                    ssh.close()
            except Exception:
                pass
            return "/home/ubuntu/app/logs/optimization.log"

        main_log_path = _latest_log_path(public_ip)
        log_service_operation("SSH", "Opening SSH to tail optimization log", job_id=job_id, ip=public_ip, path=main_log_path)
        events = ec2_manager.read_file_tail(public_ip, main_log_path, lines=1000)
        # Fallbacks
        fallback = []
        if not events:
            # Try launcher log (contains early wrapper prints)
            log_service_operation("SSH", "optimization log empty, fetching launcher.log", job_id=job_id, ip=public_ip)
            fallback = ec2_manager.read_file_tail(public_ip, "/home/ubuntu/app/logs/launcher.log", lines=200)
        if not events and not fallback:
            # Finally try cloud-init user data
            log_service_operation("SSH", "launcher.log empty, falling back to user-data.log", job_id=job_id, ip=public_ip)
            fallback = ec2_manager.read_file_tail(public_ip, "/var/log/user-data.log", lines=200)
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/job_logs", duration_ms, job_id=job_id, event_count=len(events), fallback_count=len(fallback or []))
        # Try to fetch current stage info
        stage_lines: List[str] = []
        try:
            stage_lines = ec2_manager.read_file_tail(public_ip, "/home/ubuntu/app/logs/stage.jsonl", lines=50)
        except Exception:
            stage_lines = []
        current_stage = None
        if stage_lines:
            try:
                import json as _json
                for line in reversed(stage_lines):
                    line = (line or "").strip()
                    if not line:
                        continue
                    try:
                        obj = _json.loads(line)
                        if isinstance(obj, dict) and obj.get("stage"):
                            current_stage = obj
                            break
                    except Exception:
                        continue
            except Exception:
                current_stage = None

        resp = {"events": events}
        if current_stage is not None:
            resp["stage"] = current_stage
        if fallback:
            resp["bootstrap"] = fallback
        return jsonify(resp)
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/job_logs", str(e), duration_ms, job_id=job_id)
        return jsonify({"error": str(e)}), 500


@app.get("/api/job_logs/download")
def job_logs_download() -> Any:
    """
    GET /api/job_logs/download - Downloads the full optimization log from the EC2 instance

    This endpoint provides access to complete log files for analysis:
    1. Resolves the job ID and gets the instance public IP
    2. Streams the complete log file content to the client
    3. Handles large log files efficiently with chunked streaming
    4. Provides fallback to other log sources if main log is missing
    5. Sets appropriate headers for file download

    This is useful for:
    - Downloading complete logs for offline analysis
    - Debugging optimization issues
    - Archiving optimization results

    Query params: job_id or repo_url
    Returns: text/plain content of the log file
    """
    start_time = time.time()
    job_id = _resolve_job_id(request.args)
    repo_url = request.args.get("repo_url")

    log_request_start("GET /api/job_logs/download", job_id=job_id, repo_url=repo_url)

    try:
        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs/download", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400

        public_ip = ec2_manager.get_public_ip(job_id)
        if not public_ip:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs/download", "instance public ip not available yet", duration_ms)
            return jsonify({"error": "instance public ip not available yet"}), 400

        # Determine latest optimization log first; then fallback to launcher and user-data
        latest_cmd = "bash -lc 'ls -1 /home/ubuntu/app/logs/optimization-*.log 2>/dev/null | sort -r | head -n1'"
        used_path = None
        try:
            ssh_tmp = ec2_manager.open_ssh(public_ip)
            try:
                _, stdout, _ = ssh_tmp.exec_command(latest_cmd)
                candidate = (stdout.read() or b"").decode().strip()
                if candidate:
                    used_path = candidate
            finally:
                ssh_tmp.close()
        except Exception:
            used_path = None

        if not used_path:
            # Prefer legacy symlink, then launcher, then user-data
            paths = [
                "/home/ubuntu/app/logs/optimization.log",
                "/home/ubuntu/app/logs/launcher.log",
                "/var/log/user-data.log",
            ]
            try:
                for p in paths:
                    if ec2_manager.file_exists(public_ip, p):
                        used_path = p
                        break
            except Exception:
                used_path = None

        if not used_path:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs/download", "no logs available", duration_ms, job_id=job_id)
            return jsonify({"error": "no logs available"}), 404

        # Obtain file size for Content-Length (progress in FE)
        total_size = 0
        try:
            ssh_stat = ec2_manager.open_ssh(public_ip)
            try:
                sftp_stat = ssh_stat.open_sftp()
                try:
                    total_size = sftp_stat.stat(used_path).st_size or 0
                finally:
                    sftp_stat.close()
            finally:
                ssh_stat.close()
        except Exception:
            total_size = 0

        def generate():
            # Stream via SFTP in chunks
            log_service_operation("SSH", "Streaming log file download", job_id=job_id, ip=public_ip, path=used_path)
            ssh = ec2_manager.open_ssh(public_ip)
            try:
                sftp = ssh.open_sftp()
                try:
                    with sftp.file(used_path, "rb") as f:
                        while True:
                            chunk = f.read(65536)
                            if not chunk:
                                break
                            yield chunk
                finally:
                    sftp.close()
            finally:
                ssh.close()

        # Build friendly filename: <org-repo>_<YYYY-MM-DD_HH-MM>.log
        slug = None
        try:
            parts = (repo_url or "").rstrip("/").split("/")
            if len(parts) >= 2:
                org = parts[-2]
                name = parts[-1].replace(".git", "")
                slug = f"{org}-{name}"
        except Exception:
            pass
        # Get timestamp from filename if present
        stamp = ""
        base = os.path.basename(used_path)
        # base may look like optimization-2024-09-20T18-22-33.log or optimization.log
        m = re.match(r"optimization-([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2})", base)
        if m:
            ts = m.group(1).replace("T", "_")
            stamp = f"_{ts}"
        filename = f"{slug or job_id}{stamp}.log"

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/job_logs/download", duration_ms, job_id=job_id, path=used_path)
        resp = Response(stream_with_context(generate()), mimetype="text/plain")
        # Force download with naming convention
        resp.headers["Content-Disposition"] = f"attachment; filename={filename}"
        if total_size:
            resp.headers["Content-Length"] = str(total_size)
        return resp
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/job_logs/download", str(e), duration_ms, job_id=job_id)
        return jsonify({"error": str(e)}), 500


@app.get("/api/job_optimizations")
def job_optimizations() -> Any:
    """
    GET /api/job_optimizations - Extract Codeflash optimization review links from logs

    This endpoint extracts optimization review links from job logs:
    1. Resolves the job ID and gets the instance public IP
    2. Searches the optimization log for Codeflash review URLs
    3. Extracts and deduplicates the review links
    4. Returns the count and list of review URLs

    These links provide direct access to:
    - Codeflash optimization reviews
    - Detailed optimization suggestions
    - Before/after code comparisons
    - Performance improvement recommendations

    Query params: job_id or repo_url
    Returns: { count, links }
    """
    start_time = time.time()
    job_id = _resolve_job_id(request.args)
    repo_url = request.args.get("repo_url")

    log_request_start("GET /api/job_optimizations", job_id=job_id, repo_url=repo_url)
    try:
        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_optimizations", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400

        public_ip = ec2_manager.get_public_ip(job_id)
        if not public_ip:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_optimizations", "instance public ip not available yet", duration_ms)
            return jsonify({"error": "instance public ip not available yet"}), 400

        # Resolve latest log file path
        log_path = None
        try:
            ssh = ec2_manager.open_ssh(public_ip)
            try:
                cmd = "bash -lc 'ls -1 /home/ubuntu/app/logs/optimization-*.log 2>/dev/null | sort -r | head -n1'"
                _, stdout, _ = ssh.exec_command(cmd)
                candidate = (stdout.read() or b"").decode().strip()
                if candidate:
                    log_path = candidate
            finally:
                ssh.close()
        except Exception:
            log_path = None
        if not log_path:
            # fallback to legacy symlink
            log_path = "/home/ubuntu/app/logs/optimization.log"
        if not ec2_manager.file_exists(public_ip, log_path):
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_success("GET /api/job_optimizations", duration_ms, job_id=job_id, links=0)
            return jsonify({"count": 0, "links": []})

        grep_cmd = (
            "bash -lc \"grep -ao 'https://app.codeflash.ai/review-optimizations/[A-Za-z0-9-]*' "
            f"{log_path} | sort -u\""
        )
        code, out, _ = ec2_manager.exec_capture(public_ip, grep_cmd, get_pty=False)
        found = [line.strip() for line in out.splitlines() if line.strip()]
        # Deduplicate while preserving order
        seen = set()
        links: list[str] = []
        for url in found:
            if url not in seen:
                seen.add(url)
                links.append(url)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/job_optimizations", duration_ms, job_id=job_id, links=len(links))
        return jsonify({"count": len(links), "links": links})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/job_optimizations", str(e), duration_ms, job_id=job_id)
        return jsonify({"error": str(e)}), 500


@app.post("/api/terminate")
def terminate_job() -> Any:
    """
    POST /api/terminate - Terminates the EC2 instance for a job

    This endpoint provides manual control over EC2 instances:
    1. Resolves the job ID from request parameters
    2. Terminates the EC2 instance to stop the optimization job
    3. Cleans up the job index to remove the mapping
    4. Prevents further costs from running instances

    This is useful for:
    - Stopping long-running optimization jobs
    - Cleaning up failed or stuck instances
    - Managing costs by terminating unused instances

    Body: {repo_url? , job_id?}
    """
    start_time = time.time()
    payload = request.get_json(force=True) or {}
    repo_url = (payload.get("repo_url") or "").strip()
    job_id = (payload.get("job_id") or "").strip()

    log_request_start("POST /api/terminate", repo_url=repo_url, job_id=job_id)

    try:
        # Resolve job_id from mapping if not provided
        if not job_id and repo_url:
            jobs_index = _load_jobs_index()
            job_id = jobs_index.get(_canon_repo_url(repo_url)) or ""

        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/terminate", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400

        # Terminate the instance
        ec2_manager.terminate(job_id)

        # Cleanup jobs index mapping
        jobs_index = _load_jobs_index()
        changed = False
        if repo_url:
            key = _canon_repo_url(repo_url)
            if key in jobs_index and jobs_index[key] == job_id:
                jobs_index.pop(key, None)
                changed = True
        else:
            # Remove any mapping pointing to this job_id
            to_del = [k for k, v in jobs_index.items() if v == job_id]
            for k in to_del:
                jobs_index.pop(k, None)
                changed = True
        if changed:
            _save_jobs_index(jobs_index)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/terminate", duration_ms, job_id=job_id)
        return jsonify({"ok": True})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/terminate", str(e), duration_ms, repo_url=repo_url, job_id=job_id)
        return jsonify({"error": str(e)}), 500


@app.post("/api/restart")
def restart_job() -> Any:
    """
    POST /api/restart - Stops the current optimization process on the instance and starts a new one.

    Body: {repo_url}
    Behavior:
      - Resolves instance by repo_url mapping
      - SSH in, attempts to kill running optimization via PID file if present
      - Clears exitcode marker, re-runs the wrapper in background (nohup)
      - Returns ok and job_id
    """
    start_time = time.time()
    payload = request.get_json(force=True) or {}
    repo_url = (payload.get("repo_url") or "").strip()

    log_request_start("POST /api/restart", repo_url=repo_url)

    try:
        if not repo_url:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/restart", "repo_url is required", duration_ms)
            return jsonify({"error": "repo_url is required"}), 400

        jobs_index = _load_jobs_index()
        job_id = jobs_index.get(_canon_repo_url(repo_url))
        if not job_id:
            # If no running instance, just run a new one
            rows = _read_csv()
            idx = _find_row(rows, repo_url)
            if idx is None:
                duration_ms = int((time.time() - start_time) * 1000)
                log_request_error("POST /api/restart", "repo not found", duration_ms, repo_url=repo_url)
                return jsonify({"error": "repo not found"}), 404
            new_job = _launch_ec2_job(rows[idx])
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_success("POST /api/restart", duration_ms, repo_url=repo_url, job_id=new_job)
            return jsonify({"ok": True, "job_id": new_job, "action": "started_new"})

        # We have an instance; attempt to stop current optimization and start a new one
        # Resolve IP
        public_ip = ec2_manager.get_public_ip(job_id)
        if not public_ip:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("POST /api/restart", "public IP not available", duration_ms, job_id=job_id)
            return jsonify({"error": "instance public IP not available yet"}), 409

        # Kill existing process if PID file exists
        kill_cmds = [
            "bash -lc 'set -e; PID_FILE=/home/ubuntu/app/logs/optimization.pid; EXIT_FILE=/home/ubuntu/app/logs/job.exitcode; "
            "if [ -f \"$PID_FILE\" ]; then PID=$(cat \"$PID_FILE\" 2>/dev/null || echo 0); "
            "if [ \"$PID\" -gt 0 ] && kill -0 \"$PID\" 2>/dev/null; then kill -TERM \"$PID\" || true; sleep 2; kill -KILL \"$PID\" || true; fi; fi; "
            "rm -f \"$EXIT_FILE\" || true'"
        ]
        for cmd in kill_cmds:
            ec2_manager.exec(public_ip, cmd)

        # Re-run the wrapper script in background
        start_cmd = (
            "bash -lc 'chmod +x /home/ubuntu/app/run_job.sh && "
            "setsid nohup /home/ubuntu/app/run_job.sh >> /home/ubuntu/app/logs/launcher.log 2>&1 & disown'"
        )
        ec2_manager.exec(public_ip, start_cmd)

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("POST /api/restart", duration_ms, repo_url=repo_url, job_id=job_id)
        return jsonify({"ok": True, "job_id": job_id, "action": "restarted"})
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("POST /api/restart", str(e), duration_ms, repo_url=repo_url)
        return jsonify({"error": str(e)}), 500


@app.get("/api/job_logs/download_all")
def job_logs_download_all() -> Any:
    """
    GET /api/job_logs/download_all - Download all related logs (optimization, launcher, LLM) as a zip

    Query params: job_id or repo_url
    Returns: application/zip with files named using <org-repo>_<YYYY-MM-DD_HH-MM> convention
    """
    start_time = time.time()
    job_id = _resolve_job_id(request.args)
    repo_url = request.args.get("repo_url")
    log_request_start("GET /api/job_logs/download_all", job_id=job_id, repo_url=repo_url)
    try:
        if not job_id:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs/download_all", "job_id or repo_url is required", duration_ms)
            return jsonify({"error": "job_id or repo_url is required"}), 400
        public_ip = ec2_manager.get_public_ip(job_id)
        if not public_ip:
            duration_ms = int((time.time() - start_time) * 1000)
            log_request_error("GET /api/job_logs/download_all", "instance public ip not available yet", duration_ms)
            return jsonify({"error": "instance public ip not available yet"}), 400

        # Determine naming prefix
        slug = None
        try:
            parts = (repo_url or "").rstrip("/").split("/")
            if len(parts) >= 2:
                org = parts[-2]
                name = parts[-1].replace(".git", "")
                slug = f"{org}-{name}"
        except Exception:
            pass
        if not slug:
            slug = job_id

        # Determine timestamp (prefer latest optimization log timestamp)
        ts = ""
        try:
            ssh = ec2_manager.open_ssh(public_ip)
            try:
                cmd = "bash -lc 'basename $(ls -1 /home/ubuntu/app/logs/optimization-*.log 2>/dev/null | sort -r | head -n1)'"
                _, stdout, _ = ssh.exec_command(cmd)
                base = (stdout.read() or b"").decode().strip()
                m = re.match(r"optimization-([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2})", base)
                if m:
                    ts = m.group(1).replace("T", "_")
            finally:
                ssh.close()
        except Exception:
            ts = ""
        prefix = f"{slug}_{ts}" if ts else slug


        # Stream a zip archive containing all available logs
        def generate_zip():
            import io, zipfile, os
            # Use a temporary directory to store logs before zipping
            with tempfile.TemporaryDirectory() as temp_dir:
                # 1. Download all logs using our rsync helper
                success = _rsync_logs_from_instance(public_ip, temp_dir)

                if not success:
                    # If rsync fails, we can still yield an empty zip or an error file
                    logger.warning(f"⚠️ [ZIP] Rsync failed for {public_ip}, creating zip with error message.")
                    mem = io.BytesIO()
                    with zipfile.ZipFile(mem, mode="w") as zf:
                        zf.writestr("rsync_error.txt", f"Failed to download logs from instance {job_id} at {public_ip}.")
                    mem.seek(0)
                    yield from mem
                    return

                # 2. Zip the contents of the temporary directory
                mem = io.BytesIO()
                with zipfile.ZipFile(mem, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
                    for root, _, files in os.walk(temp_dir):
                        for file in files:
                            full_path = os.path.join(root, file)
                            # Only include files that actually exist
                            if os.path.exists(full_path):
                                # Create a relative name for the file inside the zip archive
                                arcname = f"{prefix}/{file}"
                                zf.write(full_path, arcname=arcname)

                # 3. Stream the zip file
                mem.seek(0)
                yield from mem

        duration_ms = int((time.time() - start_time) * 1000)
        log_request_success("GET /api/job_logs/download_all", duration_ms, job_id=job_id)
        resp = Response(stream_with_context(generate_zip()), mimetype="application/zip")
        resp.headers["Content-Disposition"] = f"attachment; filename={prefix}.zip"
        return resp
    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_request_error("GET /api/job_logs/download_all", str(e), duration_ms, job_id=job_id)
        return jsonify({"error": str(e)}), 500


# =============================================================================
# SERVER STARTUP
# =============================================================================

if __name__ == "__main__":
    """
    Start the Flask development server

    This is the entry point for running the application in development mode.
    The server provides:
    - Web UI for managing repositories and jobs
    - REST API for all optimization operations
    - Real-time job monitoring and log access
    - EC2 instance lifecycle management

    Environment variables:
    - PORT: Server port (default: 5000)
    - AWS_REGION: AWS region for EC2 instances
    - AWS_KEY_NAME: EC2 key pair name
    - AWS_SECURITY_GROUP: Security group for instances
    - AWS_INSTANCE_TYPE: EC2 instance type
    - AWS_AMI_ID: AMI ID for instances
    - SSH_KEY_PATH: Path to SSH private key
    - CODEFLASH_API_KEY: Codeflash API key
    - GITHUB_TOKEN: GitHub token for repository access
    """
    port = int(os.getenv("PORT", "5000"))
    logger.info(f"🚀 Starting Flask development server on port {port}")
    app.run(host="0.0.0.0", port=port, debug=True)