computer/libs/python/agent/benchmarks/utils.py

#!/usr/bin/env python3
"""
Shared utilities for ScreenSpot-Pro benchmarking and interactive testing.
"""

import dotenv

dotenv.load_dotenv()

import asyncio
import base64
import gc
import os
import statistics
import subprocess as sp
import sys
from datetime import datetime
from io import BytesIO
from typing import List, Optional, Tuple, Union

import torch
from PIL import Image, ImageDraw
from tqdm import tqdm

# Add parent directory to path for imports
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from agent.agent import ComputerAgent
from models.base import ModelProtocol


def get_gpu_memory() -> List[int]:
    """
    Get GPU memory usage using nvidia-smi.

    Returns:
        List of free memory values in MB for each GPU
    """
    try:
        command = "nvidia-smi --query-gpu=memory.free --format=csv"
        memory_free_info = sp.check_output(command.split()).decode("ascii").split("\n")[:-1][1:]
        memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
        return memory_free_values
    except (sp.CalledProcessError, FileNotFoundError, IndexError):
        # Fallback to torch if nvidia-smi is not available
        if torch.cuda.is_available():
            device = torch.cuda.current_device()
            total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
            reserved = torch.cuda.memory_reserved(device) / 1024 / 1024
            return [int(total - reserved)]
        return [0]


def get_vram_usage() -> dict:
    """
    Get current VRAM usage statistics.

    Returns:
        Dictionary with VRAM usage info (in MB)
    """
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        allocated = torch.cuda.memory_allocated(device) / 1024 / 1024  # Convert to MB
        reserved = torch.cuda.memory_reserved(device) / 1024 / 1024  # Convert to MB
        total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
        return {
            "allocated_mb": allocated,
            "reserved_mb": reserved,
            "total_mb": total,
            "free_mb": total - reserved,
        }
    else:
        return {"allocated_mb": 0.0, "reserved_mb": 0.0, "total_mb": 0.0, "free_mb": 0.0}


def get_available_models() -> List[Union[str, ModelProtocol]]:
    """
    Get list of available models for testing.

    Returns:
        List of model strings and model classes
    """
    local_provider = "huggingface-local/"  # Options: huggingface-local/ or mlx/

    # from models.gta1 import GTA1Model

    models = [
        # === ComputerAgent model strings ===
        "openai/computer-use-preview",
        "anthropic/claude-opus-4-20250514",
        # f"{local_provider}HelloKKMe/GTA1-7B",
        # f"{local_provider}HelloKKMe/GTA1-32B",
        "openai/computer-use-preview+openai/gpt-4o-mini",
        "anthropic/claude-opus-4-20250514+openai/gpt-4o-mini",
        # === Reference model classes ===
        # GTA1Model("HelloKKMe/GTA1-7B"),
        # GTA1Model("HelloKKMe/GTA1-32B"),
    ]

    return models


def is_click_in_bbox(click_coords: Optional[Tuple[int, int]], bbox: List[int]) -> bool:
    """
    Check if click coordinates are within the bounding box.

    Args:
        click_coords: (x, y) coordinates or None
        bbox: [x1, y1, x2, y2] bounding box

    Returns:
        True if click is within bbox, False otherwise
    """
    if click_coords is None:
        return False

    x, y = click_coords
    x1, y1, x2, y2 = bbox

    return x1 <= x <= x2 and y1 <= y <= y2


def image_to_base64(image: Image.Image) -> str:
    """
    Convert PIL Image to base64 string.

    Args:
        image: PIL Image

    Returns:
        Base64 encoded image string
    """
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()


class ModelWrapper:
    """
    Wrapper to provide unified interface for both ComputerAgent and custom models.
    """

    def __init__(self, model: Union[str, ModelProtocol]):
        self.model = model
        self.is_computer_agent = isinstance(model, str)
        self.agent: Optional[ComputerAgent] = None
        self.vram_usage_history: List[float] = []  # Track VRAM usage over time

        if self.is_computer_agent:
            self.model_name = str(model)
        else:
            self.model_name = (
                f"{model.__class__.__name__}('{getattr(model, 'model_name', 'unknown')}')"
            )

    async def load_model(self) -> None:
        """Load the model."""
        if self.is_computer_agent:
            self.agent = ComputerAgent(model=str(self.model))
        else:
            await self.model.load_model()  # type: ignore

        # Record initial VRAM usage after loading
        vram_info = get_vram_usage()
        self.vram_usage_history.append(vram_info["allocated_mb"])

    async def unload_model(self) -> None:
        """Unload the model."""
        if not self.is_computer_agent:
            await self.model.unload_model()  # type: ignore
        else:
            del self.agent
            self.agent = None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Record VRAM usage after unloading
        vram_info = get_vram_usage()
        self.vram_usage_history.append(vram_info["allocated_mb"])

    def get_vram_stats(self) -> dict:
        """Get VRAM usage statistics for this model."""
        if not self.vram_usage_history:
            return {"max_mb": 0.0, "avg_mb": 0.0}

        return {
            "max_mb": max(self.vram_usage_history),
            "avg_mb": sum(self.vram_usage_history) / len(self.vram_usage_history),
        }

    async def predict_click(
        self, image: Image.Image, instruction: str
    ) -> Optional[Tuple[int, int]]:
        """Predict click coordinates."""
        # Record VRAM usage before prediction
        vram_info = get_vram_usage()
        self.vram_usage_history.append(vram_info["allocated_mb"])

        if self.is_computer_agent:
            if self.agent is None:
                await self.load_model()

            if self.agent is not None:
                image_b64 = image_to_base64(image)
                result = await self.agent.predict_click(
                    instruction=instruction, image_b64=image_b64
                )

                # Record VRAM usage after prediction
                vram_info = get_vram_usage()
                self.vram_usage_history.append(vram_info["allocated_mb"])

                return result
            return None
        else:
            result = await self.model.predict_click(image, instruction)  # type: ignore

            # Record VRAM usage after prediction
            vram_info = get_vram_usage()
            self.vram_usage_history.append(vram_info["allocated_mb"])

            return result


def save_results_to_markdown(
    all_results: List[dict],
    output_file: str = "screenspot_pro_results.md",
    title: str = "ScreenSpot-Pro Benchmark Results",
) -> None:
    """
    Save evaluation results to a markdown table.

    Args:
        all_results: List of evaluation results for each model
        output_file: Output markdown file path
    """
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(f"**Evaluation Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        # Summary table
        f.write("## Summary\n\n")
        f.write(
            "| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n"
        )
        f.write(
            "|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n"
        )

        for result in all_results:
            model_name = result["model_name"]
            total = result["total_samples"]
            correct = result["correct_predictions"]
            errors = result["failed_predictions"]
            accuracy = result["accuracy"] * 100
            error_rate = result["failure_rate"] * 100
            avg_time = result.get("avg_prediction_time", 0.0)
            median_time = result.get("median_prediction_time", 0.0)
            min_time = result.get("min_prediction_time", 0.0)
            max_time = result.get("max_prediction_time", 0.0)
            time_range = f"{min_time:.2f} - {max_time:.2f}"
            vram_max = result.get("vram_max_mb", 0.0) / 1024
            vram_avg = result.get("vram_avg_mb", 0.0) / 1024

            f.write(
                f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n"
            )

        # Detailed results for each model
        for result in all_results:
            f.write(f"\n## {result['model_name']} - Detailed Results\n\n")
            f.write(
                "| Sample Index | Instruction | BBox | Predicted | Correct | Error | Time (s) |\n"
            )
            f.write("|-----------|-------------|------|-----------|---------|-------|----------|\n")

            for sample_result in result["results"][:10]:  # Show first 10 samples
                sample_idx = sample_result["sample_idx"]
                instruction = (
                    sample_result["instruction"][:50] + "..."
                    if len(sample_result["instruction"]) > 50
                    else sample_result["instruction"]
                )
                bbox = str(sample_result["bbox"])
                predicted = (
                    str(sample_result["predicted_coords"])
                    if sample_result["predicted_coords"]
                    else "None"
                )
                correct = "PASS" if sample_result["is_correct"] else "FAIL"
                error = "YES" if sample_result["failed"] else "NO"
                pred_time = sample_result.get("prediction_time", 0.0)

                f.write(
                    f"| {sample_idx} | {instruction} | {bbox} | {predicted} | {correct} | {error} | {pred_time:.2f} |\n"
                )

            if len(result["results"]) > 10:
                f.write(f"\n*Showing first 10 of {len(result['results'])} samples*\n")

    print(f"\nResults saved to: {output_file}")


def save_visualizations(all_results: List[dict], samples, output_dir: str = "output") -> None:
    """
    Save visualizations of predicted coordinates vs bboxes to an output folder.

    Args:
        all_results: List of evaluation results for each model
        samples: List of sample dicts with image, bbox, instruction keys
        output_dir: Output directory path
    """
    os.makedirs(output_dir, exist_ok=True)

    for result in all_results:
        model_name = result["model_name"].replace("/", "_").replace("\\", "_")
        model_dir = os.path.join(output_dir, model_name)
        os.makedirs(model_dir, exist_ok=True)

        print(f"Saving visualizations for {result['model_name']}...")

        # Save first 10 samples for visualization
        for i, sample_result in enumerate(
            tqdm(result["results"][:10], desc=f"Saving {model_name} visualizations")
        ):
            # Get sample data using index
            sample_idx = sample_result["sample_idx"]

            if sample_idx < len(samples):
                sample = samples[sample_idx]
                image = sample["image"].copy()  # Make a copy to avoid modifying original
            else:
                print(f"Warning: Could not find sample at index {sample_idx}")
                continue

            bbox = sample_result["bbox"]
            predicted_coords = sample_result["predicted_coords"]
            is_correct = sample_result["is_correct"]

            # Draw on image
            draw = ImageDraw.Draw(image)

            # Draw bounding box (ground truth) in green
            x1, y1, x2, y2 = bbox
            draw.rectangle([x1, y1, x2, y2], outline="green", width=3)
            draw.text((x1, y1 - 20), "Ground Truth", fill="green")

            # Draw predicted click in red or blue
            if predicted_coords is not None:
                px, py = predicted_coords
                color = "blue" if is_correct else "red"
                # Draw crosshair
                crosshair_size = 15
                draw.line(
                    [(px - crosshair_size, py), (px + crosshair_size, py)], fill=color, width=3
                )
                draw.line(
                    [(px, py - crosshair_size), (px, py + crosshair_size)], fill=color, width=3
                )
                draw.text((px + 10, py - 20), f"Predicted ({px},{py})", fill=color)

            # Add status text
            status = "CORRECT" if is_correct else "INCORRECT"
            status_color = "blue" if is_correct else "red"
            draw.text((10, 10), f"Status: {status}", fill=status_color)
            draw.text(
                (10, 30), f"Instruction: {sample_result['instruction'][:50]}...", fill="black"
            )

            # Save image
            filename = f"sample_{i+1:02d}_idx{sample_idx}_{status.lower()}.png"
            filepath = os.path.join(model_dir, filename)
            image.save(filepath)

        print(f"Visualizations saved to: {model_dir}")


def save_prediction_visualization(
    image: Image.Image,
    instruction: str,
    predictions: List[dict],
    output_file: str = "interactive_prediction.png",
) -> None:
    """
    Save visualization of multiple model predictions on a single image.

    Args:
        image: PIL Image to visualize
        instruction: Instruction text
        predictions: List of prediction dicts with keys: model_name, coords, error
        output_file: Output file path
    """
    # Create a copy of the image
    vis_image = image.copy()
    draw = ImageDraw.Draw(vis_image)

    # Colors for different models
    colors = ["red", "blue", "orange", "purple", "brown", "pink", "gray", "olive"]

    # Draw predictions
    for i, pred in enumerate(predictions):
        color = colors[i % len(colors)]
        model_name = pred["model_name"]
        coords = pred.get("coords")
        error = pred.get("error")

        if coords is not None:
            px, py = coords
            # Draw crosshair
            crosshair_size = 20
            draw.line([(px - crosshair_size, py), (px + crosshair_size, py)], fill=color, width=4)
            draw.line([(px, py - crosshair_size), (px, py + crosshair_size)], fill=color, width=4)
            # Draw model name
            draw.text((px + 15, py + 15), f"{model_name}: ({px},{py})", fill=color)
        else:
            # Draw error text
            draw.text((10, 50 + i * 20), f"{model_name}: ERROR - {error}", fill=color)

    # Add instruction at the top
    draw.text((10, 10), f"Instruction: {instruction}", fill="black")

    # Save image
    vis_image.save(output_file)
    print(f"Prediction visualization saved to: {output_file}")


def take_screenshot() -> Image.Image:
    """
    Take a screenshot of the current screen.

    Returns:
        PIL Image of the screenshot
    """
    try:
        import pyautogui

        screenshot = pyautogui.screenshot()
        return screenshot
    except ImportError:
        print("pyautogui not installed. Please install it with: pip install pyautogui")
        raise
    except Exception as e:
        print(f"Error taking screenshot: {e}")
        raise