From ffc88e203138ac6ef62d73361c800362815bf7f9 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 30 Jul 2025 13:41:58 -0400
Subject: [PATCH] added agent benchmarks

---
 libs/python/agent/benchmarks/.gitignore     |   1 +
 libs/python/agent/benchmarks/README.md      | 177 +++++++++++++++++
 libs/python/agent/benchmarks/models/gta1.py |   2 +-
 libs/python/agent/benchmarks/ss-pro.py      | 128 +++++++------
 libs/python/agent/benchmarks/ss-v2.py       | 179 ++++++++++++++++++
 libs/python/agent/benchmarks/utils.py       | 200 +++++++++++++-------
 6 files changed, 553 insertions(+), 134 deletions(-)
 create mode 100644 libs/python/agent/benchmarks/README.md
 create mode 100644 libs/python/agent/benchmarks/ss-v2.py

diff --git a/libs/python/agent/benchmarks/.gitignore b/libs/python/agent/benchmarks/.gitignore
index b9f463f1..a0aed392 100644
--- a/libs/python/agent/benchmarks/.gitignore
+++ b/libs/python/agent/benchmarks/.gitignore
@@ -1,2 +1,3 @@
 output/
 interactive_output/
+*_results.md
\ No newline at end of file
diff --git a/libs/python/agent/benchmarks/README.md b/libs/python/agent/benchmarks/README.md
new file mode 100644
index 00000000..033f0157
--- /dev/null
+++ b/libs/python/agent/benchmarks/README.md
@@ -0,0 +1,177 @@
+# Computer Agent Benchmarks
+
+This directory contains benchmarks designed to test agent providers in the Computer Agent SDK against reference agent implementations.
+
+## Overview
+
+The benchmark system evaluates models on GUI grounding tasks, specifically click prediction accuracy. It supports both:
+- **Computer Agent SDK providers** (using model strings like `"huggingface-local/HelloKKMe/GTA1-7B"`)
+- **Reference agent implementations** (custom model classes implementing the `ModelProtocol`)
+
+## Available Benchmarks
+
+### 1. ScreenSpot-v2 (`ss-v2.py`)
+- **Dataset**: ScreenSpot-v2 (click-only GUI grounding)
+- **Format**: Standard resolution screenshots
+- **Task**: Predict click coordinates given an instruction and image
+- **Metrics**: Accuracy, Error Rate, Timing, VRAM usage
+
+### 2. ScreenSpot-Pro (`ss-pro.py`) 
+- **Dataset**: ScreenSpot-Pro (high-resolution click-only GUI grounding)
+- **Format**: High-resolution screenshots
+- **Task**: Predict click coordinates given an instruction and image
+- **Metrics**: Accuracy, Error Rate, Timing, VRAM usage
+
+### 3. Interactive Testing (`interactive.py`)
+- **Real-time testing**: Take screenshots and visualize model predictions
+- **Commands**: 
+  - Type instruction → screenshot + test all models
+  - `screenshot` → take screenshot without prediction
+  - `models` → list available models
+  - `quit`/`exit` → exit tool
+- **Output**: Visual predictions with crosshairs for each model
+
+## Adding Reference Agent Implementations
+
+### 1. Implement the ModelProtocol
+
+Create a new file in `models/` directory implementing the `ModelProtocol`:
+
+```python
+from models.base import ModelProtocol
+from typing import Optional, Tuple
+from PIL import Image
+
+class YourModelName(ModelProtocol):
+    def __init__(self, model_path: str):
+        self.model_path = model_path
+        self._model = None
+    
+    @property
+    def model_name(self) -> str:
+        return self.model_path
+    
+    async def load_model(self) -> None:
+        """Load the model into memory."""
+        # Your model loading logic here
+        pass
+    
+    async def unload_model(self) -> None:
+        """Unload the model from memory."""
+        # Your model cleanup logic here
+        pass
+    
+    async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
+        """
+        Predict click coordinates for the given image and instruction.
+        
+        Args:
+            image: PIL Image to analyze
+            instruction: Text instruction describing what to click
+            
+        Returns:
+            Tuple of (x, y) coordinates or None if prediction fails
+        """
+        # Your prediction logic here
+        return (x, y)  # Return predicted coordinates
+```
+
+### 2. Register Your Model
+
+Add your model to the `get_available_models()` function in `utils.py`:
+
+```python
+def get_available_models() -> List[Union[str, ModelProtocol]]:
+    models = [
+        # Computer Agent SDK providers
+        "huggingface-local/HelloKKMe/GTA1-7B",
+        
+        # Reference implementations
+        GTA1Model("HelloKKMe/GTA1-7B"),
+        YourModelName("path/to/your/model"),  # Add your model here
+    ]
+    return models
+```
+
+## Running Benchmarks
+
+### 1. Configure Models
+Edit `utils.py` to specify which models you want to test in `get_available_models()`.
+
+### 2. Set Sample Count
+Edit the benchmark script to change the number of samples:
+```python
+max_samples = 50  # Set to None to evaluate on full dataset
+```
+
+### 3. Run Benchmark
+```bash
+# ScreenSpot-v2 benchmark
+python ss-v2.py
+
+# ScreenSpot-Pro benchmark  
+python ss-pro.py
+
+# Interactive testing
+python interactive.py
+```
+
+## Output
+
+### Console Output
+```
+Model Results:
+  Accuracy: 85.50%
+  Correct: 171/200
+  Errors: 5
+  Error Rate: 2.50%
+  Avg Time: 1.23s
+  Time Range: 0.89s - 2.45s
+  VRAM Max: 4.5GB
+  VRAM Avg: 3.4GB
+```
+
+### Generated Files
+- **Markdown Report**: `*_results.md` with detailed results tables
+- **Visualizations**: `output/` directory with prediction visualizations
+- **Interactive Output**: `interactive_output/` for interactive session results
+
+## Metrics Tracked
+
+- **Accuracy**: Percentage of clicks within bounding boxes
+- **Error Rate**: Percentage of failed predictions
+- **Timing**: Average, min, max prediction times
+- **VRAM Usage**: Maximum and average GPU memory usage
+- **Per-sample Results**: Detailed breakdown for debugging
+
+## Requirements
+
+- Python 3.8+
+- PyTorch (for VRAM tracking)
+- PIL/Pillow (for image processing)
+- datasets (for HuggingFace datasets)
+- tqdm (for progress bars)
+- Computer Agent SDK
+
+## Architecture
+
+The benchmark system is designed for:
+- **Modularity**: Easy to add new models and benchmarks
+- **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys
+- **Performance**: VRAM tracking and timing analysis
+- **Visualization**: Automatic generation of prediction visualizations
+- **No Exception Handling**: Fails fast to surface real issues
+
+## Results Table
+
+| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg |
+|-------|---------|----------|------------|----------|----------|----------|
+| (coming soon) | | | | | | |
+
+## Contributing
+
+To add a new benchmark:
+1. Create a new script following the pattern in `ss-v2.py`
+2. Use the `evaluate_model()` function from utils
+3. Ensure your dataset yields dicts with `image`, `bbox`, `instruction` keys
+4. Update this README with benchmark details
diff --git a/libs/python/agent/benchmarks/models/gta1.py b/libs/python/agent/benchmarks/models/gta1.py
index 2bb4fe1d..a1dee599 100644
--- a/libs/python/agent/benchmarks/models/gta1.py
+++ b/libs/python/agent/benchmarks/models/gta1.py
@@ -117,7 +117,7 @@ Output the coordinate pair exactly:
             }
             
             # Process inputs
-            image_inputs, video_inputs = process_vision_info([system_message, user_message])
+            image_inputs, video_inputs = process_vision_info([system_message, user_message]) # type: ignore
             text = self.processor.apply_chat_template(
                 [system_message, user_message], 
                 tokenize=False, 
diff --git a/libs/python/agent/benchmarks/ss-pro.py b/libs/python/agent/benchmarks/ss-pro.py
index 57f2c971..e1e54a1d 100644
--- a/libs/python/agent/benchmarks/ss-pro.py
+++ b/libs/python/agent/benchmarks/ss-pro.py
@@ -7,6 +7,7 @@ Supports both ComputerAgent model strings and custom model classes.
 """
 
 import asyncio
+import time
 from typing import Optional
 
 from datasets import load_dataset
@@ -43,66 +44,67 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
         total_samples = min(max_samples, total_samples)
     
     correct_predictions = 0
-    failed_predictions = 0
+    error_predictions = 0
     results = []
     
-    try:
-        for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
-            sample = dataset[i]
-            
-            # Extract sample data
-            image = sample['image']
-            instruction = sample['instruction']
-            bbox = sample['bbox']  # [x1, y1, x2, y2]
-            sample_id = sample['id']
-            
-            # Predict click coordinates
-            try:
-                click_coords = await model_wrapper.predict_click(image, instruction)
-                
-                # Check if prediction is correct
-                is_correct = is_click_in_bbox(click_coords, bbox)
-                
-                if is_correct:
-                    correct_predictions += 1
-                
-                results.append({
-                    'id': sample_id,
-                    'instruction': instruction,
-                    'bbox': bbox,
-                    'predicted_coords': click_coords,
-                    'is_correct': is_correct,
-                    'failed': False
-                })
-                
-            except Exception as e:
-                print(f"\nError predicting sample {sample_id}: {e}")
-                failed_predictions += 1
-                results.append({
-                    'id': sample_id,
-                    'instruction': instruction,
-                    'bbox': bbox,
-                    'predicted_coords': None,
-                    'is_correct': False,
-                    'failed': True,
-                    'error': str(e)
-                })
+    for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
+        sample = dataset[i]
+        
+        # Extract sample data
+        image = sample['image']
+        instruction = sample['instruction']
+        bbox = sample['bbox']  # [x1, y1, x2, y2]
+        sample_id = sample['img_filename']
+        
+        # Predict click coordinates with timing
+        start_time = time.time()
+        click_coords = await model_wrapper.predict_click(image, instruction)
+        prediction_time = time.time() - start_time
+        
+        # Check if prediction is correct
+        is_correct = is_click_in_bbox(click_coords, bbox)
+        
+        if is_correct:
+            correct_predictions += 1
+        
+        results.append({
+            'id': sample_id,
+            'instruction': instruction,
+            'bbox': bbox,
+            'predicted_coords': click_coords,
+            'is_correct': is_correct,
+            'failed': False,
+            'prediction_time': prediction_time
+        })
     
-    finally:
-        # Unload model
-        await model_wrapper.unload_model()
+    # Unload model
+    await model_wrapper.unload_model()
     
     # Calculate metrics
     accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
-    failure_rate = failed_predictions / total_samples if total_samples > 0 else 0.0
+    error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
+    
+    # Calculate timing statistics
+    successful_times = [r['prediction_time'] for r in results if not r['failed']]
+    avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    min_prediction_time = min(successful_times) if successful_times else 0.0
+    max_prediction_time = max(successful_times) if successful_times else 0.0
+    
+    # Get VRAM statistics
+    vram_stats = model_wrapper.get_vram_stats()
     
     return {
         'model_name': model_wrapper.model_name,
         'total_samples': total_samples,
         'correct_predictions': correct_predictions,
-        'failed_predictions': failed_predictions,
+        'failed_predictions': error_predictions,
         'accuracy': accuracy,
-        'failure_rate': failure_rate,
+        'failure_rate': error_rate,
+        'avg_prediction_time': avg_prediction_time,
+        'min_prediction_time': min_prediction_time,
+        'max_prediction_time': max_prediction_time,
+        'vram_max_mb': vram_stats['max_mb'],
+        'vram_avg_mb': vram_stats['avg_mb'],
         'results': results
     }
 
@@ -123,26 +125,26 @@ async def main():
     models = get_available_models()
     
     # Evaluation settings
-    max_samples = 5  # Set to None to evaluate on full dataset
+    max_samples = 300  # Set to None to evaluate on full dataset
     
     # Run evaluations
     all_results = []
     
     for model in models:
-        try:
-            model_wrapper = ModelWrapper(model)
-            result = await evaluate_model(model_wrapper, dataset_list, max_samples)
-            all_results.append(result)
-            
-            # Print summary
-            print(f"\n{result['model_name']} Results:")
-            print(f"  Accuracy: {result['accuracy']*100:.2f}%")
-            print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
-            print(f"  Failed: {result['failed_predictions']}")
-            
-        except Exception as e:
-            print(f"\nError evaluating model {model}: {e}")
-            continue
+        model_wrapper = ModelWrapper(model)
+        result = await evaluate_model(model_wrapper, dataset_list, max_samples)
+        all_results.append(result)
+        
+        # Print summary
+        print(f"\n{result['model_name']} Results:")
+        print(f"  Accuracy: {result['accuracy']*100:.2f}%")
+        print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
+        print(f"  Errors: {result['failed_predictions']}")
+        print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
+        print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
+        print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
+        print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
     
     # Save results
     if all_results:
diff --git a/libs/python/agent/benchmarks/ss-v2.py b/libs/python/agent/benchmarks/ss-v2.py
new file mode 100644
index 00000000..919a1001
--- /dev/null
+++ b/libs/python/agent/benchmarks/ss-v2.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+ScreenSpot-Pro Benchmark Script
+
+Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
+Supports both ComputerAgent model strings and custom model classes.
+"""
+
+import asyncio
+import time
+from typing import Optional
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+from utils import (
+    ModelWrapper, 
+    is_click_in_bbox, 
+    save_results_to_markdown, 
+    save_visualizations,
+    get_available_models
+)
+
+
+async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Optional[int] = None) -> dict:
+    """
+    Evaluate a model on any iterable of samples.
+    
+    Args:
+        model_wrapper: ModelWrapper instance
+        samples: Iterable of dicts with keys: image, bbox, instruction
+        max_samples: Maximum number of samples to evaluate (None for all)
+        
+    Returns:
+        Dictionary with evaluation results
+    """
+    print(f"\nEvaluating model: {model_wrapper.model_name}")
+    
+    # Load model
+    await model_wrapper.load_model()
+    
+    # Convert to list if needed and limit samples
+    if hasattr(samples, '__len__'):
+        total_samples = len(samples)
+        if max_samples is not None:
+            total_samples = min(max_samples, total_samples)
+        sample_list = list(samples)[:total_samples]
+    else:
+        # For iterators, take max_samples or all
+        sample_list = list(samples)
+        if max_samples is not None:
+            sample_list = sample_list[:max_samples]
+        total_samples = len(sample_list)
+    
+    correct_predictions = 0
+    error_predictions = 0
+    results = []
+    
+    for i, sample in enumerate(tqdm(sample_list, desc=f"Evaluating {model_wrapper.model_name}")):
+        # Extract required data (only these 3 keys matter)
+        image = sample['image']
+        instruction = sample['instruction']
+        bbox = sample['bbox']  # [x1, y1, x2, y2]
+        
+        # Predict click coordinates with timing
+        start_time = time.time()
+        click_coords = await model_wrapper.predict_click(image, instruction)
+        prediction_time = time.time() - start_time
+        
+        # Check if prediction is correct
+        is_correct = is_click_in_bbox(click_coords, bbox)
+        
+        if is_correct:
+            correct_predictions += 1
+        
+        results.append({
+            'sample_idx': i,
+            'instruction': instruction,
+            'bbox': bbox,
+            'predicted_coords': click_coords,
+            'is_correct': is_correct,
+            'failed': False,
+            'prediction_time': prediction_time
+        })
+    
+    # Unload model
+    await model_wrapper.unload_model()
+    
+    # Calculate metrics
+    accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
+    error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
+    
+    # Calculate timing statistics
+    successful_times = [r['prediction_time'] for r in results if not r['failed']]
+    avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    min_prediction_time = min(successful_times) if successful_times else 0.0
+    max_prediction_time = max(successful_times) if successful_times else 0.0
+    
+    # Get VRAM statistics
+    vram_stats = model_wrapper.get_vram_stats()
+    
+    return {
+        'model_name': model_wrapper.model_name,
+        'total_samples': total_samples,
+        'correct_predictions': correct_predictions,
+        'failed_predictions': error_predictions,
+        'accuracy': accuracy,
+        'failure_rate': error_rate,
+        'avg_prediction_time': avg_prediction_time,
+        'min_prediction_time': min_prediction_time,
+        'max_prediction_time': max_prediction_time,
+        'vram_max_mb': vram_stats['max_mb'],
+        'vram_avg_mb': vram_stats['avg_mb'],
+        'results': results
+    }
+
+
+async def main():
+    """
+    Main function to run the benchmark.
+    """
+    # Load dataset
+    print("Loading ScreenSpot-v2 dataset...")
+    ds = load_dataset("lmms-lab/ScreenSpot-v2")
+    dataset = ds['train'] # type: ignore
+    # Convert to simple list of dicts with only required keys
+    samples = []
+    for item in dataset:
+        # Convert dataset item to dict if needed
+        item_dict = dict(item) if hasattr(item, 'keys') else item
+        
+        # Convert ScreenSpot-v2 bbox format [x, y, w, h] to [x1, y1, x2, y2]
+        bbox_xywh = item_dict['bbox']  # type: ignore
+        x, y, w, h = bbox_xywh
+        bbox_xyxy = [x, y, x + w, y + h]
+        
+        samples.append({
+            'image': item_dict['image'],  # type: ignore
+            'instruction': item_dict['instruction'],  # type: ignore
+            'bbox': bbox_xyxy
+        })
+    print(f"Dataset loaded: {len(samples)} samples")
+    
+    # Get available models
+    models = get_available_models()
+    
+    # Evaluation settings
+    max_samples = 500  # Set to None to evaluate on full dataset
+    
+    # Run evaluations
+    all_results = []
+    
+    for model in models:
+        model_wrapper = ModelWrapper(model)
+        result = await evaluate_model(model_wrapper, samples, max_samples)
+        all_results.append(result)
+        
+        # Print summary
+        print(f"\n{result['model_name']} Results:")
+        print(f"  Accuracy: {result['accuracy']*100:.2f}%")
+        print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
+        print(f"  Errors: {result['failed_predictions']}")
+        print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
+        print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
+        print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
+        print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
+    
+    # Save results
+    if all_results:
+        save_results_to_markdown(all_results, "screenspot_v2_results.md", title="ScreenSpot-v2 Benchmark Results")
+        save_visualizations(all_results, samples)
+        print("\nBenchmark completed successfully!")
+    else:
+        print("\nNo successful evaluations completed.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/libs/python/agent/benchmarks/utils.py b/libs/python/agent/benchmarks/utils.py
index c1fc41cf..099499a5 100644
--- a/libs/python/agent/benchmarks/utils.py
+++ b/libs/python/agent/benchmarks/utils.py
@@ -22,6 +22,33 @@ from agent.agent import ComputerAgent
 from models import GTA1Model
 from models.base import ModelProtocol
 
+def get_vram_usage() -> dict:
+    """
+    Get current VRAM usage statistics.
+    
+    Returns:
+        Dictionary with VRAM usage info (in MB)
+    """
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        allocated = torch.cuda.memory_allocated(device) / 1024 / 1024  # Convert to MB
+        reserved = torch.cuda.memory_reserved(device) / 1024 / 1024   # Convert to MB
+        total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
+        return {
+            'allocated_mb': allocated,
+            'reserved_mb': reserved,
+            'total_mb': total,
+            'free_mb': total - reserved
+        }
+    else:
+        return {
+            'allocated_mb': 0.0,
+            'reserved_mb': 0.0,
+            'total_mb': 0.0,
+            'free_mb': 0.0
+        }
+
+
 def get_available_models() -> List[Union[str, ModelProtocol]]:
     """
     Get list of available models for testing.
@@ -34,11 +61,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
     models = [
         # === ComputerAgent model strings ===
         f"{local_provider}HelloKKMe/GTA1-7B",
-        # f"{local_provider}HelloKKMe/GTA1-32B",  # Uncomment if you have this model
+        f"{local_provider}HelloKKMe/GTA1-32B",
         
         # === Reference model classes ===
         GTA1Model("HelloKKMe/GTA1-7B"),
-        # GTA1Model("HelloKKMe/GTA1-32B"),  # Uncomment if you have this model
+        GTA1Model("HelloKKMe/GTA1-32B"), 
     ]
     
     return models
@@ -88,11 +115,12 @@ class ModelWrapper:
         self.model = model
         self.is_computer_agent = isinstance(model, str)
         self.agent: Optional[ComputerAgent] = None
+        self.vram_usage_history: List[float] = []  # Track VRAM usage over time
         
         if self.is_computer_agent:
             self.model_name = str(model)
         else:
-            self.model_name = f"models.{model.__class__.__name__}"
+            self.model_name = f"{model.__class__.__name__}('{getattr(model, 'model_name', 'unknown')}')"
     
     async def load_model(self) -> None:
         """Load the model."""
@@ -100,6 +128,10 @@ class ModelWrapper:
             self.agent = ComputerAgent(model=str(self.model))
         else:
             await self.model.load_model() # type: ignore
+        
+        # Record initial VRAM usage after loading
+        vram_info = get_vram_usage()
+        self.vram_usage_history.append(vram_info['allocated_mb'])
     
     async def unload_model(self) -> None:
         """Unload the model."""
@@ -111,10 +143,28 @@ class ModelWrapper:
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+        
+        # Record VRAM usage after unloading
+        vram_info = get_vram_usage()
+        self.vram_usage_history.append(vram_info['allocated_mb'])
+    
+    def get_vram_stats(self) -> dict:
+        """Get VRAM usage statistics for this model."""
+        if not self.vram_usage_history:
+            return {'max_mb': 0.0, 'avg_mb': 0.0}
+        
+        return {
+            'max_mb': max(self.vram_usage_history),
+            'avg_mb': sum(self.vram_usage_history) / len(self.vram_usage_history)
+        }
 
     
     async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
         """Predict click coordinates."""
+        # Record VRAM usage before prediction
+        vram_info = get_vram_usage()
+        self.vram_usage_history.append(vram_info['allocated_mb'])
+        
         if self.is_computer_agent:
             if self.agent is None:
                 await self.load_model()
@@ -122,13 +172,24 @@ class ModelWrapper:
             if self.agent is not None:
                 image_b64 = image_to_base64(image)
                 result = await self.agent.predict_click(instruction=instruction, image_b64=image_b64)
+                
+                # Record VRAM usage after prediction
+                vram_info = get_vram_usage()
+                self.vram_usage_history.append(vram_info['allocated_mb'])
+                
                 return result
             return None
         else:
-            return await self.model.predict_click(image, instruction) # type: ignore
+            result = await self.model.predict_click(image, instruction) # type: ignore
+            
+            # Record VRAM usage after prediction
+            vram_info = get_vram_usage()
+            self.vram_usage_history.append(vram_info['allocated_mb'])
+            
+            return result
 
 
-def save_results_to_markdown(all_results: List[dict], output_file: str = "screenspot_pro_results.md") -> None:
+def save_results_to_markdown(all_results: List[dict],output_file: str = "screenspot_pro_results.md", title: str = "ScreenSpot-Pro Benchmark Results") -> None:
     """
     Save evaluation results to a markdown table.
     
@@ -137,39 +198,46 @@ def save_results_to_markdown(all_results: List[dict], output_file: str = "screen
         output_file: Output markdown file path
     """
     with open(output_file, 'w', encoding='utf-8') as f:
-        f.write("# ScreenSpot-Pro Benchmark Results\n\n")
+        f.write(f"# {title}\n\n")
         f.write(f"**Evaluation Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
         
         # Summary table
         f.write("## Summary\n\n")
-        f.write("| Model | Total Samples | Correct | Failed | Accuracy | Failure Rate |\n")
-        f.write("|-------|---------------|---------|--------|----------|--------------|\n")
+        f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
+        f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n")
         
         for result in all_results:
             model_name = result['model_name']
             total = result['total_samples']
             correct = result['correct_predictions']
-            failed = result['failed_predictions']
+            errors = result['failed_predictions']
             accuracy = result['accuracy'] * 100
-            failure_rate = result['failure_rate'] * 100
+            error_rate = result['failure_rate'] * 100
+            avg_time = result.get('avg_prediction_time', 0.0)
+            min_time = result.get('min_prediction_time', 0.0)
+            max_time = result.get('max_prediction_time', 0.0)
+            time_range = f"{min_time:.2f} - {max_time:.2f}"
+            vram_max = result.get('vram_max_mb', 0.0) / 1024
+            vram_avg = result.get('vram_avg_mb', 0.0) / 1024
             
-            f.write(f"| {model_name} | {total} | {correct} | {failed} | {accuracy:.2f}% | {failure_rate:.2f}% |\n")
+            f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
         
         # Detailed results for each model
         for result in all_results:
             f.write(f"\n## {result['model_name']} - Detailed Results\n\n")
-            f.write("| Sample ID | Instruction | BBox | Predicted | Correct | Failed |\n")
-            f.write("|-----------|-------------|------|-----------|---------|--------|\n")
+            f.write("| Sample Index | Instruction | BBox | Predicted | Correct | Error | Time (s) |\n")
+            f.write("|-----------|-------------|------|-----------|---------|-------|----------|\n")
             
             for sample_result in result['results'][:10]:  # Show first 10 samples
-                sample_id = sample_result['id']
+                sample_idx = sample_result['sample_idx']
                 instruction = sample_result['instruction'][:50] + "..." if len(sample_result['instruction']) > 50 else sample_result['instruction']
                 bbox = str(sample_result['bbox'])
                 predicted = str(sample_result['predicted_coords']) if sample_result['predicted_coords'] else "None"
                 correct = "PASS" if sample_result['is_correct'] else "FAIL"
-                failed = "YES" if sample_result['failed'] else "NO"
+                error = "YES" if sample_result['failed'] else "NO"
+                pred_time = sample_result.get('prediction_time', 0.0)
                 
-                f.write(f"| {sample_id} | {instruction} | {bbox} | {predicted} | {correct} | {failed} |\n")
+                f.write(f"| {sample_idx} | {instruction} | {bbox} | {predicted} | {correct} | {error} | {pred_time:.2f} |\n")
             
             if len(result['results']) > 10:
                 f.write(f"\n*Showing first 10 of {len(result['results'])} samples*\n")
@@ -177,76 +245,68 @@ def save_results_to_markdown(all_results: List[dict], output_file: str = "screen
     print(f"\nResults saved to: {output_file}")
 
 
-def save_visualizations(all_results: List[dict], dataset_list, output_dir: str = "output") -> None:
+def save_visualizations(all_results: List[dict], samples, output_dir: str = "output") -> None:
     """
     Save visualizations of predicted coordinates vs bboxes to an output folder.
     
     Args:
         all_results: List of evaluation results for each model
-        dataset_list: List of dataset samples
+        samples: List of sample dicts with image, bbox, instruction keys
         output_dir: Output directory path
     """
-    # Create output directory
     os.makedirs(output_dir, exist_ok=True)
     
     for result in all_results:
-        model_name = result['model_name'].replace('/', '_').replace('.', '_')
+        model_name = result['model_name'].replace('/', '_').replace('\\', '_')
         model_dir = os.path.join(output_dir, model_name)
         os.makedirs(model_dir, exist_ok=True)
         
-        print(f"\nSaving visualizations for {result['model_name']}...")
+        print(f"Saving visualizations for {result['model_name']}...")
         
+        # Save first 10 samples for visualization
         for i, sample_result in enumerate(tqdm(result['results'][:10], desc=f"Saving {model_name} visualizations")):
-            try:
-                # Find the original sample
-                sample_id = sample_result['id']
-                sample = None
-                for s in dataset_list:
-                    if s['id'] == sample_id:
-                        sample = s
-                        break
-                
-                if sample is None:
-                    continue
-                
-                # Get image and data
-                image = sample['image'].copy()
-                bbox = sample_result['bbox']  # [x1, y1, x2, y2]
-                predicted_coords = sample_result['predicted_coords']
-                is_correct = sample_result['is_correct']
-                
-                # Draw on image
-                draw = ImageDraw.Draw(image)
-                
-                # Draw bounding box (ground truth) in green
-                x1, y1, x2, y2 = bbox
-                draw.rectangle([x1, y1, x2, y2], outline="green", width=3)
-                draw.text((x1, y1-20), "Ground Truth", fill="green")
-                
-                # Draw predicted click in red or blue
-                if predicted_coords is not None:
-                    px, py = predicted_coords
-                    color = "blue" if is_correct else "red"
-                    # Draw crosshair
-                    crosshair_size = 15
-                    draw.line([(px-crosshair_size, py), (px+crosshair_size, py)], fill=color, width=3)
-                    draw.line([(px, py-crosshair_size), (px, py+crosshair_size)], fill=color, width=3)
-                    draw.text((px+10, py-20), f"Predicted ({px},{py})", fill=color)
-                
-                # Add status text
-                status = "CORRECT" if is_correct else "INCORRECT"
-                status_color = "blue" if is_correct else "red"
-                draw.text((10, 10), f"Status: {status}", fill=status_color)
-                draw.text((10, 30), f"Instruction: {sample_result['instruction'][:50]}...", fill="black")
-                
-                # Save image
-                filename = f"sample_{i+1:02d}_{sample_id}_{status.lower()}.png"
-                filepath = os.path.join(model_dir, filename)
-                image.save(filepath)
-                
-            except Exception as e:
-                print(f"Error saving visualization for sample {sample_id}: {e}")
+            # Get sample data using index
+            sample_idx = sample_result['sample_idx']
+            
+            if sample_idx < len(samples):
+                sample = samples[sample_idx]
+                image = sample['image'].copy()  # Make a copy to avoid modifying original
+            else:
+                print(f"Warning: Could not find sample at index {sample_idx}")
                 continue
+            
+            bbox = sample_result['bbox']
+            predicted_coords = sample_result['predicted_coords']
+            is_correct = sample_result['is_correct']
+            
+            # Draw on image
+            draw = ImageDraw.Draw(image)
+            
+            # Draw bounding box (ground truth) in green
+            x1, y1, x2, y2 = bbox
+            draw.rectangle([x1, y1, x2, y2], outline="green", width=3)
+            draw.text((x1, y1-20), "Ground Truth", fill="green")
+            
+            # Draw predicted click in red or blue
+            if predicted_coords is not None:
+                px, py = predicted_coords
+                color = "blue" if is_correct else "red"
+                # Draw crosshair
+                crosshair_size = 15
+                draw.line([(px-crosshair_size, py), (px+crosshair_size, py)], fill=color, width=3)
+                draw.line([(px, py-crosshair_size), (px, py+crosshair_size)], fill=color, width=3)
+                draw.text((px+10, py-20), f"Predicted ({px},{py})", fill=color)
+            
+            # Add status text
+            status = "CORRECT" if is_correct else "INCORRECT"
+            status_color = "blue" if is_correct else "red"
+            draw.text((10, 10), f"Status: {status}", fill=status_color)
+            draw.text((10, 30), f"Instruction: {sample_result['instruction'][:50]}...", fill="black")
+            
+            # Save image
+            filename = f"sample_{i+1:02d}_idx{sample_idx}_{status.lower()}.png"
+            filepath = os.path.join(model_dir, filename)
+            image.save(filepath)
         
         print(f"Visualizations saved to: {model_dir}")