updated metrics

2026-01-04 20:40:15 -06:00 · 2025-07-30 16:12:51 -04:00
parent ffc88e2031
commit 8aef7b8b1a
4 changed files with 96 additions and 45 deletions
--- a/libs/python/agent/benchmarks/README.md
+++ b/libs/python/agent/benchmarks/README.md
@@ -98,19 +98,13 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
 ### 1. Configure Models
 Edit `utils.py` to specify which models you want to test in `get_available_models()`.

-### 2. Set Sample Count
-Edit the benchmark script to change the number of samples:
-```python
-max_samples = 50  # Set to None to evaluate on full dataset
-```
-
-### 3. Run Benchmark
+### 2. Run Benchmark
 ```bash
 # ScreenSpot-v2 benchmark
-python ss-v2.py
+python ss-v2.py --samples 50

 # ScreenSpot-Pro benchmark  
-python ss-pro.py
+python ss-pro.py --samples 50

 # Interactive testing
 python interactive.py
@@ -121,14 +115,9 @@ python interactive.py
 ### Console Output
 ```
 Model Results:
-  Accuracy: 85.50%
-  Correct: 171/200
-  Errors: 5
-  Error Rate: 2.50%
-  Avg Time: 1.23s
-  Time Range: 0.89s - 2.45s
-  VRAM Max: 4.5GB
-  VRAM Avg: 3.4GB
+  Accuracy: 85.50% (171/200)
+  Avg Time: 1.23s (0.89s - 2.45s)
+  VRAM Usage: 4.5GB (max) / 3.4GB (avg)
 ```

 ### Generated Files
@@ -139,20 +128,10 @@ Model Results:
 ## Metrics Tracked

 - **Accuracy**: Percentage of clicks within bounding boxes
- **Error Rate**: Percentage of failed predictions
 - **Timing**: Average, min, max prediction times
 - **VRAM Usage**: Maximum and average GPU memory usage
 - **Per-sample Results**: Detailed breakdown for debugging

-## Requirements
-
- Python 3.8+
- PyTorch (for VRAM tracking)
- PIL/Pillow (for image processing)
- datasets (for HuggingFace datasets)
- tqdm (for progress bars)
- Computer Agent SDK
-
 ## Architecture

 The benchmark system is designed for:
@@ -160,13 +139,6 @@ The benchmark system is designed for:
 - **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys
 - **Performance**: VRAM tracking and timing analysis
 - **Visualization**: Automatic generation of prediction visualizations
- **No Exception Handling**: Fails fast to surface real issues
-
-## Results Table
-
-| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg |
-|-------|---------|----------|------------|----------|----------|----------|
-| (coming soon) | | | | | | |

 ## Contributing

--- a/libs/python/agent/benchmarks/ss-pro.py
+++ b/libs/python/agent/benchmarks/ss-pro.py
@@ -6,7 +6,10 @@ Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
 Supports both ComputerAgent model strings and custom model classes.
 """

+import argparse
 import asyncio
+import random
+import statistics
 import time
 from typing import Optional

@@ -18,7 +21,8 @@ from utils import (
    is_click_in_bbox, 
    save_results_to_markdown, 
    save_visualizations,
-    get_available_models
+    get_available_models,
+    get_gpu_memory
 )


@@ -87,6 +91,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
    # Calculate timing statistics
    successful_times = [r['prediction_time'] for r in results if not r['failed']]
    avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
    min_prediction_time = min(successful_times) if successful_times else 0.0
    max_prediction_time = max(successful_times) if successful_times else 0.0
    
@@ -101,6 +106,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
        'accuracy': accuracy,
        'failure_rate': error_rate,
        'avg_prediction_time': avg_prediction_time,
+        'median_prediction_time': median_prediction_time,
        'min_prediction_time': min_prediction_time,
        'max_prediction_time': max_prediction_time,
        'vram_max_mb': vram_stats['max_mb'],
@@ -113,6 +119,17 @@ async def main():
    """
    Main function to run the benchmark.
    """
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script')
+    parser.add_argument('--samples', type=int, default=300, 
+                       help='Number of samples to evaluate (default: 300)')
+    parser.add_argument('--seed', type=int, default=42,
+                       help='Random seed for shuffling (default: 42)')
+    args = parser.parse_args()
+    
+    # Set random seed
+    random.seed(args.seed)
+    
    # Load dataset
    print("Loading ScreenSpot-Pro dataset...")
    ds = load_dataset("lmms-lab/ScreenSpot-Pro")
@@ -121,11 +138,15 @@ async def main():
    dataset_list = list(dataset)
    print(f"Dataset loaded: {len(dataset_list)} samples")
    
+    # Shuffle dataset with seed
+    random.shuffle(dataset_list)
+    print(f"Dataset shuffled with seed {args.seed}")
+    
    # Get available models
    models = get_available_models()
    
    # Evaluation settings
-    max_samples = 300  # Set to None to evaluate on full dataset
+    max_samples = args.samples  # Use command line argument
    
    # Run evaluations
    all_results = []
@@ -142,9 +163,15 @@ async def main():
        print(f"  Errors: {result['failed_predictions']}")
        print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
        print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Median Time: {result['median_prediction_time']:.2f}s")
        print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
        print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
        print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
+        
+        # Print GPU memory info
+        gpu_memory = get_gpu_memory()
+        if gpu_memory and gpu_memory[0] > 0:
+            print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
    
    # Save results
    if all_results:
--- a/libs/python/agent/benchmarks/ss-v2.py
+++ b/libs/python/agent/benchmarks/ss-v2.py
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 """
-ScreenSpot-Pro Benchmark Script
+ScreenSpot-v2 Benchmark Script

-Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
+Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
 Supports both ComputerAgent model strings and custom model classes.
 """

+import argparse
 import asyncio
+import random
+import statistics
 import time
 from typing import Optional

@@ -18,7 +21,8 @@ from utils import (
    is_click_in_bbox, 
    save_results_to_markdown, 
    save_visualizations,
-    get_available_models
+    get_available_models,
+    get_gpu_memory
 )


@@ -93,6 +97,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
    # Calculate timing statistics
    successful_times = [r['prediction_time'] for r in results if not r['failed']]
    avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
    min_prediction_time = min(successful_times) if successful_times else 0.0
    max_prediction_time = max(successful_times) if successful_times else 0.0
    
@@ -107,6 +112,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
        'accuracy': accuracy,
        'failure_rate': error_rate,
        'avg_prediction_time': avg_prediction_time,
+        'median_prediction_time': median_prediction_time,
        'min_prediction_time': min_prediction_time,
        'max_prediction_time': max_prediction_time,
        'vram_max_mb': vram_stats['max_mb'],
@@ -119,6 +125,17 @@ async def main():
    """
    Main function to run the benchmark.
    """
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script')
+    parser.add_argument('--samples', type=int, default=500, 
+                       help='Number of samples to evaluate (default: 500)')
+    parser.add_argument('--seed', type=int, default=42,
+                       help='Random seed for shuffling (default: 42)')
+    args = parser.parse_args()
+    
+    # Set random seed
+    random.seed(args.seed)
+    
    # Load dataset
    print("Loading ScreenSpot-v2 dataset...")
    ds = load_dataset("lmms-lab/ScreenSpot-v2")
@@ -141,11 +158,15 @@ async def main():
        })
    print(f"Dataset loaded: {len(samples)} samples")
    
+    # Shuffle samples with seed
+    random.shuffle(samples)
+    print(f"Samples shuffled with seed {args.seed}")
+    
    # Get available models
    models = get_available_models()
    
    # Evaluation settings
-    max_samples = 500  # Set to None to evaluate on full dataset
+    max_samples = args.samples  # Use command line argument
    
    # Run evaluations
    all_results = []
@@ -162,9 +183,15 @@ async def main():
        print(f"  Errors: {result['failed_predictions']}")
        print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
        print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Median Time: {result['median_prediction_time']:.2f}s")
        print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
        print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
        print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
+        
+        # Print GPU memory info
+        gpu_memory = get_gpu_memory()
+        if gpu_memory and gpu_memory[0] > 0:
+            print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
    
    # Save results
    if all_results:
--- a/libs/python/agent/benchmarks/utils.py
+++ b/libs/python/agent/benchmarks/utils.py
@@ -7,6 +7,8 @@ import asyncio
 import base64
 import os
 import sys
+import subprocess as sp
+import statistics
 from datetime import datetime
 from io import BytesIO
 from typing import List, Union, Tuple, Optional
@@ -22,6 +24,28 @@ from agent.agent import ComputerAgent
 from models import GTA1Model
 from models.base import ModelProtocol

+def get_gpu_memory() -> List[int]:
+    """
+    Get GPU memory usage using nvidia-smi.
+    
+    Returns:
+        List of free memory values in MB for each GPU
+    """
+    try:
+        command = "nvidia-smi --query-gpu=memory.free --format=csv"
+        memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
+        memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+        return memory_free_values
+    except (sp.CalledProcessError, FileNotFoundError, IndexError):
+        # Fallback to torch if nvidia-smi is not available
+        if torch.cuda.is_available():
+            device = torch.cuda.current_device()
+            total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
+            reserved = torch.cuda.memory_reserved(device) / 1024 / 1024
+            return [int(total - reserved)]
+        return [0]
+
+
 def get_vram_usage() -> dict:
    """
    Get current VRAM usage statistics.
@@ -61,11 +85,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
    models = [
        # === ComputerAgent model strings ===
        f"{local_provider}HelloKKMe/GTA1-7B",
-        f"{local_provider}HelloKKMe/GTA1-32B",
+        # f"{local_provider}HelloKKMe/GTA1-32B",
        
        # === Reference model classes ===
        GTA1Model("HelloKKMe/GTA1-7B"),
-        GTA1Model("HelloKKMe/GTA1-32B"), 
+        # GTA1Model("HelloKKMe/GTA1-32B"), 
    ]
    
    return models
@@ -203,8 +227,8 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
        
        # Summary table
        f.write("## Summary\n\n")
-        f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
-        f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n")
+        f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
+        f.write("|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n")
        
        for result in all_results:
            model_name = result['model_name']
@@ -214,13 +238,14 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
            accuracy = result['accuracy'] * 100
            error_rate = result['failure_rate'] * 100
            avg_time = result.get('avg_prediction_time', 0.0)
+            median_time = result.get('median_prediction_time', 0.0)
            min_time = result.get('min_prediction_time', 0.0)
            max_time = result.get('max_prediction_time', 0.0)
            time_range = f"{min_time:.2f} - {max_time:.2f}"
            vram_max = result.get('vram_max_mb', 0.0) / 1024
            vram_avg = result.get('vram_avg_mb', 0.0) / 1024
            
-            f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
+            f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
        
        # Detailed results for each model
        for result in all_results: