diff --git a/libs/python/agent/benchmarks/README.md b/libs/python/agent/benchmarks/README.md index 033f0157..225fc30b 100644 --- a/libs/python/agent/benchmarks/README.md +++ b/libs/python/agent/benchmarks/README.md @@ -98,19 +98,13 @@ def get_available_models() -> List[Union[str, ModelProtocol]]: ### 1. Configure Models Edit `utils.py` to specify which models you want to test in `get_available_models()`. -### 2. Set Sample Count -Edit the benchmark script to change the number of samples: -```python -max_samples = 50 # Set to None to evaluate on full dataset -``` - -### 3. Run Benchmark +### 2. Run Benchmark ```bash # ScreenSpot-v2 benchmark -python ss-v2.py +python ss-v2.py --samples 50 # ScreenSpot-Pro benchmark -python ss-pro.py +python ss-pro.py --samples 50 # Interactive testing python interactive.py @@ -121,14 +115,9 @@ python interactive.py ### Console Output ``` Model Results: - Accuracy: 85.50% - Correct: 171/200 - Errors: 5 - Error Rate: 2.50% - Avg Time: 1.23s - Time Range: 0.89s - 2.45s - VRAM Max: 4.5GB - VRAM Avg: 3.4GB + Accuracy: 85.50% (171/200) + Avg Time: 1.23s (0.89s - 2.45s) + VRAM Usage: 4.5GB (max) / 3.4GB (avg) ``` ### Generated Files @@ -139,20 +128,10 @@ Model Results: ## Metrics Tracked - **Accuracy**: Percentage of clicks within bounding boxes -- **Error Rate**: Percentage of failed predictions - **Timing**: Average, min, max prediction times - **VRAM Usage**: Maximum and average GPU memory usage - **Per-sample Results**: Detailed breakdown for debugging -## Requirements - -- Python 3.8+ -- PyTorch (for VRAM tracking) -- PIL/Pillow (for image processing) -- datasets (for HuggingFace datasets) -- tqdm (for progress bars) -- Computer Agent SDK - ## Architecture The benchmark system is designed for: @@ -160,13 +139,6 @@ The benchmark system is designed for: - **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys - **Performance**: VRAM tracking and timing analysis - **Visualization**: Automatic generation of prediction visualizations -- **No Exception Handling**: Fails fast to surface real issues - -## Results Table - -| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg | -|-------|---------|----------|------------|----------|----------|----------| -| (coming soon) | | | | | | | ## Contributing diff --git a/libs/python/agent/benchmarks/ss-pro.py b/libs/python/agent/benchmarks/ss-pro.py index e1e54a1d..80e5e72f 100644 --- a/libs/python/agent/benchmarks/ss-pro.py +++ b/libs/python/agent/benchmarks/ss-pro.py @@ -6,7 +6,10 @@ Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy. Supports both ComputerAgent model strings and custom model classes. """ +import argparse import asyncio +import random +import statistics import time from typing import Optional @@ -18,7 +21,8 @@ from utils import ( is_click_in_bbox, save_results_to_markdown, save_visualizations, - get_available_models + get_available_models, + get_gpu_memory ) @@ -87,6 +91,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti # Calculate timing statistics successful_times = [r['prediction_time'] for r in results if not r['failed']] avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0 + median_prediction_time = statistics.median(successful_times) if successful_times else 0.0 min_prediction_time = min(successful_times) if successful_times else 0.0 max_prediction_time = max(successful_times) if successful_times else 0.0 @@ -101,6 +106,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti 'accuracy': accuracy, 'failure_rate': error_rate, 'avg_prediction_time': avg_prediction_time, + 'median_prediction_time': median_prediction_time, 'min_prediction_time': min_prediction_time, 'max_prediction_time': max_prediction_time, 'vram_max_mb': vram_stats['max_mb'], @@ -113,6 +119,17 @@ async def main(): """ Main function to run the benchmark. """ + # Parse command line arguments + parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script') + parser.add_argument('--samples', type=int, default=300, + help='Number of samples to evaluate (default: 300)') + parser.add_argument('--seed', type=int, default=42, + help='Random seed for shuffling (default: 42)') + args = parser.parse_args() + + # Set random seed + random.seed(args.seed) + # Load dataset print("Loading ScreenSpot-Pro dataset...") ds = load_dataset("lmms-lab/ScreenSpot-Pro") @@ -121,11 +138,15 @@ async def main(): dataset_list = list(dataset) print(f"Dataset loaded: {len(dataset_list)} samples") + # Shuffle dataset with seed + random.shuffle(dataset_list) + print(f"Dataset shuffled with seed {args.seed}") + # Get available models models = get_available_models() # Evaluation settings - max_samples = 300 # Set to None to evaluate on full dataset + max_samples = args.samples # Use command line argument # Run evaluations all_results = [] @@ -142,9 +163,15 @@ async def main(): print(f" Errors: {result['failed_predictions']}") print(f" Error Rate: {result['failure_rate']*100:.2f}%") print(f" Avg Time: {result['avg_prediction_time']:.2f}s") + print(f" Median Time: {result['median_prediction_time']:.2f}s") print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s") print(f" VRAM Max: {result['vram_max_mb']:.1f}MB") print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB") + + # Print GPU memory info + gpu_memory = get_gpu_memory() + if gpu_memory and gpu_memory[0] > 0: + print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB") # Save results if all_results: diff --git a/libs/python/agent/benchmarks/ss-v2.py b/libs/python/agent/benchmarks/ss-v2.py index 919a1001..dab1d4b1 100644 --- a/libs/python/agent/benchmarks/ss-v2.py +++ b/libs/python/agent/benchmarks/ss-v2.py @@ -1,12 +1,15 @@ #!/usr/bin/env python3 """ -ScreenSpot-Pro Benchmark Script +ScreenSpot-v2 Benchmark Script -Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy. +Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy. Supports both ComputerAgent model strings and custom model classes. """ +import argparse import asyncio +import random +import statistics import time from typing import Optional @@ -18,7 +21,8 @@ from utils import ( is_click_in_bbox, save_results_to_markdown, save_visualizations, - get_available_models + get_available_models, + get_gpu_memory ) @@ -93,6 +97,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti # Calculate timing statistics successful_times = [r['prediction_time'] for r in results if not r['failed']] avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0 + median_prediction_time = statistics.median(successful_times) if successful_times else 0.0 min_prediction_time = min(successful_times) if successful_times else 0.0 max_prediction_time = max(successful_times) if successful_times else 0.0 @@ -107,6 +112,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti 'accuracy': accuracy, 'failure_rate': error_rate, 'avg_prediction_time': avg_prediction_time, + 'median_prediction_time': median_prediction_time, 'min_prediction_time': min_prediction_time, 'max_prediction_time': max_prediction_time, 'vram_max_mb': vram_stats['max_mb'], @@ -119,6 +125,17 @@ async def main(): """ Main function to run the benchmark. """ + # Parse command line arguments + parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script') + parser.add_argument('--samples', type=int, default=500, + help='Number of samples to evaluate (default: 500)') + parser.add_argument('--seed', type=int, default=42, + help='Random seed for shuffling (default: 42)') + args = parser.parse_args() + + # Set random seed + random.seed(args.seed) + # Load dataset print("Loading ScreenSpot-v2 dataset...") ds = load_dataset("lmms-lab/ScreenSpot-v2") @@ -141,11 +158,15 @@ async def main(): }) print(f"Dataset loaded: {len(samples)} samples") + # Shuffle samples with seed + random.shuffle(samples) + print(f"Samples shuffled with seed {args.seed}") + # Get available models models = get_available_models() # Evaluation settings - max_samples = 500 # Set to None to evaluate on full dataset + max_samples = args.samples # Use command line argument # Run evaluations all_results = [] @@ -162,9 +183,15 @@ async def main(): print(f" Errors: {result['failed_predictions']}") print(f" Error Rate: {result['failure_rate']*100:.2f}%") print(f" Avg Time: {result['avg_prediction_time']:.2f}s") + print(f" Median Time: {result['median_prediction_time']:.2f}s") print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s") print(f" VRAM Max: {result['vram_max_mb']:.1f}MB") print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB") + + # Print GPU memory info + gpu_memory = get_gpu_memory() + if gpu_memory and gpu_memory[0] > 0: + print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB") # Save results if all_results: diff --git a/libs/python/agent/benchmarks/utils.py b/libs/python/agent/benchmarks/utils.py index 099499a5..7a3b70a3 100644 --- a/libs/python/agent/benchmarks/utils.py +++ b/libs/python/agent/benchmarks/utils.py @@ -7,6 +7,8 @@ import asyncio import base64 import os import sys +import subprocess as sp +import statistics from datetime import datetime from io import BytesIO from typing import List, Union, Tuple, Optional @@ -22,6 +24,28 @@ from agent.agent import ComputerAgent from models import GTA1Model from models.base import ModelProtocol +def get_gpu_memory() -> List[int]: + """ + Get GPU memory usage using nvidia-smi. + + Returns: + List of free memory values in MB for each GPU + """ + try: + command = "nvidia-smi --query-gpu=memory.free --format=csv" + memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:] + memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)] + return memory_free_values + except (sp.CalledProcessError, FileNotFoundError, IndexError): + # Fallback to torch if nvidia-smi is not available + if torch.cuda.is_available(): + device = torch.cuda.current_device() + total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024 + reserved = torch.cuda.memory_reserved(device) / 1024 / 1024 + return [int(total - reserved)] + return [0] + + def get_vram_usage() -> dict: """ Get current VRAM usage statistics. @@ -61,11 +85,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]: models = [ # === ComputerAgent model strings === f"{local_provider}HelloKKMe/GTA1-7B", - f"{local_provider}HelloKKMe/GTA1-32B", + # f"{local_provider}HelloKKMe/GTA1-32B", # === Reference model classes === GTA1Model("HelloKKMe/GTA1-7B"), - GTA1Model("HelloKKMe/GTA1-32B"), + # GTA1Model("HelloKKMe/GTA1-32B"), ] return models @@ -203,8 +227,8 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens # Summary table f.write("## Summary\n\n") - f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n") - f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n") + f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n") + f.write("|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n") for result in all_results: model_name = result['model_name'] @@ -214,13 +238,14 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens accuracy = result['accuracy'] * 100 error_rate = result['failure_rate'] * 100 avg_time = result.get('avg_prediction_time', 0.0) + median_time = result.get('median_prediction_time', 0.0) min_time = result.get('min_prediction_time', 0.0) max_time = result.get('max_prediction_time', 0.0) time_range = f"{min_time:.2f} - {max_time:.2f}" vram_max = result.get('vram_max_mb', 0.0) / 1024 vram_avg = result.get('vram_avg_mb', 0.0) / 1024 - f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n") + f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n") # Detailed results for each model for result in all_results: