updated metrics

This commit is contained in:
Dillon DuPont
2025-07-30 16:12:51 -04:00
parent ffc88e2031
commit 8aef7b8b1a
4 changed files with 96 additions and 45 deletions

View File

@@ -98,19 +98,13 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
### 1. Configure Models
Edit `utils.py` to specify which models you want to test in `get_available_models()`.
### 2. Set Sample Count
Edit the benchmark script to change the number of samples:
```python
max_samples = 50 # Set to None to evaluate on full dataset
```
### 3. Run Benchmark
### 2. Run Benchmark
```bash
# ScreenSpot-v2 benchmark
python ss-v2.py
python ss-v2.py --samples 50
# ScreenSpot-Pro benchmark
python ss-pro.py
python ss-pro.py --samples 50
# Interactive testing
python interactive.py
@@ -121,14 +115,9 @@ python interactive.py
### Console Output
```
Model Results:
Accuracy: 85.50%
Correct: 171/200
Errors: 5
Error Rate: 2.50%
Avg Time: 1.23s
Time Range: 0.89s - 2.45s
VRAM Max: 4.5GB
VRAM Avg: 3.4GB
Accuracy: 85.50% (171/200)
Avg Time: 1.23s (0.89s - 2.45s)
VRAM Usage: 4.5GB (max) / 3.4GB (avg)
```
### Generated Files
@@ -139,20 +128,10 @@ Model Results:
## Metrics Tracked
- **Accuracy**: Percentage of clicks within bounding boxes
- **Error Rate**: Percentage of failed predictions
- **Timing**: Average, min, max prediction times
- **VRAM Usage**: Maximum and average GPU memory usage
- **Per-sample Results**: Detailed breakdown for debugging
## Requirements
- Python 3.8+
- PyTorch (for VRAM tracking)
- PIL/Pillow (for image processing)
- datasets (for HuggingFace datasets)
- tqdm (for progress bars)
- Computer Agent SDK
## Architecture
The benchmark system is designed for:
@@ -160,13 +139,6 @@ The benchmark system is designed for:
- **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys
- **Performance**: VRAM tracking and timing analysis
- **Visualization**: Automatic generation of prediction visualizations
- **No Exception Handling**: Fails fast to surface real issues
## Results Table
| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg |
|-------|---------|----------|------------|----------|----------|----------|
| (coming soon) | | | | | | |
## Contributing

View File

@@ -6,7 +6,10 @@ Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
Supports both ComputerAgent model strings and custom model classes.
"""
import argparse
import asyncio
import random
import statistics
import time
from typing import Optional
@@ -18,7 +21,8 @@ from utils import (
is_click_in_bbox,
save_results_to_markdown,
save_visualizations,
get_available_models
get_available_models,
get_gpu_memory
)
@@ -87,6 +91,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
# Calculate timing statistics
successful_times = [r['prediction_time'] for r in results if not r['failed']]
avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
min_prediction_time = min(successful_times) if successful_times else 0.0
max_prediction_time = max(successful_times) if successful_times else 0.0
@@ -101,6 +106,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
'accuracy': accuracy,
'failure_rate': error_rate,
'avg_prediction_time': avg_prediction_time,
'median_prediction_time': median_prediction_time,
'min_prediction_time': min_prediction_time,
'max_prediction_time': max_prediction_time,
'vram_max_mb': vram_stats['max_mb'],
@@ -113,6 +119,17 @@ async def main():
"""
Main function to run the benchmark.
"""
# Parse command line arguments
parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script')
parser.add_argument('--samples', type=int, default=300,
help='Number of samples to evaluate (default: 300)')
parser.add_argument('--seed', type=int, default=42,
help='Random seed for shuffling (default: 42)')
args = parser.parse_args()
# Set random seed
random.seed(args.seed)
# Load dataset
print("Loading ScreenSpot-Pro dataset...")
ds = load_dataset("lmms-lab/ScreenSpot-Pro")
@@ -121,11 +138,15 @@ async def main():
dataset_list = list(dataset)
print(f"Dataset loaded: {len(dataset_list)} samples")
# Shuffle dataset with seed
random.shuffle(dataset_list)
print(f"Dataset shuffled with seed {args.seed}")
# Get available models
models = get_available_models()
# Evaluation settings
max_samples = 300 # Set to None to evaluate on full dataset
max_samples = args.samples # Use command line argument
# Run evaluations
all_results = []
@@ -142,9 +163,15 @@ async def main():
print(f" Errors: {result['failed_predictions']}")
print(f" Error Rate: {result['failure_rate']*100:.2f}%")
print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
print(f" Median Time: {result['median_prediction_time']:.2f}s")
print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
# Print GPU memory info
gpu_memory = get_gpu_memory()
if gpu_memory and gpu_memory[0] > 0:
print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
# Save results
if all_results:

View File

@@ -1,12 +1,15 @@
#!/usr/bin/env python3
"""
ScreenSpot-Pro Benchmark Script
ScreenSpot-v2 Benchmark Script
Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
Supports both ComputerAgent model strings and custom model classes.
"""
import argparse
import asyncio
import random
import statistics
import time
from typing import Optional
@@ -18,7 +21,8 @@ from utils import (
is_click_in_bbox,
save_results_to_markdown,
save_visualizations,
get_available_models
get_available_models,
get_gpu_memory
)
@@ -93,6 +97,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
# Calculate timing statistics
successful_times = [r['prediction_time'] for r in results if not r['failed']]
avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
min_prediction_time = min(successful_times) if successful_times else 0.0
max_prediction_time = max(successful_times) if successful_times else 0.0
@@ -107,6 +112,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
'accuracy': accuracy,
'failure_rate': error_rate,
'avg_prediction_time': avg_prediction_time,
'median_prediction_time': median_prediction_time,
'min_prediction_time': min_prediction_time,
'max_prediction_time': max_prediction_time,
'vram_max_mb': vram_stats['max_mb'],
@@ -119,6 +125,17 @@ async def main():
"""
Main function to run the benchmark.
"""
# Parse command line arguments
parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script')
parser.add_argument('--samples', type=int, default=500,
help='Number of samples to evaluate (default: 500)')
parser.add_argument('--seed', type=int, default=42,
help='Random seed for shuffling (default: 42)')
args = parser.parse_args()
# Set random seed
random.seed(args.seed)
# Load dataset
print("Loading ScreenSpot-v2 dataset...")
ds = load_dataset("lmms-lab/ScreenSpot-v2")
@@ -141,11 +158,15 @@ async def main():
})
print(f"Dataset loaded: {len(samples)} samples")
# Shuffle samples with seed
random.shuffle(samples)
print(f"Samples shuffled with seed {args.seed}")
# Get available models
models = get_available_models()
# Evaluation settings
max_samples = 500 # Set to None to evaluate on full dataset
max_samples = args.samples # Use command line argument
# Run evaluations
all_results = []
@@ -162,9 +183,15 @@ async def main():
print(f" Errors: {result['failed_predictions']}")
print(f" Error Rate: {result['failure_rate']*100:.2f}%")
print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
print(f" Median Time: {result['median_prediction_time']:.2f}s")
print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
# Print GPU memory info
gpu_memory = get_gpu_memory()
if gpu_memory and gpu_memory[0] > 0:
print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
# Save results
if all_results:

View File

@@ -7,6 +7,8 @@ import asyncio
import base64
import os
import sys
import subprocess as sp
import statistics
from datetime import datetime
from io import BytesIO
from typing import List, Union, Tuple, Optional
@@ -22,6 +24,28 @@ from agent.agent import ComputerAgent
from models import GTA1Model
from models.base import ModelProtocol
def get_gpu_memory() -> List[int]:
"""
Get GPU memory usage using nvidia-smi.
Returns:
List of free memory values in MB for each GPU
"""
try:
command = "nvidia-smi --query-gpu=memory.free --format=csv"
memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
return memory_free_values
except (sp.CalledProcessError, FileNotFoundError, IndexError):
# Fallback to torch if nvidia-smi is not available
if torch.cuda.is_available():
device = torch.cuda.current_device()
total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
reserved = torch.cuda.memory_reserved(device) / 1024 / 1024
return [int(total - reserved)]
return [0]
def get_vram_usage() -> dict:
"""
Get current VRAM usage statistics.
@@ -61,11 +85,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
models = [
# === ComputerAgent model strings ===
f"{local_provider}HelloKKMe/GTA1-7B",
f"{local_provider}HelloKKMe/GTA1-32B",
# f"{local_provider}HelloKKMe/GTA1-32B",
# === Reference model classes ===
GTA1Model("HelloKKMe/GTA1-7B"),
GTA1Model("HelloKKMe/GTA1-32B"),
# GTA1Model("HelloKKMe/GTA1-32B"),
]
return models
@@ -203,8 +227,8 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
# Summary table
f.write("## Summary\n\n")
f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n")
f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
f.write("|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n")
for result in all_results:
model_name = result['model_name']
@@ -214,13 +238,14 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
accuracy = result['accuracy'] * 100
error_rate = result['failure_rate'] * 100
avg_time = result.get('avg_prediction_time', 0.0)
median_time = result.get('median_prediction_time', 0.0)
min_time = result.get('min_prediction_time', 0.0)
max_time = result.get('max_prediction_time', 0.0)
time_range = f"{min_time:.2f} - {max_time:.2f}"
vram_max = result.get('vram_max_mb', 0.0) / 1024
vram_avg = result.get('vram_avg_mb', 0.0) / 1024
f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
# Detailed results for each model
for result in all_results: