mirror of
https://github.com/trycua/computer.git
synced 2026-01-04 20:40:15 -06:00
updated metrics
This commit is contained in:
@@ -98,19 +98,13 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
|
||||
### 1. Configure Models
|
||||
Edit `utils.py` to specify which models you want to test in `get_available_models()`.
|
||||
|
||||
### 2. Set Sample Count
|
||||
Edit the benchmark script to change the number of samples:
|
||||
```python
|
||||
max_samples = 50 # Set to None to evaluate on full dataset
|
||||
```
|
||||
|
||||
### 3. Run Benchmark
|
||||
### 2. Run Benchmark
|
||||
```bash
|
||||
# ScreenSpot-v2 benchmark
|
||||
python ss-v2.py
|
||||
python ss-v2.py --samples 50
|
||||
|
||||
# ScreenSpot-Pro benchmark
|
||||
python ss-pro.py
|
||||
python ss-pro.py --samples 50
|
||||
|
||||
# Interactive testing
|
||||
python interactive.py
|
||||
@@ -121,14 +115,9 @@ python interactive.py
|
||||
### Console Output
|
||||
```
|
||||
Model Results:
|
||||
Accuracy: 85.50%
|
||||
Correct: 171/200
|
||||
Errors: 5
|
||||
Error Rate: 2.50%
|
||||
Avg Time: 1.23s
|
||||
Time Range: 0.89s - 2.45s
|
||||
VRAM Max: 4.5GB
|
||||
VRAM Avg: 3.4GB
|
||||
Accuracy: 85.50% (171/200)
|
||||
Avg Time: 1.23s (0.89s - 2.45s)
|
||||
VRAM Usage: 4.5GB (max) / 3.4GB (avg)
|
||||
```
|
||||
|
||||
### Generated Files
|
||||
@@ -139,20 +128,10 @@ Model Results:
|
||||
## Metrics Tracked
|
||||
|
||||
- **Accuracy**: Percentage of clicks within bounding boxes
|
||||
- **Error Rate**: Percentage of failed predictions
|
||||
- **Timing**: Average, min, max prediction times
|
||||
- **VRAM Usage**: Maximum and average GPU memory usage
|
||||
- **Per-sample Results**: Detailed breakdown for debugging
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+
|
||||
- PyTorch (for VRAM tracking)
|
||||
- PIL/Pillow (for image processing)
|
||||
- datasets (for HuggingFace datasets)
|
||||
- tqdm (for progress bars)
|
||||
- Computer Agent SDK
|
||||
|
||||
## Architecture
|
||||
|
||||
The benchmark system is designed for:
|
||||
@@ -160,13 +139,6 @@ The benchmark system is designed for:
|
||||
- **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys
|
||||
- **Performance**: VRAM tracking and timing analysis
|
||||
- **Visualization**: Automatic generation of prediction visualizations
|
||||
- **No Exception Handling**: Fails fast to surface real issues
|
||||
|
||||
## Results Table
|
||||
|
||||
| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg |
|
||||
|-------|---------|----------|------------|----------|----------|----------|
|
||||
| (coming soon) | | | | | | |
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@@ -6,7 +6,10 @@ Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
|
||||
Supports both ComputerAgent model strings and custom model classes.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
@@ -18,7 +21,8 @@ from utils import (
|
||||
is_click_in_bbox,
|
||||
save_results_to_markdown,
|
||||
save_visualizations,
|
||||
get_available_models
|
||||
get_available_models,
|
||||
get_gpu_memory
|
||||
)
|
||||
|
||||
|
||||
@@ -87,6 +91,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
|
||||
# Calculate timing statistics
|
||||
successful_times = [r['prediction_time'] for r in results if not r['failed']]
|
||||
avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
|
||||
median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
|
||||
min_prediction_time = min(successful_times) if successful_times else 0.0
|
||||
max_prediction_time = max(successful_times) if successful_times else 0.0
|
||||
|
||||
@@ -101,6 +106,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
|
||||
'accuracy': accuracy,
|
||||
'failure_rate': error_rate,
|
||||
'avg_prediction_time': avg_prediction_time,
|
||||
'median_prediction_time': median_prediction_time,
|
||||
'min_prediction_time': min_prediction_time,
|
||||
'max_prediction_time': max_prediction_time,
|
||||
'vram_max_mb': vram_stats['max_mb'],
|
||||
@@ -113,6 +119,17 @@ async def main():
|
||||
"""
|
||||
Main function to run the benchmark.
|
||||
"""
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script')
|
||||
parser.add_argument('--samples', type=int, default=300,
|
||||
help='Number of samples to evaluate (default: 300)')
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help='Random seed for shuffling (default: 42)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set random seed
|
||||
random.seed(args.seed)
|
||||
|
||||
# Load dataset
|
||||
print("Loading ScreenSpot-Pro dataset...")
|
||||
ds = load_dataset("lmms-lab/ScreenSpot-Pro")
|
||||
@@ -121,11 +138,15 @@ async def main():
|
||||
dataset_list = list(dataset)
|
||||
print(f"Dataset loaded: {len(dataset_list)} samples")
|
||||
|
||||
# Shuffle dataset with seed
|
||||
random.shuffle(dataset_list)
|
||||
print(f"Dataset shuffled with seed {args.seed}")
|
||||
|
||||
# Get available models
|
||||
models = get_available_models()
|
||||
|
||||
# Evaluation settings
|
||||
max_samples = 300 # Set to None to evaluate on full dataset
|
||||
max_samples = args.samples # Use command line argument
|
||||
|
||||
# Run evaluations
|
||||
all_results = []
|
||||
@@ -142,9 +163,15 @@ async def main():
|
||||
print(f" Errors: {result['failed_predictions']}")
|
||||
print(f" Error Rate: {result['failure_rate']*100:.2f}%")
|
||||
print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
|
||||
print(f" Median Time: {result['median_prediction_time']:.2f}s")
|
||||
print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
|
||||
print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
|
||||
print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
|
||||
|
||||
# Print GPU memory info
|
||||
gpu_memory = get_gpu_memory()
|
||||
if gpu_memory and gpu_memory[0] > 0:
|
||||
print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
|
||||
|
||||
# Save results
|
||||
if all_results:
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ScreenSpot-Pro Benchmark Script
|
||||
ScreenSpot-v2 Benchmark Script
|
||||
|
||||
Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
|
||||
Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
|
||||
Supports both ComputerAgent model strings and custom model classes.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import random
|
||||
import statistics
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
@@ -18,7 +21,8 @@ from utils import (
|
||||
is_click_in_bbox,
|
||||
save_results_to_markdown,
|
||||
save_visualizations,
|
||||
get_available_models
|
||||
get_available_models,
|
||||
get_gpu_memory
|
||||
)
|
||||
|
||||
|
||||
@@ -93,6 +97,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
|
||||
# Calculate timing statistics
|
||||
successful_times = [r['prediction_time'] for r in results if not r['failed']]
|
||||
avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
|
||||
median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
|
||||
min_prediction_time = min(successful_times) if successful_times else 0.0
|
||||
max_prediction_time = max(successful_times) if successful_times else 0.0
|
||||
|
||||
@@ -107,6 +112,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
|
||||
'accuracy': accuracy,
|
||||
'failure_rate': error_rate,
|
||||
'avg_prediction_time': avg_prediction_time,
|
||||
'median_prediction_time': median_prediction_time,
|
||||
'min_prediction_time': min_prediction_time,
|
||||
'max_prediction_time': max_prediction_time,
|
||||
'vram_max_mb': vram_stats['max_mb'],
|
||||
@@ -119,6 +125,17 @@ async def main():
|
||||
"""
|
||||
Main function to run the benchmark.
|
||||
"""
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script')
|
||||
parser.add_argument('--samples', type=int, default=500,
|
||||
help='Number of samples to evaluate (default: 500)')
|
||||
parser.add_argument('--seed', type=int, default=42,
|
||||
help='Random seed for shuffling (default: 42)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set random seed
|
||||
random.seed(args.seed)
|
||||
|
||||
# Load dataset
|
||||
print("Loading ScreenSpot-v2 dataset...")
|
||||
ds = load_dataset("lmms-lab/ScreenSpot-v2")
|
||||
@@ -141,11 +158,15 @@ async def main():
|
||||
})
|
||||
print(f"Dataset loaded: {len(samples)} samples")
|
||||
|
||||
# Shuffle samples with seed
|
||||
random.shuffle(samples)
|
||||
print(f"Samples shuffled with seed {args.seed}")
|
||||
|
||||
# Get available models
|
||||
models = get_available_models()
|
||||
|
||||
# Evaluation settings
|
||||
max_samples = 500 # Set to None to evaluate on full dataset
|
||||
max_samples = args.samples # Use command line argument
|
||||
|
||||
# Run evaluations
|
||||
all_results = []
|
||||
@@ -162,9 +183,15 @@ async def main():
|
||||
print(f" Errors: {result['failed_predictions']}")
|
||||
print(f" Error Rate: {result['failure_rate']*100:.2f}%")
|
||||
print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
|
||||
print(f" Median Time: {result['median_prediction_time']:.2f}s")
|
||||
print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
|
||||
print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
|
||||
print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
|
||||
|
||||
# Print GPU memory info
|
||||
gpu_memory = get_gpu_memory()
|
||||
if gpu_memory and gpu_memory[0] > 0:
|
||||
print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
|
||||
|
||||
# Save results
|
||||
if all_results:
|
||||
|
||||
@@ -7,6 +7,8 @@ import asyncio
|
||||
import base64
|
||||
import os
|
||||
import sys
|
||||
import subprocess as sp
|
||||
import statistics
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from typing import List, Union, Tuple, Optional
|
||||
@@ -22,6 +24,28 @@ from agent.agent import ComputerAgent
|
||||
from models import GTA1Model
|
||||
from models.base import ModelProtocol
|
||||
|
||||
def get_gpu_memory() -> List[int]:
|
||||
"""
|
||||
Get GPU memory usage using nvidia-smi.
|
||||
|
||||
Returns:
|
||||
List of free memory values in MB for each GPU
|
||||
"""
|
||||
try:
|
||||
command = "nvidia-smi --query-gpu=memory.free --format=csv"
|
||||
memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
|
||||
memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
|
||||
return memory_free_values
|
||||
except (sp.CalledProcessError, FileNotFoundError, IndexError):
|
||||
# Fallback to torch if nvidia-smi is not available
|
||||
if torch.cuda.is_available():
|
||||
device = torch.cuda.current_device()
|
||||
total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
|
||||
reserved = torch.cuda.memory_reserved(device) / 1024 / 1024
|
||||
return [int(total - reserved)]
|
||||
return [0]
|
||||
|
||||
|
||||
def get_vram_usage() -> dict:
|
||||
"""
|
||||
Get current VRAM usage statistics.
|
||||
@@ -61,11 +85,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
|
||||
models = [
|
||||
# === ComputerAgent model strings ===
|
||||
f"{local_provider}HelloKKMe/GTA1-7B",
|
||||
f"{local_provider}HelloKKMe/GTA1-32B",
|
||||
# f"{local_provider}HelloKKMe/GTA1-32B",
|
||||
|
||||
# === Reference model classes ===
|
||||
GTA1Model("HelloKKMe/GTA1-7B"),
|
||||
GTA1Model("HelloKKMe/GTA1-32B"),
|
||||
# GTA1Model("HelloKKMe/GTA1-32B"),
|
||||
]
|
||||
|
||||
return models
|
||||
@@ -203,8 +227,8 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
|
||||
|
||||
# Summary table
|
||||
f.write("## Summary\n\n")
|
||||
f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
|
||||
f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n")
|
||||
f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
|
||||
f.write("|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n")
|
||||
|
||||
for result in all_results:
|
||||
model_name = result['model_name']
|
||||
@@ -214,13 +238,14 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
|
||||
accuracy = result['accuracy'] * 100
|
||||
error_rate = result['failure_rate'] * 100
|
||||
avg_time = result.get('avg_prediction_time', 0.0)
|
||||
median_time = result.get('median_prediction_time', 0.0)
|
||||
min_time = result.get('min_prediction_time', 0.0)
|
||||
max_time = result.get('max_prediction_time', 0.0)
|
||||
time_range = f"{min_time:.2f} - {max_time:.2f}"
|
||||
vram_max = result.get('vram_max_mb', 0.0) / 1024
|
||||
vram_avg = result.get('vram_avg_mb', 0.0) / 1024
|
||||
|
||||
f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
|
||||
f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
|
||||
|
||||
# Detailed results for each model
|
||||
for result in all_results:
|
||||
|
||||
Reference in New Issue
Block a user