added agent benchmarks

This commit is contained in:
Dillon DuPont
2025-07-30 13:41:58 -04:00
parent 2076ec7596
commit ffc88e2031
6 changed files with 553 additions and 134 deletions
+65 -63
View File
@@ -7,6 +7,7 @@ Supports both ComputerAgent model strings and custom model classes.
"""
import asyncio
import time
from typing import Optional
from datasets import load_dataset
@@ -43,66 +44,67 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
total_samples = min(max_samples, total_samples)
correct_predictions = 0
failed_predictions = 0
error_predictions = 0
results = []
try:
for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
sample = dataset[i]
# Extract sample data
image = sample['image']
instruction = sample['instruction']
bbox = sample['bbox'] # [x1, y1, x2, y2]
sample_id = sample['id']
# Predict click coordinates
try:
click_coords = await model_wrapper.predict_click(image, instruction)
# Check if prediction is correct
is_correct = is_click_in_bbox(click_coords, bbox)
if is_correct:
correct_predictions += 1
results.append({
'id': sample_id,
'instruction': instruction,
'bbox': bbox,
'predicted_coords': click_coords,
'is_correct': is_correct,
'failed': False
})
except Exception as e:
print(f"\nError predicting sample {sample_id}: {e}")
failed_predictions += 1
results.append({
'id': sample_id,
'instruction': instruction,
'bbox': bbox,
'predicted_coords': None,
'is_correct': False,
'failed': True,
'error': str(e)
})
for i in tqdm(range(total_samples), desc=f"Evaluating {model_wrapper.model_name}"):
sample = dataset[i]
# Extract sample data
image = sample['image']
instruction = sample['instruction']
bbox = sample['bbox'] # [x1, y1, x2, y2]
sample_id = sample['img_filename']
# Predict click coordinates with timing
start_time = time.time()
click_coords = await model_wrapper.predict_click(image, instruction)
prediction_time = time.time() - start_time
# Check if prediction is correct
is_correct = is_click_in_bbox(click_coords, bbox)
if is_correct:
correct_predictions += 1
results.append({
'id': sample_id,
'instruction': instruction,
'bbox': bbox,
'predicted_coords': click_coords,
'is_correct': is_correct,
'failed': False,
'prediction_time': prediction_time
})
finally:
# Unload model
await model_wrapper.unload_model()
# Unload model
await model_wrapper.unload_model()
# Calculate metrics
accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
failure_rate = failed_predictions / total_samples if total_samples > 0 else 0.0
error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
# Calculate timing statistics
successful_times = [r['prediction_time'] for r in results if not r['failed']]
avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
min_prediction_time = min(successful_times) if successful_times else 0.0
max_prediction_time = max(successful_times) if successful_times else 0.0
# Get VRAM statistics
vram_stats = model_wrapper.get_vram_stats()
return {
'model_name': model_wrapper.model_name,
'total_samples': total_samples,
'correct_predictions': correct_predictions,
'failed_predictions': failed_predictions,
'failed_predictions': error_predictions,
'accuracy': accuracy,
'failure_rate': failure_rate,
'failure_rate': error_rate,
'avg_prediction_time': avg_prediction_time,
'min_prediction_time': min_prediction_time,
'max_prediction_time': max_prediction_time,
'vram_max_mb': vram_stats['max_mb'],
'vram_avg_mb': vram_stats['avg_mb'],
'results': results
}
@@ -123,26 +125,26 @@ async def main():
models = get_available_models()
# Evaluation settings
max_samples = 5 # Set to None to evaluate on full dataset
max_samples = 300 # Set to None to evaluate on full dataset
# Run evaluations
all_results = []
for model in models:
try:
model_wrapper = ModelWrapper(model)
result = await evaluate_model(model_wrapper, dataset_list, max_samples)
all_results.append(result)
# Print summary
print(f"\n{result['model_name']} Results:")
print(f" Accuracy: {result['accuracy']*100:.2f}%")
print(f" Correct: {result['correct_predictions']}/{result['total_samples']}")
print(f" Failed: {result['failed_predictions']}")
except Exception as e:
print(f"\nError evaluating model {model}: {e}")
continue
model_wrapper = ModelWrapper(model)
result = await evaluate_model(model_wrapper, dataset_list, max_samples)
all_results.append(result)
# Print summary
print(f"\n{result['model_name']} Results:")
print(f" Accuracy: {result['accuracy']*100:.2f}%")
print(f" Correct: {result['correct_predictions']}/{result['total_samples']}")
print(f" Errors: {result['failed_predictions']}")
print(f" Error Rate: {result['failure_rate']*100:.2f}%")
print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
print(f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
# Save results
if all_results: