diff --git a/libs/python/agent/benchmarks/README.md b/libs/python/agent/benchmarks/README.md
index 033f0157..225fc30b 100644
--- a/libs/python/agent/benchmarks/README.md
+++ b/libs/python/agent/benchmarks/README.md
@@ -98,19 +98,13 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
 ### 1. Configure Models
 Edit `utils.py` to specify which models you want to test in `get_available_models()`.
 
-### 2. Set Sample Count
-Edit the benchmark script to change the number of samples:
-```python
-max_samples = 50  # Set to None to evaluate on full dataset
-```
-
-### 3. Run Benchmark
+### 2. Run Benchmark
 ```bash
 # ScreenSpot-v2 benchmark
-python ss-v2.py
+python ss-v2.py --samples 50
 
 # ScreenSpot-Pro benchmark  
-python ss-pro.py
+python ss-pro.py --samples 50
 
 # Interactive testing
 python interactive.py
@@ -121,14 +115,9 @@ python interactive.py
 ### Console Output
 ```
 Model Results:
-  Accuracy: 85.50%
-  Correct: 171/200
-  Errors: 5
-  Error Rate: 2.50%
-  Avg Time: 1.23s
-  Time Range: 0.89s - 2.45s
-  VRAM Max: 4.5GB
-  VRAM Avg: 3.4GB
+  Accuracy: 85.50% (171/200)
+  Avg Time: 1.23s (0.89s - 2.45s)
+  VRAM Usage: 4.5GB (max) / 3.4GB (avg)
 ```
 
 ### Generated Files
@@ -139,20 +128,10 @@ Model Results:
 ## Metrics Tracked
 
 - **Accuracy**: Percentage of clicks within bounding boxes
-- **Error Rate**: Percentage of failed predictions
 - **Timing**: Average, min, max prediction times
 - **VRAM Usage**: Maximum and average GPU memory usage
 - **Per-sample Results**: Detailed breakdown for debugging
 
-## Requirements
-
-- Python 3.8+
-- PyTorch (for VRAM tracking)
-- PIL/Pillow (for image processing)
-- datasets (for HuggingFace datasets)
-- tqdm (for progress bars)
-- Computer Agent SDK
-
 ## Architecture
 
 The benchmark system is designed for:
@@ -160,13 +139,6 @@ The benchmark system is designed for:
 - **Flexibility**: Works with any iterator of dicts with `image`, `bbox`, `instruction` keys
 - **Performance**: VRAM tracking and timing analysis
 - **Visualization**: Automatic generation of prediction visualizations
-- **No Exception Handling**: Fails fast to surface real issues
-
-## Results Table
-
-| Model | Dataset | Accuracy | Error Rate | Avg Time | VRAM Max | VRAM Avg |
-|-------|---------|----------|------------|----------|----------|----------|
-| (coming soon) | | | | | | |
 
 ## Contributing
 
diff --git a/libs/python/agent/benchmarks/ss-pro.py b/libs/python/agent/benchmarks/ss-pro.py
index e1e54a1d..80e5e72f 100644
--- a/libs/python/agent/benchmarks/ss-pro.py
+++ b/libs/python/agent/benchmarks/ss-pro.py
@@ -6,7 +6,10 @@ Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
 Supports both ComputerAgent model strings and custom model classes.
 """
 
+import argparse
 import asyncio
+import random
+import statistics
 import time
 from typing import Optional
 
@@ -18,7 +21,8 @@ from utils import (
     is_click_in_bbox, 
     save_results_to_markdown, 
     save_visualizations,
-    get_available_models
+    get_available_models,
+    get_gpu_memory
 )
 
 
@@ -87,6 +91,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
     # Calculate timing statistics
     successful_times = [r['prediction_time'] for r in results if not r['failed']]
     avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
     min_prediction_time = min(successful_times) if successful_times else 0.0
     max_prediction_time = max(successful_times) if successful_times else 0.0
     
@@ -101,6 +106,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, dataset, max_samples: Opti
         'accuracy': accuracy,
         'failure_rate': error_rate,
         'avg_prediction_time': avg_prediction_time,
+        'median_prediction_time': median_prediction_time,
         'min_prediction_time': min_prediction_time,
         'max_prediction_time': max_prediction_time,
         'vram_max_mb': vram_stats['max_mb'],
@@ -113,6 +119,17 @@ async def main():
     """
     Main function to run the benchmark.
     """
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='ScreenSpot-Pro Benchmark Script')
+    parser.add_argument('--samples', type=int, default=300, 
+                       help='Number of samples to evaluate (default: 300)')
+    parser.add_argument('--seed', type=int, default=42,
+                       help='Random seed for shuffling (default: 42)')
+    args = parser.parse_args()
+    
+    # Set random seed
+    random.seed(args.seed)
+    
     # Load dataset
     print("Loading ScreenSpot-Pro dataset...")
     ds = load_dataset("lmms-lab/ScreenSpot-Pro")
@@ -121,11 +138,15 @@ async def main():
     dataset_list = list(dataset)
     print(f"Dataset loaded: {len(dataset_list)} samples")
     
+    # Shuffle dataset with seed
+    random.shuffle(dataset_list)
+    print(f"Dataset shuffled with seed {args.seed}")
+    
     # Get available models
     models = get_available_models()
     
     # Evaluation settings
-    max_samples = 300  # Set to None to evaluate on full dataset
+    max_samples = args.samples  # Use command line argument
     
     # Run evaluations
     all_results = []
@@ -142,9 +163,15 @@ async def main():
         print(f"  Errors: {result['failed_predictions']}")
         print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
         print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Median Time: {result['median_prediction_time']:.2f}s")
         print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
         print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
         print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
+        
+        # Print GPU memory info
+        gpu_memory = get_gpu_memory()
+        if gpu_memory and gpu_memory[0] > 0:
+            print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
     
     # Save results
     if all_results:
diff --git a/libs/python/agent/benchmarks/ss-v2.py b/libs/python/agent/benchmarks/ss-v2.py
index 919a1001..dab1d4b1 100644
--- a/libs/python/agent/benchmarks/ss-v2.py
+++ b/libs/python/agent/benchmarks/ss-v2.py
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 """
-ScreenSpot-Pro Benchmark Script
+ScreenSpot-v2 Benchmark Script
 
-Evaluates models on the ScreenSpot-Pro dataset for click prediction accuracy.
+Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
 Supports both ComputerAgent model strings and custom model classes.
 """
 
+import argparse
 import asyncio
+import random
+import statistics
 import time
 from typing import Optional
 
@@ -18,7 +21,8 @@ from utils import (
     is_click_in_bbox, 
     save_results_to_markdown, 
     save_visualizations,
-    get_available_models
+    get_available_models,
+    get_gpu_memory
 )
 
 
@@ -93,6 +97,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
     # Calculate timing statistics
     successful_times = [r['prediction_time'] for r in results if not r['failed']]
     avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
+    median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
     min_prediction_time = min(successful_times) if successful_times else 0.0
     max_prediction_time = max(successful_times) if successful_times else 0.0
     
@@ -107,6 +112,7 @@ async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Opti
         'accuracy': accuracy,
         'failure_rate': error_rate,
         'avg_prediction_time': avg_prediction_time,
+        'median_prediction_time': median_prediction_time,
         'min_prediction_time': min_prediction_time,
         'max_prediction_time': max_prediction_time,
         'vram_max_mb': vram_stats['max_mb'],
@@ -119,6 +125,17 @@ async def main():
     """
     Main function to run the benchmark.
     """
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script')
+    parser.add_argument('--samples', type=int, default=500, 
+                       help='Number of samples to evaluate (default: 500)')
+    parser.add_argument('--seed', type=int, default=42,
+                       help='Random seed for shuffling (default: 42)')
+    args = parser.parse_args()
+    
+    # Set random seed
+    random.seed(args.seed)
+    
     # Load dataset
     print("Loading ScreenSpot-v2 dataset...")
     ds = load_dataset("lmms-lab/ScreenSpot-v2")
@@ -141,11 +158,15 @@ async def main():
         })
     print(f"Dataset loaded: {len(samples)} samples")
     
+    # Shuffle samples with seed
+    random.shuffle(samples)
+    print(f"Samples shuffled with seed {args.seed}")
+    
     # Get available models
     models = get_available_models()
     
     # Evaluation settings
-    max_samples = 500  # Set to None to evaluate on full dataset
+    max_samples = args.samples  # Use command line argument
     
     # Run evaluations
     all_results = []
@@ -162,9 +183,15 @@ async def main():
         print(f"  Errors: {result['failed_predictions']}")
         print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
         print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
+        print(f"  Median Time: {result['median_prediction_time']:.2f}s")
         print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
         print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
         print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
+        
+        # Print GPU memory info
+        gpu_memory = get_gpu_memory()
+        if gpu_memory and gpu_memory[0] > 0:
+            print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
     
     # Save results
     if all_results:
diff --git a/libs/python/agent/benchmarks/utils.py b/libs/python/agent/benchmarks/utils.py
index 099499a5..7a3b70a3 100644
--- a/libs/python/agent/benchmarks/utils.py
+++ b/libs/python/agent/benchmarks/utils.py
@@ -7,6 +7,8 @@ import asyncio
 import base64
 import os
 import sys
+import subprocess as sp
+import statistics
 from datetime import datetime
 from io import BytesIO
 from typing import List, Union, Tuple, Optional
@@ -22,6 +24,28 @@ from agent.agent import ComputerAgent
 from models import GTA1Model
 from models.base import ModelProtocol
 
+def get_gpu_memory() -> List[int]:
+    """
+    Get GPU memory usage using nvidia-smi.
+    
+    Returns:
+        List of free memory values in MB for each GPU
+    """
+    try:
+        command = "nvidia-smi --query-gpu=memory.free --format=csv"
+        memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
+        memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
+        return memory_free_values
+    except (sp.CalledProcessError, FileNotFoundError, IndexError):
+        # Fallback to torch if nvidia-smi is not available
+        if torch.cuda.is_available():
+            device = torch.cuda.current_device()
+            total = torch.cuda.get_device_properties(device).total_memory / 1024 / 1024
+            reserved = torch.cuda.memory_reserved(device) / 1024 / 1024
+            return [int(total - reserved)]
+        return [0]
+
+
 def get_vram_usage() -> dict:
     """
     Get current VRAM usage statistics.
@@ -61,11 +85,11 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:
     models = [
         # === ComputerAgent model strings ===
         f"{local_provider}HelloKKMe/GTA1-7B",
-        f"{local_provider}HelloKKMe/GTA1-32B",
+        # f"{local_provider}HelloKKMe/GTA1-32B",
         
         # === Reference model classes ===
         GTA1Model("HelloKKMe/GTA1-7B"),
-        GTA1Model("HelloKKMe/GTA1-32B"), 
+        # GTA1Model("HelloKKMe/GTA1-32B"), 
     ]
     
     return models
@@ -203,8 +227,8 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
         
         # Summary table
         f.write("## Summary\n\n")
-        f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
-        f.write("|-------|---------------|---------|--------|----------|------------|--------------|----------------|---------------|---------------|\n")
+        f.write("| Model | Total Samples | Correct | Errors | Accuracy | Error Rate | Avg Time (s) | Median Time (s) | Time Range (s) | VRAM Max (GB) | VRAM Avg (GB) |\n")
+        f.write("|-------|---------------|---------|--------|----------|------------|--------------|-----------------|----------------|---------------|---------------|\n")
         
         for result in all_results:
             model_name = result['model_name']
@@ -214,13 +238,14 @@ def save_results_to_markdown(all_results: List[dict],output_file: str = "screens
             accuracy = result['accuracy'] * 100
             error_rate = result['failure_rate'] * 100
             avg_time = result.get('avg_prediction_time', 0.0)
+            median_time = result.get('median_prediction_time', 0.0)
             min_time = result.get('min_prediction_time', 0.0)
             max_time = result.get('max_prediction_time', 0.0)
             time_range = f"{min_time:.2f} - {max_time:.2f}"
             vram_max = result.get('vram_max_mb', 0.0) / 1024
             vram_avg = result.get('vram_avg_mb', 0.0) / 1024
             
-            f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
+            f.write(f"| {model_name} | {total} | {correct} | {errors} | {accuracy:.2f}% | {error_rate:.2f}% | {avg_time:.2f} | {median_time:.2f} | {time_range} | {vram_max:.1f} | {vram_avg:.1f} |\n")
         
         # Detailed results for each model
         for result in all_results: