added base models to benchmark

2025-12-31 10:29:59 -06:00 · 2025-08-05 12:45:00 -04:00
parent 3cc36905ff
commit 8eb662bf4d
1 changed files with 7 additions and 1 deletions
--- a/libs/python/agent/benchmarks/utils.py
+++ b/libs/python/agent/benchmarks/utils.py
@@ -3,6 +3,9 @@
 Shared utilities for ScreenSpot-Pro benchmarking and interactive testing.
 """

+import dotenv
+dotenv.load_dotenv()
+
 import asyncio
 import base64
 import os
@@ -85,9 +88,12 @@ def get_available_models() -> List[Union[str, ModelProtocol]]:

    models = [
        # === ComputerAgent model strings ===
+        "openai/computer-use-preview",
+        "anthropic/claude-opus-4-20250514",
        # f"{local_provider}HelloKKMe/GTA1-7B",
        # f"{local_provider}HelloKKMe/GTA1-32B",
-        "openai/computer-use-preview+openai/gpt-4o-mini"
+        "openai/computer-use-preview+openai/gpt-4o-mini",
+        "anthropic/claude-opus-4-20250514+openai/gpt-4o-mini",
        
        # === Reference model classes ===
        # GTA1Model("HelloKKMe/GTA1-7B"),