From 312361abccdb616fe4681ce974abf561ba40c2cb Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 13:12:01 -0400 Subject: [PATCH 01/23] Added python funcs to computer --- libs/computer/computer/computer.py | 170 +++++++++++++++++++++++++++++ tests/pytest.ini | 4 + tests/venv.py | 151 +++++++++++++++++++++++++ 3 files changed, 325 insertions(+) create mode 100644 tests/pytest.ini create mode 100644 tests/venv.py diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py index c25ad2bf..b77582ae 100644 --- a/libs/computer/computer/computer.py +++ b/libs/computer/computer/computer.py @@ -722,3 +722,173 @@ class Computer: tuple[float, float]: (x, y) coordinates in screenshot space """ return await self.interface.to_screenshot_coordinates(x, y) + + + # Add virtual environment management functions to computer interface + async def venv_install(self, venv_name: str, requirements: list[str]) -> tuple[str, str]: + """Install packages in a virtual environment. + + Args: + venv_name: Name of the virtual environment + requirements: List of package requirements to install + + Returns: + Tuple of (stdout, stderr) from the installation command + """ + requirements = requirements or [] + + # Create virtual environment if it doesn't exist + venv_path = f"~/.venvs/{venv_name}" + create_cmd = f"mkdir -p ~/.venvs && python3 -m venv {venv_path}" + + # Check if venv exists, if not create it + check_cmd = f"test -d {venv_path} || ({create_cmd})" + _, _ = await self.interface.run_command(check_cmd) + + # Install packages + requirements_str = " ".join(requirements) + install_cmd = f". {venv_path}/bin/activate && pip install {requirements_str}" + return await self.interface.run_command(install_cmd) + + async def venv_cmd(self, venv_name: str, command: str) -> tuple[str, str]: + """Execute a shell command in a virtual environment. + + Args: + venv_name: Name of the virtual environment + command: Shell command to execute in the virtual environment + + Returns: + Tuple of (stdout, stderr) from the command execution + """ + venv_path = f"~/.venvs/{venv_name}" + + # Check if virtual environment exists + check_cmd = f"test -d {venv_path}" + stdout, stderr = await self.interface.run_command(check_cmd) + + if stderr or "test:" in stdout: # venv doesn't exist + return "", f"Virtual environment '{venv_name}' does not exist. Create it first using venv_install." + + # Activate virtual environment and run command + full_command = f". {venv_path}/bin/activate && {command}" + return await self.interface.run_command(full_command) + + async def venv_exec(self, venv_name: str, python_func, *args, **kwargs): + """Execute Python function in a virtual environment using source code extraction. + + Args: + venv_name: Name of the virtual environment + python_func: A callable function to execute + *args: Positional arguments to pass to the function + **kwargs: Keyword arguments to pass to the function + + Returns: + The result of the function execution, or raises any exception that occurred + """ + import base64 + import inspect + import json + import textwrap + + try: + # Get function source code using inspect.getsource + source = inspect.getsource(python_func) + # Remove common leading whitespace (dedent) + func_source = textwrap.dedent(source).strip() + + # Get function name for execution + func_name = python_func.__name__ + + # Serialize args and kwargs as JSON (safer than dill for cross-version compatibility) + args_json = json.dumps(args, default=str) + kwargs_json = json.dumps(kwargs, default=str) + + except OSError as e: + raise Exception(f"Cannot retrieve source code for function {python_func.__name__}: {e}") + except Exception as e: + raise Exception(f"Failed to reconstruct function source: {e}") + + # Create Python code that will define and execute the function + python_code = f''' +import json +import traceback + +try: + # Define the function from source +{textwrap.indent(func_source, " ")} + + # Deserialize args and kwargs from JSON + args_json = """{args_json}""" + kwargs_json = """{kwargs_json}""" + args = json.loads(args_json) + kwargs = json.loads(kwargs_json) + + # Execute the function + result = {func_name}(*args, **kwargs) + + # Create success output payload + output_payload = {{ + "success": True, + "result": result, + "error": None + }} + +except Exception as e: + # Create error output payload + output_payload = {{ + "success": False, + "result": None, + "error": {{ + "type": type(e).__name__, + "message": str(e), + "traceback": traceback.format_exc() + }} + }} + +# Serialize the output payload as JSON +import json +output_json = json.dumps(output_payload, default=str) + +# Print the JSON output with markers +print(f"<<>>{{output_json}}<<>>") +''' + + # Encode the Python code in base64 to avoid shell escaping issues + encoded_code = base64.b64encode(python_code.encode('utf-8')).decode('ascii') + + # Execute the Python code in the virtual environment + python_command = f"python -c \"import base64; exec(base64.b64decode('{encoded_code}').decode('utf-8'))\"" + stdout, stderr = await self.venv_cmd(venv_name, python_command) + + # Parse the output to extract the payload + start_marker = "<<>>" + end_marker = "<<>>" + + # Print original stdout + print(stdout[:stdout.find(start_marker)]) + + if start_marker in stdout and end_marker in stdout: + start_idx = stdout.find(start_marker) + len(start_marker) + end_idx = stdout.find(end_marker) + + if start_idx < end_idx: + output_json = stdout[start_idx:end_idx] + + try: + # Decode and deserialize the output payload from JSON + output_payload = json.loads(output_json) + except Exception as e: + raise Exception(f"Failed to decode output payload: {e}") + + if output_payload["success"]: + return output_payload["result"] + else: + # Recreate and raise the original exception + error_info = output_payload["error"] + error_class = eval(error_info["type"]) + raise error_class(error_info["message"]) + else: + raise Exception("Invalid output format: markers found but no content between them") + else: + # Fallback: return stdout/stderr if no payload markers found + raise Exception(f"No output payload found. stdout: {stdout}, stderr: {stderr}") diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 00000000..998cbeaf --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +asyncio_mode = auto +markers = + asyncio: asyncio mark \ No newline at end of file diff --git a/tests/venv.py b/tests/venv.py new file mode 100644 index 00000000..4f9e3206 --- /dev/null +++ b/tests/venv.py @@ -0,0 +1,151 @@ +""" +Virtual Environment Testing Module +This module tests the ability to execute python code in a virtual environment within C/ua Containers. + +Required environment variables: +- CUA_API_KEY: API key for C/ua cloud provider +- CUA_CONTAINER_NAME: Name of the container to use +""" + +import os +import asyncio +import pytest +from pathlib import Path +import sys +import traceback + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +env_file = project_root / ".env" +print(f"Loading environment from: {env_file}") +from dotenv import load_dotenv + +load_dotenv(env_file) + +# Add paths to sys.path if needed +pythonpath = os.environ.get("PYTHONPATH", "") +for path in pythonpath.split(":"): + if path and path not in sys.path: + sys.path.insert(0, path) # Insert at beginning to prioritize + print(f"Added to sys.path: {path}") + +from computer.computer import Computer +from computer.providers.base import VMProviderType + + +@pytest.fixture(scope="session") +async def computer(): + """Shared Computer instance for all test cases.""" + # Create a remote Linux computer with C/ua + computer = Computer( + os_type="linux", + api_key=os.getenv("CUA_API_KEY"), + name=str(os.getenv("CUA_CONTAINER_NAME")), + provider_type=VMProviderType.CLOUD, + ) + + try: + await computer.run() + yield computer + finally: + await computer.stop() + + +# Sample test cases +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_install(computer): + """Test virtual environment creation and package installation.""" + # Create a test virtual environment and install requests + stdout, _ = await computer.venv_install("test_env", ["requests"]) + + # Check that installation was successful (no major errors) + assert "Successfully installed" in stdout or "Requirement already satisfied" in stdout + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_cmd(computer): + """Test executing shell commands in virtual environment.""" + # Test Python version check + stdout, _ = await computer.venv_cmd("test_env", "python --version") + + assert "Python" in stdout + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_exec(computer): + """Test executing Python functions in virtual environment.""" + def test_function(message="Hello World"): + import sys + return f"Python {sys.version_info.major}.{sys.version_info.minor}: {message}" + + result = await computer.venv_exec("test_env", test_function, message="Test successful!") + + assert "Python" in result + assert "Test successful!" in result + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_exec_with_package(computer): + """Test executing Python functions that use installed packages.""" + def test_requests(): + import requests + return f"requests version: {requests.__version__}" + + result = await computer.venv_exec("test_env", test_requests) + + assert "requests version:" in result + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_exec_error_handling(computer): + """Test error handling in venv_exec.""" + def test_error(): + raise ValueError("This is a test error") + + with pytest.raises(ValueError, match="This is a test error"): + await computer.venv_exec("test_env", test_error) + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_exec_with_args_kwargs(computer): + """Test executing Python functions with args and kwargs that return an object.""" + def create_data_object(name, age, *hobbies, **metadata): + return { + "name": name, + "age": age, + "hobbies": list(hobbies), + "metadata": metadata, + "status": "active" + } + + args = ["Alice", 25, "reading", "coding"] + kwargs = {"location": "New York", "department": "Engineering"} + + result = await computer.venv_exec( + "test_env", + create_data_object, + *args, + **kwargs + ) + + assert result["name"] == "Alice" + assert result["age"] == 25 + assert result["hobbies"] == ["reading", "coding"] + assert result["metadata"]["location"] == "New York" + assert result["status"] == "active" + +@pytest.mark.asyncio(loop_scope="session") +async def test_venv_exec_stdout_capture(computer, capfd): + """Test capturing stdout from Python functions executed in virtual environment.""" + def hello_world_function(): + print("Hello World!") + return "Function completed" + + # Execute the function in the virtual environment + result = await computer.venv_exec("test_env", hello_world_function) + + # Capture stdout and stderr + out, _ = capfd.readouterr() + + # Assert the stdout contains our expected output + assert out == "Hello World!\n\n" + assert result == "Function completed" + +if __name__ == "__main__": + # Run tests directly + pytest.main([__file__, "-v"]) From 420b67d2a85a1069944cef5fb40b0eff248f3a6c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 13:26:11 -0400 Subject: [PATCH 02/23] wiki-race evaluator --- examples/eval_examples.py | 149 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 examples/eval_examples.py diff --git a/examples/eval_examples.py b/examples/eval_examples.py new file mode 100644 index 00000000..c3504250 --- /dev/null +++ b/examples/eval_examples.py @@ -0,0 +1,149 @@ +import os +import asyncio +from pathlib import Path +import sys +import traceback +import time + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +env_file = project_root / ".env" +print(f"Loading environment from: {env_file}") +from dotenv import load_dotenv + +load_dotenv(env_file) + +# Add paths to sys.path if needed +pythonpath = os.environ.get("PYTHONPATH", "") +for path in pythonpath.split(":"): + if path and path not in sys.path: + sys.path.insert(0, path) # Insert at beginning to prioritize + print(f"Added to sys.path: {path}") + +from computer.computer import Computer +from computer.providers.base import VMProviderType +from computer.logger import LogLevel + +# Assuming these exist based on your request +from agent import ComputerAgent, LLM, AgentLoop, LLMProvider + +async def main(): + try: + print("\n=== Using cloud container ===") + # Create a remote Linux computer with CUA + computer = Computer( + os_type="linux", + api_key=os.getenv("CUA_API_KEY"), + name=str(os.getenv("CUA_CONTAINER_NAME")), + provider_type=VMProviderType.CLOUD, + ) + + try: + # Run the computer with default parameters + await computer.run() + + # Install required packages + await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) + + # Helper functions for wikirace + async def open_wiki(page): + await computer.interface.run_command(f"firefox https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") + await asyncio.sleep(2) # Wait for page to load + + # Remote functions for wikirace + def get_open_wikis(): + import pywinctl + titles = pywinctl.getAllTitles() + wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title] + return wiki_titles + + def get_current_wiki_page(): + import pywinctl + titles = pywinctl.getAllTitles() + wiki_titles = [title for title in titles if "Wikipedia" in title and "Mozilla Firefox" in title] + if wiki_titles: + return wiki_titles[0].split(" - Wikipedia")[0] + return None + + # Wikirace setup + start_page = "Albert Einstein" + target_page = "Pizza" + max_steps = 10 + + print(f"\nStarting Wikirace: {start_page} → {target_page}") + + # Open starting page + await open_wiki(start_page) + + # Create agent + agent = ComputerAgent( + computer=computer, + loop=AgentLoop.OPENAI, + model=LLM(LLMProvider.OPENAI) + ) + + # Run the wikirace + steps = 0 + success = False + start_time = time.time() + + prompt = f""" + You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" + by clicking only on Wikipedia links within articles. + + Rules: + 1. Only click on links within Wikipedia articles (blue underlined text) + 2. No using search, back button, or typing URLs + 3. Try to find the shortest path possible + 4. Current target: {target_page} + + Look at the current page and click on a link that might lead you closer to {target_page}. + """ + + async for step_result in agent.run(prompt): + steps += 1 + print(f"Step {steps}: {step_result}") + + # Check current page + current_page = await computer.venv_exec("eval_env", get_current_wiki_page) + print(f"Current page: {current_page}") + + # Check if we reached the target + if current_page and target_page.lower() in current_page.lower(): + success = True + print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") + break + + # Safety check + if steps >= max_steps: + print(f"❌ Failed: Reached maximum steps ({max_steps})") + break + + await asyncio.sleep(1) # Brief pause between steps + + end_time = time.time() + duration = end_time - start_time + + # Results + print(f"\n=== WIKIRACE RESULTS ===") + print(f"Start: {start_page}") + print(f"Target: {target_page}") + print(f"Steps taken: {steps}") + print(f"Success: {success}") + print(f"Duration: {duration:.2f} seconds") + + # Get final page list + final_wikis = await computer.venv_exec("eval_env", get_open_wikis) + print(f"Open Wikipedia pages: {final_wikis}") + + finally: + # Important to clean up resources + await computer.stop() + + except Exception as e: + print(f"Error in main: {e}") + traceback.print_exc() + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From a6397e9a9bd48ba42e517c258b068377c72c6076 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 14:10:48 -0400 Subject: [PATCH 03/23] Added decorator example --- examples/eval_examples.py | 85 ++++++++++++++++++++---------- libs/computer/computer/computer.py | 4 ++ 2 files changed, 62 insertions(+), 27 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index c3504250..1315aba4 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -4,6 +4,7 @@ from pathlib import Path import sys import traceback import time +from functools import wraps # Load environment variables from .env file project_root = Path(__file__).parent.parent @@ -27,16 +28,41 @@ from computer.logger import LogLevel # Assuming these exist based on your request from agent import ComputerAgent, LLM, AgentLoop, LLMProvider +# Global reference to computer instance (will be set in main) +_computer = None + +def remote(venv_name="eval_env"): + """ + Decorator that wraps a function to be executed remotely via computer.venv_exec + + Args: + venv_name: Name of the virtual environment to execute in + """ + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + if _computer is None: + raise RuntimeError("Computer instance not initialized. Call this after computer.run()") + return await _computer.venv_exec(venv_name, func, *args, **kwargs) + return wrapper + return decorator + async def main(): + global _computer, remote + try: print("\n=== Using cloud container ===") - # Create a remote Linux computer with CUA - computer = Computer( - os_type="linux", - api_key=os.getenv("CUA_API_KEY"), - name=str(os.getenv("CUA_CONTAINER_NAME")), - provider_type=VMProviderType.CLOUD, - ) + # # Create a remote Linux computer with CUA + # computer = Computer( + # os_type="linux", + # api_key=os.getenv("CUA_API_KEY"), + # name=str(os.getenv("CUA_CONTAINER_NAME")), + # provider_type=VMProviderType.CLOUD, + # ) + + # Connect to local macOS computer + computer = Computer() + _computer = computer # Set global reference try: # Run the computer with default parameters @@ -47,34 +73,41 @@ async def main(): # Helper functions for wikirace async def open_wiki(page): - await computer.interface.run_command(f"firefox https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") + await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") await asyncio.sleep(2) # Wait for page to load - # Remote functions for wikirace + # Remote functions for wikirace - using @remote decorator + @remote("eval_env") def get_open_wikis(): import pywinctl titles = pywinctl.getAllTitles() wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title] return wiki_titles + @remote("eval_env") def get_current_wiki_page(): import pywinctl titles = pywinctl.getAllTitles() - wiki_titles = [title for title in titles if "Wikipedia" in title and "Mozilla Firefox" in title] + wiki_titles = [title for title in titles if "Wikipedia" in title] if wiki_titles: return wiki_titles[0].split(" - Wikipedia")[0] return None # Wikirace setup + max_steps = 15 start_page = "Albert Einstein" target_page = "Pizza" - max_steps = 10 print(f"\nStarting Wikirace: {start_page} → {target_page}") # Open starting page await open_wiki(start_page) + # Check current page using decorated function + current_page = await get_current_wiki_page() + print(f"Starting page: {current_page}") + assert current_page == start_page, f"Expected {start_page}, got {current_page}" + # Create agent agent = ComputerAgent( computer=computer, @@ -100,26 +133,23 @@ async def main(): Look at the current page and click on a link that might lead you closer to {target_page}. """ - async for step_result in agent.run(prompt): + async for result in agent.run(prompt): steps += 1 - print(f"Step {steps}: {step_result}") - - # Check current page - current_page = await computer.venv_exec("eval_env", get_current_wiki_page) - print(f"Current page: {current_page}") - - # Check if we reached the target - if current_page and target_page.lower() in current_page.lower(): - success = True - print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") - break + print(f"Step {steps}: {result}") # Safety check if steps >= max_steps: print(f"❌ Failed: Reached maximum steps ({max_steps})") break - await asyncio.sleep(1) # Brief pause between steps + # Check current page using decorated function + current_page = await get_current_wiki_page() + print(f"Current page: {current_page}") + + # Check if we reached the target + if current_page and target_page.lower() in current_page.lower(): + success = True + print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") end_time = time.time() duration = end_time - start_time @@ -132,13 +162,14 @@ async def main(): print(f"Success: {success}") print(f"Duration: {duration:.2f} seconds") - # Get final page list - final_wikis = await computer.venv_exec("eval_env", get_open_wikis) + # Get final page list - now using decorated function + final_wikis = await get_open_wikis() print(f"Open Wikipedia pages: {final_wikis}") finally: # Important to clean up resources - await computer.stop() + # await computer.stop() + pass except Exception as e: print(f"Error in main: {e}") diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py index b77582ae..2057215e 100644 --- a/libs/computer/computer/computer.py +++ b/libs/computer/computer/computer.py @@ -796,6 +796,10 @@ class Computer: # Remove common leading whitespace (dedent) func_source = textwrap.dedent(source).strip() + # Remove decorators + while func_source.lstrip().startswith("@"): + func_source = func_source.split("\n", 1)[1].strip() + # Get function name for execution func_name = python_func.__name__ From c5c91729a2421172bdbc14e98090b2ae6014b6e2 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 14:18:52 -0400 Subject: [PATCH 04/23] Added cleanup step --- examples/eval_examples.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 1315aba4..38d9cae8 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -78,11 +78,15 @@ async def main(): # Remote functions for wikirace - using @remote decorator @remote("eval_env") - def get_open_wikis(): + def close_all_windows(): import pywinctl - titles = pywinctl.getAllTitles() - wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title] - return wiki_titles + windows = pywinctl.getAllWindows() + for window in windows: + try: + window.close() + except: + # Some windows might not be closeable or may have already closed + pass @remote("eval_env") def get_current_wiki_page(): @@ -94,12 +98,15 @@ async def main(): return None # Wikirace setup - max_steps = 15 + max_steps = 2 start_page = "Albert Einstein" target_page = "Pizza" print(f"\nStarting Wikirace: {start_page} → {target_page}") + # Close all windows + await close_all_windows() + # Open starting page await open_wiki(start_page) @@ -142,7 +149,7 @@ async def main(): print(f"❌ Failed: Reached maximum steps ({max_steps})") break - # Check current page using decorated function + # Check again current_page = await get_current_wiki_page() print(f"Current page: {current_page}") @@ -161,11 +168,6 @@ async def main(): print(f"Steps taken: {steps}") print(f"Success: {success}") print(f"Duration: {duration:.2f} seconds") - - # Get final page list - now using decorated function - final_wikis = await get_open_wikis() - print(f"Open Wikipedia pages: {final_wikis}") - finally: # Important to clean up resources # await computer.stop() From 89deb8111fbbed7f3156f88d2195c5104be92e77 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 14:22:30 -0400 Subject: [PATCH 05/23] Fixed agent stop --- examples/eval_examples.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 38d9cae8..b4b207b6 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -146,8 +146,10 @@ async def main(): # Safety check if steps >= max_steps: - print(f"❌ Failed: Reached maximum steps ({max_steps})") - break + print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") + agent._loop.cancel() + + await asyncio.sleep(2) # Wait for recv to finish # Check again current_page = await get_current_wiki_page() From e16fb75ce81f4ab9fe786854d8b8b405a8292605 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 14:50:56 -0400 Subject: [PATCH 06/23] Added locks to websocket interface --- libs/computer/computer/interface/linux.py | 103 ++++++++++++---------- libs/computer/computer/interface/macos.py | 59 +++++++------ 2 files changed, 86 insertions(+), 76 deletions(-) diff --git a/libs/computer/computer/interface/linux.py b/libs/computer/computer/interface/linux.py index 401730ca..68ba5706 100644 --- a/libs/computer/computer/interface/linux.py +++ b/libs/computer/computer/interface/linux.py @@ -27,6 +27,7 @@ class LinuxComputerInterface(BaseComputerInterface): self._max_reconnect_delay = 30 # Maximum delay between reconnection attempts self._log_connection_attempts = True # Flag to control connection attempt logging self._authenticated = False # Track authentication status + self._command_lock = asyncio.Lock() # Lock to ensure only one command at a time # Set logger name for Linux interface self.logger = Logger("cua.interface.linux", LogLevel.NORMAL) @@ -193,58 +194,62 @@ class LinuxComputerInterface(BaseComputerInterface): retry_count = 0 last_error = None - while retry_count < max_retries: - try: - await self._ensure_connection() - if not self._ws: - raise ConnectionError("WebSocket connection is not established") + # Acquire lock to ensure only one command is processed at a time + async with self._command_lock: + self.logger.debug(f"Acquired lock for command: {command}") + while retry_count < max_retries: + try: + await self._ensure_connection() + if not self._ws: + raise ConnectionError("WebSocket connection is not established") - # Handle authentication if needed - if self.api_key and self.vm_name and not self._authenticated: - self.logger.info("Performing authentication handshake...") - auth_message = { - "command": "authenticate", - "params": { - "api_key": self.api_key, - "container_name": self.vm_name + # Handle authentication if needed + if self.api_key and self.vm_name and not self._authenticated: + self.logger.info("Performing authentication handshake...") + auth_message = { + "command": "authenticate", + "params": { + "api_key": self.api_key, + "container_name": self.vm_name + } } - } - await self._ws.send(json.dumps(auth_message)) - - # Wait for authentication response - auth_response = await asyncio.wait_for(self._ws.recv(), timeout=10) - auth_result = json.loads(auth_response) - - if not auth_result.get("success"): - error_msg = auth_result.get("error", "Authentication failed") - self.logger.error(f"Authentication failed: {error_msg}") - self._authenticated = False - raise ConnectionError(f"Authentication failed: {error_msg}") - - self.logger.info("Authentication successful") - self._authenticated = True + await self._ws.send(json.dumps(auth_message)) + + # Wait for authentication response + auth_response = await asyncio.wait_for(self._ws.recv(), timeout=10) + auth_result = json.loads(auth_response) + + if not auth_result.get("success"): + error_msg = auth_result.get("error", "Authentication failed") + self.logger.error(f"Authentication failed: {error_msg}") + self._authenticated = False + raise ConnectionError(f"Authentication failed: {error_msg}") + + self.logger.info("Authentication successful") + self._authenticated = True - message = {"command": command, "params": params or {}} - await self._ws.send(json.dumps(message)) - response = await asyncio.wait_for(self._ws.recv(), timeout=30) - return json.loads(response) - except Exception as e: - last_error = e - retry_count += 1 - if retry_count < max_retries: - # Only log at debug level for intermediate retries - self.logger.debug( - f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}" - ) - await asyncio.sleep(1) - continue - else: - # Only log at error level for the final failure - self.logger.error( - f"Failed to send command '{command}' after {max_retries} retries" - ) - self.logger.debug(f"Command failure details: {e}") - raise last_error if last_error else RuntimeError("Failed to send command") + message = {"command": command, "params": params or {}} + await self._ws.send(json.dumps(message)) + response = await asyncio.wait_for(self._ws.recv(), timeout=30) + self.logger.debug(f"Completed command: {command}") + return json.loads(response) + except Exception as e: + last_error = e + retry_count += 1 + if retry_count < max_retries: + # Only log at debug level for intermediate retries + self.logger.debug( + f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}" + ) + await asyncio.sleep(1) + continue + else: + # Only log at error level for the final failure + self.logger.error( + f"Failed to send command '{command}' after {max_retries} retries" + ) + self.logger.debug(f"Command failure details: {e}") + raise last_error if last_error else RuntimeError("Failed to send command") async def wait_for_ready(self, timeout: int = 60, interval: float = 1.0): """Wait for WebSocket connection to become available.""" diff --git a/libs/computer/computer/interface/macos.py b/libs/computer/computer/interface/macos.py index a96c44d1..3daa4fdf 100644 --- a/libs/computer/computer/interface/macos.py +++ b/libs/computer/computer/interface/macos.py @@ -26,6 +26,7 @@ class MacOSComputerInterface(BaseComputerInterface): self._reconnect_delay = 1 # Start with 1 second delay self._max_reconnect_delay = 30 # Maximum delay between reconnection attempts self._log_connection_attempts = True # Flag to control connection attempt logging + self._command_lock = asyncio.Lock() # Lock to ensure only one command at a time # Set logger name for macOS interface self.logger = Logger("cua.interface.macos", LogLevel.NORMAL) @@ -219,35 +220,39 @@ class MacOSComputerInterface(BaseComputerInterface): retry_count = 0 last_error = None - while retry_count < max_retries: - try: - await self._ensure_connection() - if not self._ws: - raise ConnectionError("WebSocket connection is not established") + # Acquire lock to ensure only one command is processed at a time + async with self._command_lock: + self.logger.debug(f"Acquired lock for command: {command}") + while retry_count < max_retries: + try: + await self._ensure_connection() + if not self._ws: + raise ConnectionError("WebSocket connection is not established") - message = {"command": command, "params": params or {}} - await self._ws.send(json.dumps(message)) - response = await asyncio.wait_for(self._ws.recv(), timeout=30) - return json.loads(response) - except Exception as e: - last_error = e - retry_count += 1 - if retry_count < max_retries: - # Only log at debug level for intermediate retries - self.logger.debug( - f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}" - ) - await asyncio.sleep(1) - continue - else: - # Only log at error level for the final failure - self.logger.error( - f"Failed to send command '{command}' after {max_retries} retries" - ) - self.logger.debug(f"Command failure details: {e}") - raise + message = {"command": command, "params": params or {}} + await self._ws.send(json.dumps(message)) + response = await asyncio.wait_for(self._ws.recv(), timeout=30) + self.logger.debug(f"Completed command: {command}") + return json.loads(response) + except Exception as e: + last_error = e + retry_count += 1 + if retry_count < max_retries: + # Only log at debug level for intermediate retries + self.logger.debug( + f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}" + ) + await asyncio.sleep(1) + continue + else: + # Only log at error level for the final failure + self.logger.error( + f"Failed to send command '{command}' after {max_retries} retries" + ) + self.logger.debug(f"Command failure details: {e}") + raise - raise last_error if last_error else RuntimeError("Failed to send command") + raise last_error if last_error else RuntimeError("Failed to send command") async def wait_for_ready(self, timeout: int = 60, interval: float = 1.0): """Wait for WebSocket connection to become available.""" From fa07ee444ad85e8b391cb5fbf16ac1d59e4a2037 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 14:51:11 -0400 Subject: [PATCH 07/23] More freq checks --- examples/eval_examples.py | 57 ++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index b4b207b6..a476e104 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -31,7 +31,8 @@ from agent import ComputerAgent, LLM, AgentLoop, LLMProvider # Global reference to computer instance (will be set in main) _computer = None -def remote(venv_name="eval_env"): + +def remote(venv_name="eval_env", max_retries=3): """ Decorator that wraps a function to be executed remotely via computer.venv_exec @@ -43,7 +44,14 @@ def remote(venv_name="eval_env"): async def wrapper(*args, **kwargs): if _computer is None: raise RuntimeError("Computer instance not initialized. Call this after computer.run()") - return await _computer.venv_exec(venv_name, func, *args, **kwargs) + for i in range(max_retries): + try: + return await _computer.venv_exec(venv_name, func, *args, **kwargs) + except Exception as e: + print(f"Attempt {i+1} failed: {e}") + await asyncio.sleep(1) + if i == max_retries - 1: + raise e return wrapper return decorator @@ -140,28 +148,33 @@ async def main(): Look at the current page and click on a link that might lead you closer to {target_page}. """ - async for result in agent.run(prompt): - steps += 1 - print(f"Step {steps}: {result}") - - # Safety check - if steps >= max_steps: - print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") - agent._loop.cancel() - - await asyncio.sleep(2) # Wait for recv to finish - - # Check again - current_page = await get_current_wiki_page() - print(f"Current page: {current_page}") - - # Check if we reached the target - if current_page and target_page.lower() in current_page.lower(): - success = True - print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") - + try: + async for result in agent.run(prompt): + steps += 1 + print(f"Step {steps}: {result}") + + # Check again + current_page = await get_current_wiki_page() + print(f"Current page: {current_page}") + + # Check if we reached the target + if current_page and target_page.lower() in current_page.lower(): + success = True + print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") + await agent._loop.cancel() + break + + # Safety check + if steps >= max_steps: + print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") + await agent._loop.cancel() + break + except asyncio.CancelledError: + print("Agent stopped") + end_time = time.time() duration = end_time - start_time + await asyncio.sleep(2) # Wait for agent to finish # Results print(f"\n=== WIKIRACE RESULTS ===") From a7e56ce64a439b12b04f6243f16a41012af8102e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 18:54:00 -0400 Subject: [PATCH 08/23] Added @sandboxed decorator --- examples/eval_examples.py | 39 +++------------- libs/computer/computer/computer.py | 5 +++ libs/computer/computer/helpers.py | 49 +++++++++++++++++++++ tests/venv.py | 71 ++++++++++++++++++++++++++---- 4 files changed, 122 insertions(+), 42 deletions(-) create mode 100644 libs/computer/computer/helpers.py diff --git a/examples/eval_examples.py b/examples/eval_examples.py index a476e104..48da31be 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -24,40 +24,12 @@ for path in pythonpath.split(":"): from computer.computer import Computer from computer.providers.base import VMProviderType from computer.logger import LogLevel +from computer.helpers import sandboxed # Assuming these exist based on your request from agent import ComputerAgent, LLM, AgentLoop, LLMProvider -# Global reference to computer instance (will be set in main) -_computer = None - - -def remote(venv_name="eval_env", max_retries=3): - """ - Decorator that wraps a function to be executed remotely via computer.venv_exec - - Args: - venv_name: Name of the virtual environment to execute in - """ - def decorator(func): - @wraps(func) - async def wrapper(*args, **kwargs): - if _computer is None: - raise RuntimeError("Computer instance not initialized. Call this after computer.run()") - for i in range(max_retries): - try: - return await _computer.venv_exec(venv_name, func, *args, **kwargs) - except Exception as e: - print(f"Attempt {i+1} failed: {e}") - await asyncio.sleep(1) - if i == max_retries - 1: - raise e - return wrapper - return decorator - -async def main(): - global _computer, remote - +async def main(): try: print("\n=== Using cloud container ===") # # Create a remote Linux computer with CUA @@ -70,7 +42,6 @@ async def main(): # Connect to local macOS computer computer = Computer() - _computer = computer # Set global reference try: # Run the computer with default parameters @@ -84,8 +55,8 @@ async def main(): await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") await asyncio.sleep(2) # Wait for page to load - # Remote functions for wikirace - using @remote decorator - @remote("eval_env") + # Remote functions for wikirace - using @sandboxed decorator + @sandboxed("eval_env") def close_all_windows(): import pywinctl windows = pywinctl.getAllWindows() @@ -96,7 +67,7 @@ async def main(): # Some windows might not be closeable or may have already closed pass - @remote("eval_env") + @sandboxed("eval_env") def get_current_wiki_page(): import pywinctl titles = pywinctl.getAllTitles() diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py index 2057215e..191c611d 100644 --- a/libs/computer/computer/computer.py +++ b/libs/computer/computer/computer.py @@ -11,6 +11,7 @@ import json import logging from .telemetry import record_computer_initialization import os +from . import helpers # Import provider related modules from .providers.base import VMProviderType @@ -460,6 +461,10 @@ class Computer: # Set the initialization flag and clear the initializing flag self._initialized = True + + # Set this instance as the default computer for remote decorators + helpers.set_default_computer(self) + self.logger.info("Computer successfully initialized") except Exception as e: raise diff --git a/libs/computer/computer/helpers.py b/libs/computer/computer/helpers.py new file mode 100644 index 00000000..b472c047 --- /dev/null +++ b/libs/computer/computer/helpers.py @@ -0,0 +1,49 @@ +""" +Helper functions and decorators for the Computer module. +""" +import asyncio +from functools import wraps +from typing import Any, Callable, Optional, TypeVar, cast + +# Global reference to the default computer instance +_default_computer = None + +def set_default_computer(computer): + """ + Set the default computer instance to be used by the remote decorator. + + Args: + computer: The computer instance to use as default + """ + global _default_computer + _default_computer = computer + + +def sandboxed(venv_name: str = "default", computer: str = "default", max_retries: int = 3): + """ + Decorator that wraps a function to be executed remotely via computer.venv_exec + + Args: + venv_name: Name of the virtual environment to execute in + computer: The computer instance to use, or "default" to use the globally set default + max_retries: Maximum number of retries for the remote execution + """ + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + # Determine which computer instance to use + comp = computer if computer != "default" else _default_computer + + if comp is None: + raise RuntimeError("No computer instance available. Either specify a computer instance or call set_default_computer() first.") + + for i in range(max_retries): + try: + return await comp.venv_exec(venv_name, func, *args, **kwargs) + except Exception as e: + print(f"Attempt {i+1} failed: {e}") + await asyncio.sleep(1) + if i == max_retries - 1: + raise e + return wrapper + return decorator diff --git a/tests/venv.py b/tests/venv.py index 4f9e3206..8b78a78f 100644 --- a/tests/venv.py +++ b/tests/venv.py @@ -31,24 +31,29 @@ for path in pythonpath.split(":"): from computer.computer import Computer from computer.providers.base import VMProviderType +from computer.helpers import remote, set_default_computer @pytest.fixture(scope="session") async def computer(): """Shared Computer instance for all test cases.""" - # Create a remote Linux computer with C/ua - computer = Computer( - os_type="linux", - api_key=os.getenv("CUA_API_KEY"), - name=str(os.getenv("CUA_CONTAINER_NAME")), - provider_type=VMProviderType.CLOUD, - ) + # # Create a remote Linux computer with C/ua + # computer = Computer( + # os_type="linux", + # api_key=os.getenv("CUA_API_KEY"), + # name=str(os.getenv("CUA_CONTAINER_NAME")), + # provider_type=VMProviderType.CLOUD, + # ) + + # Create a local macOS computer with C/ua + computer = Computer() try: await computer.run() yield computer finally: - await computer.stop() + # await computer.stop() + pass # Sample test cases @@ -146,6 +151,56 @@ async def test_venv_exec_stdout_capture(computer, capfd): assert out == "Hello World!\n\n" assert result == "Function completed" +@pytest.mark.asyncio(loop_scope="session") +async def test_remote_decorator(computer): + """Test the remote decorator from computer.helpers module.""" + # Set the computer as default for the remote decorator + set_default_computer(computer) + + # Define a function with the remote decorator + @sandboxed("test_env") + def get_package_version(): + import sys + import platform + return { + "python_version": sys.version, + "platform": platform.platform(), + "success": True + } + + # Call the decorated function + result = await get_package_version() + + # Verify the function executed in the virtual environment + assert "python_version" in result + assert "platform" in result + assert result["success"] == True + +@pytest.mark.asyncio(loop_scope="session") +async def test_remote_decorator_with_custom_computer(computer): + """Test the remote decorator with explicitly specified computer instance.""" + # Define a function with the remote decorator that explicitly specifies the computer + @sandboxed("test_env", computer=computer) + def get_system_info(): + import os + import sys + return { + "python_version": sys.version, + "environment_vars": dict(os.environ), + "working_directory": os.getcwd() + } + + # Call the decorated function + result = await get_system_info() + + # Verify the function executed in the virtual environment + assert "python_version" in result + assert "environment_vars" in result + assert "working_directory" in result + # The virtual environment should have a different working directory + # than the current test process + assert result["working_directory"] != os.getcwd() + if __name__ == "__main__": # Run tests directly pytest.main([__file__, "-v"]) From 86d052d88278b4d58e1832d93a6d7fd50eaf6cf4 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 21:19:46 -0400 Subject: [PATCH 09/23] updated eval to use sandboxed decorator --- examples/eval_examples.py | 714 ++++++++++++++++++++++++++++++++------ 1 file changed, 606 insertions(+), 108 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 48da31be..1978a897 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -1,13 +1,63 @@ import os import asyncio +import json +import random from pathlib import Path import sys import traceback import time from functools import wraps +import urllib.request +import datetime +from urllib.parse import quote + +# Wikirace prompt template +WIKIRACE_PROMPT_TEMPLATE = """ +You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" +by clicking only on Wikipedia links within articles. + +Rules: +1. Only click on links within Wikipedia articles (blue underlined text) +2. No using search, back button, or typing URLs +3. You MAY use cmd+f (or ctrl+f) to find text on the current page +4. Do NOT click any search icon or type into any search box unless it's a browser command +5. Try to find the shortest path possible +6. Current target: {target_page} +7. Do not maximize the window or use any other application +8. Avoid wasting actions by scrolling +9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps + +Look at the current page and click on a link that might lead you closer to {target_page}. +""" + +# Store original print function +_print = print + +# Define log file path +project_root = Path(__file__).parent.parent +log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt" + +# Custom print function that also logs to file +def print(*args, **kwargs): + # Call the original print function + _print(*args, **kwargs) + + # Format the output as a string + output = " ".join(str(arg) for arg in args) + if kwargs.get("end") is not None: + output += kwargs["end"] + else: + output += "\n" + + # Add timestamp + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + log_entry = f"[{timestamp}] {output}" + + # Append to log file + with open(log_file, "a") as f: + f.write(log_entry) # Load environment variables from .env file -project_root = Path(__file__).parent.parent env_file = project_root / ".env" print(f"Loading environment from: {env_file}") from dotenv import load_dotenv @@ -29,104 +79,283 @@ from computer.helpers import sandboxed # Assuming these exist based on your request from agent import ComputerAgent, LLM, AgentLoop, LLMProvider -async def main(): +articles = [] + +# Load from file +articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt" +with open(articles_file, "r") as f: + articles = [line.strip() for line in f] + + +def get_article_links(article_title): + """Get all links from a Wikipedia article's content""" try: - print("\n=== Using cloud container ===") - # # Create a remote Linux computer with CUA - # computer = Computer( - # os_type="linux", - # api_key=os.getenv("CUA_API_KEY"), - # name=str(os.getenv("CUA_CONTAINER_NAME")), - # provider_type=VMProviderType.CLOUD, - # ) + # Get the article content + url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json" - # Connect to local macOS computer - computer = Computer() + with urllib.request.urlopen(url) as response: + data = json.loads(response.read().decode()) + + pages = data.get('query', {}).get('pages', {}) + if not pages: + return [] + + # Get the first (and only) page + page = next(iter(pages.values())) + links = page.get('links', []) + + # Filter links to keep only main namespace articles (no special pages, files, etc.) + article_links = [] + for link in links: + title = link.get('title', '') + # Skip if title contains colons (indicates special pages, files, categories, etc.) + if ':' not in title and title.isascii() and len(title) < 50: + article_links.append(title) + + return article_links + + except Exception as e: + print(f"Error fetching links for {article_title}: {e}") + return [] + +def wikipedia_random_walk(start_article, depth=5): + """ + Perform a random walk through Wikipedia articles + + Args: + start_article (str): The article title to start from + depth (int): How many steps to take in the random walk + + Returns: + list: Path of article titles visited during the walk + """ + path = [start_article] + current_article = start_article + + for step in range(depth): + print(f"Step {step + 1}: Currently at '{current_article}'") + + # Get links from current article + links = get_article_links(current_article) + + if not links: + print(f"No valid links found in '{current_article}'. Ending walk.") + break + + # Randomly select next article + next_article = random.choice(links) + path.append(next_article) + current_article = next_article + + print(f" -> Moving to '{next_article}'") + + return path + +def get_article_pair(depth=5): + global articles + start_article = random.choice(articles) + target_article = wikipedia_random_walk(start_article, depth)[-1] + while target_article == start_article: + start_article = random.choice(articles) + target_article = wikipedia_random_walk(start_article, depth)[-1] + return start_article, target_article + +async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30): + """Run a specific evaluation scenario""" + + print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===") + + # Create computer instance with or without app-use experiment + experiments = ["app-use"] if use_app_use else [] + computer = Computer(experiments=experiments) + + try: + # Run the computer + await computer.run() + + # Install required packages + await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) + + # Run the specific scenario + if scenario_name == "messy_desktop": + await run_messy_desktop_scenario(computer, agent_configs, max_steps) + elif scenario_name == "parallel_agents": + await run_parallel_agents_scenario(computer, agent_configs, max_steps) + else: + print(f"Unknown scenario: {scenario_name}") + + except Exception as e: + print(f"Error in scenario {scenario_name}: {e}") + traceback.print_exc() + finally: + # Important to clean up resources + # await computer.stop() + pass + + +@sandboxed("eval_env") +def close_all_windows(): + """Close all open windows""" + import pywinctl + windows = pywinctl.getAllWindows() + for window in windows: + try: + window.close() + except: + # Some windows might not be closeable or may have already closed + pass + + +@sandboxed("eval_env") +def get_current_wiki_page(app_name=None): + """Get the title of the current Wikipedia page + + Args: + app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox') + """ + import pywinctl + windows = pywinctl.getAllWindows() + + # Filter windows by app name if provided + if app_name: + windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()] + + # Get titles from filtered windows + titles = [w.title for w in windows if w.title] + wiki_titles = [title for title in titles if "Wikipedia" in title] + + if wiki_titles: + return wiki_titles[0].split(" - Wikipedia")[0] + return None + + +@sandboxed("eval_env") +def get_open_app_names(): + """Get names of all open applications""" + import pywinctl + windows = pywinctl.getAllWindows() + return [window.getAppName() for window in windows if window.getAppName()] + +def _computer(): + """Get the default computer instance""" + from computer.helpers import _default_computer + return _default_computer + +async def open_app(app_name): + """Open a specific application""" + await _computer().interface.run_command(f"open -a '{app_name}'") + await asyncio.sleep(2) # Wait for app to open + + +async def open_wiki(page, app_name="Safari"): + """Open a specific Wikipedia page""" + await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}") + await asyncio.sleep(2) # Wait for page to load + + +async def run_messy_desktop_scenario(computer, agent_configs, max_steps): + """Run the messy desktop scenario with a single agent""" + # Get popular wiki articles + global articles + start_page, target_page = get_article_pair(depth=1) + + print(f"Wiki race: {start_page} → {target_page}") + + # Close all windows first + await close_all_windows() + + # Open starting Wikipedia page + await open_wiki(start_page) + + # Open 3 random apps to create a messy desktop + apps_to_open = ["Notes", "Terminal", "System Settings"] + for app in apps_to_open: + await open_app(app) + + # Verify apps are open + open_apps = await get_open_app_names() + print(f"Open applications: {open_apps}") + + # Create the agent's computer interface + # If app-use is enabled, create a desktop limited to Safari/Firefox + if "app-use" in (computer.experiments or []): + browser_desktop = computer.create_desktop_from_apps(["Safari"]) + agent_computer = browser_desktop + else: + agent_computer = computer + + # Run each agent configuration + for config_name, loop_provider, model_provider in agent_configs: + print(f"\n--- Testing Agent: {config_name} ---") + + # Create agent with the specified configuration + agent = ComputerAgent( + computer=agent_computer, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + # Run the wikirace + steps = 0 + success = False + start_time = time.time() + + # Use the template with formatting for this scenario + prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Safari", + start_page=start_page, + target_page=target_page + ) try: - # Run the computer with default parameters - await computer.run() - - # Install required packages - await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) - - # Helper functions for wikirace - async def open_wiki(page): - await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") - await asyncio.sleep(2) # Wait for page to load - - # Remote functions for wikirace - using @sandboxed decorator - @sandboxed("eval_env") - def close_all_windows(): - import pywinctl - windows = pywinctl.getAllWindows() - for window in windows: - try: - window.close() - except: - # Some windows might not be closeable or may have already closed - pass - - @sandboxed("eval_env") - def get_current_wiki_page(): - import pywinctl - titles = pywinctl.getAllTitles() - wiki_titles = [title for title in titles if "Wikipedia" in title] - if wiki_titles: - return wiki_titles[0].split(" - Wikipedia")[0] - return None - - # Wikirace setup - max_steps = 2 - start_page = "Albert Einstein" - target_page = "Pizza" - - print(f"\nStarting Wikirace: {start_page} → {target_page}") - - # Close all windows - await close_all_windows() - - # Open starting page - await open_wiki(start_page) - - # Check current page using decorated function - current_page = await get_current_wiki_page() - print(f"Starting page: {current_page}") - assert current_page == start_page, f"Expected {start_page}, got {current_page}" - - # Create agent - agent = ComputerAgent( - computer=computer, - loop=AgentLoop.OPENAI, - model=LLM(LLMProvider.OPENAI) - ) - - # Run the wikirace - steps = 0 - success = False - start_time = time.time() - - prompt = f""" - You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" - by clicking only on Wikipedia links within articles. - - Rules: - 1. Only click on links within Wikipedia articles (blue underlined text) - 2. No using search, back button, or typing URLs - 3. Try to find the shortest path possible - 4. Current target: {target_page} - - Look at the current page and click on a link that might lead you closer to {target_page}. - """ - - try: + while steps < max_steps and not success: async for result in agent.run(prompt): steps += 1 - print(f"Step {steps}: {result}") + print(f"Step {steps}") - # Check again - current_page = await get_current_wiki_page() + def process_result(): + if result.get("content"): + print(f"Agent: {result.get('content', '')}") + + else: + outputs = result.get("output", []) + for output in outputs: + if output.get("type") == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + print(f"Agent: {content_part.get('text', '')}") + + elif output.get("type") == "reasoning": + # if it's openAI, we only have access to a summary of the reasoning + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + print(f"Agent: {summary_part.get('text', '')}") + + else: + summary_content = output.get("text", "") + if summary_content: + print(f"Agent: {summary_content}") + + elif output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type: + action_title = f"🛠️ Performing {action_type}" + if action.get("x") and action.get("y"): + action_title += f" at ({action['x']}, {action['y']})" + print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```") + + + # Process and print the result + process_result() + + # Check current page + current_page = await get_current_wiki_page("Safari") print(f"Current page: {current_page}") + print(f"Target: {target_page}") # Check if we reached the target if current_page and target_page.lower() in current_page.lower(): @@ -140,29 +369,298 @@ async def main(): print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") await agent._loop.cancel() break + except asyncio.CancelledError: + print("Agent stopped") + + end_time = time.time() + duration = end_time - start_time + await asyncio.sleep(2) # Wait for agent to finish + + # Results + print(f"\n=== WIKIRACE RESULTS: {config_name} ===") + print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") + print(f"Start: {start_page}") + print(f"Target: {target_page}") + print(f"Steps taken: {steps}") + print(f"Success: {success}") + print(f"Duration: {duration:.2f} seconds") + + +async def run_parallel_agents_scenario(computer, agent_configs, max_steps): + + """Run two agents in parallel, one using Safari and one using Firefox""" + # Get popular wiki articles + global articles + safari_start, safari_target = get_article_pair(depth=1) + firefox_start, firefox_target = get_article_pair(depth=1) + + print(f"Safari Wiki race: {safari_start} → {safari_target}") + print(f"Firefox Wiki race: {firefox_start} → {firefox_target}") + + # Close all windows first + await close_all_windows() + + # Open Safari with starting page + await open_wiki(safari_start, "Safari") + await asyncio.sleep(2) + + # Open Firefox with starting page + await open_wiki(firefox_start, "Firefox") + await asyncio.sleep(2) + + # Create agent configurations + for config_name, loop_provider, model_provider in agent_configs: + print(f"\n--- Testing Parallel Agents: {config_name} ---") + + # Create the agent interfaces + if "app-use" in (computer.experiments or []): + safari_desktop = computer.create_desktop_from_apps(["Safari"]) + firefox_desktop = computer.create_desktop_from_apps(["Firefox"]) + else: + safari_desktop = computer + firefox_desktop = computer + + # Save screenshots + screenshot_dir = project_root / "examples" / "evals" / "screenshots" + screenshot_dir.mkdir(exist_ok=True) + safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png" + firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png" + screenshot_bytes = await safari_desktop.interface.screenshot() + with open(safari_screenshot_path, "wb") as f: + f.write(screenshot_bytes) + screenshot_bytes = await firefox_desktop.interface.screenshot() + with open(firefox_screenshot_path, "wb") as f: + f.write(screenshot_bytes) + + # Create agents + safari_agent = ComputerAgent( + computer=safari_desktop, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + firefox_agent = ComputerAgent( + computer=firefox_desktop, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + # Create prompts using the template + safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Safari", + start_page=safari_start, + target_page=safari_target + ) + + firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Firefox", + start_page=firefox_start, + target_page=firefox_target + ) + + # Track results + safari_results = { + "steps": 0, + "success": False, + "start_time": time.time(), + "end_time": None + } + + firefox_results = { + "steps": 0, + "success": False, + "start_time": time.time(), + "end_time": None + } + + # Function to run a single agent + async def run_agent(agent, prompt, browser, start_page, target_page, results): + try: + while results["steps"] < max_steps and not results["success"]: + async for result in agent.run(prompt): + results["steps"] += 1 + print(f"{browser} Step {results['steps']}") + + def process_result(): + if result.get("content"): + print(f"{browser} Agent: {result.get('content', '')}") + + else: + outputs = result.get("output", []) + for output in outputs: + if output.get("type") == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + print(f"{browser} Agent: {content_part.get('text', '')}") + + elif output.get("type") == "reasoning": + # if it's openAI, we only have access to a summary of the reasoning + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + print(f"{browser} Agent: {summary_part.get('text', '')}") + + else: + summary_content = output.get("text", "") + if summary_content: + print(f"{browser} Agent: {summary_content}") + + elif output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type: + action_title = f"🛠️ Performing {action_type}" + if action.get("x") and action.get("y"): + action_title += f" at ({action['x']}, {action['y']})" + print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```") + + + # Process and print the result + process_result() + + # Check current page + current_page = await get_current_wiki_page(browser) + print(f"{browser} current page: {current_page}") + print(f"{browser} target: {target_page}") + + # Check if we reached the target + if current_page and target_page.lower() in current_page.lower(): + results["success"] = True + print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!") + await agent._loop.cancel() + break + + # Check if we reached the maximum steps + if results["steps"] >= max_steps: + print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})") + await agent._loop.cancel() + break except asyncio.CancelledError: - print("Agent stopped") - - end_time = time.time() - duration = end_time - start_time - await asyncio.sleep(2) # Wait for agent to finish + print(f"{browser} agent stopped") + finally: + results["end_time"] = time.time() + + # Run both agents in parallel + await asyncio.gather( + run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results), + run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results) + ) + + # Wait for agents to finish + await asyncio.sleep(2) + + # Print results + print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===") + print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") + + print(f"\nSafari Results:") + print(f"Start: {safari_start}") + print(f"Target: {safari_target}") + print(f"Steps taken: {safari_results['steps']}") + print(f"Success: {safari_results['success']}") + print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds") + + print(f"\nFirefox Results:") + print(f"Start: {firefox_start}") + print(f"Target: {firefox_target}") + print(f"Steps taken: {firefox_results['steps']}") + print(f"Success: {firefox_results['success']}") + print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds") + + +async def main(): + try: + + # Define agent configurations to test + agent_configs = [ + ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), + ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), + # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) + ] + + # # Run the test scenario without any agents + # print("Running test scenario for sandboxed functions") + # await run_test_scenario() + + # Set maximum steps for each agent run + max_steps = 50 + runs = 5 + + # run all scenarios + for _ in range(runs): + # Scenario 1: Messy desktop without App-Use + await run_scenario("messy_desktop", False, agent_configs, max_steps) - # Results - print(f"\n=== WIKIRACE RESULTS ===") - print(f"Start: {start_page}") - print(f"Target: {target_page}") - print(f"Steps taken: {steps}") - print(f"Success: {success}") - print(f"Duration: {duration:.2f} seconds") - finally: - # Important to clean up resources - # await computer.stop() - pass + # Scenario 1: Messy desktop with App-Use + await run_scenario("messy_desktop", True, agent_configs, max_steps) + + # Scenario 2: Parallel agents without App-Use + await run_scenario("parallel_agents", False, agent_configs, max_steps) + + # Scenario 2: Parallel agents with App-Use + await run_scenario("parallel_agents", True, agent_configs, max_steps) except Exception as e: print(f"Error in main: {e}") traceback.print_exc() +async def run_test_scenario(max_iterations=5): + """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match + + This function opens the same Wikipedia pages in both browsers and verifies that + the get_current_wiki_page function returns the same result for both browsers. + It does this for the specified number of iterations. + """ + + # Create computer instance + computer = Computer() + await computer.run() + + # Get popular wiki articles + global articles + selected_articles = random.sample(articles, max_iterations) + + print(f"\n--- Running Test Scenario for {max_iterations} iterations ---") + + # Close all windows first + await close_all_windows() + + # Open both browsers + await open_app("Safari") + await open_app("Firefox") + + # Verify browsers are open + open_apps = await get_open_app_names() + print(f"Open applications: {open_apps}") + + # Run test iterations + for i, article in enumerate(selected_articles): + print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'") + + # Open the same Wikipedia page in both browsers + await open_wiki(article, "Safari") + await open_wiki(article, "Firefox") + await asyncio.sleep(3) # Give a bit more time for both pages to load + + # Check if both browsers show the same page + safari_page = await get_current_wiki_page("Safari") + firefox_page = await get_current_wiki_page("Firefox") + + print(f"Safari page: {safari_page}") + print(f"Firefox page: {firefox_page}") + + if safari_page == firefox_page: + print(f"✅ MATCH: Both browsers show '{safari_page}'") + else: + print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'") + + await asyncio.sleep(1) # Brief pause between iterations + + print("\n--- Test Scenario Completed ---") + + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From 79bcf9d05d2347f4a03ba190f526e1c12e192448 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 21:37:58 -0400 Subject: [PATCH 10/23] Added top wiki links --- examples/evals/wikipedia_most_linked.txt | 1000 ++++++++++++++++++++++ 1 file changed, 1000 insertions(+) create mode 100644 examples/evals/wikipedia_most_linked.txt diff --git a/examples/evals/wikipedia_most_linked.txt b/examples/evals/wikipedia_most_linked.txt new file mode 100644 index 00000000..877909d2 --- /dev/null +++ b/examples/evals/wikipedia_most_linked.txt @@ -0,0 +1,1000 @@ +ISBN (identifier) +United States +Main Page +Tilde +Doi (identifier) +Fair use +Association football +Years +Wayback Machine +ISSN (identifier) +India +Wikimedia Foundation +Wikidata +Animal +Taxonomy (biology) +Australia +France +Eukaryote +IP address +U.S. state +Time zone +City +Copyright +Canada +Town +ASCII +Greek alphabet +Typographic ligature +Diacritical mark +Wikipedia +Germany +Human settlement +Open Tree of Life +IMDb (identifier) +United Kingdom +Catalogue of Life +Insect +Russia +Japan +Italy +Arthropod +Television show +Public domain +INaturalist +Poland +England +PMID (identifier) +Daylight saving time +S2CID (identifier) +China +Encyclopedia of Life +Spain +OCLC (identifier) +Plant +Flickr +Wikispecies +Africa +Song +Record label +Lepidoptera +Iran +English language +Music genre +News aggregator +Web feed +Proxy server +X-Forwarded-For +College football +World War II +Brazil +Sweden +Politics +Olympics +Netherlands +Record producer +California +New York City +Surname +The New York Times +London +New Zealand +PMC (identifier) +Logo +Synonym (taxonomy) +Switzerland +Turkey +Sport +Video game +Architecture +Norway +Bibcode (identifier) +Mexico +Botany +JSTOR (identifier) +Rail transport +Field hockey +Ireland +Scotland +Belgium +South Africa +Common name +Professional sports +Sport governing body +Sport industry +Olympic games +Election +Austria +Ukraine +Anthroponymy +Pakistan +Baseball +Denmark +Christianity +Philippines +Woman +Romania +Czech Republic +Album +Godzilla Minus One +Single (music) +Electoral reform +Nofollow +Basketball +New York (state) +Argentina +Finland +Soviet Union +Greece +Russian language +Historic site +Free content +YouTube +Catholic Church +Hungary +Kingdom Hearts +Beetle +Company +Tetris +Portugal +BioShock +Abandonware +Deus Ex (video game) +4A Engine +Yoshi's New Island +Kaboom! (video game) +Rain World +Juno (Overwatch) +Crash Team Rumble +Vault 101 +Tales of Commons +NHL Hockey +Clutch Gaming +Haseo +Allin Kempthorne +Ilyas El Maliki +Ratalaika Games +3D mousepad +HaptX +Walid Sultan Midani +Rustler (video game) +Look Outside +Ducks Ahoy! +Fusion Engine +Cricket +Geography +Chordate +The Guardian +Israel +Billboard (magazine) +Ice hockey +Given name +Chicago +World War I +Pennsylvania +Indonesia +Alma mater +Vascular plant +Amorphea +Wikimedia Commons +Novel +Village +Visual arts +Film poster +Flowering plant +Opisthokont +Obazoa +County seat +Short story +First-class cricket +Law +Europe +University +Croatia +Sport of athletics +Holozoa +Choanozoa +Filozoa +German language +Tennis +Eumetazoa +Serbia +ParaHoxozoa +Thailand +History +Midfielder +Bilateria +Unincorporated area +French language +AllMusic +Astronomy +Nephrozoa +Novella +Ship +Twitter +Character (arts) +College +Malaysia +Conflict of interest +Higher education +IUCN Red List +Rock music +Gastropoda +Creative Commons +Wales +Bulgaria +UTC+2 +Paris +Species +Illinois +HTML element +South Korea +BBC +Persian language +Moth +Conservation status +Pop music +Colombia +Wicket +American football +Jazz +World Flora Online +Los Angeles +Songwriter +Hong Kong +Hdl (identifier) +Genus +Spanish language +Egypt +Not out +Slovenia +Chile +Korea +Tropicos +Slovakia +Bishop +Family (biology) +Rugby union +Women's history +Nigeria +College basketball +Sports Reference +Washington, D.C. +GFDL +Afghanistan +Sri Lanka +Newspapers.com +UTC+1 +Eudicots +Estonia +Los Angeles Times +Olympedia +Bangladesh +Peru +Singapore +Typographical error +UTC +Virginia +Taiwan +Fast bowling +COVID-19 pandemic +Food +Fish +River +Republic of Ireland +Beer +Caribbean +Michigan +Drink +Chinese language +Business +Leg break +Women's Test cricket +Women's cricket +Innings +New Jersey +Protostome +Spin bowling +Sugar +Underarm bowling +Roger Federer +Googly +Apple +Comics +Cricket Australia XI +Fair and unfair play +Anime +Rafael Nadal +Leander Paes +Kazakhstan +Capital city +Blessed Virgin Mary +Venezuela +Case sensitivity +Arabic language +North America +Texas +Burger King +The Plant List +Justine Henin +Sushi +Angelus +Beef +Sanctification +Cuthbert Tunstall +Bread +Saint Mungo +Incumbent +Americanism (heresy) +Curry +Ensoulment +Associated Press +Adolph John Paschang +French cuisine +Altar Society +UTC-5 +Philadelphia +Bill Mallon +Yogurt +Soy sauce +Open Era (tennis) +Belarus +Manga +English Wikipedia +Islam +Trademark +ISO 4 +Wisconsin +Lithuania +The Washington Post +Agaricus bisporus +Reptile +Sociology +Organizations +Death +Ham and eggs +Asia +Swimming (sport) +South America +Northern Ireland +Observation.org +European Union +Astronomical object +Georgia (U.S. state) +Gmina +Provinces of Iran +Computing +Counties of Iran +Discogs +Mathematics +Powiat +Missouri +Bachelor of Arts +Iran Standard Time +Florida +Bakhsh +Minnesota +Oregon +Nepal +Variety (magazine) +Japanese language +Journalism +Rome +Computer +Ohio +Ontario +Internet Archive +Latvia +Comedy +Azerbaijan +BBC News +Morocco +Ecdysozoa +Print-on-demand +Bengali language +A5 paper +Pedia Press +Education +Mollusca +American Civil War +Berlin +Taxon +Maryland +Panarthropoda +Hebrew language +Toronto +Tactopoda +Episode +Cuba +Country music +Religion +Rotten Tomatoes +Georgia (country) +Classical music +Month +Puerto Rico +GEOnet Names Server +Sydney +The Times +Iraq +Polyphaga +Derivative work +Lisbon +Syria +Ecuador +Uzbekistan +Greek language +Latin +United Nations +Literature +Animation +Physics +Amphibian +Romanize +List of countries +Moscow +Politician +Philosophy +Metacritic +Mammal +Pinyin +Open access +New South Wales +Theatre +Allmusic +Syntax +Women in music +Fly +Colorado +Academic journal +LGBTQ +Seal (emblem) +Rolling Stone +Saudi Arabia +Science fiction +Tweet (social media) +Heavy metal music +Boston +Vietnam +Molecular biology +Facebook +Iceland +Albania +Cycling +Tennessee +Armenia +Massachusetts +Mandibulata +United States Navy +Communes of France +Census +Algeria +United States Army +Wikilink +Pancrustacea +Alternative rock +American English +Radio stations +History of Romania +Endemism +San Francisco +Award +Ghana +Judaism +Alabama +Blog +The Independent +Melbourne +Cantons of France +Lebanon +West Germany +Quotation mark +Regions of France +Chernivtsi Oblast +Tokyo +Italian language +Connecticut +Country +Screenshot +Ghost town +Iran Daylight Time +NatureServe +Mongolia +Cyprus +Northern Bukovina +Rugby league +Northern Bessarabia +State highway +Harvard University +Yorkshire +Pterygota +Slash (punctuation) +Prize +Science +Asian Games +Eastern Time Zone +Myanmar +Nazi Germany +Ottoman Empire +Quebec +Billboard Hot 100 +United Arab Emirates +Neoptera +Hexapoda +Least Concern +Type species +EPPO Code +Wikisource +Kyrgyzstan +Allotriocarida +Volleyball +Geology +Second World War +British Columbia +Socialism +Zoology +The Daily Telegraph +Paleontology +Vienna +Dicondylia +BugGuide +United States Senate +Hermit crab +Paraphrase +CNN +Royal Navy +Indian Standard Time +Billboard 200 +Kenya +DVD +Sipuncula +Tajikistan +National park +Economics +Heterocyathus +Uruguay +Heteropsammia +Road +Spanish name +Luxembourg +Korean language +UK Singles Chart +Queensland +Montreal +New York Times +Bolivia +CP/M +Timestamp +Electronic music +INSEE code +ArXiv (identifier) +PubMed +SVG +USA Today +Omnivore +Tunisia +Psychology +ESPN +UEFA +Hawaii +Gastropod +Aliyah +North Carolina +Russian Empire +Tibet +Fungi +Oklahoma +Fauna Europaea +Turkmenistan +British English +The London Gazette +Civil township +Boxing +Barack Obama +Animal Diversity Web +Reuters +Eumetabola +Voter turnout +Transport +False positive +Donald Trump +Kansas +Antarctica +Lake +Ethiopia +Time (magazine) +Marriage +NBC +Beijing +Vertebrate +Czechoslovakia +Protected area +Energy +Poetry +Archaeology +Columbia University +Poverty line +Alaska +Computing platform +British Empire +University of Oxford +Costa Rica +Dublin +A-side and B-side +ZIP code +Actinopterygii +UTC-6 +Photoperiodism +Mayor +Sphaeriidae +Animal suicide +Atka mackerel +Starling +Arizona +Entertainment Weekly +Sphaerium beckmani +Junqueira cow +Zaniolepis frenata +Campocraspedon +Zimbabwe +Motorsport +Bird flight +Cnemophilidae +Hinduism +Phalarope +Indiana +Museums +Holometabola +Pytilia +North Macedonia +Malta +Cathartiformes +Darter +Saker falcon +Cathartes +Avian malaria +Coal tit +Magpie duck +Video game developer +Bird bath +Vesper sparrow +Gouldian finch +Debeaking +Vector graphics +Semiplumbeous hawk +Scottish crossbill +Bullfinch +Fregata +Nidicolous +Plushcap +Pallid scops owl +Hip-hop +Blyth's frogmouth +Sunda scops owl +Argus (bird) +Operation Migration +Nik Borrow +Per capita income +Guy Oseary +Madrid +Buddhism +Drainage basin +Sephardic Haredim +Rami Kleinstein +Guy Bavli +David Bar-Hayim +Levin Kipnis +Edna Arbel +Prisoner of Zion +Ayala Procaccia +Nachum Heiman +Zman Tel Aviv +CBS +ARIA Charts +Cucujiformia +Away colours +Regex +2019 African Games +1962 Asian Games +1958 Asian Games +Chemistry +Olympic Games +The Middle Ages +Central Asia +Bengalis +Southeast Asia +Find a Grave +Microsoft Windows +Swing (politics) +White (U.S. Census) +Roman Catholic +Maine +The Times of India +Season (sports) +Jamaica +Video game genre +Munich +Asterids +Rosids +Golf +Language +Hangul +Atlanta +Glasgow +UTC+3 +Library of Congress +Deuterostome +COVID-19 +Video game publisher +Montenegro +ESPNcricinfo +Brand +UTC-4 +IGN +Stockholm +Istanbul +NASA +Gnathostomata +Ukrainian language +Human rights +Chicago Tribune +ProQuest +IMDb +River mouth +Hip hop music +Gene +Netflix +Moldova +Barcelona +Paraguay +Olfactores +Labour Party (UK) +United States dollar +Qatar +Photography +Guatemala +Summit +Cold War +Running +First World War +Precipitation +Edinburgh +Amsterdam +Lima +New Eskaton +Computer program +Xinjiang +Women in science +Manhattan +Warsaw +Magazine +Horror film +Deadline Hollywood +Jordan +Aparaglossata +Agriculture +Internet +Prague +The Hindu +Cretaceous +Latino (U.S. Census) +Vietnam War +Music download +Encyclopedia +Chemical compounds +Pittsburgh +Soap opera +Budapest +George W. Bush +Seattle +Extended play +Washington (state) +Listed building +Palestine +LCCN (identifier) +Portland, Oregon +Panama +Plagiarism +Brooklyn +Teleostomi +Manchester +Bird +Mollusk +Automobile +Historic England +Linguistics +Dependent territory +Athens +Civil engineering +Sea snail +Population density +Finance +Disaster management +Tanzania +Jurassic +Districts of Russia +Western Australia +Louisiana +Portuguese language +Anatomy +The Beatles +Tamil language +Milan +Uganda +Natural environment +FIFA +Cameroon +Blu-ray +Mexico City +Chemical formula +Jimmy Wales +Papua New Guinea +Diaphoretickes +UNESCO +Forbes +Technology +Buenos Aires +Vancouver +Dominican Republic +2007 +Species description +East Germany +Folk music +Kentucky +Multimedia +Monocotyledon +Rio de Janeiro +Automated +Hindi +Houston +Google +Devonian +Member of Parliament +Bible +Mumbai +FishBase +African diaspora +Carboniferous +Cambrian +Triassic +Montana +Handball +Ordovician +San Diego +Archive.today +Stanford University +British Army +Middle Ages +Frequency +Ultratop +Permian +Detroit +Earth +Precambrian +Hamburg +Alberta +Tamil Nadu +Madagascar +Lancashire +Guitar +Trade union +Instagram +Engineering +2006 +Silurian +NPR +Railway station +CAS Registry Number +Yemen +Noctuoidea +Fiji +Haiti +Rowing (sport) +New Orleans +NME +Alternative media +North Korea +Microsoft +Jerusalem +Paleogene +Audery Mill Creek +Horse racing +Post town +Piano +Bavaria +Polish language +Horror fiction +Neogene +Kerala +Copenhagen +Google Books +Central Time Zone +Island +Birmingham +Anglicanism +Software +Mountain range +Investment +Brussels +Muhammad Ali +Asian (U.S. Census) +Video game culture +Brisbane +Church of England +Kosovo +Bachelor of Science +Molar mass +Arachnid +Own goal +Yale University +Caenogastropoda +Auckland +World Athletics +Trinidad and Tobago +Hanyu Pinyin +Sound bite +Time +El Salvador +Microbiology +Columbia Records +Seoul +Cerambycidae +Maharashtra +Chelicerata +Fungus +Media influence +South Carolina +Radio +Telenovela +FA Cup +Senegal +Internet trolling +Nashville, Tennessee +Demonym +Standard Chinese +Sculpture +Liverpool +Thesis +Bass guitar +Chess +Women artists +Icon (computing) +PubChem +UK Albums Chart +Head coach +Roman Empire +Grand Slam (tennis) +JSmol +Formula One +Biology +Kent +Ancient Rome +Inner Carniola +Oslo +Dutch language +Wingspan +Archaeplastida +MTV +Edvard Ravnikar +ITunes +Feminism +German Empire +Pacific Ocean +Atlantic Ocean +Pharmacology +Track gauge +ChemSpider +Doctor of Philosophy +Regions of England +Districts of England +Christmas +Pavel Golia +Predjama Castle +Overtime (sports) +Forum +Swiss Hitparade +Stumped +Majority +Male +Shanghai +Siddharta (band) \ No newline at end of file From 58a453dc496f2f70396016948aded91ac166ca5f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 09:05:20 -0400 Subject: [PATCH 11/23] App-usage stability fixes --- .../computer_server/diorama/diorama.py | 28 ++++- libs/computer/computer/diorama_computer.py | 2 +- libs/computer/computer/ui/gradio/app.py | 111 +++++++++++++----- 3 files changed, 107 insertions(+), 34 deletions(-) diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py index bf30a018..e781395c 100644 --- a/libs/computer-server/computer_server/diorama/diorama.py +++ b/libs/computer-server/computer_server/diorama/diorama.py @@ -36,11 +36,21 @@ class Diorama: cls._ensure_scheduler() return cls(args).computer + # Dictionary to store cursor positions for each unique app_list hash + _cursor_positions = {} + def __init__(self, app_list): self.app_list = app_list self.interface = self.Interface(self) self.computer = DioramaComputer(self) self.focus_context = None + + # Create a hash for this app_list to use as a key + self.app_list_hash = hash(tuple(sorted(app_list))) + + # Initialize cursor position for this app_list if it doesn't exist + if self.app_list_hash not in Diorama._cursor_positions: + Diorama._cursor_positions[self.app_list_hash] = (0, 0) @classmethod def _ensure_scheduler(cls): @@ -67,10 +77,11 @@ class Diorama: frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger) + app_list_hash = hash(tuple(sorted(app_whitelist))) + with focus_context: try: if action == "screenshot": - app_whitelist = list(args["app_list"]) logger.info(f"Taking screenshot for apps: {app_whitelist}") result, img = capture_all_apps( app_whitelist=app_whitelist, @@ -82,8 +93,15 @@ class Diorama: future.set_result((result, img)) # Mouse actions elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]: - x = args.get("x") - y = args.get("y") + # Get last cursor position for this app_list hash + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + + x = args.get("x", last_pos[0]) + y = args.get("y", last_pos[1]) + + # Update the cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + duration = args.get("duration", 0.5) if action == "left_click": await automation_handler.left_click(x, y) @@ -98,6 +116,10 @@ class Diorama: if future: future.set_result(None) elif action in ["scroll_up", "scroll_down"]: + # Move cursor to last known position for this app_list hash + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + await automation_handler.move_cursor(*last_pos) + clicks = args.get("clicks", 1) if action == "scroll_up": await automation_handler.scroll_up(clicks) diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py index 608e6721..5cad0006 100644 --- a/libs/computer/computer/diorama_computer.py +++ b/libs/computer/computer/diorama_computer.py @@ -37,7 +37,7 @@ class DioramaComputerInterface: raise RuntimeError("Computer interface not initialized. Call run() first.") result = await iface.diorama_cmd(action, arguments) if not result.get("success"): - raise RuntimeError(f"Diorama command failed: {result.get('error')}") + raise RuntimeError(f"Diorama command failed: {result.get('error')}\n{result.get('trace')}") return result.get("result") async def screenshot(self, as_bytes=True): diff --git a/libs/computer/computer/ui/gradio/app.py b/libs/computer/computer/ui/gradio/app.py index 1a93b27d..b1d131d9 100644 --- a/libs/computer/computer/ui/gradio/app.py +++ b/libs/computer/computer/ui/gradio/app.py @@ -463,7 +463,7 @@ async def execute(name, action, arguments): elif action == "left_click": if "x" in arguments and "y" in arguments: await computer.interface.move_cursor(arguments["x"], arguments["y"]) - await computer.interface.left_click() + await computer.interface.left_click(arguments["x"], arguments["y"]) await asyncio.sleep(0.5) elif action == "right_click": if "x" in arguments and "y" in arguments: @@ -528,43 +528,75 @@ async def execute(name, action, arguments): return results -async def handle_init_computer(os_choice: str): - """Initialize the computer instance and tools for macOS or Ubuntu""" +async def handle_init_computer(os_choice: str, app_list=None, provider="lume"): + """Initialize the computer instance and tools for macOS or Ubuntu + + Args: + os_choice: The OS to use ("macOS" or "Ubuntu") + app_list: Optional list of apps to focus on using the app-use experiment + provider: The provider to use ("lume" or "self") + """ global computer, tool_call_logs, tools - + + # Check if we should enable app-use experiment + use_app_experiment = app_list and len(app_list) > 0 + experiments = ["app-use"] if use_app_experiment else None + + # Determine if we should use host computer server + use_host_computer_server = provider == "self" + if os_choice == "Ubuntu": - computer = Computer( - image="ubuntu-noble-vanilla:latest", - os_type="linux", - provider_type=VMProviderType.LUME, - display="1024x768", - memory="8GB", - cpu="4" - ) os_type_str = "linux" image_str = "ubuntu-noble-vanilla:latest" else: + os_type_str = "macos" + image_str = "macos-sequoia-cua:latest" + + # Create computer instance with appropriate configuration + if use_host_computer_server: computer = Computer( - image="macos-sequoia-cua:latest", - os_type="macos", + os_type=os_type_str, + use_host_computer_server=True, + experiments=experiments + ) + else: + computer = Computer( + image=image_str, + os_type=os_type_str, provider_type=VMProviderType.LUME, display="1024x768", memory="8GB", - cpu="4" + cpu="4", + experiments=experiments ) - os_type_str = "macos" - image_str = "macos-sequoia-cua:latest" await computer.run() + + # If app list is provided, create desktop from apps + if use_app_experiment: + computer = computer.create_desktop_from_apps(app_list) # Log computer initialization as a tool call - result = await execute("computer", "initialize", { + init_params = { "os": os_type_str, - "image": image_str, - "display": "1024x768", - "memory": "8GB", - "cpu": "4" - }) + "provider": provider + } + + # Add VM-specific parameters if not using host computer server + if not use_host_computer_server: + init_params.update({ + "image": image_str, + "display": "1024x768", + "memory": "8GB", + "cpu": "4" + }) + + # Add app list to the log if provided + if use_app_experiment: + init_params["apps"] = app_list + init_params["experiments"] = ["app-use"] + + result = await execute("computer", "initialize", init_params) return result["screenshot"], json.dumps(tool_call_logs, indent=2) @@ -1029,12 +1061,31 @@ def create_gradio_ui(): setup_status = gr.Textbox(label="Setup Status", value="") with gr.Group(): - os_choice = gr.Radio( - label="OS", - choices=["macOS", "Ubuntu"], - value="macOS", - interactive=False # disable until the ubuntu image is ready - ) + with gr.Accordion("Computer Configuration", open=False): + with gr.Row(): + os_choice = gr.Radio( + label="OS", + choices=["macOS", "Ubuntu"], + value="macOS", + interactive=False # disable until the ubuntu image is ready + ) + + # Provider selection radio + provider_choice = gr.Radio( + label="Provider", + choices=["lume", "self"], + value="lume", + info="'lume' uses a VM, 'self' uses the host computer server" + ) + + # App filtering dropdown for app-use experiment + app_filter = gr.Dropdown( + label="Filter by apps (App-Use)", + multiselect=True, + allow_custom_value=True, + info="When apps are selected, the computer will focus on those apps using the app-use experiment" + ) + start_btn = gr.Button("Initialize Computer") with gr.Group(): @@ -1199,7 +1250,7 @@ def create_gradio_ui(): ) img.select(handle_click, inputs=[img, click_type], outputs=[img, action_log]) - start_btn.click(handle_init_computer, inputs=[os_choice], outputs=[img, action_log]) + start_btn.click(handle_init_computer, inputs=[os_choice, app_filter, provider_choice], outputs=[img, action_log]) wait_btn.click(handle_wait, outputs=[img, action_log]) # DONE and FAIL buttons just do a placeholder action From c2302eb6c607f074916051f411b3eeaec54aa38f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 09:46:09 -0400 Subject: [PATCH 12/23] Added results table --- examples/eval_examples.py | 74 ++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 1978a897..b3d163ca 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -11,13 +11,16 @@ import urllib.request import datetime from urllib.parse import quote +# Global variable to track all results +all_results = [] + # Wikirace prompt template WIKIRACE_PROMPT_TEMPLATE = """ You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" by clicking only on Wikipedia links within articles. Rules: -1. Only click on links within Wikipedia articles (blue underlined text) +1. Only click on links within Wikipedia articles (blue text) 2. No using search, back button, or typing URLs 3. You MAY use cmd+f (or ctrl+f) to find text on the current page 4. Do NOT click any search icon or type into any search box unless it's a browser command @@ -26,6 +29,7 @@ Rules: 7. Do not maximize the window or use any other application 8. Avoid wasting actions by scrolling 9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps +10. Stay on the English Wikipedia Look at the current page and click on a link that might lead you closer to {target_page}. """ @@ -36,6 +40,7 @@ _print = print # Define log file path project_root = Path(__file__).parent.parent log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt" +results_file = project_root / "examples" / "evals" / "eval_appuse_results.md" # Custom print function that also logs to file def print(*args, **kwargs): @@ -160,6 +165,36 @@ def get_article_pair(depth=5): target_article = wikipedia_random_walk(start_article, depth)[-1] return start_article, target_article + +def save_results_to_markdown(): + """Save all results to a markdown table""" + global all_results + + if not all_results: + print("No results to save") + return + + # Create header for the markdown table + header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |" + separator = "|---|---|---|---|---|---|---|---|---|---|" + + # Create rows for each result + rows = [] + for result in all_results: + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |" + rows.append(row) + + # Combine header, separator, and rows + table = "\n".join([header, separator] + rows) + + # Write to file (append mode) + with open(results_file, "a") as f: + f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + f.write(table) + + print(f"Results saved to {results_file}") + async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30): """Run a specific evaluation scenario""" @@ -254,6 +289,7 @@ async def open_wiki(page, app_name="Safari"): async def run_messy_desktop_scenario(computer, agent_configs, max_steps): + global all_results """Run the messy desktop scenario with a single agent""" # Get popular wiki articles global articles @@ -292,7 +328,8 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps): agent = ComputerAgent( computer=agent_computer, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_appuse" ) # Run the wikirace @@ -387,6 +424,7 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps): async def run_parallel_agents_scenario(computer, agent_configs, max_steps): + global all_results """Run two agents in parallel, one using Safari and one using Firefox""" # Get popular wiki articles @@ -436,13 +474,15 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps): safari_agent = ComputerAgent( computer=safari_desktop, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_parallel_safari" ) firefox_agent = ComputerAgent( computer=firefox_desktop, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_parallel_firefox" ) # Create prompts using the template @@ -525,6 +565,24 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps): print(f"{browser} current page: {current_page}") print(f"{browser} target: {target_page}") + # Add result to global tracking + global all_results + current_result = { + 'scenario': 'parallel_agents', + 'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No', + 'browser': browser, + 'config': config_name, + 'start': start_page, + 'target': target_page, + 'steps': results['steps'], + 'success': results['success'], + 'duration': time.time() - results['start_time'] + } + all_results.append(current_result) + + # Save results after each step + save_results_to_markdown() + # Check if we reached the target if current_page and target_page.lower() in current_page.lower(): results["success"] = True @@ -575,9 +633,9 @@ async def main(): # Define agent configurations to test agent_configs = [ - ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), - ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), - # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) + # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), + # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), + ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) ] # # Run the test scenario without any agents @@ -585,7 +643,7 @@ async def main(): # await run_test_scenario() # Set maximum steps for each agent run - max_steps = 50 + max_steps = 15 runs = 5 # run all scenarios From 4693f2f0eb69dfb78af560abd7e20675db7dde2d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 09:58:13 -0400 Subject: [PATCH 13/23] Fixed 'max() iterable argument is empty' --- libs/computer-server/computer_server/diorama/draw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/computer-server/computer_server/diorama/draw.py b/libs/computer-server/computer_server/diorama/draw.py index ac90106e..9fce809f 100644 --- a/libs/computer-server/computer_server/diorama/draw.py +++ b/libs/computer-server/computer_server/diorama/draw.py @@ -377,7 +377,7 @@ def draw_desktop_screenshot(app_whitelist: List[str] = None, all_windows: List[D dock_orientation = "side" if dock_bounds["width"] < dock_bounds["height"] else "bottom" - menubar_length = max(item["bounds"]["x"] + item["bounds"]["width"] for item in menubar_items) + menubar_length = max(item["bounds"]["x"] + item["bounds"]["width"] for item in menubar_items) if menubar_items else 0 # Calculate bounds of app windows app_bounds = { From aa4dc71b9c6b9b7621b8614ad8d56f346f776a37 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 11:06:43 -0400 Subject: [PATCH 14/23] Mouse bug fixes --- .../computer_server/diorama/diorama.py | 69 +++++++++++++++---- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py index e781395c..20f5b1fb 100644 --- a/libs/computer-server/computer_server/diorama/diorama.py +++ b/libs/computer-server/computer_server/diorama/diorama.py @@ -77,8 +77,6 @@ class Diorama: frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger) - app_list_hash = hash(tuple(sorted(app_whitelist))) - with focus_context: try: if action == "screenshot": @@ -93,11 +91,8 @@ class Diorama: future.set_result((result, img)) # Mouse actions elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]: - # Get last cursor position for this app_list hash - last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) - - x = args.get("x", last_pos[0]) - y = args.get("y", last_pos[1]) + x = args.get("x") + y = args.get("y") # Update the cursor position for this app_list hash Diorama._cursor_positions[app_list_hash] = (x, y) @@ -116,9 +111,10 @@ class Diorama: if future: future.set_result(None) elif action in ["scroll_up", "scroll_down"]: - # Move cursor to last known position for this app_list hash - last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) - await automation_handler.move_cursor(*last_pos) + x = args.get("x") + y = args.get("y") + if x is not None and y is not None: + await automation_handler.move_cursor(x, y) clicks = args.get("clicks", 1) if action == "scroll_up": @@ -197,22 +193,57 @@ class Diorama: return img async def left_click(self, x, y): + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = x or last_pos[0], y or last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + sx, sy = await self.to_screen_coordinates(x, y) await self._send_cmd("left_click", {"x": sx, "y": sy}) async def right_click(self, x, y): + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = x or last_pos[0], y or last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + sx, sy = await self.to_screen_coordinates(x, y) await self._send_cmd("right_click", {"x": sx, "y": sy}) async def double_click(self, x, y): + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = x or last_pos[0], y or last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + sx, sy = await self.to_screen_coordinates(x, y) await self._send_cmd("double_click", {"x": sx, "y": sy}) async def move_cursor(self, x, y): + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = x or last_pos[0], y or last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + sx, sy = await self.to_screen_coordinates(x, y) await self._send_cmd("move_cursor", {"x": sx, "y": sy}) async def drag_to(self, x, y, duration=0.5): + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = x or last_pos[0], y or last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + sx, sy = await self.to_screen_coordinates(x, y) await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration}) @@ -229,10 +260,24 @@ class Diorama: await self._send_cmd("hotkey", {"keys": list(keys)}) async def scroll_up(self, clicks: int = 1): - await self._send_cmd("scroll_up", {"clicks": clicks}) + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = last_pos[0], last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + + await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y}) async def scroll_down(self, clicks: int = 1): - await self._send_cmd("scroll_down", {"clicks": clicks}) + # Get last cursor position for this app_list hash + app_list_hash = hash(tuple(sorted(self._diorama.app_list))) + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + x, y = last_pos[0], last_pos[1] + # Update cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + + await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y}) async def get_screen_size(self) -> dict[str, int]: if not self._scene_size: From 92ca6d2923e9dc426444d9175542a1ed9f2e4576 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 12:03:06 -0400 Subject: [PATCH 15/23] Bugfixes --- libs/computer-server/computer_server/diorama/diorama.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py index 20f5b1fb..fc426a7c 100644 --- a/libs/computer-server/computer_server/diorama/diorama.py +++ b/libs/computer-server/computer_server/diorama/diorama.py @@ -94,9 +94,6 @@ class Diorama: x = args.get("x") y = args.get("y") - # Update the cursor position for this app_list hash - Diorama._cursor_positions[app_list_hash] = (x, y) - duration = args.get("duration", 0.5) if action == "left_click": await automation_handler.left_click(x, y) @@ -264,8 +261,6 @@ class Diorama: app_list_hash = hash(tuple(sorted(self._diorama.app_list))) last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) x, y = last_pos[0], last_pos[1] - # Update cursor position for this app_list hash - Diorama._cursor_positions[app_list_hash] = (x, y) await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y}) @@ -274,8 +269,6 @@ class Diorama: app_list_hash = hash(tuple(sorted(self._diorama.app_list))) last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) x, y = last_pos[0], last_pos[1] - # Update cursor position for this app_list hash - Diorama._cursor_positions[app_list_hash] = (x, y) await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y}) From 08ce9c67c17724b75e238cc2d8808caa00592087 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 14:07:24 -0400 Subject: [PATCH 16/23] Replaced pyautogui with pynput on macOS (#125, #165) --- .../computer_server/handlers/macos.py | 85 ++++++++++++------- libs/computer-server/pyproject.toml | 1 + 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py index 1e5c5ceb..8afc5cc1 100644 --- a/libs/computer-server/computer_server/handlers/macos.py +++ b/libs/computer-server/computer_server/handlers/macos.py @@ -1,4 +1,7 @@ import pyautogui +from pynput.mouse import Button, Controller as MouseController +from pynput.keyboard import Key, Controller as KeyboardController +import time import base64 from io import BytesIO from typing import Optional, Dict, Any, List, Tuple @@ -336,7 +339,6 @@ class UIElement: "position": position, "size": size, "enabled": self.enabled, - "focused": self.focused, "bbox": self.bbox, "visible_bbox": self.visible_bbox, "children": children_to_dict(self.children), @@ -527,11 +529,14 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler): class MacOSAutomationHandler(BaseAutomationHandler): # Mouse Actions + mouse = MouseController() + keyboard = KeyboardController() + async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: try: if x is not None and y is not None: - pyautogui.moveTo(x, y) - pyautogui.click() + self.mouse.position = (x, y) + self.mouse.click(Button.left, 1) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} @@ -539,8 +544,8 @@ class MacOSAutomationHandler(BaseAutomationHandler): async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]: try: if x is not None and y is not None: - pyautogui.moveTo(x, y) - pyautogui.rightClick() + self.mouse.position = (x, y) + self.mouse.click(Button.right, 1) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} @@ -550,15 +555,15 @@ class MacOSAutomationHandler(BaseAutomationHandler): ) -> Dict[str, Any]: try: if x is not None and y is not None: - pyautogui.moveTo(x, y) - pyautogui.doubleClick(interval=0.1) + self.mouse.position = (x, y) + self.mouse.click(Button.left, 2) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} async def move_cursor(self, x: int, y: int) -> Dict[str, Any]: try: - pyautogui.moveTo(x, y) + self.mouse.position = (x, y) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} @@ -567,9 +572,26 @@ class MacOSAutomationHandler(BaseAutomationHandler): self, x: int, y: int, button: str = "left", duration: float = 0.5 ) -> Dict[str, Any]: try: - pyautogui.dragTo(x, y, button=button, duration=duration) + btn = Button.left if button == "left" else Button.right + # Press + self.mouse.press(btn) + # Move with sleep to simulate drag duration + start = self.mouse.position + steps = 20 + start_x, start_y = start + dx = (x - start_x) / steps + dy = (y - start_y) / steps + for i in range(steps): + self.mouse.position = (int(start_x + dx * (i + 1)), int(start_y + dy * (i + 1))) + time.sleep(duration / steps) + # Release + self.mouse.release(btn) return {"success": True} except Exception as e: + try: + self.mouse.release(btn) + except: + pass return {"success": False, "error": str(e)} async def drag( @@ -578,29 +600,19 @@ class MacOSAutomationHandler(BaseAutomationHandler): try: if not path or len(path) < 2: return {"success": False, "error": "Path must contain at least 2 points"} - + btn = Button.left if button == "left" else Button.right # Move to the first point - start_x, start_y = path[0] - pyautogui.moveTo(start_x, start_y) - - # Press the mouse button - pyautogui.mouseDown(button=button) - - # Calculate time between points to distribute duration evenly + self.mouse.position = path[0] + self.mouse.press(btn) step_duration = duration / (len(path) - 1) if len(path) > 1 else duration - - # Move through each subsequent point for x, y in path[1:]: - pyautogui.moveTo(x, y, duration=step_duration) - - # Release the mouse button - pyautogui.mouseUp(button=button) - + self.mouse.position = (x, y) + time.sleep(step_duration) + self.mouse.release(btn) return {"success": True} except Exception as e: - # Make sure to release the mouse button if an error occurs try: - pyautogui.mouseUp(button=button) + self.mouse.release(btn) except: pass return {"success": False, "error": str(e)} @@ -608,21 +620,28 @@ class MacOSAutomationHandler(BaseAutomationHandler): # Keyboard Actions async def type_text(self, text: str) -> Dict[str, Any]: try: - pyautogui.write(text) + self.keyboard.type(text) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} async def press_key(self, key: str) -> Dict[str, Any]: try: - pyautogui.press(key) + # Try to map string to Key else use as char + k = getattr(Key, key, key) + self.keyboard.press(k) + self.keyboard.release(k) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} async def hotkey(self, keys: List[str]) -> Dict[str, Any]: try: - pyautogui.hotkey(*keys) + key_objs = [getattr(Key, k, k) for k in keys] + for k in key_objs: + self.keyboard.press(k) + for k in reversed(key_objs): + self.keyboard.release(k) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} @@ -630,14 +649,14 @@ class MacOSAutomationHandler(BaseAutomationHandler): # Scrolling Actions async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]: try: - pyautogui.scroll(-clicks) + self.mouse.scroll(0, -clicks) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]: try: - pyautogui.scroll(clicks) + self.mouse.scroll(0, clicks) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} @@ -668,8 +687,8 @@ class MacOSAutomationHandler(BaseAutomationHandler): async def get_cursor_position(self) -> Dict[str, Any]: try: - pos = pyautogui.position() - return {"success": True, "position": {"x": pos.x, "y": pos.y}} + x, y = self.mouse.position + return {"success": True, "position": {"x": x, "y": y}} except Exception as e: return {"success": False, "error": str(e)} diff --git a/libs/computer-server/pyproject.toml b/libs/computer-server/pyproject.toml index b5480f0f..cbf9821a 100644 --- a/libs/computer-server/pyproject.toml +++ b/libs/computer-server/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "uvicorn[standard]>=0.27.0", "pydantic>=2.0.0", "pyautogui>=0.9.54", + "pynput>=1.8.1", "pillow>=10.2.0", "aiohttp>=3.9.1" ] From dd717764e3be295c01599f7fe5ff7aaa23a65ec1 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 14:26:42 -0400 Subject: [PATCH 17/23] Revert presskey and hotkey to pyautogui --- .../computer-server/computer_server/handlers/macos.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py index 8afc5cc1..713ac371 100644 --- a/libs/computer-server/computer_server/handlers/macos.py +++ b/libs/computer-server/computer_server/handlers/macos.py @@ -627,21 +627,14 @@ class MacOSAutomationHandler(BaseAutomationHandler): async def press_key(self, key: str) -> Dict[str, Any]: try: - # Try to map string to Key else use as char - k = getattr(Key, key, key) - self.keyboard.press(k) - self.keyboard.release(k) + pyautogui.press(key) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} async def hotkey(self, keys: List[str]) -> Dict[str, Any]: try: - key_objs = [getattr(Key, k, k) for k in keys] - for k in key_objs: - self.keyboard.press(k) - for k in reversed(key_objs): - self.keyboard.release(k) + pyautogui.hotkey(*keys) return {"success": True} except Exception as e: return {"success": False, "error": str(e)} From 8f5f72ab213ef48cde32ea7461d923072a2521f1 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 14:47:46 -0400 Subject: [PATCH 18/23] Replace example --- examples/eval_examples.py | 724 ----------------------- examples/sandboxed_functions_examples.py | 54 ++ 2 files changed, 54 insertions(+), 724 deletions(-) delete mode 100644 examples/eval_examples.py create mode 100644 examples/sandboxed_functions_examples.py diff --git a/examples/eval_examples.py b/examples/eval_examples.py deleted file mode 100644 index b3d163ca..00000000 --- a/examples/eval_examples.py +++ /dev/null @@ -1,724 +0,0 @@ -import os -import asyncio -import json -import random -from pathlib import Path -import sys -import traceback -import time -from functools import wraps -import urllib.request -import datetime -from urllib.parse import quote - -# Global variable to track all results -all_results = [] - -# Wikirace prompt template -WIKIRACE_PROMPT_TEMPLATE = """ -You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" -by clicking only on Wikipedia links within articles. - -Rules: -1. Only click on links within Wikipedia articles (blue text) -2. No using search, back button, or typing URLs -3. You MAY use cmd+f (or ctrl+f) to find text on the current page -4. Do NOT click any search icon or type into any search box unless it's a browser command -5. Try to find the shortest path possible -6. Current target: {target_page} -7. Do not maximize the window or use any other application -8. Avoid wasting actions by scrolling -9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps -10. Stay on the English Wikipedia - -Look at the current page and click on a link that might lead you closer to {target_page}. -""" - -# Store original print function -_print = print - -# Define log file path -project_root = Path(__file__).parent.parent -log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt" -results_file = project_root / "examples" / "evals" / "eval_appuse_results.md" - -# Custom print function that also logs to file -def print(*args, **kwargs): - # Call the original print function - _print(*args, **kwargs) - - # Format the output as a string - output = " ".join(str(arg) for arg in args) - if kwargs.get("end") is not None: - output += kwargs["end"] - else: - output += "\n" - - # Add timestamp - timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - log_entry = f"[{timestamp}] {output}" - - # Append to log file - with open(log_file, "a") as f: - f.write(log_entry) - -# Load environment variables from .env file -env_file = project_root / ".env" -print(f"Loading environment from: {env_file}") -from dotenv import load_dotenv - -load_dotenv(env_file) - -# Add paths to sys.path if needed -pythonpath = os.environ.get("PYTHONPATH", "") -for path in pythonpath.split(":"): - if path and path not in sys.path: - sys.path.insert(0, path) # Insert at beginning to prioritize - print(f"Added to sys.path: {path}") - -from computer.computer import Computer -from computer.providers.base import VMProviderType -from computer.logger import LogLevel -from computer.helpers import sandboxed - -# Assuming these exist based on your request -from agent import ComputerAgent, LLM, AgentLoop, LLMProvider - -articles = [] - -# Load from file -articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt" -with open(articles_file, "r") as f: - articles = [line.strip() for line in f] - - -def get_article_links(article_title): - """Get all links from a Wikipedia article's content""" - try: - # Get the article content - url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json" - - with urllib.request.urlopen(url) as response: - data = json.loads(response.read().decode()) - - pages = data.get('query', {}).get('pages', {}) - if not pages: - return [] - - # Get the first (and only) page - page = next(iter(pages.values())) - links = page.get('links', []) - - # Filter links to keep only main namespace articles (no special pages, files, etc.) - article_links = [] - for link in links: - title = link.get('title', '') - # Skip if title contains colons (indicates special pages, files, categories, etc.) - if ':' not in title and title.isascii() and len(title) < 50: - article_links.append(title) - - return article_links - - except Exception as e: - print(f"Error fetching links for {article_title}: {e}") - return [] - -def wikipedia_random_walk(start_article, depth=5): - """ - Perform a random walk through Wikipedia articles - - Args: - start_article (str): The article title to start from - depth (int): How many steps to take in the random walk - - Returns: - list: Path of article titles visited during the walk - """ - path = [start_article] - current_article = start_article - - for step in range(depth): - print(f"Step {step + 1}: Currently at '{current_article}'") - - # Get links from current article - links = get_article_links(current_article) - - if not links: - print(f"No valid links found in '{current_article}'. Ending walk.") - break - - # Randomly select next article - next_article = random.choice(links) - path.append(next_article) - current_article = next_article - - print(f" -> Moving to '{next_article}'") - - return path - -def get_article_pair(depth=5): - global articles - start_article = random.choice(articles) - target_article = wikipedia_random_walk(start_article, depth)[-1] - while target_article == start_article: - start_article = random.choice(articles) - target_article = wikipedia_random_walk(start_article, depth)[-1] - return start_article, target_article - - -def save_results_to_markdown(): - """Save all results to a markdown table""" - global all_results - - if not all_results: - print("No results to save") - return - - # Create header for the markdown table - header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |" - separator = "|---|---|---|---|---|---|---|---|---|---|" - - # Create rows for each result - rows = [] - for result in all_results: - timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |" - rows.append(row) - - # Combine header, separator, and rows - table = "\n".join([header, separator] + rows) - - # Write to file (append mode) - with open(results_file, "a") as f: - f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") - f.write(table) - - print(f"Results saved to {results_file}") - -async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30): - """Run a specific evaluation scenario""" - - print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===") - - # Create computer instance with or without app-use experiment - experiments = ["app-use"] if use_app_use else [] - computer = Computer(experiments=experiments) - - try: - # Run the computer - await computer.run() - - # Install required packages - await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) - - # Run the specific scenario - if scenario_name == "messy_desktop": - await run_messy_desktop_scenario(computer, agent_configs, max_steps) - elif scenario_name == "parallel_agents": - await run_parallel_agents_scenario(computer, agent_configs, max_steps) - else: - print(f"Unknown scenario: {scenario_name}") - - except Exception as e: - print(f"Error in scenario {scenario_name}: {e}") - traceback.print_exc() - finally: - # Important to clean up resources - # await computer.stop() - pass - - -@sandboxed("eval_env") -def close_all_windows(): - """Close all open windows""" - import pywinctl - windows = pywinctl.getAllWindows() - for window in windows: - try: - window.close() - except: - # Some windows might not be closeable or may have already closed - pass - - -@sandboxed("eval_env") -def get_current_wiki_page(app_name=None): - """Get the title of the current Wikipedia page - - Args: - app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox') - """ - import pywinctl - windows = pywinctl.getAllWindows() - - # Filter windows by app name if provided - if app_name: - windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()] - - # Get titles from filtered windows - titles = [w.title for w in windows if w.title] - wiki_titles = [title for title in titles if "Wikipedia" in title] - - if wiki_titles: - return wiki_titles[0].split(" - Wikipedia")[0] - return None - - -@sandboxed("eval_env") -def get_open_app_names(): - """Get names of all open applications""" - import pywinctl - windows = pywinctl.getAllWindows() - return [window.getAppName() for window in windows if window.getAppName()] - -def _computer(): - """Get the default computer instance""" - from computer.helpers import _default_computer - return _default_computer - -async def open_app(app_name): - """Open a specific application""" - await _computer().interface.run_command(f"open -a '{app_name}'") - await asyncio.sleep(2) # Wait for app to open - - -async def open_wiki(page, app_name="Safari"): - """Open a specific Wikipedia page""" - await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}") - await asyncio.sleep(2) # Wait for page to load - - -async def run_messy_desktop_scenario(computer, agent_configs, max_steps): - global all_results - """Run the messy desktop scenario with a single agent""" - # Get popular wiki articles - global articles - start_page, target_page = get_article_pair(depth=1) - - print(f"Wiki race: {start_page} → {target_page}") - - # Close all windows first - await close_all_windows() - - # Open starting Wikipedia page - await open_wiki(start_page) - - # Open 3 random apps to create a messy desktop - apps_to_open = ["Notes", "Terminal", "System Settings"] - for app in apps_to_open: - await open_app(app) - - # Verify apps are open - open_apps = await get_open_app_names() - print(f"Open applications: {open_apps}") - - # Create the agent's computer interface - # If app-use is enabled, create a desktop limited to Safari/Firefox - if "app-use" in (computer.experiments or []): - browser_desktop = computer.create_desktop_from_apps(["Safari"]) - agent_computer = browser_desktop - else: - agent_computer = computer - - # Run each agent configuration - for config_name, loop_provider, model_provider in agent_configs: - print(f"\n--- Testing Agent: {config_name} ---") - - # Create agent with the specified configuration - agent = ComputerAgent( - computer=agent_computer, - loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, - trajectory_dir="examples/evals/trajectories/eval_appuse" - ) - - # Run the wikirace - steps = 0 - success = False - start_time = time.time() - - # Use the template with formatting for this scenario - prompt = WIKIRACE_PROMPT_TEMPLATE.format( - browser="Safari", - start_page=start_page, - target_page=target_page - ) - - try: - while steps < max_steps and not success: - async for result in agent.run(prompt): - steps += 1 - print(f"Step {steps}") - - def process_result(): - if result.get("content"): - print(f"Agent: {result.get('content', '')}") - - else: - outputs = result.get("output", []) - for output in outputs: - if output.get("type") == "message": - content = output.get("content", []) - for content_part in content: - if content_part.get("text"): - print(f"Agent: {content_part.get('text', '')}") - - elif output.get("type") == "reasoning": - # if it's openAI, we only have access to a summary of the reasoning - summary_content = output.get("summary", []) - if summary_content: - for summary_part in summary_content: - if summary_part.get("type") == "summary_text": - print(f"Agent: {summary_part.get('text', '')}") - - else: - summary_content = output.get("text", "") - if summary_content: - print(f"Agent: {summary_content}") - - elif output.get("type") == "computer_call": - action = output.get("action", {}) - action_type = action.get("type", "") - if action_type: - action_title = f"🛠️ Performing {action_type}" - if action.get("x") and action.get("y"): - action_title += f" at ({action['x']}, {action['y']})" - print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```") - - - # Process and print the result - process_result() - - # Check current page - current_page = await get_current_wiki_page("Safari") - print(f"Current page: {current_page}") - print(f"Target: {target_page}") - - # Check if we reached the target - if current_page and target_page.lower() in current_page.lower(): - success = True - print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!") - await agent._loop.cancel() - break - - # Safety check - if steps >= max_steps: - print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") - await agent._loop.cancel() - break - except asyncio.CancelledError: - print("Agent stopped") - - end_time = time.time() - duration = end_time - start_time - await asyncio.sleep(2) # Wait for agent to finish - - # Results - print(f"\n=== WIKIRACE RESULTS: {config_name} ===") - print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") - print(f"Start: {start_page}") - print(f"Target: {target_page}") - print(f"Steps taken: {steps}") - print(f"Success: {success}") - print(f"Duration: {duration:.2f} seconds") - - -async def run_parallel_agents_scenario(computer, agent_configs, max_steps): - global all_results - - """Run two agents in parallel, one using Safari and one using Firefox""" - # Get popular wiki articles - global articles - safari_start, safari_target = get_article_pair(depth=1) - firefox_start, firefox_target = get_article_pair(depth=1) - - print(f"Safari Wiki race: {safari_start} → {safari_target}") - print(f"Firefox Wiki race: {firefox_start} → {firefox_target}") - - # Close all windows first - await close_all_windows() - - # Open Safari with starting page - await open_wiki(safari_start, "Safari") - await asyncio.sleep(2) - - # Open Firefox with starting page - await open_wiki(firefox_start, "Firefox") - await asyncio.sleep(2) - - # Create agent configurations - for config_name, loop_provider, model_provider in agent_configs: - print(f"\n--- Testing Parallel Agents: {config_name} ---") - - # Create the agent interfaces - if "app-use" in (computer.experiments or []): - safari_desktop = computer.create_desktop_from_apps(["Safari"]) - firefox_desktop = computer.create_desktop_from_apps(["Firefox"]) - else: - safari_desktop = computer - firefox_desktop = computer - - # Save screenshots - screenshot_dir = project_root / "examples" / "evals" / "screenshots" - screenshot_dir.mkdir(exist_ok=True) - safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png" - firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png" - screenshot_bytes = await safari_desktop.interface.screenshot() - with open(safari_screenshot_path, "wb") as f: - f.write(screenshot_bytes) - screenshot_bytes = await firefox_desktop.interface.screenshot() - with open(firefox_screenshot_path, "wb") as f: - f.write(screenshot_bytes) - - # Create agents - safari_agent = ComputerAgent( - computer=safari_desktop, - loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, - trajectory_dir="examples/evals/trajectories/eval_parallel_safari" - ) - - firefox_agent = ComputerAgent( - computer=firefox_desktop, - loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, - trajectory_dir="examples/evals/trajectories/eval_parallel_firefox" - ) - - # Create prompts using the template - safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format( - browser="Safari", - start_page=safari_start, - target_page=safari_target - ) - - firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format( - browser="Firefox", - start_page=firefox_start, - target_page=firefox_target - ) - - # Track results - safari_results = { - "steps": 0, - "success": False, - "start_time": time.time(), - "end_time": None - } - - firefox_results = { - "steps": 0, - "success": False, - "start_time": time.time(), - "end_time": None - } - - # Function to run a single agent - async def run_agent(agent, prompt, browser, start_page, target_page, results): - try: - while results["steps"] < max_steps and not results["success"]: - async for result in agent.run(prompt): - results["steps"] += 1 - print(f"{browser} Step {results['steps']}") - - def process_result(): - if result.get("content"): - print(f"{browser} Agent: {result.get('content', '')}") - - else: - outputs = result.get("output", []) - for output in outputs: - if output.get("type") == "message": - content = output.get("content", []) - for content_part in content: - if content_part.get("text"): - print(f"{browser} Agent: {content_part.get('text', '')}") - - elif output.get("type") == "reasoning": - # if it's openAI, we only have access to a summary of the reasoning - summary_content = output.get("summary", []) - if summary_content: - for summary_part in summary_content: - if summary_part.get("type") == "summary_text": - print(f"{browser} Agent: {summary_part.get('text', '')}") - - else: - summary_content = output.get("text", "") - if summary_content: - print(f"{browser} Agent: {summary_content}") - - elif output.get("type") == "computer_call": - action = output.get("action", {}) - action_type = action.get("type", "") - if action_type: - action_title = f"🛠️ Performing {action_type}" - if action.get("x") and action.get("y"): - action_title += f" at ({action['x']}, {action['y']})" - print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```") - - - # Process and print the result - process_result() - - # Check current page - current_page = await get_current_wiki_page(browser) - print(f"{browser} current page: {current_page}") - print(f"{browser} target: {target_page}") - - # Add result to global tracking - global all_results - current_result = { - 'scenario': 'parallel_agents', - 'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No', - 'browser': browser, - 'config': config_name, - 'start': start_page, - 'target': target_page, - 'steps': results['steps'], - 'success': results['success'], - 'duration': time.time() - results['start_time'] - } - all_results.append(current_result) - - # Save results after each step - save_results_to_markdown() - - # Check if we reached the target - if current_page and target_page.lower() in current_page.lower(): - results["success"] = True - print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!") - await agent._loop.cancel() - break - - # Check if we reached the maximum steps - if results["steps"] >= max_steps: - print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})") - await agent._loop.cancel() - break - except asyncio.CancelledError: - print(f"{browser} agent stopped") - finally: - results["end_time"] = time.time() - - # Run both agents in parallel - await asyncio.gather( - run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results), - run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results) - ) - - # Wait for agents to finish - await asyncio.sleep(2) - - # Print results - print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===") - print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") - - print(f"\nSafari Results:") - print(f"Start: {safari_start}") - print(f"Target: {safari_target}") - print(f"Steps taken: {safari_results['steps']}") - print(f"Success: {safari_results['success']}") - print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds") - - print(f"\nFirefox Results:") - print(f"Start: {firefox_start}") - print(f"Target: {firefox_target}") - print(f"Steps taken: {firefox_results['steps']}") - print(f"Success: {firefox_results['success']}") - print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds") - - -async def main(): - try: - - # Define agent configurations to test - agent_configs = [ - # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), - # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), - ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) - ] - - # # Run the test scenario without any agents - # print("Running test scenario for sandboxed functions") - # await run_test_scenario() - - # Set maximum steps for each agent run - max_steps = 15 - runs = 5 - - # run all scenarios - for _ in range(runs): - # Scenario 1: Messy desktop without App-Use - await run_scenario("messy_desktop", False, agent_configs, max_steps) - - # Scenario 1: Messy desktop with App-Use - await run_scenario("messy_desktop", True, agent_configs, max_steps) - - # Scenario 2: Parallel agents without App-Use - await run_scenario("parallel_agents", False, agent_configs, max_steps) - - # Scenario 2: Parallel agents with App-Use - await run_scenario("parallel_agents", True, agent_configs, max_steps) - - except Exception as e: - print(f"Error in main: {e}") - traceback.print_exc() - - -async def run_test_scenario(max_iterations=5): - """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match - - This function opens the same Wikipedia pages in both browsers and verifies that - the get_current_wiki_page function returns the same result for both browsers. - It does this for the specified number of iterations. - """ - - # Create computer instance - computer = Computer() - await computer.run() - - # Get popular wiki articles - global articles - selected_articles = random.sample(articles, max_iterations) - - print(f"\n--- Running Test Scenario for {max_iterations} iterations ---") - - # Close all windows first - await close_all_windows() - - # Open both browsers - await open_app("Safari") - await open_app("Firefox") - - # Verify browsers are open - open_apps = await get_open_app_names() - print(f"Open applications: {open_apps}") - - # Run test iterations - for i, article in enumerate(selected_articles): - print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'") - - # Open the same Wikipedia page in both browsers - await open_wiki(article, "Safari") - await open_wiki(article, "Firefox") - await asyncio.sleep(3) # Give a bit more time for both pages to load - - # Check if both browsers show the same page - safari_page = await get_current_wiki_page("Safari") - firefox_page = await get_current_wiki_page("Firefox") - - print(f"Safari page: {safari_page}") - print(f"Firefox page: {firefox_page}") - - if safari_page == firefox_page: - print(f"✅ MATCH: Both browsers show '{safari_page}'") - else: - print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'") - - await asyncio.sleep(1) # Brief pause between iterations - - print("\n--- Test Scenario Completed ---") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/sandboxed_functions_examples.py b/examples/sandboxed_functions_examples.py new file mode 100644 index 00000000..caa733b9 --- /dev/null +++ b/examples/sandboxed_functions_examples.py @@ -0,0 +1,54 @@ +from pathlib import Path +import os +import sys + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +env_file = project_root / ".env" +print(f"Loading environment from: {env_file}") +from dotenv import load_dotenv + +load_dotenv(env_file) + +# Add paths to sys.path if needed +pythonpath = os.environ.get("PYTHONPATH", "") +for path in pythonpath.split(":"): + if path and path not in sys.path: + sys.path.insert(0, path) # Insert at beginning to prioritize + print(f"Added to sys.path: {path}") + +import asyncio +from computer.computer import Computer +from computer.helpers import sandboxed + +async def main(): + # Initialize the computer in a C/ua Container + computer = Computer() + await computer.run() + + # Install a package in a virtual environment in the container + await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) + + # Open Safari + await computer.interface.run_command("open -a Safari") + await asyncio.sleep(2) + + # Define a sandboxed function + # This function will run inside the C/ua Container + @sandboxed("demo_venv") + def greet_and_print(name): + # get .html of the current Safari tab + import PyXA + safari = PyXA.Application("Safari") + current_doc = safari.current_document + html = current_doc.source() + print(f"Hello from inside the container, {name}!") + print("Safari HTML length:", len(html)) + return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]} + + # Call with args and kwargs + result = await greet_and_print("C/ua") + print("Result from sandboxed function:", result) + +if __name__ == "__main__": + asyncio.run(main()) From a4250f57fa2bf93b6fe8dc555ac473d0dbfeafaf Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 14:53:54 -0400 Subject: [PATCH 19/23] Added to README.md --- README.md | 32 ++++++++++++++++++++++++++++++-- tests/venv.py | 2 +- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5f67d69f..29041fba 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,7 @@ pip install "cua-computer[all]" "cua-agent[all]" ### Step 4: Use in Your Code ```python +from computer.helpers import sandboxed from computer import Computer from agent import ComputerAgent, LLM @@ -163,9 +164,31 @@ async def main(): loop="uitars", model=LLM(provider="mlxvlm", name="mlx-community/UI-TARS-1.5-7B-6bit") ) - await agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide") + async for result in agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide"): + print(result) -main() + # Example: Use sandboxed functions to execute code in a C/ua Container + # 1. Install a package in a Python virtual environment + await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) + + # 2. Define a sandboxed function + @sandboxed("demo_venv") + def greet_and_print(name): + # get .html of the current Safari tab + import PyXA + safari = PyXA.Application("Safari") + current_doc = safari.current_document + html = current_doc.source() + print(f"Hello from inside the container, {name}!") + print("Safari HTML length:", len(html)) + return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]} + + # 3. Run the function in the container in the agent's environment + result = await greet_and_print("C/ua") + print("Result from sandboxed function:", result) + +if __name__ == "__main__": + asyncio.run(main()) ``` For ready-to-use examples, check out our [Notebooks](./notebooks/) collection. @@ -273,6 +296,11 @@ await computer.interface.run_command(cmd) # Run shell command # Accessibility await computer.interface.get_accessibility_tree() # Get accessibility tree + +# Python Virtual Environment Operations +await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment +await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests.get(`https://httpbin.org/ip`).json())'") # Run a shell command in a virtual environment +await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception ``` ## ComputerAgent Reference diff --git a/tests/venv.py b/tests/venv.py index 8b78a78f..8463fa4d 100644 --- a/tests/venv.py +++ b/tests/venv.py @@ -31,7 +31,7 @@ for path in pythonpath.split(":"): from computer.computer import Computer from computer.providers.base import VMProviderType -from computer.helpers import remote, set_default_computer +from computer.helpers import sandboxed, set_default_computer @pytest.fixture(scope="session") From 3e6cb3465e3a7c246babd939b20a624960e57efc Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 15:20:17 -0400 Subject: [PATCH 20/23] Moved @sandboxed example --- README.md | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 29041fba..f3965a4a 100644 --- a/README.md +++ b/README.md @@ -167,26 +167,6 @@ async def main(): async for result in agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide"): print(result) - # Example: Use sandboxed functions to execute code in a C/ua Container - # 1. Install a package in a Python virtual environment - await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) - - # 2. Define a sandboxed function - @sandboxed("demo_venv") - def greet_and_print(name): - # get .html of the current Safari tab - import PyXA - safari = PyXA.Application("Safari") - current_doc = safari.current_document - html = current_doc.source() - print(f"Hello from inside the container, {name}!") - print("Safari HTML length:", len(html)) - return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]} - - # 3. Run the function in the container in the agent's environment - result = await greet_and_print("C/ua") - print("Result from sandboxed function:", result) - if __name__ == "__main__": asyncio.run(main()) ``` @@ -301,6 +281,25 @@ await computer.interface.get_accessibility_tree() # Get accessibility tree await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests.get(`https://httpbin.org/ip`).json())'") # Run a shell command in a virtual environment await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception + +# Example: Use sandboxed functions to execute code in a C/ua Container +# 1. Install a package in a Python virtual environment +await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) + +# 2. Define a sandboxed function +@sandboxed("demo_venv") +def greet_and_print(name, html_snippet_length=200): + # get .html of the current Safari tab + import PyXA + safari = PyXA.Application("Safari") + html = safari.current_document.source() + print(f"Hello from inside the container, {name}!") + print("Safari HTML length:", len(html)) + return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:html_snippet_length]} + +# 3. Run the function in the container in the agent's environment +result = await greet_and_print("C/ua", html_snippet_length=100) +print("Result from sandboxed function:", result) ``` ## ComputerAgent Reference From 3ab5b65a257d62d0c6bb2cc5eb59f1584f95c680 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 15:50:02 -0400 Subject: [PATCH 21/23] Fixes for hotkey and claude scrolling --- .../agent/providers/anthropic/tools/computer.py | 10 ++-------- libs/computer/computer/diorama_computer.py | 12 +++++++++++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/libs/agent/agent/providers/anthropic/tools/computer.py b/libs/agent/agent/providers/anthropic/tools/computer.py index ecf232bd..2bb944ea 100644 --- a/libs/agent/agent/providers/anthropic/tools/computer.py +++ b/libs/agent/agent/providers/anthropic/tools/computer.py @@ -478,17 +478,11 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): if direction == "down": # Scroll down (Page Down on macOS) self.logger.info(f"Scrolling down, amount: {amount}") - # Use fn+down for page down on macOS - for _ in range(amount): - await self.computer.interface.hotkey("fn", "down") - await asyncio.sleep(0.1) + await self.computer.interface.scroll_down(amount) else: # Scroll up (Page Up on macOS) self.logger.info(f"Scrolling up, amount: {amount}") - # Use fn+up for page up on macOS - for _ in range(amount): - await self.computer.interface.hotkey("fn", "up") - await asyncio.sleep(0.1) + await self.computer.interface.scroll_up(amount) # Wait briefly for UI changes await asyncio.sleep(0.5) diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py index 5cad0006..dfb541b9 100644 --- a/libs/computer/computer/diorama_computer.py +++ b/libs/computer/computer/diorama_computer.py @@ -87,7 +87,17 @@ class DioramaComputerInterface: await self._send_cmd("press_key", {"key": key}) async def hotkey(self, *keys): - await self._send_cmd("hotkey", {"keys": list(keys)}) + actual_keys = [] + for key in keys: + if isinstance(key, Key): + actual_keys.append(key.value) + elif isinstance(key, str): + # Try to convert to enum if it matches a known key + key_or_enum = Key.from_string(key) + actual_keys.append(key_or_enum.value if isinstance(key_or_enum, Key) else key_or_enum) + else: + raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.") + await self._send_cmd("hotkey", {"keys": actual_keys}) async def to_screen_coordinates(self, x, y): return await self._send_cmd("to_screen_coordinates", {"x": x, "y": y}) From 8599da0d43f3f5ea534e82972e05eefa74c5f0c3 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 15:53:16 -0400 Subject: [PATCH 22/23] Add missing imports --- libs/computer/computer/diorama_computer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py index dfb541b9..2eee77f0 100644 --- a/libs/computer/computer/diorama_computer.py +++ b/libs/computer/computer/diorama_computer.py @@ -1,4 +1,5 @@ import asyncio +from .interface.models import KeyType, Key class DioramaComputer: """ From e63c5fd81fce768f36f4b14b1f3faba6b0ad3969 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 15:54:02 -0400 Subject: [PATCH 23/23] Finalized README --- README.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f3965a4a..f10bae77 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,6 @@ pip install "cua-computer[all]" "cua-agent[all]" ### Step 4: Use in Your Code ```python -from computer.helpers import sandboxed from computer import Computer from agent import ComputerAgent, LLM @@ -283,10 +282,7 @@ await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception # Example: Use sandboxed functions to execute code in a C/ua Container -# 1. Install a package in a Python virtual environment -await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) - -# 2. Define a sandboxed function +from computer.helpers import sandboxed @sandboxed("demo_venv") def greet_and_print(name, html_snippet_length=200): # get .html of the current Safari tab @@ -296,9 +292,7 @@ def greet_and_print(name, html_snippet_length=200): print(f"Hello from inside the container, {name}!") print("Safari HTML length:", len(html)) return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:html_snippet_length]} - -# 3. Run the function in the container in the agent's environment -result = await greet_and_print("C/ua", html_snippet_length=100) +result = await greet_and_print("C/ua", html_snippet_length=100) # Executes in the container print("Result from sandboxed function:", result) ```