From 312361abccdb616fe4681ce974abf561ba40c2cb Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 3 Jun 2025 13:12:01 -0400
Subject: [PATCH 01/23] Added python funcs to computer

---
 libs/computer/computer/computer.py | 170 +++++++++++++++++++++++++++++
 tests/pytest.ini                   |   4 +
 tests/venv.py                      | 151 +++++++++++++++++++++++++
 3 files changed, 325 insertions(+)
 create mode 100644 tests/pytest.ini
 create mode 100644 tests/venv.py

diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py
index c25ad2bf..b77582ae 100644
--- a/libs/computer/computer/computer.py
+++ b/libs/computer/computer/computer.py
@@ -722,3 +722,173 @@ class Computer:
             tuple[float, float]: (x, y) coordinates in screenshot space
         """
         return await self.interface.to_screenshot_coordinates(x, y)
+
+
+    # Add virtual environment management functions to computer interface
+    async def venv_install(self, venv_name: str, requirements: list[str]) -> tuple[str, str]:
+        """Install packages in a virtual environment.
+        
+        Args:
+            venv_name: Name of the virtual environment
+            requirements: List of package requirements to install
+            
+        Returns:
+            Tuple of (stdout, stderr) from the installation command
+        """
+        requirements = requirements or []
+
+        # Create virtual environment if it doesn't exist
+        venv_path = f"~/.venvs/{venv_name}"
+        create_cmd = f"mkdir -p ~/.venvs && python3 -m venv {venv_path}"
+        
+        # Check if venv exists, if not create it
+        check_cmd = f"test -d {venv_path} || ({create_cmd})"
+        _, _ = await self.interface.run_command(check_cmd)
+        
+        # Install packages
+        requirements_str = " ".join(requirements)
+        install_cmd = f". {venv_path}/bin/activate && pip install {requirements_str}"
+        return await self.interface.run_command(install_cmd)
+    
+    async def venv_cmd(self, venv_name: str, command: str) -> tuple[str, str]:
+        """Execute a shell command in a virtual environment.
+        
+        Args:
+            venv_name: Name of the virtual environment
+            command: Shell command to execute in the virtual environment
+            
+        Returns:
+            Tuple of (stdout, stderr) from the command execution
+        """
+        venv_path = f"~/.venvs/{venv_name}"
+        
+        # Check if virtual environment exists
+        check_cmd = f"test -d {venv_path}"
+        stdout, stderr = await self.interface.run_command(check_cmd)
+        
+        if stderr or "test:" in stdout:  # venv doesn't exist
+            return "", f"Virtual environment '{venv_name}' does not exist. Create it first using venv_install."
+        
+        # Activate virtual environment and run command
+        full_command = f". {venv_path}/bin/activate && {command}"
+        return await self.interface.run_command(full_command)
+    
+    async def venv_exec(self, venv_name: str, python_func, *args, **kwargs):
+        """Execute Python function in a virtual environment using source code extraction.
+        
+        Args:
+            venv_name: Name of the virtual environment
+            python_func: A callable function to execute
+            *args: Positional arguments to pass to the function
+            **kwargs: Keyword arguments to pass to the function
+            
+        Returns:
+            The result of the function execution, or raises any exception that occurred
+        """
+        import base64
+        import inspect
+        import json
+        import textwrap
+        
+        try:
+            # Get function source code using inspect.getsource
+            source = inspect.getsource(python_func)
+            # Remove common leading whitespace (dedent)
+            func_source = textwrap.dedent(source).strip()
+            
+            # Get function name for execution
+            func_name = python_func.__name__
+            
+            # Serialize args and kwargs as JSON (safer than dill for cross-version compatibility)
+            args_json = json.dumps(args, default=str)
+            kwargs_json = json.dumps(kwargs, default=str)
+            
+        except OSError as e:
+            raise Exception(f"Cannot retrieve source code for function {python_func.__name__}: {e}")
+        except Exception as e:
+            raise Exception(f"Failed to reconstruct function source: {e}")
+        
+        # Create Python code that will define and execute the function
+        python_code = f'''
+import json
+import traceback
+
+try:
+    # Define the function from source
+{textwrap.indent(func_source, "    ")}
+    
+    # Deserialize args and kwargs from JSON
+    args_json = """{args_json}"""
+    kwargs_json = """{kwargs_json}"""
+    args = json.loads(args_json)
+    kwargs = json.loads(kwargs_json)
+    
+    # Execute the function
+    result = {func_name}(*args, **kwargs)
+
+    # Create success output payload
+    output_payload = {{
+        "success": True,
+        "result": result,
+        "error": None
+    }}
+    
+except Exception as e:
+    # Create error output payload
+    output_payload = {{
+        "success": False,
+        "result": None,
+        "error": {{
+            "type": type(e).__name__,
+            "message": str(e),
+            "traceback": traceback.format_exc()
+        }}
+    }}
+
+# Serialize the output payload as JSON
+import json
+output_json = json.dumps(output_payload, default=str)
+
+# Print the JSON output with markers
+print(f"<<<VENV_EXEC_START>>>{{output_json}}<<<VENV_EXEC_END>>>")
+'''
+        
+        # Encode the Python code in base64 to avoid shell escaping issues
+        encoded_code = base64.b64encode(python_code.encode('utf-8')).decode('ascii')
+        
+        # Execute the Python code in the virtual environment
+        python_command = f"python -c \"import base64; exec(base64.b64decode('{encoded_code}').decode('utf-8'))\""
+        stdout, stderr = await self.venv_cmd(venv_name, python_command)
+        
+        # Parse the output to extract the payload
+        start_marker = "<<<VENV_EXEC_START>>>"
+        end_marker = "<<<VENV_EXEC_END>>>"
+
+        # Print original stdout
+        print(stdout[:stdout.find(start_marker)])
+        
+        if start_marker in stdout and end_marker in stdout:
+            start_idx = stdout.find(start_marker) + len(start_marker)
+            end_idx = stdout.find(end_marker)
+            
+            if start_idx < end_idx:
+                output_json = stdout[start_idx:end_idx]
+
+                try:
+                    # Decode and deserialize the output payload from JSON
+                    output_payload = json.loads(output_json)
+                except Exception as e:
+                    raise Exception(f"Failed to decode output payload: {e}")
+                
+                if output_payload["success"]:
+                    return output_payload["result"]
+                else:
+                    # Recreate and raise the original exception
+                    error_info = output_payload["error"]
+                    error_class = eval(error_info["type"])
+                    raise error_class(error_info["message"])
+            else:
+                raise Exception("Invalid output format: markers found but no content between them")
+        else:
+            # Fallback: return stdout/stderr if no payload markers found
+            raise Exception(f"No output payload found. stdout: {stdout}, stderr: {stderr}")
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 00000000..998cbeaf
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+asyncio_mode = auto
+markers =
+    asyncio: asyncio mark
\ No newline at end of file
diff --git a/tests/venv.py b/tests/venv.py
new file mode 100644
index 00000000..4f9e3206
--- /dev/null
+++ b/tests/venv.py
@@ -0,0 +1,151 @@
+"""
+Virtual Environment Testing Module
+This module tests the ability to execute python code in a virtual environment within C/ua Containers.
+
+Required environment variables:
+- CUA_API_KEY: API key for C/ua cloud provider
+- CUA_CONTAINER_NAME: Name of the container to use
+"""
+
+import os
+import asyncio
+import pytest
+from pathlib import Path
+import sys
+import traceback
+
+# Load environment variables from .env file
+project_root = Path(__file__).parent.parent
+env_file = project_root / ".env"
+print(f"Loading environment from: {env_file}")
+from dotenv import load_dotenv
+
+load_dotenv(env_file)
+
+# Add paths to sys.path if needed
+pythonpath = os.environ.get("PYTHONPATH", "")
+for path in pythonpath.split(":"):
+    if path and path not in sys.path:
+        sys.path.insert(0, path)  # Insert at beginning to prioritize
+        print(f"Added to sys.path: {path}")
+
+from computer.computer import Computer
+from computer.providers.base import VMProviderType
+
+
+@pytest.fixture(scope="session")
+async def computer():
+    """Shared Computer instance for all test cases."""
+    # Create a remote Linux computer with C/ua
+    computer = Computer(
+        os_type="linux",
+        api_key=os.getenv("CUA_API_KEY"),
+        name=str(os.getenv("CUA_CONTAINER_NAME")),
+        provider_type=VMProviderType.CLOUD,
+    )
+    
+    try:
+        await computer.run()
+        yield computer
+    finally:
+        await computer.stop()
+
+
+# Sample test cases
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_install(computer):
+    """Test virtual environment creation and package installation."""
+    # Create a test virtual environment and install requests
+    stdout, _ = await computer.venv_install("test_env", ["requests"])
+    
+    # Check that installation was successful (no major errors)
+    assert "Successfully installed" in stdout or "Requirement already satisfied" in stdout
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_cmd(computer):
+    """Test executing shell commands in virtual environment."""
+    # Test Python version check
+    stdout, _ = await computer.venv_cmd("test_env", "python --version")
+    
+    assert "Python" in stdout
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_exec(computer):
+    """Test executing Python functions in virtual environment."""
+    def test_function(message="Hello World"):
+        import sys
+        return f"Python {sys.version_info.major}.{sys.version_info.minor}: {message}"
+    
+    result = await computer.venv_exec("test_env", test_function, message="Test successful!")
+    
+    assert "Python" in result
+    assert "Test successful!" in result
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_exec_with_package(computer):
+    """Test executing Python functions that use installed packages."""
+    def test_requests():
+        import requests
+        return f"requests version: {requests.__version__}"
+    
+    result = await computer.venv_exec("test_env", test_requests)
+    
+    assert "requests version:" in result
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_exec_error_handling(computer):
+    """Test error handling in venv_exec."""
+    def test_error():
+        raise ValueError("This is a test error")
+    
+    with pytest.raises(ValueError, match="This is a test error"):
+        await computer.venv_exec("test_env", test_error)
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_exec_with_args_kwargs(computer):
+    """Test executing Python functions with args and kwargs that return an object."""
+    def create_data_object(name, age, *hobbies, **metadata):
+        return {
+            "name": name,
+            "age": age,
+            "hobbies": list(hobbies),
+            "metadata": metadata,
+            "status": "active"
+        }
+    
+    args = ["Alice", 25, "reading", "coding"]
+    kwargs = {"location": "New York", "department": "Engineering"}
+
+    result = await computer.venv_exec(
+        "test_env", 
+        create_data_object, 
+        *args, 
+        **kwargs
+    )
+    
+    assert result["name"] == "Alice"
+    assert result["age"] == 25
+    assert result["hobbies"] == ["reading", "coding"]
+    assert result["metadata"]["location"] == "New York"
+    assert result["status"] == "active"
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_venv_exec_stdout_capture(computer, capfd):
+    """Test capturing stdout from Python functions executed in virtual environment."""
+    def hello_world_function():
+        print("Hello World!")
+        return "Function completed"
+    
+    # Execute the function in the virtual environment
+    result = await computer.venv_exec("test_env", hello_world_function)
+    
+    # Capture stdout and stderr
+    out, _ = capfd.readouterr()
+    
+    # Assert the stdout contains our expected output
+    assert out == "Hello World!\n\n"
+    assert result == "Function completed"
+
+if __name__ == "__main__":
+    # Run tests directly
+    pytest.main([__file__, "-v"])

From 420b67d2a85a1069944cef5fb40b0eff248f3a6c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 3 Jun 2025 13:26:11 -0400
Subject: [PATCH 02/23] wiki-race evaluator

---
 examples/eval_examples.py | 149 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 examples/eval_examples.py

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
new file mode 100644
index 00000000..c3504250
--- /dev/null
+++ b/examples/eval_examples.py
@@ -0,0 +1,149 @@
+import os
+import asyncio
+from pathlib import Path
+import sys
+import traceback
+import time
+
+# Load environment variables from .env file
+project_root = Path(__file__).parent.parent
+env_file = project_root / ".env"
+print(f"Loading environment from: {env_file}")
+from dotenv import load_dotenv
+
+load_dotenv(env_file)
+
+# Add paths to sys.path if needed
+pythonpath = os.environ.get("PYTHONPATH", "")
+for path in pythonpath.split(":"):
+    if path and path not in sys.path:
+        sys.path.insert(0, path)  # Insert at beginning to prioritize
+        print(f"Added to sys.path: {path}")
+
+from computer.computer import Computer
+from computer.providers.base import VMProviderType
+from computer.logger import LogLevel
+
+# Assuming these exist based on your request
+from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
+
+async def main():
+    try:
+        print("\n=== Using cloud container ===")
+        # Create a remote Linux computer with CUA
+        computer = Computer(
+            os_type="linux",
+            api_key=os.getenv("CUA_API_KEY"),
+            name=str(os.getenv("CUA_CONTAINER_NAME")),
+            provider_type=VMProviderType.CLOUD,
+        )
+        
+        try:
+            # Run the computer with default parameters
+            await computer.run()
+            
+            # Install required packages
+            await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
+
+            # Helper functions for wikirace
+            async def open_wiki(page):
+                await computer.interface.run_command(f"firefox https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
+                await asyncio.sleep(2)  # Wait for page to load
+
+            # Remote functions for wikirace
+            def get_open_wikis():
+                import pywinctl
+                titles = pywinctl.getAllTitles()
+                wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title]
+                return wiki_titles
+
+            def get_current_wiki_page():
+                import pywinctl
+                titles = pywinctl.getAllTitles()
+                wiki_titles = [title for title in titles if "Wikipedia" in title and "Mozilla Firefox" in title]
+                if wiki_titles:
+                    return wiki_titles[0].split(" - Wikipedia")[0]
+                return None
+
+            # Wikirace setup
+            start_page = "Albert Einstein"
+            target_page = "Pizza"
+            max_steps = 10
+            
+            print(f"\nStarting Wikirace: {start_page} → {target_page}")
+            
+            # Open starting page
+            await open_wiki(start_page)
+            
+            # Create agent
+            agent = ComputerAgent(
+                computer=computer,
+                loop=AgentLoop.OPENAI,
+                model=LLM(LLMProvider.OPENAI)
+            )
+            
+            # Run the wikirace
+            steps = 0
+            success = False
+            start_time = time.time()
+            
+            prompt = f"""
+            You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" 
+            by clicking only on Wikipedia links within articles.
+            
+            Rules:
+            1. Only click on links within Wikipedia articles (blue underlined text)
+            2. No using search, back button, or typing URLs
+            3. Try to find the shortest path possible
+            4. Current target: {target_page}
+            
+            Look at the current page and click on a link that might lead you closer to {target_page}.
+            """
+            
+            async for step_result in agent.run(prompt):
+                steps += 1
+                print(f"Step {steps}: {step_result}")
+                
+                # Check current page
+                current_page = await computer.venv_exec("eval_env", get_current_wiki_page)
+                print(f"Current page: {current_page}")
+                
+                # Check if we reached the target
+                if current_page and target_page.lower() in current_page.lower():
+                    success = True
+                    print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
+                    break
+                
+                # Safety check
+                if steps >= max_steps:
+                    print(f"❌ Failed: Reached maximum steps ({max_steps})")
+                    break
+                
+                await asyncio.sleep(1)  # Brief pause between steps
+            
+            end_time = time.time()
+            duration = end_time - start_time
+            
+            # Results
+            print(f"\n=== WIKIRACE RESULTS ===")
+            print(f"Start: {start_page}")
+            print(f"Target: {target_page}")
+            print(f"Steps taken: {steps}")
+            print(f"Success: {success}")
+            print(f"Duration: {duration:.2f} seconds")
+            
+            # Get final page list
+            final_wikis = await computer.venv_exec("eval_env", get_open_wikis)
+            print(f"Open Wikipedia pages: {final_wikis}")
+
+        finally:
+            # Important to clean up resources
+            await computer.stop()
+            
+    except Exception as e:
+        print(f"Error in main: {e}")
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file

From a6397e9a9bd48ba42e517c258b068377c72c6076 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 14:10:48 -0400
Subject: [PATCH 03/23] Added decorator example

---
 examples/eval_examples.py          | 85 ++++++++++++++++++++----------
 libs/computer/computer/computer.py |  4 ++
 2 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index c3504250..1315aba4 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import sys
 import traceback
 import time
+from functools import wraps
 
 # Load environment variables from .env file
 project_root = Path(__file__).parent.parent
@@ -27,16 +28,41 @@ from computer.logger import LogLevel
 # Assuming these exist based on your request
 from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
 
+# Global reference to computer instance (will be set in main)
+_computer = None
+
+def remote(venv_name="eval_env"):
+    """
+    Decorator that wraps a function to be executed remotely via computer.venv_exec
+    
+    Args:
+        venv_name: Name of the virtual environment to execute in
+    """
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            if _computer is None:
+                raise RuntimeError("Computer instance not initialized. Call this after computer.run()")
+            return await _computer.venv_exec(venv_name, func, *args, **kwargs)
+        return wrapper
+    return decorator
+
 async def main():
+    global _computer, remote
+    
     try:
         print("\n=== Using cloud container ===")
-        # Create a remote Linux computer with CUA
-        computer = Computer(
-            os_type="linux",
-            api_key=os.getenv("CUA_API_KEY"),
-            name=str(os.getenv("CUA_CONTAINER_NAME")),
-            provider_type=VMProviderType.CLOUD,
-        )
+        # # Create a remote Linux computer with CUA
+        # computer = Computer(
+        #     os_type="linux",
+        #     api_key=os.getenv("CUA_API_KEY"),
+        #     name=str(os.getenv("CUA_CONTAINER_NAME")),
+        #     provider_type=VMProviderType.CLOUD,
+        # )
+        
+        # Connect to local macOS computer
+        computer = Computer()
+        _computer = computer  # Set global reference
         
         try:
             # Run the computer with default parameters
@@ -47,34 +73,41 @@ async def main():
 
             # Helper functions for wikirace
             async def open_wiki(page):
-                await computer.interface.run_command(f"firefox https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
+                await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
                 await asyncio.sleep(2)  # Wait for page to load
 
-            # Remote functions for wikirace
+            # Remote functions for wikirace - using @remote decorator
+            @remote("eval_env")
             def get_open_wikis():
                 import pywinctl
                 titles = pywinctl.getAllTitles()
                 wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title]
                 return wiki_titles
 
+            @remote("eval_env")
             def get_current_wiki_page():
                 import pywinctl
                 titles = pywinctl.getAllTitles()
-                wiki_titles = [title for title in titles if "Wikipedia" in title and "Mozilla Firefox" in title]
+                wiki_titles = [title for title in titles if "Wikipedia" in title]
                 if wiki_titles:
                     return wiki_titles[0].split(" - Wikipedia")[0]
                 return None
 
             # Wikirace setup
+            max_steps = 15
             start_page = "Albert Einstein"
             target_page = "Pizza"
-            max_steps = 10
             
             print(f"\nStarting Wikirace: {start_page} → {target_page}")
             
             # Open starting page
             await open_wiki(start_page)
             
+            # Check current page using decorated function
+            current_page = await get_current_wiki_page()
+            print(f"Starting page: {current_page}")
+            assert current_page == start_page, f"Expected {start_page}, got {current_page}"
+            
             # Create agent
             agent = ComputerAgent(
                 computer=computer,
@@ -100,26 +133,23 @@ async def main():
             Look at the current page and click on a link that might lead you closer to {target_page}.
             """
             
-            async for step_result in agent.run(prompt):
+            async for result in agent.run(prompt):    
                 steps += 1
-                print(f"Step {steps}: {step_result}")
-                
-                # Check current page
-                current_page = await computer.venv_exec("eval_env", get_current_wiki_page)
-                print(f"Current page: {current_page}")
-                
-                # Check if we reached the target
-                if current_page and target_page.lower() in current_page.lower():
-                    success = True
-                    print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
-                    break
+                print(f"Step {steps}: {result}")
                 
                 # Safety check
                 if steps >= max_steps:
                     print(f"❌ Failed: Reached maximum steps ({max_steps})")
                     break
                 
-                await asyncio.sleep(1)  # Brief pause between steps
+            # Check current page using decorated function
+            current_page = await get_current_wiki_page()
+            print(f"Current page: {current_page}")
+            
+            # Check if we reached the target
+            if current_page and target_page.lower() in current_page.lower():
+                success = True
+                print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
             
             end_time = time.time()
             duration = end_time - start_time
@@ -132,13 +162,14 @@ async def main():
             print(f"Success: {success}")
             print(f"Duration: {duration:.2f} seconds")
             
-            # Get final page list
-            final_wikis = await computer.venv_exec("eval_env", get_open_wikis)
+            # Get final page list - now using decorated function
+            final_wikis = await get_open_wikis()
             print(f"Open Wikipedia pages: {final_wikis}")
 
         finally:
             # Important to clean up resources
-            await computer.stop()
+            # await computer.stop()
+            pass
             
     except Exception as e:
         print(f"Error in main: {e}")
diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py
index b77582ae..2057215e 100644
--- a/libs/computer/computer/computer.py
+++ b/libs/computer/computer/computer.py
@@ -796,6 +796,10 @@ class Computer:
             # Remove common leading whitespace (dedent)
             func_source = textwrap.dedent(source).strip()
             
+            # Remove decorators
+            while func_source.lstrip().startswith("@"):
+                func_source = func_source.split("\n", 1)[1].strip()
+            
             # Get function name for execution
             func_name = python_func.__name__
             

From c5c91729a2421172bdbc14e98090b2ae6014b6e2 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 14:18:52 -0400
Subject: [PATCH 04/23] Added cleanup step

---
 examples/eval_examples.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index 1315aba4..38d9cae8 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -78,11 +78,15 @@ async def main():
 
             # Remote functions for wikirace - using @remote decorator
             @remote("eval_env")
-            def get_open_wikis():
+            def close_all_windows():
                 import pywinctl
-                titles = pywinctl.getAllTitles()
-                wiki_titles = [title.split(" - Wikipedia")[0] for title in titles if "Wikipedia" in title]
-                return wiki_titles
+                windows = pywinctl.getAllWindows()
+                for window in windows:
+                    try:
+                        window.close()
+                    except:
+                        # Some windows might not be closeable or may have already closed
+                        pass
 
             @remote("eval_env")
             def get_current_wiki_page():
@@ -94,12 +98,15 @@ async def main():
                 return None
 
             # Wikirace setup
-            max_steps = 15
+            max_steps = 2
             start_page = "Albert Einstein"
             target_page = "Pizza"
             
             print(f"\nStarting Wikirace: {start_page} → {target_page}")
             
+            # Close all windows
+            await close_all_windows()
+            
             # Open starting page
             await open_wiki(start_page)
             
@@ -142,7 +149,7 @@ async def main():
                     print(f"❌ Failed: Reached maximum steps ({max_steps})")
                     break
                 
-            # Check current page using decorated function
+            # Check again
             current_page = await get_current_wiki_page()
             print(f"Current page: {current_page}")
             
@@ -161,11 +168,6 @@ async def main():
             print(f"Steps taken: {steps}")
             print(f"Success: {success}")
             print(f"Duration: {duration:.2f} seconds")
-            
-            # Get final page list - now using decorated function
-            final_wikis = await get_open_wikis()
-            print(f"Open Wikipedia pages: {final_wikis}")
-
         finally:
             # Important to clean up resources
             # await computer.stop()

From 89deb8111fbbed7f3156f88d2195c5104be92e77 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 14:22:30 -0400
Subject: [PATCH 05/23] Fixed agent stop

---
 examples/eval_examples.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index 38d9cae8..b4b207b6 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -146,8 +146,10 @@ async def main():
                 
                 # Safety check
                 if steps >= max_steps:
-                    print(f"❌ Failed: Reached maximum steps ({max_steps})")
-                    break
+                    print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
+                    agent._loop.cancel()
+                
+            await asyncio.sleep(2) # Wait for recv to finish
                 
             # Check again
             current_page = await get_current_wiki_page()

From e16fb75ce81f4ab9fe786854d8b8b405a8292605 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 14:50:56 -0400
Subject: [PATCH 06/23] Added locks to websocket interface

---
 libs/computer/computer/interface/linux.py | 103 ++++++++++++----------
 libs/computer/computer/interface/macos.py |  59 +++++++------
 2 files changed, 86 insertions(+), 76 deletions(-)

diff --git a/libs/computer/computer/interface/linux.py b/libs/computer/computer/interface/linux.py
index 401730ca..68ba5706 100644
--- a/libs/computer/computer/interface/linux.py
+++ b/libs/computer/computer/interface/linux.py
@@ -27,6 +27,7 @@ class LinuxComputerInterface(BaseComputerInterface):
         self._max_reconnect_delay = 30  # Maximum delay between reconnection attempts
         self._log_connection_attempts = True  # Flag to control connection attempt logging
         self._authenticated = False  # Track authentication status
+        self._command_lock = asyncio.Lock()  # Lock to ensure only one command at a time
 
         # Set logger name for Linux interface
         self.logger = Logger("cua.interface.linux", LogLevel.NORMAL)
@@ -193,58 +194,62 @@ class LinuxComputerInterface(BaseComputerInterface):
         retry_count = 0
         last_error = None
 
-        while retry_count < max_retries:
-            try:
-                await self._ensure_connection()
-                if not self._ws:
-                    raise ConnectionError("WebSocket connection is not established")
+        # Acquire lock to ensure only one command is processed at a time
+        async with self._command_lock:
+            self.logger.debug(f"Acquired lock for command: {command}")
+            while retry_count < max_retries:
+                try:
+                    await self._ensure_connection()
+                    if not self._ws:
+                        raise ConnectionError("WebSocket connection is not established")
 
-                # Handle authentication if needed
-                if self.api_key and self.vm_name and not self._authenticated:
-                    self.logger.info("Performing authentication handshake...")
-                    auth_message = {
-                        "command": "authenticate",
-                        "params": {
-                            "api_key": self.api_key,
-                            "container_name": self.vm_name
+                    # Handle authentication if needed
+                    if self.api_key and self.vm_name and not self._authenticated:
+                        self.logger.info("Performing authentication handshake...")
+                        auth_message = {
+                            "command": "authenticate",
+                            "params": {
+                                "api_key": self.api_key,
+                                "container_name": self.vm_name
+                            }
                         }
-                    }
-                    await self._ws.send(json.dumps(auth_message))
-                    
-                    # Wait for authentication response
-                    auth_response = await asyncio.wait_for(self._ws.recv(), timeout=10)
-                    auth_result = json.loads(auth_response)
-                    
-                    if not auth_result.get("success"):
-                        error_msg = auth_result.get("error", "Authentication failed")
-                        self.logger.error(f"Authentication failed: {error_msg}")
-                        self._authenticated = False
-                        raise ConnectionError(f"Authentication failed: {error_msg}")
-                    
-                    self.logger.info("Authentication successful")
-                    self._authenticated = True
+                        await self._ws.send(json.dumps(auth_message))
+                        
+                        # Wait for authentication response
+                        auth_response = await asyncio.wait_for(self._ws.recv(), timeout=10)
+                        auth_result = json.loads(auth_response)
+                        
+                        if not auth_result.get("success"):
+                            error_msg = auth_result.get("error", "Authentication failed")
+                            self.logger.error(f"Authentication failed: {error_msg}")
+                            self._authenticated = False
+                            raise ConnectionError(f"Authentication failed: {error_msg}")
+                        
+                        self.logger.info("Authentication successful")
+                        self._authenticated = True
 
-                message = {"command": command, "params": params or {}}
-                await self._ws.send(json.dumps(message))
-                response = await asyncio.wait_for(self._ws.recv(), timeout=30)
-                return json.loads(response)
-            except Exception as e:
-                last_error = e
-                retry_count += 1
-                if retry_count < max_retries:
-                    # Only log at debug level for intermediate retries
-                    self.logger.debug(
-                        f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}"
-                    )
-                    await asyncio.sleep(1)
-                    continue
-                else:
-                    # Only log at error level for the final failure
-                    self.logger.error(
-                        f"Failed to send command '{command}' after {max_retries} retries"
-                    )
-                    self.logger.debug(f"Command failure details: {e}")
-                raise last_error if last_error else RuntimeError("Failed to send command")
+                    message = {"command": command, "params": params or {}}
+                    await self._ws.send(json.dumps(message))
+                    response = await asyncio.wait_for(self._ws.recv(), timeout=30)
+                    self.logger.debug(f"Completed command: {command}")
+                    return json.loads(response)
+                except Exception as e:
+                    last_error = e
+                    retry_count += 1
+                    if retry_count < max_retries:
+                        # Only log at debug level for intermediate retries
+                        self.logger.debug(
+                            f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}"
+                        )
+                        await asyncio.sleep(1)
+                        continue
+                    else:
+                        # Only log at error level for the final failure
+                        self.logger.error(
+                            f"Failed to send command '{command}' after {max_retries} retries"
+                        )
+                        self.logger.debug(f"Command failure details: {e}")
+                        raise last_error if last_error else RuntimeError("Failed to send command")
 
     async def wait_for_ready(self, timeout: int = 60, interval: float = 1.0):
         """Wait for WebSocket connection to become available."""
diff --git a/libs/computer/computer/interface/macos.py b/libs/computer/computer/interface/macos.py
index a96c44d1..3daa4fdf 100644
--- a/libs/computer/computer/interface/macos.py
+++ b/libs/computer/computer/interface/macos.py
@@ -26,6 +26,7 @@ class MacOSComputerInterface(BaseComputerInterface):
         self._reconnect_delay = 1  # Start with 1 second delay
         self._max_reconnect_delay = 30  # Maximum delay between reconnection attempts
         self._log_connection_attempts = True  # Flag to control connection attempt logging
+        self._command_lock = asyncio.Lock()  # Lock to ensure only one command at a time
 
         # Set logger name for macOS interface
         self.logger = Logger("cua.interface.macos", LogLevel.NORMAL)
@@ -219,35 +220,39 @@ class MacOSComputerInterface(BaseComputerInterface):
         retry_count = 0
         last_error = None
 
-        while retry_count < max_retries:
-            try:
-                await self._ensure_connection()
-                if not self._ws:
-                    raise ConnectionError("WebSocket connection is not established")
+        # Acquire lock to ensure only one command is processed at a time
+        async with self._command_lock:
+            self.logger.debug(f"Acquired lock for command: {command}")
+            while retry_count < max_retries:
+                try:
+                    await self._ensure_connection()
+                    if not self._ws:
+                        raise ConnectionError("WebSocket connection is not established")
 
-                message = {"command": command, "params": params or {}}
-                await self._ws.send(json.dumps(message))
-                response = await asyncio.wait_for(self._ws.recv(), timeout=30)
-                return json.loads(response)
-            except Exception as e:
-                last_error = e
-                retry_count += 1
-                if retry_count < max_retries:
-                    # Only log at debug level for intermediate retries
-                    self.logger.debug(
-                        f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}"
-                    )
-                    await asyncio.sleep(1)
-                    continue
-                else:
-                    # Only log at error level for the final failure
-                    self.logger.error(
-                        f"Failed to send command '{command}' after {max_retries} retries"
-                    )
-                    self.logger.debug(f"Command failure details: {e}")
-                    raise
+                    message = {"command": command, "params": params or {}}
+                    await self._ws.send(json.dumps(message))
+                    response = await asyncio.wait_for(self._ws.recv(), timeout=30)
+                    self.logger.debug(f"Completed command: {command}")
+                    return json.loads(response)
+                except Exception as e:
+                    last_error = e
+                    retry_count += 1
+                    if retry_count < max_retries:
+                        # Only log at debug level for intermediate retries
+                        self.logger.debug(
+                            f"Command '{command}' failed (attempt {retry_count}/{max_retries}): {e}"
+                        )
+                        await asyncio.sleep(1)
+                        continue
+                    else:
+                        # Only log at error level for the final failure
+                        self.logger.error(
+                            f"Failed to send command '{command}' after {max_retries} retries"
+                        )
+                        self.logger.debug(f"Command failure details: {e}")
+                        raise
 
-        raise last_error if last_error else RuntimeError("Failed to send command")
+            raise last_error if last_error else RuntimeError("Failed to send command")
 
     async def wait_for_ready(self, timeout: int = 60, interval: float = 1.0):
         """Wait for WebSocket connection to become available."""

From fa07ee444ad85e8b391cb5fbf16ac1d59e4a2037 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 14:51:11 -0400
Subject: [PATCH 07/23] More freq checks

---
 examples/eval_examples.py | 57 ++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index b4b207b6..a476e104 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -31,7 +31,8 @@ from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
 # Global reference to computer instance (will be set in main)
 _computer = None
 
-def remote(venv_name="eval_env"):
+
+def remote(venv_name="eval_env", max_retries=3):
     """
     Decorator that wraps a function to be executed remotely via computer.venv_exec
     
@@ -43,7 +44,14 @@ def remote(venv_name="eval_env"):
         async def wrapper(*args, **kwargs):
             if _computer is None:
                 raise RuntimeError("Computer instance not initialized. Call this after computer.run()")
-            return await _computer.venv_exec(venv_name, func, *args, **kwargs)
+            for i in range(max_retries):
+                try:
+                    return await _computer.venv_exec(venv_name, func, *args, **kwargs)
+                except Exception as e:
+                    print(f"Attempt {i+1} failed: {e}")
+                    await asyncio.sleep(1)
+                    if i == max_retries - 1:
+                        raise e
         return wrapper
     return decorator
 
@@ -140,28 +148,33 @@ async def main():
             Look at the current page and click on a link that might lead you closer to {target_page}.
             """
             
-            async for result in agent.run(prompt):    
-                steps += 1
-                print(f"Step {steps}: {result}")
-                
-                # Safety check
-                if steps >= max_steps:
-                    print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
-                    agent._loop.cancel()
-                
-            await asyncio.sleep(2) # Wait for recv to finish
-                
-            # Check again
-            current_page = await get_current_wiki_page()
-            print(f"Current page: {current_page}")
-            
-            # Check if we reached the target
-            if current_page and target_page.lower() in current_page.lower():
-                success = True
-                print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
-            
+            try: 
+                async for result in agent.run(prompt):    
+                    steps += 1
+                    print(f"Step {steps}: {result}")
+                    
+                    # Check again
+                    current_page = await get_current_wiki_page()
+                    print(f"Current page: {current_page}")
+                    
+                    # Check if we reached the target
+                    if current_page and target_page.lower() in current_page.lower():
+                        success = True
+                        print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
+                        await agent._loop.cancel()
+                        break
+                    
+                    # Safety check
+                    if steps >= max_steps:
+                        print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
+                        await agent._loop.cancel()
+                        break
+            except asyncio.CancelledError:
+                print("Agent stopped")
+                            
             end_time = time.time()
             duration = end_time - start_time
+            await asyncio.sleep(2) # Wait for agent to finish
             
             # Results
             print(f"\n=== WIKIRACE RESULTS ===")

From a7e56ce64a439b12b04f6243f16a41012af8102e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 18:54:00 -0400
Subject: [PATCH 08/23] Added @sandboxed decorator

---
 examples/eval_examples.py          | 39 +++-------------
 libs/computer/computer/computer.py |  5 +++
 libs/computer/computer/helpers.py  | 49 +++++++++++++++++++++
 tests/venv.py                      | 71 ++++++++++++++++++++++++++----
 4 files changed, 122 insertions(+), 42 deletions(-)
 create mode 100644 libs/computer/computer/helpers.py

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index a476e104..48da31be 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -24,40 +24,12 @@ for path in pythonpath.split(":"):
 from computer.computer import Computer
 from computer.providers.base import VMProviderType
 from computer.logger import LogLevel
+from computer.helpers import sandboxed
 
 # Assuming these exist based on your request
 from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
 
-# Global reference to computer instance (will be set in main)
-_computer = None
-
-
-def remote(venv_name="eval_env", max_retries=3):
-    """
-    Decorator that wraps a function to be executed remotely via computer.venv_exec
-    
-    Args:
-        venv_name: Name of the virtual environment to execute in
-    """
-    def decorator(func):
-        @wraps(func)
-        async def wrapper(*args, **kwargs):
-            if _computer is None:
-                raise RuntimeError("Computer instance not initialized. Call this after computer.run()")
-            for i in range(max_retries):
-                try:
-                    return await _computer.venv_exec(venv_name, func, *args, **kwargs)
-                except Exception as e:
-                    print(f"Attempt {i+1} failed: {e}")
-                    await asyncio.sleep(1)
-                    if i == max_retries - 1:
-                        raise e
-        return wrapper
-    return decorator
-
-async def main():
-    global _computer, remote
-    
+async def main():    
     try:
         print("\n=== Using cloud container ===")
         # # Create a remote Linux computer with CUA
@@ -70,7 +42,6 @@ async def main():
         
         # Connect to local macOS computer
         computer = Computer()
-        _computer = computer  # Set global reference
         
         try:
             # Run the computer with default parameters
@@ -84,8 +55,8 @@ async def main():
                 await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
                 await asyncio.sleep(2)  # Wait for page to load
 
-            # Remote functions for wikirace - using @remote decorator
-            @remote("eval_env")
+            # Remote functions for wikirace - using @sandboxed decorator
+            @sandboxed("eval_env")
             def close_all_windows():
                 import pywinctl
                 windows = pywinctl.getAllWindows()
@@ -96,7 +67,7 @@ async def main():
                         # Some windows might not be closeable or may have already closed
                         pass
 
-            @remote("eval_env")
+            @sandboxed("eval_env")
             def get_current_wiki_page():
                 import pywinctl
                 titles = pywinctl.getAllTitles()
diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py
index 2057215e..191c611d 100644
--- a/libs/computer/computer/computer.py
+++ b/libs/computer/computer/computer.py
@@ -11,6 +11,7 @@ import json
 import logging
 from .telemetry import record_computer_initialization
 import os
+from . import helpers
 
 # Import provider related modules
 from .providers.base import VMProviderType
@@ -460,6 +461,10 @@ class Computer:
 
             # Set the initialization flag and clear the initializing flag
             self._initialized = True
+            
+            # Set this instance as the default computer for remote decorators
+            helpers.set_default_computer(self)
+            
             self.logger.info("Computer successfully initialized")
         except Exception as e:
             raise
diff --git a/libs/computer/computer/helpers.py b/libs/computer/computer/helpers.py
new file mode 100644
index 00000000..b472c047
--- /dev/null
+++ b/libs/computer/computer/helpers.py
@@ -0,0 +1,49 @@
+"""
+Helper functions and decorators for the Computer module.
+"""
+import asyncio
+from functools import wraps
+from typing import Any, Callable, Optional, TypeVar, cast
+
+# Global reference to the default computer instance
+_default_computer = None
+
+def set_default_computer(computer):
+    """
+    Set the default computer instance to be used by the remote decorator.
+    
+    Args:
+        computer: The computer instance to use as default
+    """
+    global _default_computer
+    _default_computer = computer
+
+
+def sandboxed(venv_name: str = "default", computer: str = "default", max_retries: int = 3):
+    """
+    Decorator that wraps a function to be executed remotely via computer.venv_exec
+    
+    Args:
+        venv_name: Name of the virtual environment to execute in
+        computer: The computer instance to use, or "default" to use the globally set default
+        max_retries: Maximum number of retries for the remote execution
+    """
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            # Determine which computer instance to use
+            comp = computer if computer != "default" else _default_computer
+            
+            if comp is None:
+                raise RuntimeError("No computer instance available. Either specify a computer instance or call set_default_computer() first.")
+            
+            for i in range(max_retries):
+                try:
+                    return await comp.venv_exec(venv_name, func, *args, **kwargs)
+                except Exception as e:
+                    print(f"Attempt {i+1} failed: {e}")
+                    await asyncio.sleep(1)
+                    if i == max_retries - 1:
+                        raise e
+        return wrapper
+    return decorator
diff --git a/tests/venv.py b/tests/venv.py
index 4f9e3206..8b78a78f 100644
--- a/tests/venv.py
+++ b/tests/venv.py
@@ -31,24 +31,29 @@ for path in pythonpath.split(":"):
 
 from computer.computer import Computer
 from computer.providers.base import VMProviderType
+from computer.helpers import remote, set_default_computer
 
 
 @pytest.fixture(scope="session")
 async def computer():
     """Shared Computer instance for all test cases."""
-    # Create a remote Linux computer with C/ua
-    computer = Computer(
-        os_type="linux",
-        api_key=os.getenv("CUA_API_KEY"),
-        name=str(os.getenv("CUA_CONTAINER_NAME")),
-        provider_type=VMProviderType.CLOUD,
-    )
+    # # Create a remote Linux computer with C/ua
+    # computer = Computer(
+    #     os_type="linux",
+    #     api_key=os.getenv("CUA_API_KEY"),
+    #     name=str(os.getenv("CUA_CONTAINER_NAME")),
+    #     provider_type=VMProviderType.CLOUD,
+    # )
+    
+    # Create a local macOS computer with C/ua
+    computer = Computer()
     
     try:
         await computer.run()
         yield computer
     finally:
-        await computer.stop()
+        # await computer.stop()
+        pass
 
 
 # Sample test cases
@@ -146,6 +151,56 @@ async def test_venv_exec_stdout_capture(computer, capfd):
     assert out == "Hello World!\n\n"
     assert result == "Function completed"
 
+@pytest.mark.asyncio(loop_scope="session")
+async def test_remote_decorator(computer):
+    """Test the remote decorator from computer.helpers module."""
+    # Set the computer as default for the remote decorator
+    set_default_computer(computer)
+    
+    # Define a function with the remote decorator
+    @sandboxed("test_env")
+    def get_package_version():
+        import sys
+        import platform
+        return {
+            "python_version": sys.version,
+            "platform": platform.platform(),
+            "success": True
+        }
+    
+    # Call the decorated function
+    result = await get_package_version()
+    
+    # Verify the function executed in the virtual environment
+    assert "python_version" in result
+    assert "platform" in result
+    assert result["success"] == True
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_remote_decorator_with_custom_computer(computer):
+    """Test the remote decorator with explicitly specified computer instance."""
+    # Define a function with the remote decorator that explicitly specifies the computer
+    @sandboxed("test_env", computer=computer)
+    def get_system_info():
+        import os
+        import sys
+        return {
+            "python_version": sys.version,
+            "environment_vars": dict(os.environ),
+            "working_directory": os.getcwd()
+        }
+    
+    # Call the decorated function
+    result = await get_system_info()
+    
+    # Verify the function executed in the virtual environment
+    assert "python_version" in result
+    assert "environment_vars" in result
+    assert "working_directory" in result
+    # The virtual environment should have a different working directory
+    # than the current test process
+    assert result["working_directory"] != os.getcwd()
+
 if __name__ == "__main__":
     # Run tests directly
     pytest.main([__file__, "-v"])

From 86d052d88278b4d58e1832d93a6d7fd50eaf6cf4 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 21:19:46 -0400
Subject: [PATCH 09/23] updated eval to use sandboxed decorator

---
 examples/eval_examples.py | 714 ++++++++++++++++++++++++++++++++------
 1 file changed, 606 insertions(+), 108 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index 48da31be..1978a897 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -1,13 +1,63 @@
 import os
 import asyncio
+import json
+import random
 from pathlib import Path
 import sys
 import traceback
 import time
 from functools import wraps
+import urllib.request
+import datetime
+from urllib.parse import quote
+
+# Wikirace prompt template
+WIKIRACE_PROMPT_TEMPLATE = """
+You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" 
+by clicking only on Wikipedia links within articles.
+
+Rules:
+1. Only click on links within Wikipedia articles (blue underlined text)
+2. No using search, back button, or typing URLs
+3. You MAY use cmd+f (or ctrl+f) to find text on the current page
+4. Do NOT click any search icon or type into any search box unless it's a browser command
+5. Try to find the shortest path possible
+6. Current target: {target_page}
+7. Do not maximize the window or use any other application
+8. Avoid wasting actions by scrolling
+9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
+
+Look at the current page and click on a link that might lead you closer to {target_page}.
+"""
+
+# Store original print function
+_print = print
+
+# Define log file path
+project_root = Path(__file__).parent.parent
+log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
+
+# Custom print function that also logs to file
+def print(*args, **kwargs):
+    # Call the original print function
+    _print(*args, **kwargs)
+    
+    # Format the output as a string
+    output = " ".join(str(arg) for arg in args)
+    if kwargs.get("end") is not None:
+        output += kwargs["end"]
+    else:
+        output += "\n"
+    
+    # Add timestamp
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    log_entry = f"[{timestamp}] {output}"
+    
+    # Append to log file
+    with open(log_file, "a") as f:
+        f.write(log_entry)
 
 # Load environment variables from .env file
-project_root = Path(__file__).parent.parent
 env_file = project_root / ".env"
 print(f"Loading environment from: {env_file}")
 from dotenv import load_dotenv
@@ -29,104 +79,283 @@ from computer.helpers import sandboxed
 # Assuming these exist based on your request
 from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
 
-async def main():    
+articles = []
+
+# Load from file
+articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt"
+with open(articles_file, "r") as f:
+    articles = [line.strip() for line in f]
+
+
+def get_article_links(article_title):
+    """Get all links from a Wikipedia article's content"""
     try:
-        print("\n=== Using cloud container ===")
-        # # Create a remote Linux computer with CUA
-        # computer = Computer(
-        #     os_type="linux",
-        #     api_key=os.getenv("CUA_API_KEY"),
-        #     name=str(os.getenv("CUA_CONTAINER_NAME")),
-        #     provider_type=VMProviderType.CLOUD,
-        # )
+        # Get the article content
+        url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json"
         
-        # Connect to local macOS computer
-        computer = Computer()
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+            
+        pages = data.get('query', {}).get('pages', {})
+        if not pages:
+            return []
+        
+        # Get the first (and only) page
+        page = next(iter(pages.values()))
+        links = page.get('links', [])
+        
+        # Filter links to keep only main namespace articles (no special pages, files, etc.)
+        article_links = []
+        for link in links:
+            title = link.get('title', '')
+            # Skip if title contains colons (indicates special pages, files, categories, etc.)
+            if ':' not in title and title.isascii() and len(title) < 50:
+                article_links.append(title)
+        
+        return article_links
+    
+    except Exception as e:
+        print(f"Error fetching links for {article_title}: {e}")
+        return []
+
+def wikipedia_random_walk(start_article, depth=5):
+    """
+    Perform a random walk through Wikipedia articles
+    
+    Args:
+        start_article (str): The article title to start from
+        depth (int): How many steps to take in the random walk
+    
+    Returns:
+        list: Path of article titles visited during the walk
+    """
+    path = [start_article]
+    current_article = start_article
+    
+    for step in range(depth):
+        print(f"Step {step + 1}: Currently at '{current_article}'")
+        
+        # Get links from current article
+        links = get_article_links(current_article)
+        
+        if not links:
+            print(f"No valid links found in '{current_article}'. Ending walk.")
+            break
+        
+        # Randomly select next article
+        next_article = random.choice(links)
+        path.append(next_article)
+        current_article = next_article
+        
+        print(f"  -> Moving to '{next_article}'")
+    
+    return path
+
+def get_article_pair(depth=5):
+    global articles
+    start_article = random.choice(articles)
+    target_article = wikipedia_random_walk(start_article, depth)[-1]
+    while target_article == start_article:
+        start_article = random.choice(articles)
+        target_article = wikipedia_random_walk(start_article, depth)[-1]
+    return start_article, target_article
+
+async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
+    """Run a specific evaluation scenario"""
+    
+    print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===")
+    
+    # Create computer instance with or without app-use experiment
+    experiments = ["app-use"] if use_app_use else []
+    computer = Computer(experiments=experiments)
+    
+    try:
+        # Run the computer
+        await computer.run()
+        
+        # Install required packages
+        await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
+        
+        # Run the specific scenario
+        if scenario_name == "messy_desktop":
+            await run_messy_desktop_scenario(computer, agent_configs, max_steps)
+        elif scenario_name == "parallel_agents":
+            await run_parallel_agents_scenario(computer, agent_configs, max_steps)
+        else:
+            print(f"Unknown scenario: {scenario_name}")
+    
+    except Exception as e:
+        print(f"Error in scenario {scenario_name}: {e}")
+        traceback.print_exc()
+    finally:
+        # Important to clean up resources
+        # await computer.stop()
+        pass
+
+
+@sandboxed("eval_env")
+def close_all_windows():
+    """Close all open windows"""
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    for window in windows:
+        try:
+            window.close()
+        except:
+            # Some windows might not be closeable or may have already closed
+            pass
+
+
+@sandboxed("eval_env")
+def get_current_wiki_page(app_name=None):
+    """Get the title of the current Wikipedia page
+    
+    Args:
+        app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox')
+    """
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    
+    # Filter windows by app name if provided
+    if app_name:
+        windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()]
+    
+    # Get titles from filtered windows
+    titles = [w.title for w in windows if w.title]
+    wiki_titles = [title for title in titles if "Wikipedia" in title]
+    
+    if wiki_titles:
+        return wiki_titles[0].split(" - Wikipedia")[0]
+    return None
+
+
+@sandboxed("eval_env")
+def get_open_app_names():
+    """Get names of all open applications"""
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    return [window.getAppName() for window in windows if window.getAppName()]
+
+def _computer():
+    """Get the default computer instance"""
+    from computer.helpers import _default_computer
+    return _default_computer
+
+async def open_app(app_name):
+    """Open a specific application"""
+    await _computer().interface.run_command(f"open -a '{app_name}'")
+    await asyncio.sleep(2)  # Wait for app to open
+
+
+async def open_wiki(page, app_name="Safari"):
+    """Open a specific Wikipedia page"""
+    await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}")
+    await asyncio.sleep(2)  # Wait for page to load
+
+
+async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
+    """Run the messy desktop scenario with a single agent"""
+    # Get popular wiki articles
+    global articles
+    start_page, target_page = get_article_pair(depth=1)
+    
+    print(f"Wiki race: {start_page} → {target_page}")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open starting Wikipedia page
+    await open_wiki(start_page)
+    
+    # Open 3 random apps to create a messy desktop
+    apps_to_open = ["Notes", "Terminal", "System Settings"]
+    for app in apps_to_open:
+        await open_app(app)
+    
+    # Verify apps are open
+    open_apps = await get_open_app_names()
+    print(f"Open applications: {open_apps}")
+    
+    # Create the agent's computer interface
+    # If app-use is enabled, create a desktop limited to Safari/Firefox
+    if "app-use" in (computer.experiments or []):
+        browser_desktop = computer.create_desktop_from_apps(["Safari"])
+        agent_computer = browser_desktop
+    else:
+        agent_computer = computer
+    
+    # Run each agent configuration
+    for config_name, loop_provider, model_provider in agent_configs:
+        print(f"\n--- Testing Agent: {config_name} ---")
+        
+        # Create agent with the specified configuration
+        agent = ComputerAgent(
+            computer=agent_computer,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        # Run the wikirace
+        steps = 0
+        success = False
+        start_time = time.time()
+        
+        # Use the template with formatting for this scenario
+        prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Safari",
+            start_page=start_page,
+            target_page=target_page
+        )
         
         try:
-            # Run the computer with default parameters
-            await computer.run()
-            
-            # Install required packages
-            await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
-
-            # Helper functions for wikirace
-            async def open_wiki(page):
-                await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
-                await asyncio.sleep(2)  # Wait for page to load
-
-            # Remote functions for wikirace - using @sandboxed decorator
-            @sandboxed("eval_env")
-            def close_all_windows():
-                import pywinctl
-                windows = pywinctl.getAllWindows()
-                for window in windows:
-                    try:
-                        window.close()
-                    except:
-                        # Some windows might not be closeable or may have already closed
-                        pass
-
-            @sandboxed("eval_env")
-            def get_current_wiki_page():
-                import pywinctl
-                titles = pywinctl.getAllTitles()
-                wiki_titles = [title for title in titles if "Wikipedia" in title]
-                if wiki_titles:
-                    return wiki_titles[0].split(" - Wikipedia")[0]
-                return None
-
-            # Wikirace setup
-            max_steps = 2
-            start_page = "Albert Einstein"
-            target_page = "Pizza"
-            
-            print(f"\nStarting Wikirace: {start_page} → {target_page}")
-            
-            # Close all windows
-            await close_all_windows()
-            
-            # Open starting page
-            await open_wiki(start_page)
-            
-            # Check current page using decorated function
-            current_page = await get_current_wiki_page()
-            print(f"Starting page: {current_page}")
-            assert current_page == start_page, f"Expected {start_page}, got {current_page}"
-            
-            # Create agent
-            agent = ComputerAgent(
-                computer=computer,
-                loop=AgentLoop.OPENAI,
-                model=LLM(LLMProvider.OPENAI)
-            )
-            
-            # Run the wikirace
-            steps = 0
-            success = False
-            start_time = time.time()
-            
-            prompt = f"""
-            You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" 
-            by clicking only on Wikipedia links within articles.
-            
-            Rules:
-            1. Only click on links within Wikipedia articles (blue underlined text)
-            2. No using search, back button, or typing URLs
-            3. Try to find the shortest path possible
-            4. Current target: {target_page}
-            
-            Look at the current page and click on a link that might lead you closer to {target_page}.
-            """
-            
-            try: 
+            while steps < max_steps and not success: 
                 async for result in agent.run(prompt):    
                     steps += 1
-                    print(f"Step {steps}: {result}")
+                    print(f"Step {steps}")
                     
-                    # Check again
-                    current_page = await get_current_wiki_page()
+                    def process_result():
+                        if result.get("content"):
+                            print(f"Agent: {result.get('content', '')}")
+
+                        else:
+                            outputs = result.get("output", [])
+                            for output in outputs:
+                                if output.get("type") == "message":
+                                    content = output.get("content", [])
+                                    for content_part in content:
+                                        if content_part.get("text"):
+                                            print(f"Agent: {content_part.get('text', '')}")
+
+                                elif output.get("type") == "reasoning":
+                                    # if it's openAI, we only have access to a summary of the reasoning
+                                    summary_content = output.get("summary", [])
+                                    if summary_content:
+                                        for summary_part in summary_content:
+                                            if summary_part.get("type") == "summary_text":
+                                                print(f"Agent: {summary_part.get('text', '')}")
+
+                                    else:
+                                        summary_content = output.get("text", "")
+                                        if summary_content:
+                                            print(f"Agent: {summary_content}")
+
+                                elif output.get("type") == "computer_call":
+                                    action = output.get("action", {})
+                                    action_type = action.get("type", "")
+                                    if action_type:
+                                        action_title = f"🛠️ Performing {action_type}"
+                                        if action.get("x") and action.get("y"):
+                                            action_title += f" at ({action['x']}, {action['y']})"
+                                        print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
+
+                    
+                    # Process and print the result
+                    process_result()
+                    
+                    # Check current page
+                    current_page = await get_current_wiki_page("Safari")
                     print(f"Current page: {current_page}")
+                    print(f"Target: {target_page}")
                     
                     # Check if we reached the target
                     if current_page and target_page.lower() in current_page.lower():
@@ -140,29 +369,298 @@ async def main():
                         print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
                         await agent._loop.cancel()
                         break
+        except asyncio.CancelledError:
+            print("Agent stopped")
+                        
+        end_time = time.time()
+        duration = end_time - start_time
+        await asyncio.sleep(2)  # Wait for agent to finish
+        
+        # Results
+        print(f"\n=== WIKIRACE RESULTS: {config_name} ===")
+        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
+        print(f"Start: {start_page}")
+        print(f"Target: {target_page}")
+        print(f"Steps taken: {steps}")
+        print(f"Success: {success}")
+        print(f"Duration: {duration:.2f} seconds")
+
+
+async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
+    
+    """Run two agents in parallel, one using Safari and one using Firefox"""
+    # Get popular wiki articles
+    global articles
+    safari_start, safari_target = get_article_pair(depth=1)
+    firefox_start, firefox_target = get_article_pair(depth=1)
+    
+    print(f"Safari Wiki race: {safari_start} → {safari_target}")
+    print(f"Firefox Wiki race: {firefox_start} → {firefox_target}")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open Safari with starting page
+    await open_wiki(safari_start, "Safari")
+    await asyncio.sleep(2)
+    
+    # Open Firefox with starting page
+    await open_wiki(firefox_start, "Firefox")
+    await asyncio.sleep(2)
+    
+    # Create agent configurations
+    for config_name, loop_provider, model_provider in agent_configs:
+        print(f"\n--- Testing Parallel Agents: {config_name} ---")
+        
+        # Create the agent interfaces
+        if "app-use" in (computer.experiments or []):
+            safari_desktop = computer.create_desktop_from_apps(["Safari"])
+            firefox_desktop = computer.create_desktop_from_apps(["Firefox"])
+        else:
+            safari_desktop = computer
+            firefox_desktop = computer
+        
+        # Save screenshots
+        screenshot_dir = project_root / "examples" / "evals" / "screenshots"
+        screenshot_dir.mkdir(exist_ok=True)
+        safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png"
+        firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png"
+        screenshot_bytes = await safari_desktop.interface.screenshot()
+        with open(safari_screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        screenshot_bytes = await firefox_desktop.interface.screenshot()
+        with open(firefox_screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        
+        # Create agents
+        safari_agent = ComputerAgent(
+            computer=safari_desktop,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        firefox_agent = ComputerAgent(
+            computer=firefox_desktop,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        # Create prompts using the template
+        safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Safari",
+            start_page=safari_start,
+            target_page=safari_target
+        )
+        
+        firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Firefox",
+            start_page=firefox_start,
+            target_page=firefox_target
+        )
+        
+        # Track results
+        safari_results = {
+            "steps": 0,
+            "success": False,
+            "start_time": time.time(),
+            "end_time": None
+        }
+        
+        firefox_results = {
+            "steps": 0,
+            "success": False,
+            "start_time": time.time(),
+            "end_time": None
+        }
+        
+        # Function to run a single agent
+        async def run_agent(agent, prompt, browser, start_page, target_page, results):
+            try:
+                while results["steps"] < max_steps and not results["success"]:
+                    async for result in agent.run(prompt):
+                        results["steps"] += 1
+                        print(f"{browser} Step {results['steps']}")
+                        
+                        def process_result():
+                            if result.get("content"):
+                                print(f"{browser} Agent: {result.get('content', '')}")
+
+                            else:
+                                outputs = result.get("output", [])
+                                for output in outputs:
+                                    if output.get("type") == "message":
+                                        content = output.get("content", [])
+                                        for content_part in content:
+                                            if content_part.get("text"):
+                                                print(f"{browser} Agent: {content_part.get('text', '')}")
+
+                                    elif output.get("type") == "reasoning":
+                                        # if it's openAI, we only have access to a summary of the reasoning
+                                        summary_content = output.get("summary", [])
+                                        if summary_content:
+                                            for summary_part in summary_content:
+                                                if summary_part.get("type") == "summary_text":
+                                                    print(f"{browser} Agent: {summary_part.get('text', '')}")
+
+                                        else:
+                                            summary_content = output.get("text", "")
+                                            if summary_content:
+                                                print(f"{browser} Agent: {summary_content}")
+
+                                    elif output.get("type") == "computer_call":
+                                        action = output.get("action", {})
+                                        action_type = action.get("type", "")
+                                        if action_type:
+                                            action_title = f"🛠️ Performing {action_type}"
+                                            if action.get("x") and action.get("y"):
+                                                action_title += f" at ({action['x']}, {action['y']})"
+                                            print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
+
+                        
+                        # Process and print the result
+                        process_result()
+                        
+                        # Check current page
+                        current_page = await get_current_wiki_page(browser)
+                        print(f"{browser} current page: {current_page}")
+                        print(f"{browser} target: {target_page}") 
+                        
+                        # Check if we reached the target
+                        if current_page and target_page.lower() in current_page.lower():
+                            results["success"] = True
+                            print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!")
+                            await agent._loop.cancel()
+                            break
+                        
+                        # Check if we reached the maximum steps
+                        if results["steps"] >= max_steps:
+                            print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})")
+                            await agent._loop.cancel()
+                            break
             except asyncio.CancelledError:
-                print("Agent stopped")
-                            
-            end_time = time.time()
-            duration = end_time - start_time
-            await asyncio.sleep(2) # Wait for agent to finish
+                print(f"{browser} agent stopped")
+            finally:
+                results["end_time"] = time.time()
+        
+        # Run both agents in parallel
+        await asyncio.gather(
+            run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results),
+            run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results)
+        )
+        
+        # Wait for agents to finish
+        await asyncio.sleep(2)
+        
+        # Print results
+        print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===")
+        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
+        
+        print(f"\nSafari Results:")
+        print(f"Start: {safari_start}")
+        print(f"Target: {safari_target}")
+        print(f"Steps taken: {safari_results['steps']}")
+        print(f"Success: {safari_results['success']}")
+        print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds")
+        
+        print(f"\nFirefox Results:")
+        print(f"Start: {firefox_start}")
+        print(f"Target: {firefox_target}")
+        print(f"Steps taken: {firefox_results['steps']}")
+        print(f"Success: {firefox_results['success']}")
+        print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds")
+
+
+async def main():
+    try:
+        
+        # Define agent configurations to test
+        agent_configs = [
+            ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
+            ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
+            # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
+        ]
+        
+        # # Run the test scenario without any agents
+        # print("Running test scenario for sandboxed functions")
+        # await run_test_scenario()
+        
+        # Set maximum steps for each agent run
+        max_steps = 50
+        runs = 5
+
+        # run all scenarios
+        for _ in range(runs):
+            # Scenario 1: Messy desktop without App-Use
+            await run_scenario("messy_desktop", False, agent_configs, max_steps)
             
-            # Results
-            print(f"\n=== WIKIRACE RESULTS ===")
-            print(f"Start: {start_page}")
-            print(f"Target: {target_page}")
-            print(f"Steps taken: {steps}")
-            print(f"Success: {success}")
-            print(f"Duration: {duration:.2f} seconds")
-        finally:
-            # Important to clean up resources
-            # await computer.stop()
-            pass
+            # Scenario 1: Messy desktop with App-Use
+            await run_scenario("messy_desktop", True, agent_configs, max_steps)
+            
+            # Scenario 2: Parallel agents without App-Use
+            await run_scenario("parallel_agents", False, agent_configs, max_steps)
+            
+            # Scenario 2: Parallel agents with App-Use
+            await run_scenario("parallel_agents", True, agent_configs, max_steps)
             
     except Exception as e:
         print(f"Error in main: {e}")
         traceback.print_exc()
 
 
+async def run_test_scenario(max_iterations=5):
+    """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match
+    
+    This function opens the same Wikipedia pages in both browsers and verifies that
+    the get_current_wiki_page function returns the same result for both browsers.
+    It does this for the specified number of iterations.
+    """
+    
+    # Create computer instance
+    computer = Computer()
+    await computer.run()
+    
+    # Get popular wiki articles
+    global articles
+    selected_articles = random.sample(articles, max_iterations)
+    
+    print(f"\n--- Running Test Scenario for {max_iterations} iterations ---")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open both browsers
+    await open_app("Safari")
+    await open_app("Firefox")
+    
+    # Verify browsers are open
+    open_apps = await get_open_app_names()
+    print(f"Open applications: {open_apps}")
+    
+    # Run test iterations
+    for i, article in enumerate(selected_articles):
+        print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'")
+        
+        # Open the same Wikipedia page in both browsers
+        await open_wiki(article, "Safari")
+        await open_wiki(article, "Firefox")
+        await asyncio.sleep(3)  # Give a bit more time for both pages to load
+        
+        # Check if both browsers show the same page
+        safari_page = await get_current_wiki_page("Safari")
+        firefox_page = await get_current_wiki_page("Firefox")
+        
+        print(f"Safari page: {safari_page}")
+        print(f"Firefox page: {firefox_page}")
+        
+        if safari_page == firefox_page:
+            print(f"✅ MATCH: Both browsers show '{safari_page}'")
+        else:
+            print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'")
+        
+        await asyncio.sleep(1)  # Brief pause between iterations
+    
+    print("\n--- Test Scenario Completed ---")
+
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())

From 79bcf9d05d2347f4a03ba190f526e1c12e192448 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Tue, 3 Jun 2025 21:37:58 -0400
Subject: [PATCH 10/23] Added top wiki links

---
 examples/evals/wikipedia_most_linked.txt | 1000 ++++++++++++++++++++++
 1 file changed, 1000 insertions(+)
 create mode 100644 examples/evals/wikipedia_most_linked.txt

diff --git a/examples/evals/wikipedia_most_linked.txt b/examples/evals/wikipedia_most_linked.txt
new file mode 100644
index 00000000..877909d2
--- /dev/null
+++ b/examples/evals/wikipedia_most_linked.txt
@@ -0,0 +1,1000 @@
+ISBN (identifier)
+United States
+Main Page
+Tilde
+Doi (identifier)
+Fair use
+Association football
+Years
+Wayback Machine
+ISSN (identifier)
+India
+Wikimedia Foundation
+Wikidata
+Animal
+Taxonomy (biology)
+Australia
+France
+Eukaryote
+IP address
+U.S. state
+Time zone
+City
+Copyright
+Canada
+Town
+ASCII
+Greek alphabet
+Typographic ligature
+Diacritical mark
+Wikipedia
+Germany
+Human settlement
+Open Tree of Life
+IMDb (identifier)
+United Kingdom
+Catalogue of Life
+Insect
+Russia
+Japan
+Italy
+Arthropod
+Television show
+Public domain
+INaturalist
+Poland
+England
+PMID (identifier)
+Daylight saving time
+S2CID (identifier)
+China
+Encyclopedia of Life
+Spain
+OCLC (identifier)
+Plant
+Flickr
+Wikispecies
+Africa
+Song
+Record label
+Lepidoptera
+Iran
+English language
+Music genre
+News aggregator
+Web feed
+Proxy server
+X-Forwarded-For
+College football
+World War II
+Brazil
+Sweden
+Politics
+Olympics
+Netherlands
+Record producer
+California
+New York City
+Surname
+The New York Times
+London
+New Zealand
+PMC (identifier)
+Logo
+Synonym (taxonomy)
+Switzerland
+Turkey
+Sport
+Video game
+Architecture
+Norway
+Bibcode (identifier)
+Mexico
+Botany
+JSTOR (identifier)
+Rail transport
+Field hockey
+Ireland
+Scotland
+Belgium
+South Africa
+Common name
+Professional sports
+Sport governing body
+Sport industry
+Olympic games
+Election
+Austria
+Ukraine
+Anthroponymy
+Pakistan
+Baseball
+Denmark
+Christianity
+Philippines
+Woman
+Romania
+Czech Republic
+Album
+Godzilla Minus One
+Single (music)
+Electoral reform
+Nofollow
+Basketball
+New York (state)
+Argentina
+Finland
+Soviet Union
+Greece
+Russian language
+Historic site
+Free content
+YouTube
+Catholic Church
+Hungary
+Kingdom Hearts
+Beetle
+Company
+Tetris
+Portugal
+BioShock
+Abandonware
+Deus Ex (video game)
+4A Engine
+Yoshi's New Island
+Kaboom! (video game)
+Rain World
+Juno (Overwatch)
+Crash Team Rumble
+Vault 101
+Tales of Commons
+NHL Hockey
+Clutch Gaming
+Haseo
+Allin Kempthorne
+Ilyas El Maliki
+Ratalaika Games
+3D mousepad
+HaptX
+Walid Sultan Midani
+Rustler (video game)
+Look Outside
+Ducks Ahoy!
+Fusion Engine
+Cricket
+Geography
+Chordate
+The Guardian
+Israel
+Billboard (magazine)
+Ice hockey
+Given name
+Chicago
+World War I
+Pennsylvania
+Indonesia
+Alma mater
+Vascular plant
+Amorphea
+Wikimedia Commons
+Novel
+Village
+Visual arts
+Film poster
+Flowering plant
+Opisthokont
+Obazoa
+County seat
+Short story
+First-class cricket
+Law
+Europe
+University
+Croatia
+Sport of athletics
+Holozoa
+Choanozoa
+Filozoa
+German language
+Tennis
+Eumetazoa
+Serbia
+ParaHoxozoa
+Thailand
+History
+Midfielder
+Bilateria
+Unincorporated area
+French language
+AllMusic
+Astronomy
+Nephrozoa
+Novella
+Ship
+Twitter
+Character (arts)
+College
+Malaysia
+Conflict of interest
+Higher education
+IUCN Red List
+Rock music
+Gastropoda
+Creative Commons
+Wales
+Bulgaria
+UTC+2
+Paris
+Species
+Illinois
+HTML element
+South Korea
+BBC
+Persian language
+Moth
+Conservation status
+Pop music
+Colombia
+Wicket
+American football
+Jazz
+World Flora Online
+Los Angeles
+Songwriter
+Hong Kong
+Hdl (identifier)
+Genus
+Spanish language
+Egypt
+Not out
+Slovenia
+Chile
+Korea
+Tropicos
+Slovakia
+Bishop
+Family (biology)
+Rugby union
+Women's history
+Nigeria
+College basketball
+Sports Reference
+Washington, D.C.
+GFDL
+Afghanistan
+Sri Lanka
+Newspapers.com
+UTC+1
+Eudicots
+Estonia
+Los Angeles Times
+Olympedia
+Bangladesh
+Peru
+Singapore
+Typographical error
+UTC
+Virginia
+Taiwan
+Fast bowling
+COVID-19 pandemic
+Food
+Fish
+River
+Republic of Ireland
+Beer
+Caribbean
+Michigan
+Drink
+Chinese language
+Business
+Leg break
+Women's Test cricket
+Women's cricket
+Innings
+New Jersey
+Protostome
+Spin bowling
+Sugar
+Underarm bowling
+Roger Federer
+Googly
+Apple
+Comics
+Cricket Australia XI
+Fair and unfair play
+Anime
+Rafael Nadal
+Leander Paes
+Kazakhstan
+Capital city
+Blessed Virgin Mary
+Venezuela
+Case sensitivity
+Arabic language
+North America
+Texas
+Burger King
+The Plant List
+Justine Henin
+Sushi
+Angelus
+Beef
+Sanctification
+Cuthbert Tunstall
+Bread
+Saint Mungo
+Incumbent
+Americanism (heresy)
+Curry
+Ensoulment
+Associated Press
+Adolph John Paschang
+French cuisine
+Altar Society
+UTC-5
+Philadelphia
+Bill Mallon
+Yogurt
+Soy sauce
+Open Era (tennis)
+Belarus
+Manga
+English Wikipedia
+Islam
+Trademark
+ISO 4
+Wisconsin
+Lithuania
+The Washington Post
+Agaricus bisporus
+Reptile
+Sociology
+Organizations
+Death
+Ham and eggs
+Asia
+Swimming (sport)
+South America
+Northern Ireland
+Observation.org
+European Union
+Astronomical object
+Georgia (U.S. state)
+Gmina
+Provinces of Iran
+Computing
+Counties of Iran
+Discogs
+Mathematics
+Powiat
+Missouri
+Bachelor of Arts
+Iran Standard Time
+Florida
+Bakhsh
+Minnesota
+Oregon
+Nepal
+Variety (magazine)
+Japanese language
+Journalism
+Rome
+Computer
+Ohio
+Ontario
+Internet Archive
+Latvia
+Comedy
+Azerbaijan
+BBC News
+Morocco
+Ecdysozoa
+Print-on-demand
+Bengali language
+A5 paper
+Pedia Press
+Education
+Mollusca
+American Civil War
+Berlin
+Taxon
+Maryland
+Panarthropoda
+Hebrew language
+Toronto
+Tactopoda
+Episode
+Cuba
+Country music
+Religion
+Rotten Tomatoes
+Georgia (country)
+Classical music
+Month
+Puerto Rico
+GEOnet Names Server
+Sydney
+The Times
+Iraq
+Polyphaga
+Derivative work
+Lisbon
+Syria
+Ecuador
+Uzbekistan
+Greek language
+Latin
+United Nations
+Literature
+Animation
+Physics
+Amphibian
+Romanize
+List of countries
+Moscow
+Politician
+Philosophy
+Metacritic
+Mammal
+Pinyin
+Open access
+New South Wales
+Theatre
+Allmusic
+Syntax
+Women in music
+Fly
+Colorado
+Academic journal
+LGBTQ
+Seal (emblem)
+Rolling Stone
+Saudi Arabia
+Science fiction
+Tweet (social media)
+Heavy metal music
+Boston
+Vietnam
+Molecular biology
+Facebook
+Iceland
+Albania
+Cycling
+Tennessee
+Armenia
+Massachusetts
+Mandibulata
+United States Navy
+Communes of France
+Census
+Algeria
+United States Army
+Wikilink
+Pancrustacea
+Alternative rock
+American English
+Radio stations
+History of Romania
+Endemism
+San Francisco
+Award
+Ghana
+Judaism
+Alabama
+Blog
+The Independent
+Melbourne
+Cantons of France
+Lebanon
+West Germany
+Quotation mark
+Regions of France
+Chernivtsi Oblast
+Tokyo
+Italian language
+Connecticut
+Country
+Screenshot
+Ghost town
+Iran Daylight Time
+NatureServe
+Mongolia
+Cyprus
+Northern Bukovina
+Rugby league
+Northern Bessarabia
+State highway
+Harvard University
+Yorkshire
+Pterygota
+Slash (punctuation)
+Prize
+Science
+Asian Games
+Eastern Time Zone
+Myanmar
+Nazi Germany
+Ottoman Empire
+Quebec
+Billboard Hot 100
+United Arab Emirates
+Neoptera
+Hexapoda
+Least Concern
+Type species
+EPPO Code
+Wikisource
+Kyrgyzstan
+Allotriocarida
+Volleyball
+Geology
+Second World War
+British Columbia
+Socialism
+Zoology
+The Daily Telegraph
+Paleontology
+Vienna
+Dicondylia
+BugGuide
+United States Senate
+Hermit crab
+Paraphrase
+CNN
+Royal Navy
+Indian Standard Time
+Billboard 200
+Kenya
+DVD
+Sipuncula
+Tajikistan
+National park
+Economics
+Heterocyathus
+Uruguay
+Heteropsammia
+Road
+Spanish name
+Luxembourg
+Korean language
+UK Singles Chart
+Queensland
+Montreal
+New York Times
+Bolivia
+CP/M
+Timestamp
+Electronic music
+INSEE code
+ArXiv (identifier)
+PubMed
+SVG
+USA Today
+Omnivore
+Tunisia
+Psychology
+ESPN
+UEFA
+Hawaii
+Gastropod
+Aliyah
+North Carolina
+Russian Empire
+Tibet
+Fungi
+Oklahoma
+Fauna Europaea
+Turkmenistan
+British English
+The London Gazette
+Civil township
+Boxing
+Barack Obama
+Animal Diversity Web
+Reuters
+Eumetabola
+Voter turnout
+Transport
+False positive
+Donald Trump
+Kansas
+Antarctica
+Lake
+Ethiopia
+Time (magazine)
+Marriage
+NBC
+Beijing
+Vertebrate
+Czechoslovakia
+Protected area
+Energy
+Poetry
+Archaeology
+Columbia University
+Poverty line
+Alaska
+Computing platform
+British Empire
+University of Oxford
+Costa Rica
+Dublin
+A-side and B-side
+ZIP code
+Actinopterygii
+UTC-6
+Photoperiodism
+Mayor
+Sphaeriidae
+Animal suicide
+Atka mackerel
+Starling
+Arizona
+Entertainment Weekly
+Sphaerium beckmani
+Junqueira cow
+Zaniolepis frenata
+Campocraspedon
+Zimbabwe
+Motorsport
+Bird flight
+Cnemophilidae
+Hinduism
+Phalarope
+Indiana
+Museums
+Holometabola
+Pytilia
+North Macedonia
+Malta
+Cathartiformes
+Darter
+Saker falcon
+Cathartes
+Avian malaria
+Coal tit
+Magpie duck
+Video game developer
+Bird bath
+Vesper sparrow
+Gouldian finch
+Debeaking
+Vector graphics
+Semiplumbeous hawk
+Scottish crossbill
+Bullfinch
+Fregata
+Nidicolous
+Plushcap
+Pallid scops owl
+Hip-hop
+Blyth's frogmouth
+Sunda scops owl
+Argus (bird)
+Operation Migration
+Nik Borrow
+Per capita income
+Guy Oseary
+Madrid
+Buddhism
+Drainage basin
+Sephardic Haredim
+Rami Kleinstein
+Guy Bavli
+David Bar-Hayim
+Levin Kipnis
+Edna Arbel
+Prisoner of Zion
+Ayala Procaccia
+Nachum Heiman
+Zman Tel Aviv
+CBS
+ARIA Charts
+Cucujiformia
+Away colours
+Regex
+2019 African Games
+1962 Asian Games
+1958 Asian Games
+Chemistry
+Olympic Games
+The Middle Ages
+Central Asia
+Bengalis
+Southeast Asia
+Find a Grave
+Microsoft Windows
+Swing (politics)
+White (U.S. Census)
+Roman Catholic
+Maine
+The Times of India
+Season (sports)
+Jamaica
+Video game genre
+Munich
+Asterids
+Rosids
+Golf
+Language
+Hangul
+Atlanta
+Glasgow
+UTC+3
+Library of Congress
+Deuterostome
+COVID-19
+Video game publisher
+Montenegro
+ESPNcricinfo
+Brand
+UTC-4
+IGN
+Stockholm
+Istanbul
+NASA
+Gnathostomata
+Ukrainian language
+Human rights
+Chicago Tribune
+ProQuest
+IMDb
+River mouth
+Hip hop music
+Gene
+Netflix
+Moldova
+Barcelona
+Paraguay
+Olfactores
+Labour Party (UK)
+United States dollar
+Qatar
+Photography
+Guatemala
+Summit
+Cold War
+Running
+First World War
+Precipitation
+Edinburgh
+Amsterdam
+Lima
+New Eskaton
+Computer program
+Xinjiang
+Women in science
+Manhattan
+Warsaw
+Magazine
+Horror film
+Deadline Hollywood
+Jordan
+Aparaglossata
+Agriculture
+Internet
+Prague
+The Hindu
+Cretaceous
+Latino (U.S. Census)
+Vietnam War
+Music download
+Encyclopedia
+Chemical compounds
+Pittsburgh
+Soap opera
+Budapest
+George W. Bush
+Seattle
+Extended play
+Washington (state)
+Listed building
+Palestine
+LCCN (identifier)
+Portland, Oregon
+Panama
+Plagiarism
+Brooklyn
+Teleostomi
+Manchester
+Bird
+Mollusk
+Automobile
+Historic England
+Linguistics
+Dependent territory
+Athens
+Civil engineering
+Sea snail
+Population density
+Finance
+Disaster management
+Tanzania
+Jurassic
+Districts of Russia
+Western Australia
+Louisiana
+Portuguese language
+Anatomy
+The Beatles
+Tamil language
+Milan
+Uganda
+Natural environment
+FIFA
+Cameroon
+Blu-ray
+Mexico City
+Chemical formula
+Jimmy Wales
+Papua New Guinea
+Diaphoretickes
+UNESCO
+Forbes
+Technology
+Buenos Aires
+Vancouver
+Dominican Republic
+2007
+Species description
+East Germany
+Folk music
+Kentucky
+Multimedia
+Monocotyledon
+Rio de Janeiro
+Automated
+Hindi
+Houston
+Google
+Devonian
+Member of Parliament
+Bible
+Mumbai
+FishBase
+African diaspora
+Carboniferous
+Cambrian
+Triassic
+Montana
+Handball
+Ordovician
+San Diego
+Archive.today
+Stanford University
+British Army
+Middle Ages
+Frequency
+Ultratop
+Permian
+Detroit
+Earth
+Precambrian
+Hamburg
+Alberta
+Tamil Nadu
+Madagascar
+Lancashire
+Guitar
+Trade union
+Instagram
+Engineering
+2006
+Silurian
+NPR
+Railway station
+CAS Registry Number
+Yemen
+Noctuoidea
+Fiji
+Haiti
+Rowing (sport)
+New Orleans
+NME
+Alternative media
+North Korea
+Microsoft
+Jerusalem
+Paleogene
+Audery Mill Creek
+Horse racing
+Post town
+Piano
+Bavaria
+Polish language
+Horror fiction
+Neogene
+Kerala
+Copenhagen
+Google Books
+Central Time Zone
+Island
+Birmingham
+Anglicanism
+Software
+Mountain range
+Investment
+Brussels
+Muhammad Ali
+Asian (U.S. Census)
+Video game culture
+Brisbane
+Church of England
+Kosovo
+Bachelor of Science
+Molar mass
+Arachnid
+Own goal
+Yale University
+Caenogastropoda
+Auckland
+World Athletics
+Trinidad and Tobago
+Hanyu Pinyin
+Sound bite
+Time
+El Salvador
+Microbiology
+Columbia Records
+Seoul
+Cerambycidae
+Maharashtra
+Chelicerata
+Fungus
+Media influence
+South Carolina
+Radio
+Telenovela
+FA Cup
+Senegal
+Internet trolling
+Nashville, Tennessee
+Demonym
+Standard Chinese
+Sculpture
+Liverpool
+Thesis
+Bass guitar
+Chess
+Women artists
+Icon (computing)
+PubChem
+UK Albums Chart
+Head coach
+Roman Empire
+Grand Slam (tennis)
+JSmol
+Formula One
+Biology
+Kent
+Ancient Rome
+Inner Carniola
+Oslo
+Dutch language
+Wingspan
+Archaeplastida
+MTV
+Edvard Ravnikar
+ITunes
+Feminism
+German Empire
+Pacific Ocean
+Atlantic Ocean
+Pharmacology
+Track gauge
+ChemSpider
+Doctor of Philosophy
+Regions of England
+Districts of England
+Christmas
+Pavel Golia
+Predjama Castle
+Overtime (sports)
+Forum
+Swiss Hitparade
+Stumped
+Majority
+Male
+Shanghai
+Siddharta (band)
\ No newline at end of file

From 58a453dc496f2f70396016948aded91ac166ca5f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 09:05:20 -0400
Subject: [PATCH 11/23] App-usage stability fixes

---
 .../computer_server/diorama/diorama.py        |  28 ++++-
 libs/computer/computer/diorama_computer.py    |   2 +-
 libs/computer/computer/ui/gradio/app.py       | 111 +++++++++++++-----
 3 files changed, 107 insertions(+), 34 deletions(-)

diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py
index bf30a018..e781395c 100644
--- a/libs/computer-server/computer_server/diorama/diorama.py
+++ b/libs/computer-server/computer_server/diorama/diorama.py
@@ -36,11 +36,21 @@ class Diorama:
         cls._ensure_scheduler()
         return cls(args).computer
 
+    # Dictionary to store cursor positions for each unique app_list hash
+    _cursor_positions = {}
+    
     def __init__(self, app_list):
         self.app_list = app_list
         self.interface = self.Interface(self)
         self.computer = DioramaComputer(self)
         self.focus_context = None
+        
+        # Create a hash for this app_list to use as a key
+        self.app_list_hash = hash(tuple(sorted(app_list)))
+        
+        # Initialize cursor position for this app_list if it doesn't exist
+        if self.app_list_hash not in Diorama._cursor_positions:
+            Diorama._cursor_positions[self.app_list_hash] = (0, 0)
 
     @classmethod
     def _ensure_scheduler(cls):
@@ -67,10 +77,11 @@ class Diorama:
             frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist)
             focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger)
             
+            app_list_hash = hash(tuple(sorted(app_whitelist)))
+            
             with focus_context:
                 try:
                     if action == "screenshot":
-                        app_whitelist = list(args["app_list"])
                         logger.info(f"Taking screenshot for apps: {app_whitelist}")
                         result, img = capture_all_apps(
                             app_whitelist=app_whitelist,
@@ -82,8 +93,15 @@ class Diorama:
                             future.set_result((result, img))
                     # Mouse actions
                     elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]:
-                        x = args.get("x")
-                        y = args.get("y")
+                        # Get last cursor position for this app_list hash
+                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+                        
+                        x = args.get("x", last_pos[0])
+                        y = args.get("y", last_pos[1])
+                        
+                        # Update the cursor position for this app_list hash
+                        Diorama._cursor_positions[app_list_hash] = (x, y)
+                        
                         duration = args.get("duration", 0.5)
                         if action == "left_click":
                             await automation_handler.left_click(x, y)
@@ -98,6 +116,10 @@ class Diorama:
                         if future:
                             future.set_result(None)
                     elif action in ["scroll_up", "scroll_down"]:
+                        # Move cursor to last known position for this app_list hash
+                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+                        await automation_handler.move_cursor(*last_pos)
+                        
                         clicks = args.get("clicks", 1)
                         if action == "scroll_up":
                             await automation_handler.scroll_up(clicks)
diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py
index 608e6721..5cad0006 100644
--- a/libs/computer/computer/diorama_computer.py
+++ b/libs/computer/computer/diorama_computer.py
@@ -37,7 +37,7 @@ class DioramaComputerInterface:
             raise RuntimeError("Computer interface not initialized. Call run() first.")
         result = await iface.diorama_cmd(action, arguments)
         if not result.get("success"):
-            raise RuntimeError(f"Diorama command failed: {result.get('error')}")
+            raise RuntimeError(f"Diorama command failed: {result.get('error')}\n{result.get('trace')}")
         return result.get("result")
 
     async def screenshot(self, as_bytes=True):
diff --git a/libs/computer/computer/ui/gradio/app.py b/libs/computer/computer/ui/gradio/app.py
index 1a93b27d..b1d131d9 100644
--- a/libs/computer/computer/ui/gradio/app.py
+++ b/libs/computer/computer/ui/gradio/app.py
@@ -463,7 +463,7 @@ async def execute(name, action, arguments):
         elif action == "left_click":
             if "x" in arguments and "y" in arguments:
                 await computer.interface.move_cursor(arguments["x"], arguments["y"])
-            await computer.interface.left_click()
+            await computer.interface.left_click(arguments["x"], arguments["y"])
             await asyncio.sleep(0.5)
         elif action == "right_click":
             if "x" in arguments and "y" in arguments:
@@ -528,43 +528,75 @@ async def execute(name, action, arguments):
     
     return results
 
-async def handle_init_computer(os_choice: str):
-    """Initialize the computer instance and tools for macOS or Ubuntu"""
+async def handle_init_computer(os_choice: str, app_list=None, provider="lume"):
+    """Initialize the computer instance and tools for macOS or Ubuntu
+    
+    Args:
+        os_choice: The OS to use ("macOS" or "Ubuntu")
+        app_list: Optional list of apps to focus on using the app-use experiment
+        provider: The provider to use ("lume" or "self")
+    """
     global computer, tool_call_logs, tools
-
+    
+    # Check if we should enable app-use experiment
+    use_app_experiment = app_list and len(app_list) > 0
+    experiments = ["app-use"] if use_app_experiment else None
+    
+    # Determine if we should use host computer server
+    use_host_computer_server = provider == "self"
+    
     if os_choice == "Ubuntu":
-        computer = Computer(
-            image="ubuntu-noble-vanilla:latest",
-            os_type="linux",
-            provider_type=VMProviderType.LUME,
-            display="1024x768",
-            memory="8GB",
-            cpu="4"
-        )
         os_type_str = "linux"
         image_str = "ubuntu-noble-vanilla:latest"
     else:
+        os_type_str = "macos"
+        image_str = "macos-sequoia-cua:latest"
+    
+    # Create computer instance with appropriate configuration
+    if use_host_computer_server:
         computer = Computer(
-            image="macos-sequoia-cua:latest",
-            os_type="macos",
+            os_type=os_type_str,
+            use_host_computer_server=True,
+            experiments=experiments
+        )
+    else:
+        computer = Computer(
+            image=image_str,
+            os_type=os_type_str,
             provider_type=VMProviderType.LUME,
             display="1024x768",
             memory="8GB",
-            cpu="4"
+            cpu="4",
+            experiments=experiments
         )
-        os_type_str = "macos"
-        image_str = "macos-sequoia-cua:latest"
 
     await computer.run()
+    
+    # If app list is provided, create desktop from apps
+    if use_app_experiment:
+        computer = computer.create_desktop_from_apps(app_list)
 
     # Log computer initialization as a tool call
-    result = await execute("computer", "initialize", {
+    init_params = {
         "os": os_type_str,
-        "image": image_str,
-        "display": "1024x768",
-        "memory": "8GB",
-        "cpu": "4"
-    })
+        "provider": provider
+    }
+    
+    # Add VM-specific parameters if not using host computer server
+    if not use_host_computer_server:
+        init_params.update({
+            "image": image_str,
+            "display": "1024x768",
+            "memory": "8GB",
+            "cpu": "4"
+        })
+    
+    # Add app list to the log if provided
+    if use_app_experiment:
+        init_params["apps"] = app_list
+        init_params["experiments"] = ["app-use"]
+    
+    result = await execute("computer", "initialize", init_params)
 
     return result["screenshot"], json.dumps(tool_call_logs, indent=2)
 
@@ -1029,12 +1061,31 @@ def create_gradio_ui():
                     setup_status = gr.Textbox(label="Setup Status", value="")
                 
                 with gr.Group():
-                    os_choice = gr.Radio(
-                        label="OS",
-                        choices=["macOS", "Ubuntu"],
-                        value="macOS",
-                        interactive=False # disable until the ubuntu image is ready
-                    )
+                    with gr.Accordion("Computer Configuration", open=False):
+                        with gr.Row():
+                            os_choice = gr.Radio(
+                                label="OS",
+                                choices=["macOS", "Ubuntu"],
+                                value="macOS",
+                                interactive=False # disable until the ubuntu image is ready
+                            )
+                            
+                            # Provider selection radio
+                            provider_choice = gr.Radio(
+                                label="Provider",
+                                choices=["lume", "self"],
+                                value="lume",
+                                info="'lume' uses a VM, 'self' uses the host computer server"
+                            )
+                        
+                        # App filtering dropdown for app-use experiment
+                        app_filter = gr.Dropdown(
+                            label="Filter by apps (App-Use)",
+                            multiselect=True,
+                            allow_custom_value=True,
+                            info="When apps are selected, the computer will focus on those apps using the app-use experiment"
+                        )
+                    
                     start_btn = gr.Button("Initialize Computer")
                 
                 with gr.Group():
@@ -1199,7 +1250,7 @@ def create_gradio_ui():
         )
                 
         img.select(handle_click, inputs=[img, click_type], outputs=[img, action_log])
-        start_btn.click(handle_init_computer, inputs=[os_choice], outputs=[img, action_log])
+        start_btn.click(handle_init_computer, inputs=[os_choice, app_filter, provider_choice], outputs=[img, action_log])
         wait_btn.click(handle_wait, outputs=[img, action_log])
         
         # DONE and FAIL buttons just do a placeholder action

From c2302eb6c607f074916051f411b3eeaec54aa38f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 09:46:09 -0400
Subject: [PATCH 12/23] Added results table

---
 examples/eval_examples.py | 74 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index 1978a897..b3d163ca 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -11,13 +11,16 @@ import urllib.request
 import datetime
 from urllib.parse import quote
 
+# Global variable to track all results
+all_results = []
+
 # Wikirace prompt template
 WIKIRACE_PROMPT_TEMPLATE = """
 You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" 
 by clicking only on Wikipedia links within articles.
 
 Rules:
-1. Only click on links within Wikipedia articles (blue underlined text)
+1. Only click on links within Wikipedia articles (blue text)
 2. No using search, back button, or typing URLs
 3. You MAY use cmd+f (or ctrl+f) to find text on the current page
 4. Do NOT click any search icon or type into any search box unless it's a browser command
@@ -26,6 +29,7 @@ Rules:
 7. Do not maximize the window or use any other application
 8. Avoid wasting actions by scrolling
 9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
+10. Stay on the English Wikipedia
 
 Look at the current page and click on a link that might lead you closer to {target_page}.
 """
@@ -36,6 +40,7 @@ _print = print
 # Define log file path
 project_root = Path(__file__).parent.parent
 log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
+results_file = project_root / "examples" / "evals" / "eval_appuse_results.md"
 
 # Custom print function that also logs to file
 def print(*args, **kwargs):
@@ -160,6 +165,36 @@ def get_article_pair(depth=5):
         target_article = wikipedia_random_walk(start_article, depth)[-1]
     return start_article, target_article
 
+
+def save_results_to_markdown():
+    """Save all results to a markdown table"""
+    global all_results
+    
+    if not all_results:
+        print("No results to save")
+        return
+    
+    # Create header for the markdown table
+    header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |"
+    separator = "|---|---|---|---|---|---|---|---|---|---|"
+    
+    # Create rows for each result
+    rows = []
+    for result in all_results:
+        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |"
+        rows.append(row)
+    
+    # Combine header, separator, and rows
+    table = "\n".join([header, separator] + rows)
+    
+    # Write to file (append mode)
+    with open(results_file, "a") as f:
+        f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+        f.write(table)
+    
+    print(f"Results saved to {results_file}")
+
 async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
     """Run a specific evaluation scenario"""
     
@@ -254,6 +289,7 @@ async def open_wiki(page, app_name="Safari"):
 
 
 async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
+    global all_results
     """Run the messy desktop scenario with a single agent"""
     # Get popular wiki articles
     global articles
@@ -292,7 +328,8 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
         agent = ComputerAgent(
             computer=agent_computer,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_appuse"
         )
         
         # Run the wikirace
@@ -387,6 +424,7 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
 
 
 async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
+    global all_results
     
     """Run two agents in parallel, one using Safari and one using Firefox"""
     # Get popular wiki articles
@@ -436,13 +474,15 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
         safari_agent = ComputerAgent(
             computer=safari_desktop,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_parallel_safari"
         )
         
         firefox_agent = ComputerAgent(
             computer=firefox_desktop,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_parallel_firefox"
         )
         
         # Create prompts using the template
@@ -525,6 +565,24 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
                         print(f"{browser} current page: {current_page}")
                         print(f"{browser} target: {target_page}") 
                         
+                        # Add result to global tracking
+                        global all_results
+                        current_result = {
+                            'scenario': 'parallel_agents',
+                            'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No',
+                            'browser': browser,
+                            'config': config_name,
+                            'start': start_page,
+                            'target': target_page,
+                            'steps': results['steps'],
+                            'success': results['success'],
+                            'duration': time.time() - results['start_time']
+                        }
+                        all_results.append(current_result)
+                        
+                        # Save results after each step
+                        save_results_to_markdown()
+                        
                         # Check if we reached the target
                         if current_page and target_page.lower() in current_page.lower():
                             results["success"] = True
@@ -575,9 +633,9 @@ async def main():
         
         # Define agent configurations to test
         agent_configs = [
-            ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
-            ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
-            # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
+            # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
+            # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
+            ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
         ]
         
         # # Run the test scenario without any agents
@@ -585,7 +643,7 @@ async def main():
         # await run_test_scenario()
         
         # Set maximum steps for each agent run
-        max_steps = 50
+        max_steps = 15
         runs = 5
 
         # run all scenarios

From 4693f2f0eb69dfb78af560abd7e20675db7dde2d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 09:58:13 -0400
Subject: [PATCH 13/23] Fixed 'max() iterable argument is empty'

---
 libs/computer-server/computer_server/diorama/draw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/computer-server/computer_server/diorama/draw.py b/libs/computer-server/computer_server/diorama/draw.py
index ac90106e..9fce809f 100644
--- a/libs/computer-server/computer_server/diorama/draw.py
+++ b/libs/computer-server/computer_server/diorama/draw.py
@@ -377,7 +377,7 @@ def draw_desktop_screenshot(app_whitelist: List[str] = None, all_windows: List[D
         
         dock_orientation = "side" if dock_bounds["width"] < dock_bounds["height"] else "bottom"
         
-        menubar_length = max(item["bounds"]["x"] + item["bounds"]["width"] for item in menubar_items)
+        menubar_length = max(item["bounds"]["x"] + item["bounds"]["width"] for item in menubar_items) if menubar_items else 0
                 
         # Calculate bounds of app windows
         app_bounds = {

From aa4dc71b9c6b9b7621b8614ad8d56f346f776a37 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 11:06:43 -0400
Subject: [PATCH 14/23] Mouse bug fixes

---
 .../computer_server/diorama/diorama.py        | 69 +++++++++++++++----
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py
index e781395c..20f5b1fb 100644
--- a/libs/computer-server/computer_server/diorama/diorama.py
+++ b/libs/computer-server/computer_server/diorama/diorama.py
@@ -77,8 +77,6 @@ class Diorama:
             frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist)
             focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger)
             
-            app_list_hash = hash(tuple(sorted(app_whitelist)))
-            
             with focus_context:
                 try:
                     if action == "screenshot":
@@ -93,11 +91,8 @@ class Diorama:
                             future.set_result((result, img))
                     # Mouse actions
                     elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]:
-                        # Get last cursor position for this app_list hash
-                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
-                        
-                        x = args.get("x", last_pos[0])
-                        y = args.get("y", last_pos[1])
+                        x = args.get("x")
+                        y = args.get("y")
                         
                         # Update the cursor position for this app_list hash
                         Diorama._cursor_positions[app_list_hash] = (x, y)
@@ -116,9 +111,10 @@ class Diorama:
                         if future:
                             future.set_result(None)
                     elif action in ["scroll_up", "scroll_down"]:
-                        # Move cursor to last known position for this app_list hash
-                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
-                        await automation_handler.move_cursor(*last_pos)
+                        x = args.get("x")
+                        y = args.get("y")
+                        if x is not None and y is not None:
+                            await automation_handler.move_cursor(x, y)
                         
                         clicks = args.get("clicks", 1)
                         if action == "scroll_up":
@@ -197,22 +193,57 @@ class Diorama:
                 return img
 
         async def left_click(self, x, y):
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = x or last_pos[0], y or last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+
             sx, sy = await self.to_screen_coordinates(x, y)
             await self._send_cmd("left_click", {"x": sx, "y": sy})
 
         async def right_click(self, x, y):
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = x or last_pos[0], y or last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
             sx, sy = await self.to_screen_coordinates(x, y)
             await self._send_cmd("right_click", {"x": sx, "y": sy})
 
         async def double_click(self, x, y):
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = x or last_pos[0], y or last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
             sx, sy = await self.to_screen_coordinates(x, y)
             await self._send_cmd("double_click", {"x": sx, "y": sy})
 
         async def move_cursor(self, x, y):
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = x or last_pos[0], y or last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
             sx, sy = await self.to_screen_coordinates(x, y)
             await self._send_cmd("move_cursor", {"x": sx, "y": sy})
 
         async def drag_to(self, x, y, duration=0.5):
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = x or last_pos[0], y or last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
             sx, sy = await self.to_screen_coordinates(x, y)
             await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration})
 
@@ -229,10 +260,24 @@ class Diorama:
             await self._send_cmd("hotkey", {"keys": list(keys)})
 
         async def scroll_up(self, clicks: int = 1):
-            await self._send_cmd("scroll_up", {"clicks": clicks})
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = last_pos[0], last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
+            await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y})
 
         async def scroll_down(self, clicks: int = 1):
-            await self._send_cmd("scroll_down", {"clicks": clicks})
+            # Get last cursor position for this app_list hash
+            app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
+            last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+            x, y = last_pos[0], last_pos[1]
+            # Update cursor position for this app_list hash
+            Diorama._cursor_positions[app_list_hash] = (x, y)
+            
+            await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y})
 
         async def get_screen_size(self) -> dict[str, int]:
             if not self._scene_size:

From 92ca6d2923e9dc426444d9175542a1ed9f2e4576 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 12:03:06 -0400
Subject: [PATCH 15/23] Bugfixes

---
 libs/computer-server/computer_server/diorama/diorama.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py
index 20f5b1fb..fc426a7c 100644
--- a/libs/computer-server/computer_server/diorama/diorama.py
+++ b/libs/computer-server/computer_server/diorama/diorama.py
@@ -94,9 +94,6 @@ class Diorama:
                         x = args.get("x")
                         y = args.get("y")
                         
-                        # Update the cursor position for this app_list hash
-                        Diorama._cursor_positions[app_list_hash] = (x, y)
-                        
                         duration = args.get("duration", 0.5)
                         if action == "left_click":
                             await automation_handler.left_click(x, y)
@@ -264,8 +261,6 @@ class Diorama:
             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
             x, y = last_pos[0], last_pos[1]
-            # Update cursor position for this app_list hash
-            Diorama._cursor_positions[app_list_hash] = (x, y)
             
             await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y})
 
@@ -274,8 +269,6 @@ class Diorama:
             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
             x, y = last_pos[0], last_pos[1]
-            # Update cursor position for this app_list hash
-            Diorama._cursor_positions[app_list_hash] = (x, y)
             
             await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y})
 

From 08ce9c67c17724b75e238cc2d8808caa00592087 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 14:07:24 -0400
Subject: [PATCH 16/23] Replaced pyautogui with pynput on macOS (#125, #165)

---
 .../computer_server/handlers/macos.py         | 85 ++++++++++++-------
 libs/computer-server/pyproject.toml           |  1 +
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py
index 1e5c5ceb..8afc5cc1 100644
--- a/libs/computer-server/computer_server/handlers/macos.py
+++ b/libs/computer-server/computer_server/handlers/macos.py
@@ -1,4 +1,7 @@
 import pyautogui
+from pynput.mouse import Button, Controller as MouseController
+from pynput.keyboard import Key, Controller as KeyboardController
+import time
 import base64
 from io import BytesIO
 from typing import Optional, Dict, Any, List, Tuple
@@ -336,7 +339,6 @@ class UIElement:
             "position": position,
             "size": size,
             "enabled": self.enabled,
-            "focused": self.focused,
             "bbox": self.bbox,
             "visible_bbox": self.visible_bbox,
             "children": children_to_dict(self.children),
@@ -527,11 +529,14 @@ class MacOSAccessibilityHandler(BaseAccessibilityHandler):
 
 class MacOSAutomationHandler(BaseAutomationHandler):
     # Mouse Actions
+    mouse = MouseController()
+    keyboard = KeyboardController()
+
     async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
         try:
             if x is not None and y is not None:
-                pyautogui.moveTo(x, y)
-            pyautogui.click()
+                self.mouse.position = (x, y)
+            self.mouse.click(Button.left, 1)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
@@ -539,8 +544,8 @@ class MacOSAutomationHandler(BaseAutomationHandler):
     async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
         try:
             if x is not None and y is not None:
-                pyautogui.moveTo(x, y)
-            pyautogui.rightClick()
+                self.mouse.position = (x, y)
+            self.mouse.click(Button.right, 1)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
@@ -550,15 +555,15 @@ class MacOSAutomationHandler(BaseAutomationHandler):
     ) -> Dict[str, Any]:
         try:
             if x is not None and y is not None:
-                pyautogui.moveTo(x, y)
-            pyautogui.doubleClick(interval=0.1)
+                self.mouse.position = (x, y)
+            self.mouse.click(Button.left, 2)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
     async def move_cursor(self, x: int, y: int) -> Dict[str, Any]:
         try:
-            pyautogui.moveTo(x, y)
+            self.mouse.position = (x, y)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
@@ -567,9 +572,26 @@ class MacOSAutomationHandler(BaseAutomationHandler):
         self, x: int, y: int, button: str = "left", duration: float = 0.5
     ) -> Dict[str, Any]:
         try:
-            pyautogui.dragTo(x, y, button=button, duration=duration)
+            btn = Button.left if button == "left" else Button.right
+            # Press
+            self.mouse.press(btn)
+            # Move with sleep to simulate drag duration
+            start = self.mouse.position
+            steps = 20
+            start_x, start_y = start
+            dx = (x - start_x) / steps
+            dy = (y - start_y) / steps
+            for i in range(steps):
+                self.mouse.position = (int(start_x + dx * (i + 1)), int(start_y + dy * (i + 1)))
+                time.sleep(duration / steps)
+            # Release
+            self.mouse.release(btn)
             return {"success": True}
         except Exception as e:
+            try:
+                self.mouse.release(btn)
+            except:
+                pass
             return {"success": False, "error": str(e)}
 
     async def drag(
@@ -578,29 +600,19 @@ class MacOSAutomationHandler(BaseAutomationHandler):
         try:
             if not path or len(path) < 2:
                 return {"success": False, "error": "Path must contain at least 2 points"}
-            
+            btn = Button.left if button == "left" else Button.right
             # Move to the first point
-            start_x, start_y = path[0]
-            pyautogui.moveTo(start_x, start_y)
-            
-            # Press the mouse button
-            pyautogui.mouseDown(button=button)
-            
-            # Calculate time between points to distribute duration evenly
+            self.mouse.position = path[0]
+            self.mouse.press(btn)
             step_duration = duration / (len(path) - 1) if len(path) > 1 else duration
-            
-            # Move through each subsequent point
             for x, y in path[1:]:
-                pyautogui.moveTo(x, y, duration=step_duration)
-            
-            # Release the mouse button
-            pyautogui.mouseUp(button=button)
-            
+                self.mouse.position = (x, y)
+                time.sleep(step_duration)
+            self.mouse.release(btn)
             return {"success": True}
         except Exception as e:
-            # Make sure to release the mouse button if an error occurs
             try:
-                pyautogui.mouseUp(button=button)
+                self.mouse.release(btn)
             except:
                 pass
             return {"success": False, "error": str(e)}
@@ -608,21 +620,28 @@ class MacOSAutomationHandler(BaseAutomationHandler):
     # Keyboard Actions
     async def type_text(self, text: str) -> Dict[str, Any]:
         try:
-            pyautogui.write(text)
+            self.keyboard.type(text)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
     async def press_key(self, key: str) -> Dict[str, Any]:
         try:
-            pyautogui.press(key)
+            # Try to map string to Key else use as char
+            k = getattr(Key, key, key)
+            self.keyboard.press(k)
+            self.keyboard.release(k)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
     async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
         try:
-            pyautogui.hotkey(*keys)
+            key_objs = [getattr(Key, k, k) for k in keys]
+            for k in key_objs:
+                self.keyboard.press(k)
+            for k in reversed(key_objs):
+                self.keyboard.release(k)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
@@ -630,14 +649,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
     # Scrolling Actions
     async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]:
         try:
-            pyautogui.scroll(-clicks)
+            self.mouse.scroll(0, -clicks)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
     async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]:
         try:
-            pyautogui.scroll(clicks)
+            self.mouse.scroll(0, clicks)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
@@ -668,8 +687,8 @@ class MacOSAutomationHandler(BaseAutomationHandler):
 
     async def get_cursor_position(self) -> Dict[str, Any]:
         try:
-            pos = pyautogui.position()
-            return {"success": True, "position": {"x": pos.x, "y": pos.y}}
+            x, y = self.mouse.position
+            return {"success": True, "position": {"x": x, "y": y}}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
diff --git a/libs/computer-server/pyproject.toml b/libs/computer-server/pyproject.toml
index b5480f0f..cbf9821a 100644
--- a/libs/computer-server/pyproject.toml
+++ b/libs/computer-server/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "uvicorn[standard]>=0.27.0",
     "pydantic>=2.0.0",
     "pyautogui>=0.9.54",
+    "pynput>=1.8.1",
     "pillow>=10.2.0",
     "aiohttp>=3.9.1"
 ]

From dd717764e3be295c01599f7fe5ff7aaa23a65ec1 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 14:26:42 -0400
Subject: [PATCH 17/23] Revert presskey and hotkey to pyautogui

---
 .../computer-server/computer_server/handlers/macos.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py
index 8afc5cc1..713ac371 100644
--- a/libs/computer-server/computer_server/handlers/macos.py
+++ b/libs/computer-server/computer_server/handlers/macos.py
@@ -627,21 +627,14 @@ class MacOSAutomationHandler(BaseAutomationHandler):
 
     async def press_key(self, key: str) -> Dict[str, Any]:
         try:
-            # Try to map string to Key else use as char
-            k = getattr(Key, key, key)
-            self.keyboard.press(k)
-            self.keyboard.release(k)
+            pyautogui.press(key)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}
 
     async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
         try:
-            key_objs = [getattr(Key, k, k) for k in keys]
-            for k in key_objs:
-                self.keyboard.press(k)
-            for k in reversed(key_objs):
-                self.keyboard.release(k)
+            pyautogui.hotkey(*keys)
             return {"success": True}
         except Exception as e:
             return {"success": False, "error": str(e)}

From 8f5f72ab213ef48cde32ea7461d923072a2521f1 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 14:47:46 -0400
Subject: [PATCH 18/23] Replace example

---
 examples/eval_examples.py                | 724 -----------------------
 examples/sandboxed_functions_examples.py |  54 ++
 2 files changed, 54 insertions(+), 724 deletions(-)
 delete mode 100644 examples/eval_examples.py
 create mode 100644 examples/sandboxed_functions_examples.py

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
deleted file mode 100644
index b3d163ca..00000000
--- a/examples/eval_examples.py
+++ /dev/null
@@ -1,724 +0,0 @@
-import os
-import asyncio
-import json
-import random
-from pathlib import Path
-import sys
-import traceback
-import time
-from functools import wraps
-import urllib.request
-import datetime
-from urllib.parse import quote
-
-# Global variable to track all results
-all_results = []
-
-# Wikirace prompt template
-WIKIRACE_PROMPT_TEMPLATE = """
-You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" 
-by clicking only on Wikipedia links within articles.
-
-Rules:
-1. Only click on links within Wikipedia articles (blue text)
-2. No using search, back button, or typing URLs
-3. You MAY use cmd+f (or ctrl+f) to find text on the current page
-4. Do NOT click any search icon or type into any search box unless it's a browser command
-5. Try to find the shortest path possible
-6. Current target: {target_page}
-7. Do not maximize the window or use any other application
-8. Avoid wasting actions by scrolling
-9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
-10. Stay on the English Wikipedia
-
-Look at the current page and click on a link that might lead you closer to {target_page}.
-"""
-
-# Store original print function
-_print = print
-
-# Define log file path
-project_root = Path(__file__).parent.parent
-log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
-results_file = project_root / "examples" / "evals" / "eval_appuse_results.md"
-
-# Custom print function that also logs to file
-def print(*args, **kwargs):
-    # Call the original print function
-    _print(*args, **kwargs)
-    
-    # Format the output as a string
-    output = " ".join(str(arg) for arg in args)
-    if kwargs.get("end") is not None:
-        output += kwargs["end"]
-    else:
-        output += "\n"
-    
-    # Add timestamp
-    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    log_entry = f"[{timestamp}] {output}"
-    
-    # Append to log file
-    with open(log_file, "a") as f:
-        f.write(log_entry)
-
-# Load environment variables from .env file
-env_file = project_root / ".env"
-print(f"Loading environment from: {env_file}")
-from dotenv import load_dotenv
-
-load_dotenv(env_file)
-
-# Add paths to sys.path if needed
-pythonpath = os.environ.get("PYTHONPATH", "")
-for path in pythonpath.split(":"):
-    if path and path not in sys.path:
-        sys.path.insert(0, path)  # Insert at beginning to prioritize
-        print(f"Added to sys.path: {path}")
-
-from computer.computer import Computer
-from computer.providers.base import VMProviderType
-from computer.logger import LogLevel
-from computer.helpers import sandboxed
-
-# Assuming these exist based on your request
-from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
-
-articles = []
-
-# Load from file
-articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt"
-with open(articles_file, "r") as f:
-    articles = [line.strip() for line in f]
-
-
-def get_article_links(article_title):
-    """Get all links from a Wikipedia article's content"""
-    try:
-        # Get the article content
-        url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json"
-        
-        with urllib.request.urlopen(url) as response:
-            data = json.loads(response.read().decode())
-            
-        pages = data.get('query', {}).get('pages', {})
-        if not pages:
-            return []
-        
-        # Get the first (and only) page
-        page = next(iter(pages.values()))
-        links = page.get('links', [])
-        
-        # Filter links to keep only main namespace articles (no special pages, files, etc.)
-        article_links = []
-        for link in links:
-            title = link.get('title', '')
-            # Skip if title contains colons (indicates special pages, files, categories, etc.)
-            if ':' not in title and title.isascii() and len(title) < 50:
-                article_links.append(title)
-        
-        return article_links
-    
-    except Exception as e:
-        print(f"Error fetching links for {article_title}: {e}")
-        return []
-
-def wikipedia_random_walk(start_article, depth=5):
-    """
-    Perform a random walk through Wikipedia articles
-    
-    Args:
-        start_article (str): The article title to start from
-        depth (int): How many steps to take in the random walk
-    
-    Returns:
-        list: Path of article titles visited during the walk
-    """
-    path = [start_article]
-    current_article = start_article
-    
-    for step in range(depth):
-        print(f"Step {step + 1}: Currently at '{current_article}'")
-        
-        # Get links from current article
-        links = get_article_links(current_article)
-        
-        if not links:
-            print(f"No valid links found in '{current_article}'. Ending walk.")
-            break
-        
-        # Randomly select next article
-        next_article = random.choice(links)
-        path.append(next_article)
-        current_article = next_article
-        
-        print(f"  -> Moving to '{next_article}'")
-    
-    return path
-
-def get_article_pair(depth=5):
-    global articles
-    start_article = random.choice(articles)
-    target_article = wikipedia_random_walk(start_article, depth)[-1]
-    while target_article == start_article:
-        start_article = random.choice(articles)
-        target_article = wikipedia_random_walk(start_article, depth)[-1]
-    return start_article, target_article
-
-
-def save_results_to_markdown():
-    """Save all results to a markdown table"""
-    global all_results
-    
-    if not all_results:
-        print("No results to save")
-        return
-    
-    # Create header for the markdown table
-    header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |"
-    separator = "|---|---|---|---|---|---|---|---|---|---|"
-    
-    # Create rows for each result
-    rows = []
-    for result in all_results:
-        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |"
-        rows.append(row)
-    
-    # Combine header, separator, and rows
-    table = "\n".join([header, separator] + rows)
-    
-    # Write to file (append mode)
-    with open(results_file, "a") as f:
-        f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-        f.write(table)
-    
-    print(f"Results saved to {results_file}")
-
-async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
-    """Run a specific evaluation scenario"""
-    
-    print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===")
-    
-    # Create computer instance with or without app-use experiment
-    experiments = ["app-use"] if use_app_use else []
-    computer = Computer(experiments=experiments)
-    
-    try:
-        # Run the computer
-        await computer.run()
-        
-        # Install required packages
-        await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
-        
-        # Run the specific scenario
-        if scenario_name == "messy_desktop":
-            await run_messy_desktop_scenario(computer, agent_configs, max_steps)
-        elif scenario_name == "parallel_agents":
-            await run_parallel_agents_scenario(computer, agent_configs, max_steps)
-        else:
-            print(f"Unknown scenario: {scenario_name}")
-    
-    except Exception as e:
-        print(f"Error in scenario {scenario_name}: {e}")
-        traceback.print_exc()
-    finally:
-        # Important to clean up resources
-        # await computer.stop()
-        pass
-
-
-@sandboxed("eval_env")
-def close_all_windows():
-    """Close all open windows"""
-    import pywinctl
-    windows = pywinctl.getAllWindows()
-    for window in windows:
-        try:
-            window.close()
-        except:
-            # Some windows might not be closeable or may have already closed
-            pass
-
-
-@sandboxed("eval_env")
-def get_current_wiki_page(app_name=None):
-    """Get the title of the current Wikipedia page
-    
-    Args:
-        app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox')
-    """
-    import pywinctl
-    windows = pywinctl.getAllWindows()
-    
-    # Filter windows by app name if provided
-    if app_name:
-        windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()]
-    
-    # Get titles from filtered windows
-    titles = [w.title for w in windows if w.title]
-    wiki_titles = [title for title in titles if "Wikipedia" in title]
-    
-    if wiki_titles:
-        return wiki_titles[0].split(" - Wikipedia")[0]
-    return None
-
-
-@sandboxed("eval_env")
-def get_open_app_names():
-    """Get names of all open applications"""
-    import pywinctl
-    windows = pywinctl.getAllWindows()
-    return [window.getAppName() for window in windows if window.getAppName()]
-
-def _computer():
-    """Get the default computer instance"""
-    from computer.helpers import _default_computer
-    return _default_computer
-
-async def open_app(app_name):
-    """Open a specific application"""
-    await _computer().interface.run_command(f"open -a '{app_name}'")
-    await asyncio.sleep(2)  # Wait for app to open
-
-
-async def open_wiki(page, app_name="Safari"):
-    """Open a specific Wikipedia page"""
-    await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}")
-    await asyncio.sleep(2)  # Wait for page to load
-
-
-async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
-    global all_results
-    """Run the messy desktop scenario with a single agent"""
-    # Get popular wiki articles
-    global articles
-    start_page, target_page = get_article_pair(depth=1)
-    
-    print(f"Wiki race: {start_page} → {target_page}")
-    
-    # Close all windows first
-    await close_all_windows()
-    
-    # Open starting Wikipedia page
-    await open_wiki(start_page)
-    
-    # Open 3 random apps to create a messy desktop
-    apps_to_open = ["Notes", "Terminal", "System Settings"]
-    for app in apps_to_open:
-        await open_app(app)
-    
-    # Verify apps are open
-    open_apps = await get_open_app_names()
-    print(f"Open applications: {open_apps}")
-    
-    # Create the agent's computer interface
-    # If app-use is enabled, create a desktop limited to Safari/Firefox
-    if "app-use" in (computer.experiments or []):
-        browser_desktop = computer.create_desktop_from_apps(["Safari"])
-        agent_computer = browser_desktop
-    else:
-        agent_computer = computer
-    
-    # Run each agent configuration
-    for config_name, loop_provider, model_provider in agent_configs:
-        print(f"\n--- Testing Agent: {config_name} ---")
-        
-        # Create agent with the specified configuration
-        agent = ComputerAgent(
-            computer=agent_computer,
-            loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
-            trajectory_dir="examples/evals/trajectories/eval_appuse"
-        )
-        
-        # Run the wikirace
-        steps = 0
-        success = False
-        start_time = time.time()
-        
-        # Use the template with formatting for this scenario
-        prompt = WIKIRACE_PROMPT_TEMPLATE.format(
-            browser="Safari",
-            start_page=start_page,
-            target_page=target_page
-        )
-        
-        try:
-            while steps < max_steps and not success: 
-                async for result in agent.run(prompt):    
-                    steps += 1
-                    print(f"Step {steps}")
-                    
-                    def process_result():
-                        if result.get("content"):
-                            print(f"Agent: {result.get('content', '')}")
-
-                        else:
-                            outputs = result.get("output", [])
-                            for output in outputs:
-                                if output.get("type") == "message":
-                                    content = output.get("content", [])
-                                    for content_part in content:
-                                        if content_part.get("text"):
-                                            print(f"Agent: {content_part.get('text', '')}")
-
-                                elif output.get("type") == "reasoning":
-                                    # if it's openAI, we only have access to a summary of the reasoning
-                                    summary_content = output.get("summary", [])
-                                    if summary_content:
-                                        for summary_part in summary_content:
-                                            if summary_part.get("type") == "summary_text":
-                                                print(f"Agent: {summary_part.get('text', '')}")
-
-                                    else:
-                                        summary_content = output.get("text", "")
-                                        if summary_content:
-                                            print(f"Agent: {summary_content}")
-
-                                elif output.get("type") == "computer_call":
-                                    action = output.get("action", {})
-                                    action_type = action.get("type", "")
-                                    if action_type:
-                                        action_title = f"🛠️ Performing {action_type}"
-                                        if action.get("x") and action.get("y"):
-                                            action_title += f" at ({action['x']}, {action['y']})"
-                                        print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
-
-                    
-                    # Process and print the result
-                    process_result()
-                    
-                    # Check current page
-                    current_page = await get_current_wiki_page("Safari")
-                    print(f"Current page: {current_page}")
-                    print(f"Target: {target_page}")
-                    
-                    # Check if we reached the target
-                    if current_page and target_page.lower() in current_page.lower():
-                        success = True
-                        print(f"🎉 SUCCESS! Reached {target_page} in {steps} steps!")
-                        await agent._loop.cancel()
-                        break
-                    
-                    # Safety check
-                    if steps >= max_steps:
-                        print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
-                        await agent._loop.cancel()
-                        break
-        except asyncio.CancelledError:
-            print("Agent stopped")
-                        
-        end_time = time.time()
-        duration = end_time - start_time
-        await asyncio.sleep(2)  # Wait for agent to finish
-        
-        # Results
-        print(f"\n=== WIKIRACE RESULTS: {config_name} ===")
-        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
-        print(f"Start: {start_page}")
-        print(f"Target: {target_page}")
-        print(f"Steps taken: {steps}")
-        print(f"Success: {success}")
-        print(f"Duration: {duration:.2f} seconds")
-
-
-async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
-    global all_results
-    
-    """Run two agents in parallel, one using Safari and one using Firefox"""
-    # Get popular wiki articles
-    global articles
-    safari_start, safari_target = get_article_pair(depth=1)
-    firefox_start, firefox_target = get_article_pair(depth=1)
-    
-    print(f"Safari Wiki race: {safari_start} → {safari_target}")
-    print(f"Firefox Wiki race: {firefox_start} → {firefox_target}")
-    
-    # Close all windows first
-    await close_all_windows()
-    
-    # Open Safari with starting page
-    await open_wiki(safari_start, "Safari")
-    await asyncio.sleep(2)
-    
-    # Open Firefox with starting page
-    await open_wiki(firefox_start, "Firefox")
-    await asyncio.sleep(2)
-    
-    # Create agent configurations
-    for config_name, loop_provider, model_provider in agent_configs:
-        print(f"\n--- Testing Parallel Agents: {config_name} ---")
-        
-        # Create the agent interfaces
-        if "app-use" in (computer.experiments or []):
-            safari_desktop = computer.create_desktop_from_apps(["Safari"])
-            firefox_desktop = computer.create_desktop_from_apps(["Firefox"])
-        else:
-            safari_desktop = computer
-            firefox_desktop = computer
-        
-        # Save screenshots
-        screenshot_dir = project_root / "examples" / "evals" / "screenshots"
-        screenshot_dir.mkdir(exist_ok=True)
-        safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png"
-        firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png"
-        screenshot_bytes = await safari_desktop.interface.screenshot()
-        with open(safari_screenshot_path, "wb") as f:
-            f.write(screenshot_bytes)
-        screenshot_bytes = await firefox_desktop.interface.screenshot()
-        with open(firefox_screenshot_path, "wb") as f:
-            f.write(screenshot_bytes)
-        
-        # Create agents
-        safari_agent = ComputerAgent(
-            computer=safari_desktop,
-            loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
-            trajectory_dir="examples/evals/trajectories/eval_parallel_safari"
-        )
-        
-        firefox_agent = ComputerAgent(
-            computer=firefox_desktop,
-            loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
-            trajectory_dir="examples/evals/trajectories/eval_parallel_firefox"
-        )
-        
-        # Create prompts using the template
-        safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
-            browser="Safari",
-            start_page=safari_start,
-            target_page=safari_target
-        )
-        
-        firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
-            browser="Firefox",
-            start_page=firefox_start,
-            target_page=firefox_target
-        )
-        
-        # Track results
-        safari_results = {
-            "steps": 0,
-            "success": False,
-            "start_time": time.time(),
-            "end_time": None
-        }
-        
-        firefox_results = {
-            "steps": 0,
-            "success": False,
-            "start_time": time.time(),
-            "end_time": None
-        }
-        
-        # Function to run a single agent
-        async def run_agent(agent, prompt, browser, start_page, target_page, results):
-            try:
-                while results["steps"] < max_steps and not results["success"]:
-                    async for result in agent.run(prompt):
-                        results["steps"] += 1
-                        print(f"{browser} Step {results['steps']}")
-                        
-                        def process_result():
-                            if result.get("content"):
-                                print(f"{browser} Agent: {result.get('content', '')}")
-
-                            else:
-                                outputs = result.get("output", [])
-                                for output in outputs:
-                                    if output.get("type") == "message":
-                                        content = output.get("content", [])
-                                        for content_part in content:
-                                            if content_part.get("text"):
-                                                print(f"{browser} Agent: {content_part.get('text', '')}")
-
-                                    elif output.get("type") == "reasoning":
-                                        # if it's openAI, we only have access to a summary of the reasoning
-                                        summary_content = output.get("summary", [])
-                                        if summary_content:
-                                            for summary_part in summary_content:
-                                                if summary_part.get("type") == "summary_text":
-                                                    print(f"{browser} Agent: {summary_part.get('text', '')}")
-
-                                        else:
-                                            summary_content = output.get("text", "")
-                                            if summary_content:
-                                                print(f"{browser} Agent: {summary_content}")
-
-                                    elif output.get("type") == "computer_call":
-                                        action = output.get("action", {})
-                                        action_type = action.get("type", "")
-                                        if action_type:
-                                            action_title = f"🛠️ Performing {action_type}"
-                                            if action.get("x") and action.get("y"):
-                                                action_title += f" at ({action['x']}, {action['y']})"
-                                            print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
-
-                        
-                        # Process and print the result
-                        process_result()
-                        
-                        # Check current page
-                        current_page = await get_current_wiki_page(browser)
-                        print(f"{browser} current page: {current_page}")
-                        print(f"{browser} target: {target_page}") 
-                        
-                        # Add result to global tracking
-                        global all_results
-                        current_result = {
-                            'scenario': 'parallel_agents',
-                            'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No',
-                            'browser': browser,
-                            'config': config_name,
-                            'start': start_page,
-                            'target': target_page,
-                            'steps': results['steps'],
-                            'success': results['success'],
-                            'duration': time.time() - results['start_time']
-                        }
-                        all_results.append(current_result)
-                        
-                        # Save results after each step
-                        save_results_to_markdown()
-                        
-                        # Check if we reached the target
-                        if current_page and target_page.lower() in current_page.lower():
-                            results["success"] = True
-                            print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!")
-                            await agent._loop.cancel()
-                            break
-                        
-                        # Check if we reached the maximum steps
-                        if results["steps"] >= max_steps:
-                            print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})")
-                            await agent._loop.cancel()
-                            break
-            except asyncio.CancelledError:
-                print(f"{browser} agent stopped")
-            finally:
-                results["end_time"] = time.time()
-        
-        # Run both agents in parallel
-        await asyncio.gather(
-            run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results),
-            run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results)
-        )
-        
-        # Wait for agents to finish
-        await asyncio.sleep(2)
-        
-        # Print results
-        print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===")
-        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
-        
-        print(f"\nSafari Results:")
-        print(f"Start: {safari_start}")
-        print(f"Target: {safari_target}")
-        print(f"Steps taken: {safari_results['steps']}")
-        print(f"Success: {safari_results['success']}")
-        print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds")
-        
-        print(f"\nFirefox Results:")
-        print(f"Start: {firefox_start}")
-        print(f"Target: {firefox_target}")
-        print(f"Steps taken: {firefox_results['steps']}")
-        print(f"Success: {firefox_results['success']}")
-        print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds")
-
-
-async def main():
-    try:
-        
-        # Define agent configurations to test
-        agent_configs = [
-            # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
-            # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
-            ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
-        ]
-        
-        # # Run the test scenario without any agents
-        # print("Running test scenario for sandboxed functions")
-        # await run_test_scenario()
-        
-        # Set maximum steps for each agent run
-        max_steps = 15
-        runs = 5
-
-        # run all scenarios
-        for _ in range(runs):
-            # Scenario 1: Messy desktop without App-Use
-            await run_scenario("messy_desktop", False, agent_configs, max_steps)
-            
-            # Scenario 1: Messy desktop with App-Use
-            await run_scenario("messy_desktop", True, agent_configs, max_steps)
-            
-            # Scenario 2: Parallel agents without App-Use
-            await run_scenario("parallel_agents", False, agent_configs, max_steps)
-            
-            # Scenario 2: Parallel agents with App-Use
-            await run_scenario("parallel_agents", True, agent_configs, max_steps)
-            
-    except Exception as e:
-        print(f"Error in main: {e}")
-        traceback.print_exc()
-
-
-async def run_test_scenario(max_iterations=5):
-    """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match
-    
-    This function opens the same Wikipedia pages in both browsers and verifies that
-    the get_current_wiki_page function returns the same result for both browsers.
-    It does this for the specified number of iterations.
-    """
-    
-    # Create computer instance
-    computer = Computer()
-    await computer.run()
-    
-    # Get popular wiki articles
-    global articles
-    selected_articles = random.sample(articles, max_iterations)
-    
-    print(f"\n--- Running Test Scenario for {max_iterations} iterations ---")
-    
-    # Close all windows first
-    await close_all_windows()
-    
-    # Open both browsers
-    await open_app("Safari")
-    await open_app("Firefox")
-    
-    # Verify browsers are open
-    open_apps = await get_open_app_names()
-    print(f"Open applications: {open_apps}")
-    
-    # Run test iterations
-    for i, article in enumerate(selected_articles):
-        print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'")
-        
-        # Open the same Wikipedia page in both browsers
-        await open_wiki(article, "Safari")
-        await open_wiki(article, "Firefox")
-        await asyncio.sleep(3)  # Give a bit more time for both pages to load
-        
-        # Check if both browsers show the same page
-        safari_page = await get_current_wiki_page("Safari")
-        firefox_page = await get_current_wiki_page("Firefox")
-        
-        print(f"Safari page: {safari_page}")
-        print(f"Firefox page: {firefox_page}")
-        
-        if safari_page == firefox_page:
-            print(f"✅ MATCH: Both browsers show '{safari_page}'")
-        else:
-            print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'")
-        
-        await asyncio.sleep(1)  # Brief pause between iterations
-    
-    print("\n--- Test Scenario Completed ---")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/examples/sandboxed_functions_examples.py b/examples/sandboxed_functions_examples.py
new file mode 100644
index 00000000..caa733b9
--- /dev/null
+++ b/examples/sandboxed_functions_examples.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+import os
+import sys
+
+# Load environment variables from .env file
+project_root = Path(__file__).parent.parent
+env_file = project_root / ".env"
+print(f"Loading environment from: {env_file}")
+from dotenv import load_dotenv
+
+load_dotenv(env_file)
+
+# Add paths to sys.path if needed
+pythonpath = os.environ.get("PYTHONPATH", "")
+for path in pythonpath.split(":"):
+    if path and path not in sys.path:
+        sys.path.insert(0, path)  # Insert at beginning to prioritize
+        print(f"Added to sys.path: {path}")
+
+import asyncio
+from computer.computer import Computer
+from computer.helpers import sandboxed
+
+async def main():
+    # Initialize the computer in a C/ua Container
+    computer = Computer()
+    await computer.run()
+    
+    # Install a package in a virtual environment in the container
+    await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
+
+    # Open Safari
+    await computer.interface.run_command("open -a Safari")
+    await asyncio.sleep(2)
+
+    # Define a sandboxed function
+    # This function will run inside the C/ua Container
+    @sandboxed("demo_venv")
+    def greet_and_print(name):
+        # get .html of the current Safari tab
+        import PyXA
+        safari = PyXA.Application("Safari")
+        current_doc = safari.current_document
+        html = current_doc.source()
+        print(f"Hello from inside the container, {name}!")
+        print("Safari HTML length:", len(html))
+        return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]}
+
+    # Call with args and kwargs
+    result = await greet_and_print("C/ua")
+    print("Result from sandboxed function:", result)
+
+if __name__ == "__main__":
+    asyncio.run(main())

From a4250f57fa2bf93b6fe8dc555ac473d0dbfeafaf Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 14:53:54 -0400
Subject: [PATCH 19/23] Added to README.md

---
 README.md     | 32 ++++++++++++++++++++++++++++++--
 tests/venv.py |  2 +-
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 5f67d69f..29041fba 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ pip install "cua-computer[all]" "cua-agent[all]"
 ### Step 4: Use in Your Code
 
 ```python
+from computer.helpers import sandboxed
 from computer import Computer
 from agent import ComputerAgent, LLM
 
@@ -163,9 +164,31 @@ async def main():
       loop="uitars",
       model=LLM(provider="mlxvlm", name="mlx-community/UI-TARS-1.5-7B-6bit")
     )
-    await agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide")
+    async for result in agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide"):
+        print(result)
 
-main()
+    # Example: Use sandboxed functions to execute code in a C/ua Container
+    # 1. Install a package in a Python virtual environment
+    await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
+    
+    # 2. Define a sandboxed function
+    @sandboxed("demo_venv")
+    def greet_and_print(name):
+        # get .html of the current Safari tab
+        import PyXA
+        safari = PyXA.Application("Safari")
+        current_doc = safari.current_document
+        html = current_doc.source()
+        print(f"Hello from inside the container, {name}!")
+        print("Safari HTML length:", len(html))
+        return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]}
+    
+    # 3. Run the function in the container in the agent's environment
+    result = await greet_and_print("C/ua")
+    print("Result from sandboxed function:", result)
+
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 
 For ready-to-use examples, check out our [Notebooks](./notebooks/) collection.
@@ -273,6 +296,11 @@ await computer.interface.run_command(cmd)       # Run shell command
 
 # Accessibility
 await computer.interface.get_accessibility_tree() # Get accessibility tree
+
+# Python Virtual Environment Operations
+await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment
+await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests.get(`https://httpbin.org/ip`).json())'") # Run a shell command in a virtual environment
+await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception
 ```
 
 ## ComputerAgent Reference
diff --git a/tests/venv.py b/tests/venv.py
index 8b78a78f..8463fa4d 100644
--- a/tests/venv.py
+++ b/tests/venv.py
@@ -31,7 +31,7 @@ for path in pythonpath.split(":"):
 
 from computer.computer import Computer
 from computer.providers.base import VMProviderType
-from computer.helpers import remote, set_default_computer
+from computer.helpers import sandboxed, set_default_computer
 
 
 @pytest.fixture(scope="session")

From 3e6cb3465e3a7c246babd939b20a624960e57efc Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 15:20:17 -0400
Subject: [PATCH 20/23] Moved @sandboxed example

---
 README.md | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 29041fba..f3965a4a 100644
--- a/README.md
+++ b/README.md
@@ -167,26 +167,6 @@ async def main():
     async for result in agent.run("Find the trycua/cua repository on GitHub and follow the quick start guide"):
         print(result)
 
-    # Example: Use sandboxed functions to execute code in a C/ua Container
-    # 1. Install a package in a Python virtual environment
-    await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
-    
-    # 2. Define a sandboxed function
-    @sandboxed("demo_venv")
-    def greet_and_print(name):
-        # get .html of the current Safari tab
-        import PyXA
-        safari = PyXA.Application("Safari")
-        current_doc = safari.current_document
-        html = current_doc.source()
-        print(f"Hello from inside the container, {name}!")
-        print("Safari HTML length:", len(html))
-        return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:200]}
-    
-    # 3. Run the function in the container in the agent's environment
-    result = await greet_and_print("C/ua")
-    print("Result from sandboxed function:", result)
-
 if __name__ == "__main__":
     asyncio.run(main())
 ```
@@ -301,6 +281,25 @@ await computer.interface.get_accessibility_tree() # Get accessibility tree
 await computer.venv_install("demo_venv", ["requests", "macos-pyxa"]) # Install packages in a virtual environment
 await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests.get(`https://httpbin.org/ip`).json())'") # Run a shell command in a virtual environment
 await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception
+
+# Example: Use sandboxed functions to execute code in a C/ua Container
+# 1. Install a package in a Python virtual environment
+await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
+
+# 2. Define a sandboxed function
+@sandboxed("demo_venv")
+def greet_and_print(name, html_snippet_length=200):
+    # get .html of the current Safari tab
+    import PyXA
+    safari = PyXA.Application("Safari")
+    html = safari.current_document.source()
+    print(f"Hello from inside the container, {name}!")
+    print("Safari HTML length:", len(html))
+    return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:html_snippet_length]}
+
+# 3. Run the function in the container in the agent's environment
+result = await greet_and_print("C/ua", html_snippet_length=100)
+print("Result from sandboxed function:", result)
 ```
 
 ## ComputerAgent Reference

From 3ab5b65a257d62d0c6bb2cc5eb59f1584f95c680 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 15:50:02 -0400
Subject: [PATCH 21/23] Fixes for hotkey and claude scrolling

---
 .../agent/providers/anthropic/tools/computer.py      | 10 ++--------
 libs/computer/computer/diorama_computer.py           | 12 +++++++++++-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/libs/agent/agent/providers/anthropic/tools/computer.py b/libs/agent/agent/providers/anthropic/tools/computer.py
index ecf232bd..2bb944ea 100644
--- a/libs/agent/agent/providers/anthropic/tools/computer.py
+++ b/libs/agent/agent/providers/anthropic/tools/computer.py
@@ -478,17 +478,11 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                 if direction == "down":
                     # Scroll down (Page Down on macOS)
                     self.logger.info(f"Scrolling down, amount: {amount}")
-                    # Use fn+down for page down on macOS
-                    for _ in range(amount):
-                        await self.computer.interface.hotkey("fn", "down")
-                        await asyncio.sleep(0.1)
+                    await self.computer.interface.scroll_down(amount)
                 else:
                     # Scroll up (Page Up on macOS)
                     self.logger.info(f"Scrolling up, amount: {amount}")
-                    # Use fn+up for page up on macOS
-                    for _ in range(amount):
-                        await self.computer.interface.hotkey("fn", "up")
-                        await asyncio.sleep(0.1)
+                    await self.computer.interface.scroll_up(amount)
 
                 # Wait briefly for UI changes
                 await asyncio.sleep(0.5)
diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py
index 5cad0006..dfb541b9 100644
--- a/libs/computer/computer/diorama_computer.py
+++ b/libs/computer/computer/diorama_computer.py
@@ -87,7 +87,17 @@ class DioramaComputerInterface:
         await self._send_cmd("press_key", {"key": key})
 
     async def hotkey(self, *keys):
-        await self._send_cmd("hotkey", {"keys": list(keys)})
+        actual_keys = []
+        for key in keys:
+            if isinstance(key, Key):
+                actual_keys.append(key.value)
+            elif isinstance(key, str):
+                # Try to convert to enum if it matches a known key
+                key_or_enum = Key.from_string(key)
+                actual_keys.append(key_or_enum.value if isinstance(key_or_enum, Key) else key_or_enum)
+            else:
+                raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
+        await self._send_cmd("hotkey", {"keys": actual_keys})
 
     async def to_screen_coordinates(self, x, y):
         return await self._send_cmd("to_screen_coordinates", {"x": x, "y": y})

From 8599da0d43f3f5ea534e82972e05eefa74c5f0c3 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 15:53:16 -0400
Subject: [PATCH 22/23] Add missing imports

---
 libs/computer/computer/diorama_computer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py
index dfb541b9..2eee77f0 100644
--- a/libs/computer/computer/diorama_computer.py
+++ b/libs/computer/computer/diorama_computer.py
@@ -1,4 +1,5 @@
 import asyncio
+from .interface.models import KeyType, Key
 
 class DioramaComputer:
     """

From e63c5fd81fce768f36f4b14b1f3faba6b0ad3969 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 15:54:02 -0400
Subject: [PATCH 23/23] Finalized README

---
 README.md | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f3965a4a..f10bae77 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,6 @@ pip install "cua-computer[all]" "cua-agent[all]"
 ### Step 4: Use in Your Code
 
 ```python
-from computer.helpers import sandboxed
 from computer import Computer
 from agent import ComputerAgent, LLM
 
@@ -283,10 +282,7 @@ await computer.venv_cmd("demo_venv", "python -c 'import requests; print(requests
 await computer.venv_exec("demo_venv", python_function_or_code, *args, **kwargs) # Run a Python function in a virtual environment and return the result / raise an exception
 
 # Example: Use sandboxed functions to execute code in a C/ua Container
-# 1. Install a package in a Python virtual environment
-await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
-
-# 2. Define a sandboxed function
+from computer.helpers import sandboxed
 @sandboxed("demo_venv")
 def greet_and_print(name, html_snippet_length=200):
     # get .html of the current Safari tab
@@ -296,9 +292,7 @@ def greet_and_print(name, html_snippet_length=200):
     print(f"Hello from inside the container, {name}!")
     print("Safari HTML length:", len(html))
     return {"greeted": name, "safari_html_length": len(html), "safari_html_snippet": html[:html_snippet_length]}
-
-# 3. Run the function in the container in the agent's environment
-result = await greet_and_print("C/ua", html_snippet_length=100)
+result = await greet_and_print("C/ua", html_snippet_length=100) # Executes in the container
 print("Result from sandboxed function:", result)
 ```