App-usage stability fixes

2026-01-02 03:20:22 -06:00 · 2025-06-04 09:05:20 -04:00
parent 79bcf9d05d
commit 58a453dc49
3 changed files with 107 additions and 34 deletions
--- a/libs/computer-server/computer_server/diorama/diorama.py
+++ b/libs/computer-server/computer_server/diorama/diorama.py
@@ -36,11 +36,21 @@ class Diorama:
        cls._ensure_scheduler()
        return cls(args).computer

+    # Dictionary to store cursor positions for each unique app_list hash
+    _cursor_positions = {}
+    
    def __init__(self, app_list):
        self.app_list = app_list
        self.interface = self.Interface(self)
        self.computer = DioramaComputer(self)
        self.focus_context = None
+        
+        # Create a hash for this app_list to use as a key
+        self.app_list_hash = hash(tuple(sorted(app_list)))
+        
+        # Initialize cursor position for this app_list if it doesn't exist
+        if self.app_list_hash not in Diorama._cursor_positions:
+            Diorama._cursor_positions[self.app_list_hash] = (0, 0)

    @classmethod
    def _ensure_scheduler(cls):
@@ -67,10 +77,11 @@ class Diorama:
            frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist)
            focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger)
            
+            app_list_hash = hash(tuple(sorted(app_whitelist)))
+            
            with focus_context:
                try:
                    if action == "screenshot":
-                        app_whitelist = list(args["app_list"])
                        logger.info(f"Taking screenshot for apps: {app_whitelist}")
                        result, img = capture_all_apps(
                            app_whitelist=app_whitelist,
@@ -82,8 +93,15 @@ class Diorama:
                            future.set_result((result, img))
                    # Mouse actions
                    elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]:
-                        x = args.get("x")
-                        y = args.get("y")
+                        # Get last cursor position for this app_list hash
+                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+                        
+                        x = args.get("x", last_pos[0])
+                        y = args.get("y", last_pos[1])
+                        
+                        # Update the cursor position for this app_list hash
+                        Diorama._cursor_positions[app_list_hash] = (x, y)
+                        
                        duration = args.get("duration", 0.5)
                        if action == "left_click":
                            await automation_handler.left_click(x, y)
@@ -98,6 +116,10 @@ class Diorama:
                        if future:
                            future.set_result(None)
                    elif action in ["scroll_up", "scroll_down"]:
+                        # Move cursor to last known position for this app_list hash
+                        last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
+                        await automation_handler.move_cursor(*last_pos)
+                        
                        clicks = args.get("clicks", 1)
                        if action == "scroll_up":
                            await automation_handler.scroll_up(clicks)
--- a/libs/computer/computer/diorama_computer.py
+++ b/libs/computer/computer/diorama_computer.py
@@ -37,7 +37,7 @@ class DioramaComputerInterface:
            raise RuntimeError("Computer interface not initialized. Call run() first.")
        result = await iface.diorama_cmd(action, arguments)
        if not result.get("success"):
-            raise RuntimeError(f"Diorama command failed: {result.get('error')}")
+            raise RuntimeError(f"Diorama command failed: {result.get('error')}\n{result.get('trace')}")
        return result.get("result")

    async def screenshot(self, as_bytes=True):
--- a/libs/computer/computer/ui/gradio/app.py
+++ b/libs/computer/computer/ui/gradio/app.py
@@ -463,7 +463,7 @@ async def execute(name, action, arguments):
        elif action == "left_click":
            if "x" in arguments and "y" in arguments:
                await computer.interface.move_cursor(arguments["x"], arguments["y"])
-            await computer.interface.left_click()
+            await computer.interface.left_click(arguments["x"], arguments["y"])
            await asyncio.sleep(0.5)
        elif action == "right_click":
            if "x" in arguments and "y" in arguments:
@@ -528,43 +528,75 @@ async def execute(name, action, arguments):
    
    return results

-async def handle_init_computer(os_choice: str):
-    """Initialize the computer instance and tools for macOS or Ubuntu"""
+async def handle_init_computer(os_choice: str, app_list=None, provider="lume"):
+    """Initialize the computer instance and tools for macOS or Ubuntu
+    
+    Args:
+        os_choice: The OS to use ("macOS" or "Ubuntu")
+        app_list: Optional list of apps to focus on using the app-use experiment
+        provider: The provider to use ("lume" or "self")
+    """
    global computer, tool_call_logs, tools
-
+    
+    # Check if we should enable app-use experiment
+    use_app_experiment = app_list and len(app_list) > 0
+    experiments = ["app-use"] if use_app_experiment else None
+    
+    # Determine if we should use host computer server
+    use_host_computer_server = provider == "self"
+    
    if os_choice == "Ubuntu":
-        computer = Computer(
-            image="ubuntu-noble-vanilla:latest",
-            os_type="linux",
-            provider_type=VMProviderType.LUME,
-            display="1024x768",
-            memory="8GB",
-            cpu="4"
-        )
        os_type_str = "linux"
        image_str = "ubuntu-noble-vanilla:latest"
    else:
+        os_type_str = "macos"
+        image_str = "macos-sequoia-cua:latest"
+    
+    # Create computer instance with appropriate configuration
+    if use_host_computer_server:
        computer = Computer(
-            image="macos-sequoia-cua:latest",
-            os_type="macos",
+            os_type=os_type_str,
+            use_host_computer_server=True,
+            experiments=experiments
+        )
+    else:
+        computer = Computer(
+            image=image_str,
+            os_type=os_type_str,
            provider_type=VMProviderType.LUME,
            display="1024x768",
            memory="8GB",
-            cpu="4"
+            cpu="4",
+            experiments=experiments
        )
-        os_type_str = "macos"
-        image_str = "macos-sequoia-cua:latest"

    await computer.run()
+    
+    # If app list is provided, create desktop from apps
+    if use_app_experiment:
+        computer = computer.create_desktop_from_apps(app_list)

    # Log computer initialization as a tool call
-    result = await execute("computer", "initialize", {
+    init_params = {
        "os": os_type_str,
-        "image": image_str,
-        "display": "1024x768",
-        "memory": "8GB",
-        "cpu": "4"
-    })
+        "provider": provider
+    }
+    
+    # Add VM-specific parameters if not using host computer server
+    if not use_host_computer_server:
+        init_params.update({
+            "image": image_str,
+            "display": "1024x768",
+            "memory": "8GB",
+            "cpu": "4"
+        })
+    
+    # Add app list to the log if provided
+    if use_app_experiment:
+        init_params["apps"] = app_list
+        init_params["experiments"] = ["app-use"]
+    
+    result = await execute("computer", "initialize", init_params)

    return result["screenshot"], json.dumps(tool_call_logs, indent=2)

@@ -1029,12 +1061,31 @@ def create_gradio_ui():
                    setup_status = gr.Textbox(label="Setup Status", value="")
                
                with gr.Group():
-                    os_choice = gr.Radio(
-                        label="OS",
-                        choices=["macOS", "Ubuntu"],
-                        value="macOS",
-                        interactive=False # disable until the ubuntu image is ready
-                    )
+                    with gr.Accordion("Computer Configuration", open=False):
+                        with gr.Row():
+                            os_choice = gr.Radio(
+                                label="OS",
+                                choices=["macOS", "Ubuntu"],
+                                value="macOS",
+                                interactive=False # disable until the ubuntu image is ready
+                            )
+                            
+                            # Provider selection radio
+                            provider_choice = gr.Radio(
+                                label="Provider",
+                                choices=["lume", "self"],
+                                value="lume",
+                                info="'lume' uses a VM, 'self' uses the host computer server"
+                            )
+                        
+                        # App filtering dropdown for app-use experiment
+                        app_filter = gr.Dropdown(
+                            label="Filter by apps (App-Use)",
+                            multiselect=True,
+                            allow_custom_value=True,
+                            info="When apps are selected, the computer will focus on those apps using the app-use experiment"
+                        )
+                    
                    start_btn = gr.Button("Initialize Computer")
                
                with gr.Group():
@@ -1199,7 +1250,7 @@ def create_gradio_ui():
        )
                
        img.select(handle_click, inputs=[img, click_type], outputs=[img, action_log])
-        start_btn.click(handle_init_computer, inputs=[os_choice], outputs=[img, action_log])
+        start_btn.click(handle_init_computer, inputs=[os_choice, app_filter, provider_choice], outputs=[img, action_log])
        wait_btn.click(handle_wait, outputs=[img, action_log])
        
        # DONE and FAIL buttons just do a placeholder action