diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py index bf30a018..e781395c 100644 --- a/libs/computer-server/computer_server/diorama/diorama.py +++ b/libs/computer-server/computer_server/diorama/diorama.py @@ -36,11 +36,21 @@ class Diorama: cls._ensure_scheduler() return cls(args).computer + # Dictionary to store cursor positions for each unique app_list hash + _cursor_positions = {} + def __init__(self, app_list): self.app_list = app_list self.interface = self.Interface(self) self.computer = DioramaComputer(self) self.focus_context = None + + # Create a hash for this app_list to use as a key + self.app_list_hash = hash(tuple(sorted(app_list))) + + # Initialize cursor position for this app_list if it doesn't exist + if self.app_list_hash not in Diorama._cursor_positions: + Diorama._cursor_positions[self.app_list_hash] = (0, 0) @classmethod def _ensure_scheduler(cls): @@ -67,10 +77,11 @@ class Diorama: frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(all_windows, running_apps, app_whitelist) focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger) + app_list_hash = hash(tuple(sorted(app_whitelist))) + with focus_context: try: if action == "screenshot": - app_whitelist = list(args["app_list"]) logger.info(f"Taking screenshot for apps: {app_whitelist}") result, img = capture_all_apps( app_whitelist=app_whitelist, @@ -82,8 +93,15 @@ class Diorama: future.set_result((result, img)) # Mouse actions elif action in ["left_click", "right_click", "double_click", "move_cursor", "drag_to"]: - x = args.get("x") - y = args.get("y") + # Get last cursor position for this app_list hash + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + + x = args.get("x", last_pos[0]) + y = args.get("y", last_pos[1]) + + # Update the cursor position for this app_list hash + Diorama._cursor_positions[app_list_hash] = (x, y) + duration = args.get("duration", 0.5) if action == "left_click": await automation_handler.left_click(x, y) @@ -98,6 +116,10 @@ class Diorama: if future: future.set_result(None) elif action in ["scroll_up", "scroll_down"]: + # Move cursor to last known position for this app_list hash + last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0)) + await automation_handler.move_cursor(*last_pos) + clicks = args.get("clicks", 1) if action == "scroll_up": await automation_handler.scroll_up(clicks) diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py index 608e6721..5cad0006 100644 --- a/libs/computer/computer/diorama_computer.py +++ b/libs/computer/computer/diorama_computer.py @@ -37,7 +37,7 @@ class DioramaComputerInterface: raise RuntimeError("Computer interface not initialized. Call run() first.") result = await iface.diorama_cmd(action, arguments) if not result.get("success"): - raise RuntimeError(f"Diorama command failed: {result.get('error')}") + raise RuntimeError(f"Diorama command failed: {result.get('error')}\n{result.get('trace')}") return result.get("result") async def screenshot(self, as_bytes=True): diff --git a/libs/computer/computer/ui/gradio/app.py b/libs/computer/computer/ui/gradio/app.py index 1a93b27d..b1d131d9 100644 --- a/libs/computer/computer/ui/gradio/app.py +++ b/libs/computer/computer/ui/gradio/app.py @@ -463,7 +463,7 @@ async def execute(name, action, arguments): elif action == "left_click": if "x" in arguments and "y" in arguments: await computer.interface.move_cursor(arguments["x"], arguments["y"]) - await computer.interface.left_click() + await computer.interface.left_click(arguments["x"], arguments["y"]) await asyncio.sleep(0.5) elif action == "right_click": if "x" in arguments and "y" in arguments: @@ -528,43 +528,75 @@ async def execute(name, action, arguments): return results -async def handle_init_computer(os_choice: str): - """Initialize the computer instance and tools for macOS or Ubuntu""" +async def handle_init_computer(os_choice: str, app_list=None, provider="lume"): + """Initialize the computer instance and tools for macOS or Ubuntu + + Args: + os_choice: The OS to use ("macOS" or "Ubuntu") + app_list: Optional list of apps to focus on using the app-use experiment + provider: The provider to use ("lume" or "self") + """ global computer, tool_call_logs, tools - + + # Check if we should enable app-use experiment + use_app_experiment = app_list and len(app_list) > 0 + experiments = ["app-use"] if use_app_experiment else None + + # Determine if we should use host computer server + use_host_computer_server = provider == "self" + if os_choice == "Ubuntu": - computer = Computer( - image="ubuntu-noble-vanilla:latest", - os_type="linux", - provider_type=VMProviderType.LUME, - display="1024x768", - memory="8GB", - cpu="4" - ) os_type_str = "linux" image_str = "ubuntu-noble-vanilla:latest" else: + os_type_str = "macos" + image_str = "macos-sequoia-cua:latest" + + # Create computer instance with appropriate configuration + if use_host_computer_server: computer = Computer( - image="macos-sequoia-cua:latest", - os_type="macos", + os_type=os_type_str, + use_host_computer_server=True, + experiments=experiments + ) + else: + computer = Computer( + image=image_str, + os_type=os_type_str, provider_type=VMProviderType.LUME, display="1024x768", memory="8GB", - cpu="4" + cpu="4", + experiments=experiments ) - os_type_str = "macos" - image_str = "macos-sequoia-cua:latest" await computer.run() + + # If app list is provided, create desktop from apps + if use_app_experiment: + computer = computer.create_desktop_from_apps(app_list) # Log computer initialization as a tool call - result = await execute("computer", "initialize", { + init_params = { "os": os_type_str, - "image": image_str, - "display": "1024x768", - "memory": "8GB", - "cpu": "4" - }) + "provider": provider + } + + # Add VM-specific parameters if not using host computer server + if not use_host_computer_server: + init_params.update({ + "image": image_str, + "display": "1024x768", + "memory": "8GB", + "cpu": "4" + }) + + # Add app list to the log if provided + if use_app_experiment: + init_params["apps"] = app_list + init_params["experiments"] = ["app-use"] + + result = await execute("computer", "initialize", init_params) return result["screenshot"], json.dumps(tool_call_logs, indent=2) @@ -1029,12 +1061,31 @@ def create_gradio_ui(): setup_status = gr.Textbox(label="Setup Status", value="") with gr.Group(): - os_choice = gr.Radio( - label="OS", - choices=["macOS", "Ubuntu"], - value="macOS", - interactive=False # disable until the ubuntu image is ready - ) + with gr.Accordion("Computer Configuration", open=False): + with gr.Row(): + os_choice = gr.Radio( + label="OS", + choices=["macOS", "Ubuntu"], + value="macOS", + interactive=False # disable until the ubuntu image is ready + ) + + # Provider selection radio + provider_choice = gr.Radio( + label="Provider", + choices=["lume", "self"], + value="lume", + info="'lume' uses a VM, 'self' uses the host computer server" + ) + + # App filtering dropdown for app-use experiment + app_filter = gr.Dropdown( + label="Filter by apps (App-Use)", + multiselect=True, + allow_custom_value=True, + info="When apps are selected, the computer will focus on those apps using the app-use experiment" + ) + start_btn = gr.Button("Initialize Computer") with gr.Group(): @@ -1199,7 +1250,7 @@ def create_gradio_ui(): ) img.select(handle_click, inputs=[img, click_type], outputs=[img, action_log]) - start_btn.click(handle_init_computer, inputs=[os_choice], outputs=[img, action_log]) + start_btn.click(handle_init_computer, inputs=[os_choice, app_filter, provider_choice], outputs=[img, action_log]) wait_btn.click(handle_wait, outputs=[img, action_log]) # DONE and FAIL buttons just do a placeholder action