Merge pull request #116 from trycua/feature/computer/ui

[Computer] Add Gradio UI and fix interaction bugs
2026-01-22 13:30:51 -06:00 · 2025-04-18 23:33:58 +02:00
parent 611d629eff 03980eacd8
commit f342b568fb
10 changed files with 1624 additions and 12 deletions
--- a/README.md
+++ b/README.md
@@ -107,7 +107,9 @@ If you want to use AI agents with virtualized environments:
   app.launch(share=False)
   ```

-7. For Developers only (contribute and use latest features):
+### Option 3: Build from Source (Nightly)
+If you want to contribute to the project or need the latest nightly features:
+
   ```bash
   # Clone the repository
   git clone https://github.com/trycua/cua.git
--- a/examples/computer_ui_examples.py
+++ b/examples/computer_ui_examples.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+"""
+Simple example script for the Computer Interface Gradio UI.
+
+This script launches the advanced Gradio UI for the Computer Interface
+with full model selection and configuration options.
+It can be run directly from the command line.
+"""
+
+
+from utils import load_dotenv_files
+
+load_dotenv_files()
+
+# Import the create_gradio_ui function
+from computer.ui.gradio.app import create_gradio_ui
+
+if __name__ == "__main__":
+    print("Launching Computer Interface Gradio UI with advanced features...")
+    app = create_gradio_ui()
+    app.launch(share=False)
+    
+    # Optional: Using the saved dataset
+    # import datasets
+    # from computer.ui.utils import convert_to_unsloth
+    # ds = datasets.load_dataset("ddupont/highquality-cua-demonstrations")
+    # ds = convert_to_unsloth(ds)
--- a/libs/agent/agent/providers/openai/tools/computer.py
+++ b/libs/agent/agent/providers/openai/tools/computer.py
@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
                y = kwargs.get("y")
                if x is None or y is None:
                    raise ToolError("x and y coordinates are required for scroll action")
-                scroll_x = kwargs.get("scroll_x", 0)
-                scroll_y = kwargs.get("scroll_y", 0)
+                scroll_x = kwargs.get("scroll_x", 0) // 20
+                scroll_y = kwargs.get("scroll_y", 0) // 20
                return await self.handle_scroll(x, y, scroll_x, scroll_y)
            elif type == "screenshot":
                return await self.screenshot()
--- a/libs/computer-server/computer_server/handlers/macos.py
+++ b/libs/computer-server/computer_server/handlers/macos.py
@@ -542,7 +542,7 @@ class MacOSAutomationHandler(BaseAutomationHandler):
        try:
            if x is not None and y is not None:
                pyautogui.moveTo(x, y)
-            pyautogui.doubleClick()
+            pyautogui.doubleClick(interval=0.1)
            return {"success": True}
        except Exception as e:
            return {"success": False, "error": str(e)}
--- a/libs/computer/README.md
+++ b/libs/computer/README.md
@@ -63,4 +63,34 @@ The `cua-computer` PyPi package pulls automatically the latest executable versio

 Refer to this notebook for a step-by-step guide on how to use the Computer-Use Interface (CUI):

- [Computer-Use Interface (CUI)](../../notebooks/computer_nb.ipynb)
+- [Computer-Use Interface (CUI)](../../notebooks/computer_nb.ipynb)
+
+## Using the Gradio Computer UI
+
+The computer module includes a Gradio UI for creating and sharing demonstration data. The UI provides built-in integration with HuggingFace Datasets for sharing demonstrations and incorporating them into CUA ML pipelines.
+
+```bash
+# Install with UI support
+pip install "cua-computer[ui]"
+```
+
+
+<details open>
+<summary>View demonstration video</summary>
+<video src="https://github.com/user-attachments/assets/7c683b58-f04d-4e8c-b63f-6ef36e9637d5" controls width="600"></video>
+</details>
+
+> **Note:** For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI.
+
+
+### Launch the UI
+
+```python
+# launch_ui.py
+from computer.ui.gradio.app import create_gradio_ui
+
+app = create_gradio_ui()
+app.launch(share=False)
+```
+
+For examples, see [Computer UI Examples](../../examples/computer_ui_examples.py)
--- a/libs/computer/computer/interface/macos.py
+++ b/libs/computer/computer/interface/macos.py
@@ -377,17 +377,47 @@ class MacOSComputerInterface(BaseComputerInterface):
        """
        await self.press(key)

-    async def hotkey(self, *keys: str) -> None:
-        await self._send_command("hotkey", {"keys": list(keys)})
+    async def hotkey(self, *keys: "KeyType") -> None:
+        """Press multiple keys simultaneously.
+
+        Args:
+            *keys: Multiple keys to press simultaneously. Each key can be any of:
+                - A Key enum value (recommended), e.g. Key.COMMAND
+                - A direct key value string, e.g. 'command'
+                - A single character string, e.g. 'a'
+
+        Examples:
+            ```python
+            # Using enums (recommended)
+            await interface.hotkey(Key.COMMAND, Key.C)  # Copy
+            await interface.hotkey(Key.COMMAND, Key.V)  # Paste
+
+            # Using mixed formats
+            await interface.hotkey(Key.COMMAND, 'a')  # Select all
+            ```
+
+        Raises:
+            ValueError: If any key type is invalid or not recognized
+        """
+        actual_keys = []
+        for key in keys:
+            if isinstance(key, Key):
+                actual_keys.append(key.value)
+            elif isinstance(key, str):
+                # Try to convert to enum if it matches a known key
+                key_or_enum = Key.from_string(key)
+                actual_keys.append(key_or_enum.value if isinstance(key_or_enum, Key) else key_or_enum)
+            else:
+                raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
+        
+        await self._send_command("hotkey", {"keys": actual_keys})

    # Scrolling Actions
    async def scroll_down(self, clicks: int = 1) -> None:
-        for _ in range(clicks):
-            await self.hotkey("pagedown")
-
+        await self._send_command("scroll_down", {"clicks": clicks})
+        
    async def scroll_up(self, clicks: int = 1) -> None:
-        for _ in range(clicks):
-            await self.hotkey("pageup")
+        await self._send_command("scroll_up", {"clicks": clicks})

    # Screen Actions
    async def screenshot(
--- a/libs/computer/computer/ui/init.py
+++ b/libs/computer/computer/ui/init.py
@@ -0,0 +1 @@
+"""UI modules for the Computer Interface."""
--- a/libs/computer/computer/ui/gradio/init.py
+++ b/libs/computer/computer/ui/gradio/init.py
@@ -0,0 +1,6 @@
+"""Gradio UI for Computer UI."""
+
+import gradio as gr
+from typing import Optional
+
+from .app import create_gradio_ui
--- a/libs/computer/computer/ui/gradio/app.py
+++ b/libs/computer/computer/ui/gradio/app.py
--- a/libs/computer/pyproject.toml
+++ b/libs/computer/pyproject.toml
@@ -21,6 +21,12 @@ dependencies = [
 ]
 requires-python = ">=3.10"

+[project.optional-dependencies]
+ui = [
+    "gradio>=5.23.3,<6.0.0",
+    "python-dotenv>=1.0.1,<2.0.0",
+]
+
 [tool.pdm]
 distribution = true
				`@@ -0,0 +1 @@`
				`"""UI modules for the Computer Interface."""`