Merge pull request #116 from trycua/feature/computer/ui

[Computer] Add Gradio UI and fix interaction bugs
This commit is contained in:
f-trycua
2025-04-18 23:33:58 +02:00
committed by GitHub
10 changed files with 1624 additions and 12 deletions

View File

@@ -107,7 +107,9 @@ If you want to use AI agents with virtualized environments:
app.launch(share=False)
```
7. For Developers only (contribute and use latest features):
### Option 3: Build from Source (Nightly)
If you want to contribute to the project or need the latest nightly features:
```bash
# Clone the repository
git clone https://github.com/trycua/cua.git

View File

@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""
Simple example script for the Computer Interface Gradio UI.
This script launches the advanced Gradio UI for the Computer Interface
with full model selection and configuration options.
It can be run directly from the command line.
"""
from utils import load_dotenv_files
load_dotenv_files()
# Import the create_gradio_ui function
from computer.ui.gradio.app import create_gradio_ui
if __name__ == "__main__":
print("Launching Computer Interface Gradio UI with advanced features...")
app = create_gradio_ui()
app.launch(share=False)
# Optional: Using the saved dataset
# import datasets
# from computer.ui.utils import convert_to_unsloth
# ds = datasets.load_dataset("ddupont/highquality-cua-demonstrations")
# ds = convert_to_unsloth(ds)

View File

@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
y = kwargs.get("y")
if x is None or y is None:
raise ToolError("x and y coordinates are required for scroll action")
scroll_x = kwargs.get("scroll_x", 0)
scroll_y = kwargs.get("scroll_y", 0)
scroll_x = kwargs.get("scroll_x", 0) // 20
scroll_y = kwargs.get("scroll_y", 0) // 20
return await self.handle_scroll(x, y, scroll_x, scroll_y)
elif type == "screenshot":
return await self.screenshot()

View File

@@ -542,7 +542,7 @@ class MacOSAutomationHandler(BaseAutomationHandler):
try:
if x is not None and y is not None:
pyautogui.moveTo(x, y)
pyautogui.doubleClick()
pyautogui.doubleClick(interval=0.1)
return {"success": True}
except Exception as e:
return {"success": False, "error": str(e)}

View File

@@ -63,4 +63,34 @@ The `cua-computer` PyPi package pulls automatically the latest executable versio
Refer to this notebook for a step-by-step guide on how to use the Computer-Use Interface (CUI):
- [Computer-Use Interface (CUI)](../../notebooks/computer_nb.ipynb)
- [Computer-Use Interface (CUI)](../../notebooks/computer_nb.ipynb)
## Using the Gradio Computer UI
The computer module includes a Gradio UI for creating and sharing demonstration data. The UI provides built-in integration with HuggingFace Datasets for sharing demonstrations and incorporating them into CUA ML pipelines.
```bash
# Install with UI support
pip install "cua-computer[ui]"
```
<details open>
<summary>View demonstration video</summary>
<video src="https://github.com/user-attachments/assets/7c683b58-f04d-4e8c-b63f-6ef36e9637d5" controls width="600"></video>
</details>
> **Note:** For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI.
### Launch the UI
```python
# launch_ui.py
from computer.ui.gradio.app import create_gradio_ui
app = create_gradio_ui()
app.launch(share=False)
```
For examples, see [Computer UI Examples](../../examples/computer_ui_examples.py)

View File

@@ -377,17 +377,47 @@ class MacOSComputerInterface(BaseComputerInterface):
"""
await self.press(key)
async def hotkey(self, *keys: str) -> None:
await self._send_command("hotkey", {"keys": list(keys)})
async def hotkey(self, *keys: "KeyType") -> None:
"""Press multiple keys simultaneously.
Args:
*keys: Multiple keys to press simultaneously. Each key can be any of:
- A Key enum value (recommended), e.g. Key.COMMAND
- A direct key value string, e.g. 'command'
- A single character string, e.g. 'a'
Examples:
```python
# Using enums (recommended)
await interface.hotkey(Key.COMMAND, Key.C) # Copy
await interface.hotkey(Key.COMMAND, Key.V) # Paste
# Using mixed formats
await interface.hotkey(Key.COMMAND, 'a') # Select all
```
Raises:
ValueError: If any key type is invalid or not recognized
"""
actual_keys = []
for key in keys:
if isinstance(key, Key):
actual_keys.append(key.value)
elif isinstance(key, str):
# Try to convert to enum if it matches a known key
key_or_enum = Key.from_string(key)
actual_keys.append(key_or_enum.value if isinstance(key_or_enum, Key) else key_or_enum)
else:
raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
await self._send_command("hotkey", {"keys": actual_keys})
# Scrolling Actions
async def scroll_down(self, clicks: int = 1) -> None:
for _ in range(clicks):
await self.hotkey("pagedown")
await self._send_command("scroll_down", {"clicks": clicks})
async def scroll_up(self, clicks: int = 1) -> None:
for _ in range(clicks):
await self.hotkey("pageup")
await self._send_command("scroll_up", {"clicks": clicks})
# Screen Actions
async def screenshot(

View File

@@ -0,0 +1 @@
"""UI modules for the Computer Interface."""

View File

@@ -0,0 +1,6 @@
"""Gradio UI for Computer UI."""
import gradio as gr
from typing import Optional
from .app import create_gradio_ui

File diff suppressed because it is too large Load Diff

View File

@@ -21,6 +21,12 @@ dependencies = [
]
requires-python = ">=3.10"
[project.optional-dependencies]
ui = [
"gradio>=5.23.3,<6.0.0",
"python-dotenv>=1.0.1,<2.0.0",
]
[tool.pdm]
distribution = true