diff --git a/docs/content/docs/get-started/quickstart.mdx b/docs/content/docs/get-started/quickstart.mdx index cea2b335..894856a2 100644 --- a/docs/content/docs/get-started/quickstart.mdx +++ b/docs/content/docs/get-started/quickstart.mdx @@ -332,253 +332,151 @@ Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available m ## CLI Quickstart +Get started quickly with the CUA CLI - the easiest way to manage cloud VMs and run AI agents. + -### Install Cua +### Install the CUA CLI - - - - -#### Install uv - - - - -```bash -# Use curl to download the script and execute it with sh: -curl -LsSf https://astral.sh/uv/install.sh | sh - -# If your system doesn't have curl, you can use wget: -# wget -qO- https://astral.sh/uv/install.sh | sh -``` - - - - -```powershell -# Use irm to download the script and execute it with iex: -powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" -``` - - + + + ```bash + curl -LsSf https://cua.ai/cli/install.sh | sh + ``` + + + ```powershell + powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex" + ``` + + + ```bash + npm install -g @trycua/cli + ``` + + + ```bash + # Install Bun (macOS/Linux) + curl -fsSL https://bun.sh/install | bash + + # Install Bun (Windows) + # powershell -c "irm bun.sh/install.ps1|iex" + + # Clone the repo + git clone https://github.com/trycua/cua + cd cua/libs/typescript/cua-cli + + # Install the CLI + bun install + bun link + bun link cua-cli + ``` + -#### Install Python 3.12 - -```bash -uv python install 3.12 -# uv will install Cua dependencies automatically when you use --with "cua-agent[cli]" -``` - - - - - -#### Install conda - - - - -```bash -mkdir -p ~/miniconda3 -curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh -o ~/miniconda3/miniconda.sh -bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 -rm ~/miniconda3/miniconda.sh -source ~/miniconda3/bin/activate -``` - - - - -```bash -mkdir -p ~/miniconda3 -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh -bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 -rm ~/miniconda3/miniconda.sh -source ~/miniconda3/bin/activate -``` - - - - -```powershell -wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe" -outfile ".\miniconda.exe" -Start-Process -FilePath ".\miniconda.exe" -ArgumentList "/S" -Wait -del .\miniconda.exe -``` - - - - -#### Create and activate Python 3.12 environment - -```bash -conda create -n cua python=3.12 -conda activate cua -``` - -#### Install Cua - -```bash -pip install "cua-agent[cli]" cua-computer -``` - - - - - -#### Install Cua - -```bash -pip install "cua-agent[cli]" cua-computer -``` - - - - - -### Run Cua CLI +### Authenticate with CUA -Choose your preferred AI model: - -#### OpenAI Computer Use Preview - - - +Login to your CUA account: ```bash -uv run --with "cua-agent[cli]" -m agent.cli openai/computer-use-preview +# Interactive browser login (recommended) +cua auth login + +# Or provide your API key directly +cua auth login --api-key sk-your-api-key-here ``` - - +If you don't have a CUA account yet, sign up at [cua.ai/signin](https://cua.ai/signin). + + + + + +### Create Your First VM + +Create a cloud sandbox where your AI agents will run: ```bash -python -m agent.cli openai/computer-use-preview +# Create a Linux VM (recommended for most use cases) +cua vm create --os linux --configuration small --region north-america + +# Or create a Windows VM +cua vm create --os windows --configuration small --region north-america + +# Or create a macOS VM +cua vm create --os macos --configuration small --region north-america ``` - - +Your VM will be created and you'll see output like: +``` +VM created and ready: my-vm-abc123 +Password: secure-password-here +Host: my-vm-abc123.containers.cloud.trycua.com +``` -#### Anthropic Claude + - - + +### Start Using Your VM + +You can now interact with your VM in multiple ways: + +#### Option 1: Open the AI Playground (Recommended) ```bash -uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-5-20250929 -uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-20250514 -uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-opus-4-1-20250805 -uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-sonnet-4-20250514 -uv run --with "cua-agent[cli]" -m agent.cli anthropic/claude-3-5-sonnet-20241022 +cua vm chat my-vm-abc123 ``` +This opens the full CUA playground in your browser where you can chat with AI agents that control your VM. - - - +#### Option 2: Access VNC Desktop ```bash -python -m agent.cli anthropic/claude-sonnet-4-5-20250929 -python -m agent.cli anthropic/claude-opus-4-1-20250805 -python -m agent.cli anthropic/claude-opus-4-20250514 -python -m agent.cli anthropic/claude-sonnet-4-20250514 -python -m agent.cli anthropic/claude-3-5-sonnet-20241022 +cua vm vnc my-vm-abc123 ``` +This opens a remote desktop connection to your VM. - - - -#### Omniparser + LLMs - - - - +#### Option 3: List and Manage VMs ```bash -uv run --with "cua-agent[cli]" -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 -uv run --with "cua-agent[cli]" -m agent.cli omniparser+openai/gpt-4o -uv run --with "cua-agent[cli]" -m agent.cli omniparser+vertex_ai/gemini-pro +# List all your VMs +cua vm list + +# Start/stop VMs as needed +cua vm stop my-vm-abc123 +cua vm start my-vm-abc123 + +# Delete VMs when done +cua vm delete my-vm-abc123 ``` - - + -```bash -python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022 -python -m agent.cli omniparser+openai/gpt-4o -python -m agent.cli omniparser+vertex_ai/gemini-pro -``` + - - +### Try Some AI Tasks -#### Local Models - - - - -```bash -# Hugging Face models (local) -uv run --with "cua-agent[cli]" -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B - -# MLX models (Apple Silicon) -uv run --with "cua-agent[cli]" -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit - -# Ollama models -uv run --with "cua-agent[cli]" -m agent.cli omniparser+ollama_chat/llama3.2:latest -``` - - - - -```bash -# Hugging Face models (local) -python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B - -# MLX models (Apple Silicon) -python -m agent.cli mlx/mlx-community/UI-TARS-1.5-7B-6bit - -# Ollama models -python -m agent.cli omniparser+ollama_chat/llama3.2:latest -``` - - - - -#### Interactive Setup - -If you haven't set up environment variables, the CLI will guide you through the setup: - -1. **Sandbox Name**: Enter your Cua sandbox name (or get one at [cua.ai](https://cua.ai/)) -2. **CUA API Key**: Enter your Cua API key -3. **Provider API Key**: Enter your AI provider API key (OpenAI, Anthropic, etc.) - -#### Start Chatting - -Once connected, you'll see: - -``` -πŸ’» Connected to your-container-name (model, agent_loop) -Type 'exit' to quit. - -> -``` - -You can ask your agent to perform actions like: +Once you have the playground open (`cua vm chat`), try asking the AI to: - "Take a screenshot and tell me what's on the screen" -- "Open Firefox and go to github.com" -- "Type 'Hello world' into the terminal" -- "Close the current window" -- "Click on the search button" +- "Open Firefox and navigate to github.com" +- "Create a new text file and write 'Hello World' in it" +- "Install Python and run a simple script" +- "Take a screenshot of the desktop" + +The AI agent will automatically control your VM to complete these tasks! +### What's Next? + +- **Explore more commands**: Check out the [complete CLI reference](/libraries/cua-cli/commands) +- **Learn about programming**: Try the [Developer Quickstart](#developer-quickstart) to build custom automations +- **Join the community**: Get help in our [Discord community](https://discord.com/invite/mVnXXpdE85) + --- For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models). diff --git a/docs/content/docs/libraries/cua-cli/commands.mdx b/docs/content/docs/libraries/cua-cli/commands.mdx new file mode 100644 index 00000000..c60d5a12 --- /dev/null +++ b/docs/content/docs/libraries/cua-cli/commands.mdx @@ -0,0 +1,320 @@ +--- +title: Commands +description: Complete reference for all CUA CLI commands +--- + +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { Callout } from 'fumadocs-ui/components/callout'; + +## Overview + +The CUA CLI provides two main command groups: + +- **`cua auth`** - Authentication and API key management +- **`cua vm`** - Virtual machine lifecycle management + +## Authentication Commands + +### `cua auth login` + +Authenticate with your CUA account using browser-based OAuth flow. + +```bash +# Interactive browser login +cua auth login + +# Direct API key login +cua auth login --api-key sk-your-api-key-here +``` + +**Options:** +- `--api-key ` - Provide API key directly instead of browser flow + +**Example:** +```bash +$ cua auth login +Opening browser for CLI auth... +API key saved +``` + +### `cua auth pull` + +Create or update a `.env` file in the current directory with your CUA API key. + +```bash +cua auth pull +``` + +**Example:** +```bash +$ cua auth pull +Wrote /path/to/your/project/.env +``` + +The generated `.env` file will contain: +``` +CUA_API_KEY=sk-your-api-key-here +``` + +### `cua auth logout` + +Remove the stored API key from your system. + +```bash +cua auth logout +``` + +**Example:** +```bash +$ cua auth logout +Logged out +``` + +## Virtual Machine Commands + +### `cua vm list` + +List all your virtual machines with their current status. + +```bash +cua vm list +``` + +**Example Output:** +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Name β”‚ Status β”‚ OS β”‚ Configuration β”‚ Host β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ my-dev-vm β”‚ running β”‚ linux β”‚ small β”‚ my-dev-vm.containers.cloud.trycua.com β”‚ +β”‚ test-windows β”‚ stopped β”‚ windowsβ”‚ medium β”‚ test-windows.containers.cloud.trycua.com β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +### `cua vm create` + +Create a new virtual machine. + +```bash +cua vm create --os --configuration --region +``` + +**Required Options:** +- `--os` - Operating system: `linux`, `windows`, `macos` +- `--configuration` - VM size: `small`, `medium`, `large` +- `--region` - Region: `north-america`, `europe`, `asia-pacific`, `south-america` + +**Examples:** +```bash +# Create a small Linux VM in North America +cua vm create --os linux --configuration small --region north-america + +# Create a medium Windows VM in Europe +cua vm create --os windows --configuration medium --region europe + +# Create a large macOS VM in Asia Pacific +cua vm create --os macos --configuration large --region asia-pacific +``` + +**Response Types:** + +**Immediate (Status 200):** +```bash +VM created and ready: my-new-vm-abc123 +Password: secure-password-here +Host: my-new-vm-abc123.containers.cloud.trycua.com +``` + +**Provisioning (Status 202):** +```bash +VM provisioning started: my-new-vm-abc123 +Job ID: job-xyz789 +Use 'cua vm list' to monitor provisioning progress +``` + +### `cua vm start` + +Start a stopped virtual machine. + +```bash +cua vm start +``` + +**Example:** +```bash +$ cua vm start my-dev-vm +Start accepted +``` + +### `cua vm stop` + +Stop a running virtual machine. + +```bash +cua vm stop +``` + +**Example:** +```bash +$ cua vm stop my-dev-vm +stopping +``` + +### `cua vm restart` + +Restart a virtual machine. + +```bash +cua vm restart +``` + +**Example:** +```bash +$ cua vm restart my-dev-vm +restarting +``` + +### `cua vm delete` + +Delete a virtual machine permanently. + +```bash +cua vm delete +``` + +**Example:** +```bash +$ cua vm delete old-test-vm +VM deletion initiated: deleting +``` + + + This action is irreversible. All data on the VM will be permanently lost. + + +### `cua vm vnc` + +Open the VNC interface for a VM in your browser. + +```bash +cua vm vnc +``` + +**Example:** +```bash +$ cua vm vnc my-dev-vm +Opening NoVNC: https://my-dev-vm.containers.cloud.trycua.com/vnc.html?autoconnect=true&password=... +``` + +This command automatically opens your default browser to the VNC interface with the correct password pre-filled. + +### `cua vm chat` + +Open the CUA Dashboard Playground for a VM in your browser. + +```bash +cua vm chat +``` + +**Example:** +```bash +$ cua vm chat my-dev-vm +Opening Playground: https://cua.ai/dashboard/playground?host=... +``` + +This opens the full CUA playground interface where you can interact with your VM using AI agents. + +## Global Options + +### Help + +Get help for any command: + +```bash +cua --help +cua auth --help +cua vm --help +cua vm create --help +``` + +### Environment Variables + +You can override default endpoints using environment variables: + +```bash +# Use staging environment +export CUA_API_BASE=https://api.staging.cua.ai +export CUA_WEBSITE_URL=https://staging.cua.ai + +cua vm list # Uses staging API +``` + +**Available Variables:** +- `CUA_API_BASE` - API endpoint (default: `https://api.cua.ai`) +- `CUA_WEBSITE_URL` - Website URL (default: `https://cua.ai`) + +## Error Handling + +The CLI provides clear error messages for common issues: + +### Authentication Errors +```bash +$ cua vm list +Unauthorized. Try 'cua auth login' again. +``` + +### VM Not Found +```bash +$ cua vm start nonexistent-vm +VM not found +``` + +### Invalid Configuration +```bash +$ cua vm create --os invalid --configuration small --region north-america +Invalid request or unsupported configuration +``` + +## Tips and Best Practices + +### 1. Use Descriptive VM Names +```bash +# Good +cua vm create --os linux --configuration small --region north-america +# Then rename or use meaningful names in the dashboard + +# Better workflow +cua vm list # Check the generated name +# Use that name consistently +``` + +### 2. Environment Management +```bash +# Set up your project with API key +cd my-project +cua auth pull +# Now your project has CUA_API_KEY in .env +``` + +### 3. Quick VM Access +```bash +# Create aliases for frequently used VMs +alias dev-vm="cua vm chat my-development-vm" +alias prod-vm="cua vm vnc my-production-vm" +``` + +### 4. Monitoring Provisioning +```bash +# For VMs that need provisioning time +cua vm create --os windows --configuration large --region europe +# VM provisioning started: my-vm-abc123 +# Job ID: job-xyz789 + +# Check status periodically +watch -n 5 cua vm list +``` + +## Next Steps + +- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart) +- [Learn about CUA computers](/computer-sdk/computers) +- [Explore agent automation](/agent-sdk/agent-loops) diff --git a/docs/content/docs/libraries/cua-cli/index.mdx b/docs/content/docs/libraries/cua-cli/index.mdx new file mode 100644 index 00000000..3fe90c0f --- /dev/null +++ b/docs/content/docs/libraries/cua-cli/index.mdx @@ -0,0 +1,58 @@ +--- +title: Cua CLI +description: Command-line interface for managing Cua cloud VMs and authentication +--- + +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; + +The Cua CLI is a command-line tool that provides an intuitive interface for managing your Cua cloud virtual machines and authentication. It offers a streamlined workflow for creating, managing, and connecting to cloud sandboxes. + +## Key Features + +- **Authentication Management**: Secure login with browser-based OAuth flow +- **VM Lifecycle**: Create, start, stop, restart, and delete cloud VMs +- **Quick Access**: Direct links to VNC and playground interfaces +- **Cross-Platform**: Works on macOS, Linux, and Windows +- **Environment Integration**: Automatic `.env` file generation + +## Quick Example + +```bash +# Install the CLI (installs Bun + CUA CLI) +curl -LsSf https://cua.ai/cli/install.sh | sh + +# Login to your CUA account +cua auth login + +# Create a new Linux VM +cua vm create --os linux --configuration small --region north-america + +# List your VMs +cua vm list + +# Open the playground for your VM +cua vm chat my-vm-name +``` + +## Use Cases + +### Development Workflow +- Quickly spin up cloud sandboxes for testing +- Manage multiple VMs across different regions +- Integrate with CI/CD pipelines + +### Team Collaboration +- Share VM configurations and access +- Standardize development environments +- Quick onboarding for new team members + +### Automation +- Script VM provisioning and management +- Integrate with deployment workflows +- Automate environment setup + +## Next Steps + +- [Install the CLI](/libraries/cua-cli/installation) +- [Learn about available commands](/libraries/cua-cli/commands) +- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart) diff --git a/docs/content/docs/libraries/cua-cli/installation.mdx b/docs/content/docs/libraries/cua-cli/installation.mdx new file mode 100644 index 00000000..5a2fd49b --- /dev/null +++ b/docs/content/docs/libraries/cua-cli/installation.mdx @@ -0,0 +1,152 @@ +--- +title: Installation +description: Install the CUA CLI on your system +--- + +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { Callout } from 'fumadocs-ui/components/callout'; + +## Quick Install + +The fastest way to install the CUA CLI is using our installation scripts: + + + + ```bash + curl -LsSf https://cua.ai/cli/install.sh | sh + ``` + + + ```powershell + powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex" + ``` + + + +These scripts will automatically: +1. Install [Bun](https://bun.sh) (a fast JavaScript runtime) +2. Install the CUA CLI via `bun add -g @trycua/cli` + + + The installation scripts will automatically detect your system and install the appropriate binary to your PATH. + + +## Alternative: npm Install + +You can also install the CLI via npm if you prefer: + +```bash +npm install -g @trycua/cli +``` + + + The npm package requires Node.js 18+ to be installed on your system. + + +## Verify Installation + +After installation, verify the CLI is working: + +```bash +cua --help +``` + +You should see the CLI help output with available commands. + +## First Time Setup + +After installation, you'll need to authenticate with your CUA account: + +```bash +# Login with browser-based OAuth flow +cua auth login + +# Or provide your API key directly +cua auth login --api-key sk-your-api-key-here +``` + +## Updating + +To update to the latest version: + + + + Re-run the installation script: + ```bash + # macOS/Linux + curl -LsSf https://cua.ai/cli/install.sh | sh + + # Windows + powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex" + ``` + + + ```bash + npm update -g @trycua/cli + ``` + + + +## Uninstalling + + + + Remove the binary from your PATH: + ```bash + # macOS/Linux + rm $(which cua) + + # Windows + # Remove from your PATH or delete the executable + ``` + + + ```bash + npm uninstall -g @trycua/cli + ``` + + + +## Troubleshooting + +### Command Not Found + +If you get a "command not found" error after installation: + +1. **Check your PATH**: Make sure the installation directory is in your PATH +2. **Restart your terminal**: Close and reopen your terminal/command prompt +3. **Manual PATH setup**: Add the installation directory to your PATH manually + +### Permission Issues + +If you encounter permission issues during installation: + + + + Try running with sudo (not recommended for the curl method): + ```bash + # If using npm + sudo npm install -g @trycua/cli + ``` + + + Run PowerShell as Administrator: + ```powershell + # Right-click PowerShell and "Run as Administrator" + powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex" + ``` + + + +### Network Issues + +If the installation script fails due to network issues: + +1. **Check your internet connection** +2. **Try the npm installation method instead** +3. **Check if your firewall is blocking the download** + +## Next Steps + +- [Learn about CLI commands](/libraries/cua-cli/commands) +- [Follow the quickstart guide](/get-started/quickstart#cli-quickstart) diff --git a/docs/content/docs/libraries/cua-cli/meta.json b/docs/content/docs/libraries/cua-cli/meta.json new file mode 100644 index 00000000..8c40c41d --- /dev/null +++ b/docs/content/docs/libraries/cua-cli/meta.json @@ -0,0 +1,9 @@ +{ + "title": "CLI", + "description": "Command-line interface for CUA", + "pages": [ + "index", + "installation", + "commands" + ] +} \ No newline at end of file diff --git a/libs/python/agent/.bumpversion.cfg b/libs/python/agent/.bumpversion.cfg index ab6acb97..ef4bfda4 100644 --- a/libs/python/agent/.bumpversion.cfg +++ b/libs/python/agent/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.37 +current_version = 0.4.38 commit = True tag = True tag_name = agent-v{new_version} diff --git a/libs/python/agent/agent/loops/__init__.py b/libs/python/agent/agent/loops/__init__.py index ab23ac27..1fdb2c22 100644 --- a/libs/python/agent/agent/loops/__init__.py +++ b/libs/python/agent/agent/loops/__init__.py @@ -1,36 +1,40 @@ -""" -Agent loops for agent -""" - -# Import the loops to register them -from . import ( - anthropic, - composed_grounded, - gemini, - glm45v, - gta1, - holo, - internvl, - moondream3, - omniparser, - openai, - opencua, - qwen, - uitars, -) - -__all__ = [ - "anthropic", - "openai", - "uitars", - "omniparser", - "gta1", - "composed_grounded", - "glm45v", - "opencua", - "internvl", - "holo", - "moondream3", - "gemini", - "qwen", -] +""" +Agent loops for agent +""" + +# Import the loops to register them +from . import ( + anthropic, + composed_grounded, + gelato, + gemini, + glm45v, + gta1, + holo, + internvl, + moondream3, + omniparser, + openai, + opencua, + qwen, + uiins, + uitars, +) + +__all__ = [ + "anthropic", + "openai", + "uitars", + "omniparser", + "gta1", + "composed_grounded", + "glm45v", + "opencua", + "internvl", + "holo", + "moondream3", + "gemini", + "qwen", + "uiins", + "gelato", +] diff --git a/libs/python/agent/agent/loops/gelato.py b/libs/python/agent/agent/loops/gelato.py new file mode 100644 index 00000000..e3032472 --- /dev/null +++ b/libs/python/agent/agent/loops/gelato.py @@ -0,0 +1,183 @@ +""" +Gelato agent loop implementation for click prediction using litellm.acompletion +Model: https://huggingface.co/mlfoundations/Gelato-30B-A3B +Code: https://github.com/mlfoundations/Gelato/tree/main +""" + +import base64 +import math +import re +from io import BytesIO +from typing import Any, Dict, List, Optional, Tuple + +import litellm +from PIL import Image + +from ..decorators import register_agent +from ..loops.base import AsyncAgentConfig +from ..types import AgentCapability + +SYSTEM_PROMPT = """ +You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. For elements with area, return the center point. + +Output the coordinate pair exactly: +(x,y) +""" + + +def extract_coordinates(raw_string): + """ + Extract the coordinates from the raw string. + Args: + raw_string: str (e.g. "(100, 200)") + Returns: + x: float (e.g. 100.0) + y: float (e.g. 200.0) + """ + try: + matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string) + return [tuple(map(int, match)) for match in matches][0] + except: + return 0, 0 + + +def smart_resize( + height: int, + width: int, + factor: int = 28, + min_pixels: int = 3136, + max_pixels: int = 8847360, +) -> Tuple[int, int]: + """Smart resize function similar to qwen_vl_utils.""" + # Calculate the total pixels + total_pixels = height * width + + # If already within bounds, return original dimensions + if min_pixels <= total_pixels <= max_pixels: + # Round to nearest factor + new_height = (height // factor) * factor + new_width = (width // factor) * factor + return new_height, new_width + + # Calculate scaling factor + if total_pixels > max_pixels: + scale = (max_pixels / total_pixels) ** 0.5 + else: + scale = (min_pixels / total_pixels) ** 0.5 + + # Apply scaling + new_height = int(height * scale) + new_width = int(width * scale) + + # Round to nearest factor + new_height = (new_height // factor) * factor + new_width = (new_width // factor) * factor + + # Ensure minimum size + new_height = max(new_height, factor) + new_width = max(new_width, factor) + + return new_height, new_width + + +@register_agent(models=r".*Gelato.*") +class GelatoConfig(AsyncAgentConfig): + """Gelato agent configuration implementing AsyncAgentConfig protocol for click prediction.""" + + def __init__(self): + self.current_model = None + self.last_screenshot_b64 = None + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs, + ) -> Dict[str, Any]: + raise NotImplementedError() + + async def predict_click( + self, model: str, image_b64: str, instruction: str, **kwargs + ) -> Optional[Tuple[float, float]]: + """ + Predict click coordinates using UI-Ins model via litellm.acompletion. + + Args: + model: The UI-Ins model name + image_b64: Base64 encoded image + instruction: Instruction for where to click + + Returns: + Tuple of (x, y) coordinates or None if prediction fails + """ + # Decode base64 image + image_data = base64.b64decode(image_b64) + image = Image.open(BytesIO(image_data)) + width, height = image.width, image.height + + # Smart resize the image (similar to qwen_vl_utils) + resized_height, resized_width = smart_resize( + height, + width, + factor=28, # Default factor for Qwen models + min_pixels=3136, + max_pixels=4096 * 2160, + ) + resized_image = image.resize((resized_width, resized_height)) + scale_x, scale_y = width / resized_width, height / resized_height + + # Convert resized image back to base64 + buffered = BytesIO() + resized_image.save(buffered, format="PNG") + resized_image_b64 = base64.b64encode(buffered.getvalue()).decode() + + # Prepare system and user messages + system_message = { + "role": "system", + "content": [{"type": "text", "text": SYSTEM_PROMPT.strip()}], + } + + user_message = { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"}, + }, + {"type": "text", "text": instruction}, + ], + } + + # Prepare API call kwargs + api_kwargs = { + "model": model, + "messages": [system_message, user_message], + "max_tokens": 2056, + "temperature": 0.0, + **kwargs, + } + + # Use liteLLM acompletion + response = await litellm.acompletion(**api_kwargs) + + # Extract response text + output_text = response.choices[0].message.content # type: ignore + + # Extract and rescale coordinates + pred_x, pred_y = extract_coordinates(output_text) # type: ignore + pred_x *= scale_x + pred_y *= scale_y + + return (math.floor(pred_x), math.floor(pred_y)) + + def get_capabilities(self) -> List[AgentCapability]: + """Return the capabilities supported by this agent.""" + return ["click"] diff --git a/libs/python/agent/agent/loops/uiins.py b/libs/python/agent/agent/loops/uiins.py new file mode 100644 index 00000000..10956948 --- /dev/null +++ b/libs/python/agent/agent/loops/uiins.py @@ -0,0 +1,175 @@ +""" +UI-Ins agent loop implementation for click prediction using litellm.acompletion +Paper: https://arxiv.org/pdf/2510.202861 +Code: https://github.com/alibaba/UI-Ins +""" + +import asyncio +import base64 +import json +import math +import re +import uuid +from io import BytesIO +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union + +import litellm +from PIL import Image + +from ..decorators import register_agent +from ..loops.base import AsyncAgentConfig +from ..types import AgentCapability, AgentResponse, Messages, Tools + +SYSTEM_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.\n\n## Output Format\nReturn a json object with a reasoning process in tags, a function name and arguments within XML tags:\n```\n\n...\n\n\n{"name": "grounding", "arguments": }\n\n```\n represents the following item of the action space:\n## Action Space{"action": "click", "coordinate": [x, y]}\nYour task is to accurately locate a UI element based on the instruction. You should first analyze instruction in tags and finally output the function in tags.\n""" + + +def parse_coordinates(raw_string: str) -> tuple[int, int]: + matches = re.findall(r"\[(\d+),\s*(\d+)\]", raw_string) + if matches: + return tuple(map(int, matches[0])) + return -1, -1 + + +def smart_resize( + height: int, + width: int, + factor: int = 28, + min_pixels: int = 3136, + max_pixels: int = 8847360, +) -> Tuple[int, int]: + """Smart resize function similar to qwen_vl_utils.""" + # Calculate the total pixels + total_pixels = height * width + + # If already within bounds, return original dimensions + if min_pixels <= total_pixels <= max_pixels: + # Round to nearest factor + new_height = (height // factor) * factor + new_width = (width // factor) * factor + return new_height, new_width + + # Calculate scaling factor + if total_pixels > max_pixels: + scale = (max_pixels / total_pixels) ** 0.5 + else: + scale = (min_pixels / total_pixels) ** 0.5 + + # Apply scaling + new_height = int(height * scale) + new_width = int(width * scale) + + # Round to nearest factor + new_height = (new_height // factor) * factor + new_width = (new_width // factor) * factor + + # Ensure minimum size + new_height = max(new_height, factor) + new_width = max(new_width, factor) + + return new_height, new_width + + +@register_agent(models=r".*UI-Ins.*") +class UIInsConfig(AsyncAgentConfig): + """UI-Ins agent configuration implementing AsyncAgentConfig protocol for click prediction.""" + + def __init__(self): + self.current_model = None + self.last_screenshot_b64 = None + + async def predict_step( + self, + messages: List[Dict[str, Any]], + model: str, + tools: Optional[List[Dict[str, Any]]] = None, + max_retries: Optional[int] = None, + stream: bool = False, + computer_handler=None, + _on_api_start=None, + _on_api_end=None, + _on_usage=None, + _on_screenshot=None, + **kwargs, + ) -> Dict[str, Any]: + raise NotImplementedError() + + async def predict_click( + self, model: str, image_b64: str, instruction: str, **kwargs + ) -> Optional[Tuple[float, float]]: + """ + Predict click coordinates using UI-Ins model via litellm.acompletion. + + Args: + model: The UI-Ins model name + image_b64: Base64 encoded image + instruction: Instruction for where to click + + Returns: + Tuple of (x, y) coordinates or None if prediction fails + """ + # Decode base64 image + image_data = base64.b64decode(image_b64) + image = Image.open(BytesIO(image_data)) + width, height = image.width, image.height + + # Smart resize the image (similar to qwen_vl_utils) + resized_height, resized_width = smart_resize( + height, + width, + factor=28, # Default factor for Qwen models + min_pixels=3136, + max_pixels=4096 * 2160, + ) + resized_image = image.resize((resized_width, resized_height)) + scale_x, scale_y = width / resized_width, height / resized_height + + # Convert resized image back to base64 + buffered = BytesIO() + resized_image.save(buffered, format="PNG") + resized_image_b64 = base64.b64encode(buffered.getvalue()).decode() + + # Prepare system and user messages + system_message = { + "role": "system", + "content": [ + {"type": "text", "text": "You are a helpful assistant."}, + {"type": "text", "text": SYSTEM_PROMPT}, + ], + } + + user_message = { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"}, + }, + {"type": "text", "text": instruction}, + ], + } + + # Prepare API call kwargs + api_kwargs = { + "model": model, + "messages": [system_message, user_message], + "max_tokens": 2056, + "temperature": 0.0, + **kwargs, + } + + # Use liteLLM acompletion + response = await litellm.acompletion(**api_kwargs) + + # Extract response text + output_text = response.choices[0].message.content # type: ignore + + # Extract and rescale coordinates + pred_x, pred_y = parse_coordinates(output_text) # type: ignore + pred_x *= scale_x + pred_y *= scale_y + + return (math.floor(pred_x), math.floor(pred_y)) + + def get_capabilities(self) -> List[AgentCapability]: + """Return the capabilities supported by this agent.""" + return ["click"] diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index fbb4bc9b..e240e4ff 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "pdm.backend" [project] name = "cua-agent" -version = "0.4.37" +version = "0.4.38" description = "CUA (Computer Use) Agent for AI-driven computer interaction" readme = "README.md" authors = [