diff --git a/docs/content/docs/home/faq.mdx b/docs/content/docs/home/faq.mdx index 99327b8f..ef3b1098 100644 --- a/docs/content/docs/home/faq.mdx +++ b/docs/content/docs/home/faq.mdx @@ -1,6 +1,6 @@ --- title: FAQ -description: C/ua's frequently asked questions. Find answers to the most common issues or questions when using C/ua tools. +description: Find answers to the most common issues or questions when using C/ua tools. icon: CircleQuestionMark --- diff --git a/docs/content/docs/home/guides/meta.json b/docs/content/docs/home/guides/meta.json index 65720bbf..6a477702 100644 --- a/docs/content/docs/home/guides/meta.json +++ b/docs/content/docs/home/guides/meta.json @@ -1,5 +1,5 @@ { - "title": "C/ua", - "description": "C/ua", - "icon": "House" + "title": "Guides", + "description": "Guides", + "icon": "BookCopy" } \ No newline at end of file diff --git a/docs/content/docs/home/index.mdx b/docs/content/docs/home/index.mdx index 7a756852..8d06745d 100644 --- a/docs/content/docs/home/index.mdx +++ b/docs/content/docs/home/index.mdx @@ -11,20 +11,16 @@ C/ua is a collection of cross-platform libraries and tools for building Computer Read our guide on getting started with a Computer-Use Agent. - + Get started using C/ua services on your machine. - + Set up a development environment with the Dev Container. diff --git a/docs/content/docs/home/libraries/agent.mdx b/docs/content/docs/home/libraries/agent.mdx deleted file mode 100644 index cc9ecf4e..00000000 --- a/docs/content/docs/home/libraries/agent.mdx +++ /dev/null @@ -1,266 +0,0 @@ ---- -title: Agent ---- - -
- - Python - - - macOS - - - Discord - - - PyPI - -
- -
- - Reference - -
- -**cua-agent** is a general Computer-Use framework for running multi-app agentic workflows targeting macOS and Linux sandbox created with C/ua, supporting local (Ollama) and cloud model providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). - -### Get started with Agent - -
- -
- -## Install - -```bash -pip install "cua-agent[all]" - -# or install specific loop providers -pip install "cua-agent[openai]" # OpenAI Cua Loop -pip install "cua-agent[anthropic]" # Anthropic Cua Loop -pip install "cua-agent[uitars]" # UI-Tars support -pip install "cua-agent[omni]" # Cua Loop based on OmniParser (includes Ollama for local models) -pip install "cua-agent[ui]" # Gradio UI for the agent - -# For local UI-TARS with MLX support, you need to manually install mlx-vlm: -pip install "cua-agent[uitars-mlx]" -pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id # PR: https://github.com/Blaizzy/mlx-vlm/pull/349 -``` - -## Run - -```bash -async with Computer() as macos_computer: - # Create agent with loop and provider - agent = ComputerAgent( - computer=macos_computer, - loop=AgentLoop.OPENAI, - model=LLM(provider=LLMProvider.OPENAI) - # or - # loop=AgentLoop.ANTHROPIC, - # model=LLM(provider=LLMProvider.ANTHROPIC) - # or - # loop=AgentLoop.OMNI, - # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3") - # or - # loop=AgentLoop.UITARS, - # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") - ) - - tasks = [ - "Look for a repository named trycua/cua on GitHub.", - "Check the open issues, open the most recent one and read it.", - "Clone the repository in users/lume/projects if it doesn't exist yet.", - "Open the repository with an app named Cursor (on the dock, black background and white cube icon).", - "From Cursor, open Composer if not already open.", - "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.", - ] - - for i, task in enumerate(tasks): - print(f"\nExecuting task {i}/{len(tasks)}: {task}") - async for result in agent.run(task): - print(result) - - print(f"\n✅ Task {i+1}/{len(tasks)} completed: {task}") -``` - -Refer to these notebooks for step-by-step guides on how to use the Computer-Use Agent (CUA): - -- [Agent Notebook](https://github.com/trycua/cua/tree/main/notebooks/agent_nb.ipynb) - Complete examples and workflows - -## Using the Gradio UI - -The agent includes a Gradio-based user interface for easier interaction. - -
- -
- -To use it: - -```bash -# Install with Gradio support -pip install "cua-agent[ui]" -``` - -### Create a simple launcher script - -```python -# launch_ui.py -from agent.ui.gradio.app import create_gradio_ui - -app = create_gradio_ui() -app.launch(share=False) -``` - -### Setting up API Keys - -For the Gradio UI to show available models, you need to set API keys as environment variables: - -```bash -# For OpenAI models -export OPENAI_API_KEY=your_openai_key_here - -# For Anthropic models -export ANTHROPIC_API_KEY=your_anthropic_key_here - -# Launch with both keys set -OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py -``` - -Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider. - -### Using Local Models - -You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio. - -If you're using a different local model server: - -- vLLM: `http://localhost:8000/v1` -- LocalAI: `http://localhost:8080/v1` -- Ollama with OpenAI compat API: `http://localhost:11434/v1` - -The Gradio UI provides: - -- Selection of different agent loops (OpenAI, Anthropic, OMNI) -- Model selection for each provider -- Configuration of agent parameters -- Chat interface for interacting with the agent - -### Using UI-TARS - -The UI-TARS models are available in two forms: - -1. **MLX UI-TARS models** (Default): These models run locally using MLXVLM provider - - - `mlx-community/UI-TARS-1.5-7B-4bit` (default) - 4-bit quantized version - - `mlx-community/UI-TARS-1.5-7B-6bit` - 6-bit quantized version for higher quality - - ```python - agent = ComputerAgent( - computer=macos_computer, - loop=AgentLoop.UITARS, - model=LLM(provider=LLMProvider.MLXVLM, name="mlx-community/UI-TARS-1.5-7B-4bit") - ) - ``` - -2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model - - - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md) - - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI: - - ```python - agent = ComputerAgent( - computer=macos_computer, - loop=AgentLoop.UITARS, - model=LLM(provider=LLMProvider.OAICOMPAT, name="tgi", - provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") - ) - ``` - -## Agent Loops - -The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques: - -| Agent Loop | Supported Models | Description | Set-Of-Marks | -| :-------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------- | :----------- | -| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required | -| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required | -| `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)
• `mlx-community/UI-TARS-1.5-7B-6bit`
• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required | -| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219`
• `gpt-4.5-preview`
• `gpt-4o`
• `gpt-4`
• `phi4`
• `phi4-mini`
• `gemma3`
• `...`
• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser | - -## AgentResponse - -The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops. - -```python -async for result in agent.run(task): - print("Response ID: ", result.get("id")) - - # Print detailed usage information - usage = result.get("usage") - if usage: - print("\nUsage Details:") - print(f" Input Tokens: {usage.get('input_tokens')}") - if "input_tokens_details" in usage: - print(f" Input Tokens Details: {usage.get('input_tokens_details')}") - print(f" Output Tokens: {usage.get('output_tokens')}") - if "output_tokens_details" in usage: - print(f" Output Tokens Details: {usage.get('output_tokens_details')}") - print(f" Total Tokens: {usage.get('total_tokens')}") - - print("Response Text: ", result.get("text")) - - # Print tools information - tools = result.get("tools") - if tools: - print("\nTools:") - print(tools) - - # Print reasoning and tool call outputs - outputs = result.get("output", []) - for output in outputs: - output_type = output.get("type") - if output_type == "reasoning": - print("\nReasoning Output:") - print(output) - elif output_type == "computer_call": - print("\nTool Call Output:") - print(output) -``` - -**Note on Settings Persistence:** - -- The Gradio UI automatically saves your configuration (Agent Loop, Model Choice, Custom Base URL, Save Trajectory state, Recent Images count) to a file named `.gradio_settings.json` in the project's root directory when you successfully run a task. -- This allows your preferences to persist between sessions. -- API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file. -- It's recommended to add `.gradio_settings.json` to your `.gitignore` file. diff --git a/docs/content/docs/home/libraries/agent/agent-gradio-ui.mdx b/docs/content/docs/home/libraries/agent/agent-gradio-ui.mdx new file mode 100644 index 00000000..3ff6e2f4 --- /dev/null +++ b/docs/content/docs/home/libraries/agent/agent-gradio-ui.mdx @@ -0,0 +1,158 @@ +--- +title: Gradio UI with the Python Agent +description: The agent module includes a Gradio-based user interface for easier interaction with Computer-Use Agent workflows. +--- + +The agent includes a Gradio-based user interface for easier interaction. + +
+ +
+ +## Install + +```bash +# Install with Gradio support +pip install "cua-agent[ui]" +``` + +## Create a simple launcher script + +```python +# launch_ui.py +from agent.ui.gradio.app import create_gradio_ui + +app = create_gradio_ui() +app.launch(share=False) +``` + +### Run the launcher + +```bash +python launch_ui.py +``` + +This will start the Gradio interface on `http://localhost:7860`. + +## Features + +The Gradio UI provides: + +- **Model Selection**: Choose between different AI models and providers +- **Task Input**: Enter tasks for the agent to execute +- **Real-time Output**: View the agent's actions and results as they happen +- **Screenshot Display**: See visual feedback from the computer screen +- **Settings Management**: Configure and save your preferred settings + +## Supported Providers + +1. **OpenAI**: GPT-4 and GPT-4 Vision models +2. **Anthropic**: Claude models +3. **Ollama**: Local models like Gemma3 +4. **UI-TARS**: Specialized UI understanding models + +### Using UI-TARS + +UI-TARS is a specialized model for UI understanding tasks. You have two options: + +1. **Local MLX UI-TARS**: For running the model locally on Apple Silicon + + ```bash + # Install MLX support + pip install "cua-agent[uitars-mlx]" + pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id + ``` + + Then select "UI-TARS (MLX)" in the Gradio interface. + +2. **OpenAI-compatible UI-TARS**: For using the original ByteDance model + + - If you want to use the original ByteDance UI-TARS model via an OpenAI-compatible API, follow the [deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md) + - This will give you a provider URL like `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1` which you can use in the code or Gradio UI: + + ```python + agent = ComputerAgent( + computer=macos_computer, + loop=AgentLoop.UITARS, + model=LLM( + provider=LLMProvider.OAICOMPAT, + name="ByteDance-Seed/UI-TARS-1.5-7B", + provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1" + ) + ) + ``` + + Or in the Gradio UI, select "OpenAI Compatible" and enter: + - Model Name: `ByteDance-Seed/UI-TARS-1.5-7B` + - Base URL: Your deployment URL + - API Key: Your API key (if required) + +## Advanced Configuration + +### Custom Provider Settings + +You can configure custom providers in the UI: + +1. Select "OpenAI Compatible" from the provider dropdown +2. Enter your custom model name, base URL, and API key +3. The settings will be saved for future sessions + +## Environment Variables + +Set API keys as environment variables for security: + +```bash +export OPENAI_API_KEY="your-openai-key" +export ANTHROPIC_API_KEY="your-anthropic-key" +export GROQ_API_KEY="your-groq-key" +export DEEPSEEK_API_KEY="your-deepseek-key" +export QWEN_API_KEY="your-qwen-key" +``` + +Or use a `.env` file: + +```bash +# .env +OPENAI_API_KEY=your-openai-key +ANTHROPIC_API_KEY=your-anthropic-key +# ... other keys +``` + +## Settings Persistence + +The Gradio UI automatically saves your settings to `.gradio_settings.json` in your working directory. This includes: + +- Selected provider and model +- Custom provider configurations (URLs and model names) +- Other UI preferences + +**Note**: API keys entered into the custom provider field are **not** saved in this file for security reasons. Manage API keys using environment variables (e.g., `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`) or a `.env` file. + +It's recommended to add `.gradio_settings.json` to your `.gitignore` file. + +## Example Usage + +Here's a complete example of using the Gradio UI with different providers: + +```python +# launch_ui_with_env.py +from agent.ui.gradio.app import create_gradio_ui +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Create and launch the UI +app = create_gradio_ui() +app.launch(share=False, server_port=7860) +``` + +Once launched, you can: + +1. Select your preferred AI provider and model +2. Enter a task like "Open a web browser and search for Python tutorials" +3. Click "Run" to execute the task +4. Watch the agent perform the actions in real-time +5. View screenshots and logs of the execution + +The UI makes it easy to experiment with different models and tasks without writing code for each interaction. diff --git a/docs/content/docs/home/libraries/agent/index.mdx b/docs/content/docs/home/libraries/agent/index.mdx new file mode 100644 index 00000000..b6f71909 --- /dev/null +++ b/docs/content/docs/home/libraries/agent/index.mdx @@ -0,0 +1,257 @@ +--- +title: Agent +description: The Computer-Use framework for running multi-app agentic workflows targeting macOS, Linux, and Windows sandboxes. +pypi: cua-computer +macos: true +windows: true +linux: true +--- + +import { buttonVariants } from 'fumadocs-ui/components/ui/button'; +import { cn } from 'fumadocs-ui/utils/cn'; +import { ChevronRight } from 'lucide-react'; + +**Agent** is a powerful Computer-Use framework that enables AI agents to interact with desktop applications and perform complex multi-step workflows across macOS, Linux, and Windows environments. Built on the C/ua platform, it supports both local models (via Ollama) and cloud providers (OpenAI, Anthropic, Groq, DeepSeek, Qwen). + +## Installation + +Install CUA Agent with pip. Choose the installation that matches your needs: + +### All Providers (Recommended) + +```bash +# Install everything you need +pip install "cua-agent[all]" +``` + +### Selective Installation + +```bash +# OpenAI models (GPT-4, Computer Use Preview) +pip install "cua-agent[openai]" + +# Anthropic models (Claude 3.5 Sonnet) +pip install "cua-agent[anthropic]" + +# Local UI-TARS models +pip install "cua-agent[uitars]" + +# OmniParser + Ollama for local models +pip install "cua-agent[omni]" + +# Gradio web interface +pip install "cua-agent[ui]" +``` + +### Advanced: Local UI-TARS with MLX + +```bash +pip install "cua-agent[uitars-mlx]" +pip install git+https://github.com/ddupont808/mlx-vlm.git@stable/fix/qwen2-position-id +``` + +### Requirements + +- Python 3.8+ +- macOS, Linux, or Windows +- For cloud providers: API keys (OpenAI, Anthropic, etc.) +- For local models: Sufficient RAM and compute resources + +## Getting Started + +### Basic Usage + +Here's a simple example to get you started with CUA Agent: + +```python +from cua_agent import ComputerAgent, AgentLoop, LLM, LLMProvider +from cua_computer import Computer + +# Set your API key +import os +os.environ["OPENAI_API_KEY"] = "your-api-key-here" + +async with Computer() as computer: + # Create agent with OpenAI + agent = ComputerAgent( + computer=computer, + loop=AgentLoop.OPENAI, + model=LLM(provider=LLMProvider.OPENAI) + ) + + # Run a simple task + async for result in agent.run("Open a text editor and write 'Hello, World!'"): + print(result.get("text")) +``` + +### Multi-Step Workflow + +```python +async with Computer() as computer: + # Create agent with your preferred provider + agent = ComputerAgent( + computer=computer, + loop=AgentLoop.OPENAI, # or ANTHROPIC, OMNI, UITARS + model=LLM(provider=LLMProvider.OPENAI) + ) + + # Define complex workflow + tasks = [ + "Look for a repository named trycua/cua on GitHub.", + "Check the open issues, open the most recent one and read it.", + "Clone the repository in users/lume/projects if it doesn't exist yet.", + "Open the repository with an app named Cursor.", + "From Cursor, open Composer and write a task to help resolve the GitHub issue.", + ] + + # Execute tasks sequentially + for i, task in enumerate(tasks): + print(f"\nExecuting task {i+1}/{len(tasks)}: {task}") + async for result in agent.run(task): + print(result.get("text")) + print(f"✅ Task {i+1} completed") +``` + +### Alternative Model Providers + +```python +# Anthropic Claude +agent = ComputerAgent( + computer=computer, + loop=AgentLoop.ANTHROPIC, + model=LLM(provider=LLMProvider.ANTHROPIC) +) + +# Local Ollama model +agent = ComputerAgent( + computer=computer, + loop=AgentLoop.OMNI, + model=LLM(provider=LLMProvider.OLLAMA, name="gemma3") +) + +# UI-TARS model +agent = ComputerAgent( + computer=computer, + loop=AgentLoop.UITARS, + model=LLM( + provider=LLMProvider.OAICOMPAT, + name="ByteDance-Seed/UI-TARS-1.5-7B", + provider_base_url="https://your-endpoint.com/v1" + ) +) +``` + +## Agent Loops + +The `cua-agent` package provides three agent loops variations, based on different CUA models providers and techniques: + +| Agent Loop | Supported Models | Description | Set-Of-Marks | +| :-------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------- | :----------- | +| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required | +| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required | +| `AgentLoop.UITARS` | • `mlx-community/UI-TARS-1.5-7B-4bit` (default)
• `mlx-community/UI-TARS-1.5-7B-6bit`
• `ByteDance-Seed/UI-TARS-1.5-7B` (via openAI-compatible endpoint) | Uses UI-TARS models with MLXVLM (default) or OAICOMPAT providers | Not Required | +| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`
• `claude-3-7-sonnet-20250219`
• `gpt-4.5-preview`
• `gpt-4o`
• `gpt-4`
• `phi4`
• `phi4-mini`
• `gemma3`
• `...`
• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser | + +## Agent Response + +The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops. + +```typescript +interface AgentResponse { + id: string; + text: string; + usage?: { + input_tokens: number; + input_tokens_details?: { + text_tokens: number; + image_tokens: number; + }; + output_tokens: number; + output_tokens_details?: { + text_tokens: number; + reasoning_tokens: number; + }; + total_tokens: number; + }; + tools?: Array<{ + name: string; + description: string; + }>; + output?: Array<{ + type: 'reasoning' | 'computer_call'; + content?: string; // for reasoning type + tool_name?: string; // for computer_call type + parameters?: Record; // for computer_call type + result?: string; // for computer_call type + }>; +} +``` + +### Example Usage + +```python +async for result in agent.run(task): + print("Response ID: ", result.get("id")) + + # Print detailed usage information + usage = result.get("usage") + if usage: + print("\nUsage Details:") + print(f" Input Tokens: {usage.get('input_tokens')}") + if "input_tokens_details" in usage: + print(f" Input Tokens Details: {usage.get('input_tokens_details')}") + print(f" Output Tokens: {usage.get('output_tokens')}") + if "output_tokens_details" in usage: + print(f" Output Tokens Details: {usage.get('output_tokens_details')}") + print(f" Total Tokens: {usage.get('total_tokens')}") + + print("Response Text: ", result.get("text")) + + # Print tools information + tools = result.get("tools") + if tools: + print("\nTools:") + print(tools) + + # Print reasoning and tool call outputs + outputs = result.get("output", []) + for output in outputs: + output_type = output.get("type") + if output_type == "reasoning": + print("\nReasoning Output:") + print(output) + elif output_type == "computer_call": + print("\nTool Call Output:") + print(output) +``` + +## Examples & Guides + + + + Step-by-step instructions on using the Computer-Use Agent (CUA) + + + Use the Agent library with a Python Gradio UI + + + +--- + + + **Need detailed API documentation?** Explore the complete API reference with + detailed class documentation, and method signatures. + + View API Reference + + + diff --git a/docs/content/docs/home/libraries/computer.mdx b/docs/content/docs/home/libraries/computer.mdx deleted file mode 100644 index fffde4c5..00000000 --- a/docs/content/docs/home/libraries/computer.mdx +++ /dev/null @@ -1,275 +0,0 @@ ---- -title: Computer ---- - -import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; - -
- - Python - - - PyPI - - - TypeScript - - - NPM Version - -
-
- - Reference - -
- -**cua-computer** is a Computer-Use Interface (CUI) framework powering Cua for interacting with local macOS and Linux sandboxes. It's PyAutoGUI-compatible and pluggable with any AI agent systems (Cua, Langchain, CrewAI, AutoGen). Computer relies on [Lume](./lume.mdx) for creating and managing sandbox environments. - -
- -
- -## Key Features - -- Create and manage virtual machine sandboxes -- Take screenshots of the virtual machine -- Control mouse movements and clicks -- Simulate keyboard input -- Manage clipboard content -- Interact with the operating system interface -- Support for macOS and Linux environments - -## Installation - - - - - To install the Computer-Use Interface (CUI) for Python: - - ```bash - pip install "cua-computer[all]" - ``` - - The `cua-computer` PyPi package pulls automatically the latest executable version of Lume through [pylume](https://github.com/trycua/pylume). - - - - To install the Computer-Use Interface (CUI) for TypeScript: - ```bash - npm install @cua/computer - ``` - - - -## Quick Start - - - - ```python - from computer import Computer - - computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4") - try: - await computer.run() - - screenshot = await computer.interface.screenshot() - with open("screenshot.png", "wb") as f: - f.write(screenshot) - - await computer.interface.move_cursor(100, 100) - await computer.interface.left_click() - await computer.interface.right_click(300, 300) - await computer.interface.double_click(400, 400) - - await computer.interface.type("Hello, World!") - await computer.interface.press_key("enter") - - await computer.interface.set_clipboard("Test clipboard") - content = await computer.interface.copy_to_clipboard() - print(f"Clipboard content: {content}") - finally: - await computer.stop() - ``` - - - - ```typescript - import { Computer, OSType } from '@cua/computer'; - - const main = async () => { - // Create a cloud-based computer - const computer = new Computer({ - name: 'cloud-vm', - osType: OSType.Linux, - apiKey: 'your-api-key', - }); - - // Access the interface - const interface = computer.interface; - - // Screenshot operations - const screenshot = await interface.screenshot(); - - // Mouse operations - await interface.moveCursor(100, 100); - await interface.leftClick(); - await interface.rightClick(300, 300); - await interface.doubleClick(400, 400); - await interface.dragTo(500, 500, 'left', 1000); // Drag with left button for 1 second - - // Keyboard operations - await interface.typeText('Hello from TypeScript!'); - await interface.pressKey('enter'); - await interface.hotkey('command', 'a'); // Select all - - // Clipboard operations - await interface.setClipboard('Clipboard content'); - const content = await interface.copyToClipboard(); - - // File operations - await interface.writeText('/tmp/test.txt', 'Hello world'); - const fileContent = await interface.readText('/tmp/test.txt'); - - // Run a command in the VM - const [stdout, stderr] = await interface.runCommand('ls -la'); - - // Disconnect from the cloud VM - await computer.disconnect(); - }; - - main().catch(console.error); - ``` - - - - -## Getting Started - - - - Refer to this notebook for a step-by-step guide on how to use the Computer-Use Interface (CUI): - - - [Computer-Use Interface (CUI)](https://github.com/trycua/cua/tree/main/notebooks/samples/computer_nb.ipynb) - - - - Check out the examples in the repository for more detailed usage patterns and advanced scenarios. - - - -## Using the Gradio Computer UI (Python Only) - -The computer module includes a Gradio UI for creating and sharing demonstration data. We make it easy for people to build community datasets for better computer use models with an upload to Huggingface feature. - -```bash -# Install with UI support -pip install "cua/computer[ui]" -``` - -> **Note:** For precise control of the computer, we recommend using VNC or Screen Sharing instead of the Computer Gradio UI. - -### Building and Sharing Demonstrations with Huggingface - -Follow these steps to contribute your own demonstrations: - -#### 1. Set up Huggingface Access - -Set your HF_TOKEN in a .env file or in your environment variables: - -```bash -# In .env file -HF_TOKEN=your_huggingface_token -``` - -#### 2. Launch the Computer UI - -```python -# launch_ui.py -from computer.ui.gradio.app import create_gradio_ui -from dotenv import load_dotenv -load_dotenv('.env') - -app = create_gradio_ui() -app.launch(share=False) -``` - -For examples, see [Computer UI Examples](https://github.com/trycua/cua/tree/main/examples/computer_ui_examples.py) - -#### 3. Record Your Tasks - -
- View demonstration video - -
- -Record yourself performing various computer tasks using the UI. - -#### 4. Save Your Demonstrations - -
- View demonstration video - -
- -Save each task by picking a descriptive name and adding relevant tags (e.g., "office", "web-browsing", "coding"). - -#### 5. Record Additional Demonstrations - -Repeat steps 3 and 4 until you have a good amount of demonstrations covering different tasks and scenarios. - -#### 6. Upload to Huggingface - -
- View demonstration video - -
- -Upload your dataset to Huggingface by: - -- Naming it as `{your_username}/{dataset_name}` -- Choosing public or private visibility -- Optionally selecting specific tags to upload only tasks with certain tags - -#### Examples and Resources - -- Example Dataset: [ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset) -- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua) diff --git a/docs/content/docs/home/libraries/computer/computer-use-gradio-ui.mdx b/docs/content/docs/home/libraries/computer/computer-use-gradio-ui.mdx new file mode 100644 index 00000000..cff1f9fc --- /dev/null +++ b/docs/content/docs/home/libraries/computer/computer-use-gradio-ui.mdx @@ -0,0 +1,90 @@ +--- +title: Gradio UI with the Python Computer Interface +description: The computer module includes a Gradio UI for creating and sharing demonstration data. This guide makes it easy for people to build community datasets for better computer use models with an upload to Huggingface feature. +--- + + + For precise control of the computer, we recommend using VNC or Screen Sharing + instead of Gradio UI. + + +```bash +# Install with UI support +pip install "cua/computer[ui]" +``` + +## Building and Sharing Demonstrations with Huggingface + +Follow these steps to contribute your own demonstrations: + +### 1. Set up Huggingface Access + +Set your HF_TOKEN in a .env file or in your environment variables: + +```bash +# In .env file +HF_TOKEN=your_huggingface_token +``` + +### 2. Launch the Computer UI + +```python +# launch_ui.py +from computer.ui.gradio.app import create_gradio_ui +from dotenv import load_dotenv +load_dotenv('.env') + +app = create_gradio_ui() +app.launch(share=False) +``` + +For examples, see [Computer UI Examples](https://github.com/trycua/cua/tree/main/examples/computer_ui_examples.py) + +### 3. Record Your Tasks + +
+ View demonstration video + +
+ +Record yourself performing various computer tasks using the UI. + +### 4. Save Your Demonstrations + +
+ View demonstration video + +
+ +Save each task by picking a descriptive name and adding relevant tags (e.g., "office", "web-browsing", "coding"). + +### 5. Record Additional Demonstrations + +Repeat steps 3 and 4 until you have a good amount of demonstrations covering different tasks and scenarios. + +### 6. Upload to Huggingface + +
+ View demonstration video + +
+ +Upload your dataset to Huggingface by: + +- Naming it as `{your_username}/{dataset_name}` +- Choosing public or private visibility +- Optionally selecting specific tags to upload only tasks with certain tags + +### Examples and Resources + +- Example Dataset: [ddupont/test-dataset](https://huggingface.co/datasets/ddupont/test-dataset) +- Find Community Datasets: 🔍 [Browse CUA Datasets on Huggingface](https://huggingface.co/datasets?other=cua) diff --git a/docs/content/docs/home/libraries/computer/index.mdx b/docs/content/docs/home/libraries/computer/index.mdx new file mode 100644 index 00000000..a3603817 --- /dev/null +++ b/docs/content/docs/home/libraries/computer/index.mdx @@ -0,0 +1,177 @@ +--- +title: Computer +description: The Computer-Use Interface (CUI) framework for interacting with local macOS, Linux, and Windows sandboxes. +macos: true +windows: true +linux: true +pypi: cua-computer +npm: '@trycua/computer' +--- + +import { Tabs, Tab } from 'fumadocs-ui/components/tabs'; +import { buttonVariants } from 'fumadocs-ui/components/ui/button'; +import { cn } from 'fumadocs-ui/utils/cn'; +import { ChevronRight } from 'lucide-react'; + +Computer powers Cua systems and is PyAutoGUI-compatible and pluggable with any AI agent system (Cua, Langchain, CrewAI, AutoGen). It relies on [Lume](./lume.mdx) for creating and managing sandbox environments. + +## Installation + + + + ```bash + pip install "cua-computer[all]" + ``` + The `cua-computer` PyPi package automatically pulls the latest executable version of Lume through [pylume](https://github.com/trycua/pylume). + + + + ```bash + npm install @trycua/computer + ``` + + + +## Key Features + +- Create and manage virtual machine sandboxes +- Take screenshots of the virtual machine +- Control mouse movements and clicks +- Simulate keyboard input +- Manage clipboard content +- Interact with the operating system interface +- Support for macOS and Linux environments + +## Simple Example + + + + ```python + from computer import Computer + + computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4") + try: + # Start a new local vm instance using Lume + await computer.run() + + # Interface with the instance + screenshot = await computer.interface.screenshot() + with open("screenshot.png", "wb") as f: + f.write(screenshot) + + await computer.interface.move_cursor(100, 100) + await computer.interface.left_click() + await computer.interface.right_click(300, 300) + await computer.interface.double_click(400, 400) + + await computer.interface.type("Hello, World!") + await computer.interface.press_key("enter") + + await computer.interface.set_clipboard("Test clipboard") + content = await computer.interface.copy_to_clipboard() + print(f"Clipboard content: {content}") + finally: + # Stop the vm instance + await computer.stop() + ``` + + + + ```typescript + import { Computer, OSType } from '@trycua/computer'; + + // This creates and interfaces with a cloud-based c/ua container. + const main = async () => { + // Create a cloud-based computer + const computer = new Computer({ + name: 'cloud-vm', + osType: OSType.Linux, + apiKey: 'your-api-key', + }); + + // Access the interface + const interface = computer.interface; + + // Screenshot operations + const screenshot = await interface.screenshot(); + + // Mouse operations + await interface.moveCursor(100, 100); + await interface.leftClick(); + await interface.rightClick(300, 300); + await interface.doubleClick(400, 400); + await interface.dragTo(500, 500, 'left', 1000); // Drag with left button for 1 second + + // Keyboard operations + await interface.typeText('Hello from TypeScript!'); + await interface.pressKey('enter'); + await interface.hotkey('command', 'a'); // Select all + + // Clipboard operations + await interface.setClipboard('Clipboard content'); + const content = await interface.copyToClipboard(); + + // File operations + await interface.writeText('/tmp/test.txt', 'Hello world'); + const fileContent = await interface.readText('/tmp/test.txt'); + + // Run a command in the VM + const [stdout, stderr] = await interface.runCommand('ls -la'); + + // Disconnect from the cloud VM + await computer.disconnect(); + }; + + main().catch(console.error); + ``` + + + + +## Examples & Guides + + + + + + Step-by-step guide on using the Computer-Use Interface (CUI) + + + + Use the Computer library with a Python Gradio UI + + + + + + + + Use C/ua Cloud Containers with OpenAI's API to execute tasks in a sandbox + + + + + +--- + + + **Need detailed API documentation?** Explore the complete API reference with + detailed class documentation, and method signatures. + + View API Reference + + + diff --git a/docs/content/docs/home/meta.json b/docs/content/docs/home/meta.json index 1ec6bb7c..21ab7f82 100644 --- a/docs/content/docs/home/meta.json +++ b/docs/content/docs/home/meta.json @@ -9,7 +9,12 @@ "faq", "telemetry", "---[BookCopy]Guides---", - "...guides", + "guides/cua-usage-guide", + "guides/developer-guide", + "guides/dev-container-setup", + "guides/computer-use-agent-quickstart", + "guides/agent-gradio-ui", + "guides/computer-use-gradio-ui", "---[Library]Libraries---", "...libraries" ] diff --git a/docs/package.json b/docs/package.json index 765b79f3..859c29d4 100644 --- a/docs/package.json +++ b/docs/package.json @@ -15,7 +15,9 @@ "lucide-react": "^0.525.0", "next": "15.3.3", "react": "^19.1.0", - "react-dom": "^19.1.0" + "react-dom": "^19.1.0", + "tailwind-merge": "^3.3.1", + "zod": "^3.25.76" }, "devDependencies": { "@biomejs/biome": "1.9.4", diff --git a/docs/pnpm-lock.yaml b/docs/pnpm-lock.yaml index 1da75851..b41d3be9 100644 --- a/docs/pnpm-lock.yaml +++ b/docs/pnpm-lock.yaml @@ -29,6 +29,12 @@ importers: react-dom: specifier: ^19.1.0 version: 19.1.0(react@19.1.0) + tailwind-merge: + specifier: ^3.3.1 + version: 3.3.1 + zod: + specifier: ^3.25.76 + version: 3.25.76 devDependencies: '@biomejs/biome': specifier: 1.9.4 @@ -1894,8 +1900,8 @@ packages: resolution: {integrity: sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==} engines: {node: '>=18'} - zod@3.25.67: - resolution: {integrity: sha512-idA2YXwpCdqUSKRCACDE6ItZD9TZzy3OZMtpfLoh6oPR47lipysRrJfjzMqFxQ3uJuUPyUeWe1r9vLH33xO/Qw==} + zod@3.25.76: + resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==} zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} @@ -2959,7 +2965,7 @@ snapshots: tinyexec: 1.0.1 tinyglobby: 0.2.14 unist-util-visit: 5.0.0 - zod: 3.25.67 + zod: 3.25.76 transitivePeerDependencies: - acorn - supports-color @@ -4010,6 +4016,6 @@ snapshots: yallist@5.0.0: {} - zod@3.25.67: {} + zod@3.25.76: {} zwitch@2.0.4: {} diff --git a/docs/source.config.ts b/docs/source.config.ts index 866a32c7..142825fa 100644 --- a/docs/source.config.ts +++ b/docs/source.config.ts @@ -4,12 +4,19 @@ import { frontmatterSchema, metaSchema, } from 'fumadocs-mdx/config'; +import { z } from 'zod'; // You can customise Zod schemas for frontmatter and `meta.json` here // see https://fumadocs.vercel.app/docs/mdx/collections#define-docs export const docs = defineDocs({ docs: { - schema: frontmatterSchema, + schema: frontmatterSchema.extend({ + pypi: z.string().optional(), + npm: z.string().optional(), + macos: z.boolean().default(false), + windows: z.boolean().default(false), + linux: z.boolean().default(false), + }), }, meta: { schema: metaSchema, diff --git a/docs/src/app/(home)/[[...slug]]/page.tsx b/docs/src/app/(home)/[[...slug]]/page.tsx index f5d1cfa1..424ad197 100644 --- a/docs/src/app/(home)/[[...slug]]/page.tsx +++ b/docs/src/app/(home)/[[...slug]]/page.tsx @@ -16,7 +16,7 @@ import { import { cn } from 'fumadocs-ui/utils/cn'; import { ChevronDown } from 'lucide-react'; import Link from 'next/link'; -import { notFound } from 'next/navigation'; +import { notFound, redirect } from 'next/navigation'; export default async function Page(props: { params: Promise<{ slug?: string[] }>; @@ -24,9 +24,9 @@ export default async function Page(props: { const params = await props.params; const slug = params.slug || []; const page = source.getPage(slug); - if (!page) notFound(); + if (!page) redirect('/home'); - // Detect if this is an API reference page: /docs/api/[section] or /docs/api/[section]/[version] + // Detect if this is an API reference page: /api/[section] or /api/[section]/[version] let apiSection: string | null = null; let apiVersionSlug: string[] = []; if (slug[0] === 'api' && slug.length >= 2) { @@ -41,72 +41,161 @@ export default async function Page(props: { versionItems = await getApiVersions(apiSection); } + const macos = page.data.macos; + const windows = page.data.windows; + const linux = page.data.linux; + const pypi = page.data.pypi; + const npm = page.data.npm; + const MDXContent = page.data.body; + // Platform icons component + const PlatformIcons = () => { + const hasAnyPlatform = macos || windows || linux; + if (!hasAnyPlatform && !pypi) return null; + + return ( +
+ {hasAnyPlatform && ( +
+ {windows && ( + + Windows + + + )} + {macos && ( + + macOS + + + )} + {linux && ( + + Linux + + + )} +
+ )} + {pypi && ( + + PyPI + + )} + {npm && ( + + NPM + + )} + {slug.includes('libraries') && ( + + Reference + + )} +
+ ); + }; + return ( -
- {page.data.title} -
- {apiSection && versionItems.length > 1 && ( - - - {(() => { - // Find the current version label - let currentLabel = 'Current'; - if (apiVersionSlug.length > 0) { - const found = versionItems.find( - (item) => - item.label !== 'Current' && - apiVersionSlug[0] === item.label - ); - if (found) currentLabel = found.label; - } - return ( - <> - API Version: {currentLabel} - - - ); - })()} - - - {versionItems.map((item) => { - // Build the href for each version - const href = - item.label === 'Current' - ? `/api/${apiSection}` - : `/api/${apiSection}/${item.label}`; - // Highlight current version - const isCurrent = - (item.label === 'Current' && apiVersionSlug.length === 0) || - (item.label !== 'Current' && - apiVersionSlug[0] === item.label); - return ( - - API version: {item.label} - - ); - })} - - - )} +
+
+
+ {page.data.title} +
+ {apiSection && versionItems.length > 1 && ( + + + {(() => { + // Find the current version label + let currentLabel = 'Current'; + if (apiVersionSlug.length > 0) { + const found = versionItems.find( + (item) => + item.label !== 'Current' && + apiVersionSlug[0] === item.label + ); + if (found) currentLabel = found.label; + } + return ( + <> + API Version: {currentLabel} + + + ); + })()} + + + {versionItems.map((item) => { + // Build the href for each version + const href = + item.label === 'Current' + ? `/api/${apiSection}` + : `/api/${apiSection}/${item.label}`; + // Highlight current version + const isCurrent = + (item.label === 'Current' && + apiVersionSlug.length === 0) || + (item.label !== 'Current' && + apiVersionSlug[0] === item.label); + return ( + + API version: {item.label} + + ); + })} + + + )} +
+
+ + {page.data.description} +
+
- {page.data.description}