Format codebase with uv run pre-commit run --all-files

This commit is contained in:
James Murdza
2025-10-22 11:11:02 -07:00
parent 759ff4703e
commit ddc5a5de91
234 changed files with 10127 additions and 8467 deletions

View File

@@ -11,6 +11,7 @@ Exit codes:
"""
import sys
try:
import tomllib
except ImportError:
@@ -20,7 +21,10 @@ except ImportError:
def main():
if len(sys.argv) != 3:
print("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>", file=sys.stderr)
print(
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
file=sys.stderr,
)
sys.exit(1)
pyproject_path = sys.argv[1]
@@ -28,7 +32,7 @@ def main():
# tomllib requires binary mode
try:
with open(pyproject_path, 'rb') as f:
with open(pyproject_path, "rb") as f:
data = tomllib.load(f)
except FileNotFoundError:
print(f"❌ ERROR: File not found: {pyproject_path}", file=sys.stderr)
@@ -37,6 +41,7 @@ def main():
# Fallback to toml if using the old library or handle other errors
try:
import toml
data = toml.load(pyproject_path)
except FileNotFoundError:
print(f"❌ ERROR: File not found: {pyproject_path}", file=sys.stderr)
@@ -45,7 +50,7 @@ def main():
print(f"❌ ERROR: Failed to parse TOML file: {e}", file=sys.stderr)
sys.exit(1)
actual_version = data.get('project', {}).get('version')
actual_version = data.get("project", {}).get("version")
if not actual_version:
print("❌ ERROR: No version found in pyproject.toml", file=sys.stderr)
@@ -56,13 +61,18 @@ def main():
print(f" pyproject.toml version: {actual_version}", file=sys.stderr)
print(f" Expected version: {expected_version}", file=sys.stderr)
print("", file=sys.stderr)
print("The version in pyproject.toml must match the version being published.", file=sys.stderr)
print(f"Please update pyproject.toml to version {expected_version} or use the correct tag.", file=sys.stderr)
print(
"The version in pyproject.toml must match the version being published.", file=sys.stderr
)
print(
f"Please update pyproject.toml to version {expected_version} or use the correct tag.",
file=sys.stderr,
)
sys.exit(1)
print(f"✅ Version consistency check passed: {actual_version}")
sys.exit(0)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -7,6 +7,7 @@ This directory contains comprehensive tests for the GitHub workflow scripts usin
**No external dependencies required!**
This test suite uses:
- `unittest` - Python's built-in testing framework
- `tomllib` - Python 3.11+ built-in TOML parser
@@ -15,27 +16,32 @@ For Python < 3.11, the `toml` package is used as a fallback.
## Running Tests
### Run all tests
```bash
cd .github/scripts/tests
python3 -m unittest discover -v
```
### Run a specific test file
```bash
python3 -m unittest test_get_pyproject_version -v
```
### Run a specific test class
```bash
python3 -m unittest test_get_pyproject_version.TestGetPyprojectVersion -v
```
### Run a specific test method
```bash
python3 -m unittest test_get_pyproject_version.TestGetPyprojectVersion.test_matching_versions -v
```
### Run tests directly from the test file
```bash
python3 test_get_pyproject_version.py
```

View File

@@ -10,10 +10,10 @@ This test suite covers:
"""
import sys
import unittest
import tempfile
from pathlib import Path
import unittest
from io import StringIO
from pathlib import Path
from unittest.mock import patch
# Add parent directory to path to import the module
@@ -36,46 +36,54 @@ class TestGetPyprojectVersion(unittest.TestCase):
def create_pyproject_toml(self, version: str) -> Path:
"""Helper to create a temporary pyproject.toml file with a given version."""
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
temp_file.write(f"""
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
temp_file.write(
f"""
[project]
name = "test-project"
version = "{version}"
description = "A test project"
""")
"""
)
temp_file.close()
return Path(temp_file.name)
def create_pyproject_toml_no_version(self) -> Path:
"""Helper to create a pyproject.toml without a version field."""
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
temp_file.write("""
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
temp_file.write(
"""
[project]
name = "test-project"
description = "A test project without version"
""")
"""
)
temp_file.close()
return Path(temp_file.name)
def create_pyproject_toml_no_project(self) -> Path:
"""Helper to create a pyproject.toml without a project section."""
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
temp_file.write("""
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
temp_file.write(
"""
[tool.poetry]
name = "test-project"
version = "1.0.0"
""")
"""
)
temp_file.close()
return Path(temp_file.name)
def create_malformed_toml(self) -> Path:
"""Helper to create a malformed TOML file."""
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
temp_file.write("""
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
temp_file.write(
"""
[project
name = "test-project
version = "1.0.0"
""")
"""
)
temp_file.close()
return Path(temp_file.name)
@@ -85,11 +93,11 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml("1.2.3")
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3"]
# Capture stdout
captured_output = StringIO()
with patch('sys.stdout', captured_output):
with patch("sys.stdout", captured_output):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -104,11 +112,11 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml("1.2.3")
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.4']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.4"]
# Capture stderr
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -127,10 +135,10 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml_no_version()
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -145,10 +153,10 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml_no_project()
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -160,7 +168,7 @@ version = "1.0.0"
# Test: File not found
def test_file_not_found(self):
"""Test handling of non-existent pyproject.toml file."""
sys.argv = ['get_pyproject_version.py', '/nonexistent/pyproject.toml', '1.0.0']
sys.argv = ["get_pyproject_version.py", "/nonexistent/pyproject.toml", "1.0.0"]
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -173,7 +181,7 @@ version = "1.0.0"
pyproject_file = self.create_malformed_toml()
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -185,44 +193,50 @@ version = "1.0.0"
# Test: Incorrect number of arguments - too few
def test_too_few_arguments(self):
"""Test that providing too few arguments results in usage error."""
sys.argv = ['get_pyproject_version.py', 'pyproject.toml']
sys.argv = ["get_pyproject_version.py", "pyproject.toml"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 1)
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue())
self.assertIn(
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue(),
)
# Test: Incorrect number of arguments - too many
def test_too_many_arguments(self):
"""Test that providing too many arguments results in usage error."""
sys.argv = ['get_pyproject_version.py', 'pyproject.toml', '1.0.0', 'extra']
sys.argv = ["get_pyproject_version.py", "pyproject.toml", "1.0.0", "extra"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 1)
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue())
self.assertIn(
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue(),
)
# Test: No arguments
def test_no_arguments(self):
"""Test that providing no arguments results in usage error."""
sys.argv = ['get_pyproject_version.py']
sys.argv = ["get_pyproject_version.py"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 1)
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue())
self.assertIn(
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
captured_error.getvalue(),
)
# Test: Version with pre-release tags
def test_version_with_prerelease_tags(self):
@@ -230,15 +244,17 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml("1.2.3-rc.1")
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3-rc.1']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3-rc.1"]
captured_output = StringIO()
with patch('sys.stdout', captured_output):
with patch("sys.stdout", captured_output):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 0)
self.assertIn("✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue())
self.assertIn(
"✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue()
)
finally:
pyproject_file.unlink()
@@ -248,15 +264,17 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml("1.2.3+build.123")
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3+build.123']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3+build.123"]
captured_output = StringIO()
with patch('sys.stdout', captured_output):
with patch("sys.stdout", captured_output):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 0)
self.assertIn("✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue())
self.assertIn(
"✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue()
)
finally:
pyproject_file.unlink()
@@ -290,15 +308,17 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml(version)
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), version]
sys.argv = ["get_pyproject_version.py", str(pyproject_file), version]
captured_output = StringIO()
with patch('sys.stdout', captured_output):
with patch("sys.stdout", captured_output):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
self.assertEqual(cm.exception.code, 0)
self.assertIn(f"✅ Version consistency check passed: {version}", captured_output.getvalue())
self.assertIn(
f"✅ Version consistency check passed: {version}", captured_output.getvalue()
)
finally:
pyproject_file.unlink()
@@ -308,10 +328,10 @@ version = "1.0.0"
pyproject_file = self.create_pyproject_toml("")
try:
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
captured_error = StringIO()
with patch('sys.stderr', captured_error):
with patch("sys.stderr", captured_error):
with self.assertRaises(SystemExit) as cm:
get_pyproject_version.main()
@@ -327,14 +347,14 @@ class TestSuiteInfo(unittest.TestCase):
def test_suite_info(self):
"""Display test suite information."""
print("\n" + "="*70)
print("\n" + "=" * 70)
print("Test Suite: get_pyproject_version.py")
print("Framework: unittest (Python built-in)")
print("TOML Library: tomllib (Python 3.11+ built-in)")
print("="*70)
print("=" * 70)
self.assertTrue(True)
if __name__ == '__main__':
if __name__ == "__main__":
# Run tests with verbose output
unittest.main(verbosity=2)

View File

@@ -3,14 +3,14 @@ name: Test valididation script
on:
pull_request:
paths:
- '.github/scripts/**'
- '.github/workflows/test-scripts.yml'
- ".github/scripts/**"
- ".github/workflows/test-scripts.yml"
push:
branches:
- main
paths:
- '.github/scripts/**'
- '.github/workflows/test-scripts.yml'
- ".github/scripts/**"
- ".github/workflows/test-scripts.yml"
jobs:
test:
@@ -23,7 +23,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: "3.11"
- name: Install dependencies
run: |

View File

@@ -20,7 +20,7 @@ repos:
hooks:
- id: isort
name: isort code formatter
args: ['--profile', 'black']
args: ["--profile", "black"]
files: \.(py)$
- repo: https://github.com/psf/black
@@ -35,7 +35,7 @@ repos:
hooks:
- id: ruff
name: ruff linter
args: ['--fix']
args: ["--fix"]
files: \.(py)$
# Temporarily disabled due to untyped codebase

View File

@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.

View File

@@ -5,18 +5,21 @@
<img alt="Cua logo" height="150" src="img/logo_black.png">
</picture>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![Swift](https://img.shields.io/badge/Swift-F05138?logo=swift&logoColor=white)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
<br>
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![Swift](https://img.shields.io/badge/Swift-F05138?logo=swift&logoColor=white)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
<br>
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</div>
> We're hosting **The Computer-Use Agents SOTA Challenge concluded** at [Hack the North](https://hackthenorth.com) and online!
>> **Track A (On-site @ UWaterloo)**: 🏆 ~~Prize: **YC interview guaranteed**.~~ **Concluded**
>> **Track B (Remote)**: 🏆 ~~Prize: **Cash award**.~~ **Concluded - Winners will be announced soon**
>>> ~~👉 Sign up here: [trycua.com/hackathon](https://www.trycua.com/hackathon)~~
> We're hosting **The Computer-Use Agents SOTA Challenge concluded** at [Hack the North](https://hackthenorth.com) and online!
>
> > **Track A (On-site @ UWaterloo)**: 🏆 ~~Prize: **YC interview guaranteed**.~~ **Concluded**
> > **Track B (Remote)**: 🏆 ~~Prize: **Cash award**.~~ **Concluded - Winners will be announced soon**
> >
> > > ~~👉 Sign up here: [trycua.com/hackathon](https://www.trycua.com/hackathon)~~
**Cua** ("koo-ah") is Docker for [Computer-Use Agents](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse) - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud.
@@ -25,10 +28,12 @@
</div>
With the Computer SDK, you can:
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://docs.trycua.com/docs/libraries/computer#interface-actions)
- create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.trycua.com/)
With the Agent SDK, you can:
- run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
- combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
@@ -38,16 +43,16 @@ With the Agent SDK, you can:
### CUA Model Zoo 🐨
| [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
|---|---|---|
| `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) |
| `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | |
| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | |
| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
| `moondream3+{ui planning}` (supports text-only models) | |
| `omniparser+{ui planning}` | | |
| `{ui grounding}+{ui planning}` | | |
| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
| `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) |
| `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | |
| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | |
| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
| `moondream3+{ui planning}` (supports text-only models) | |
| `omniparser+{ui planning}` | | |
| `{ui grounding}+{ui planning}` | | |
- `human/human` → [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)
@@ -55,7 +60,7 @@ Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/
<br/>
# Quick Start
# Quick Start
- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template) (⭐️ Recommended!)
- [Get started with the Computer-Use Agent CLI](https://docs.trycua.com/docs/quickstart-cli)
@@ -68,6 +73,7 @@ Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/
```bash
pip install cua-agent[all]
```
```python
from agent import ComputerAgent
@@ -86,8 +92,9 @@ async for result in agent.run(messages):
```
### Output format (OpenAI Agent Responses Format):
```json
{
{
"output": [
# user input
{
@@ -133,7 +140,7 @@ async for result in agent.run(messages):
}
]
}
],
],
"usage": {
"prompt_tokens": 150,
"completion_tokens": 75,
@@ -148,6 +155,7 @@ async for result in agent.run(messages):
```bash
pip install cua-computer[all]
```
```python
from computer import Computer
@@ -174,18 +182,18 @@ async with Computer(
## Modules
| Module | Description | Installation |
|--------|-------------|---------------|
| [**Lume**](./libs/lume/README.md) | VM management for macOS/Linux using Apple's Virtualization.Framework | `curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh \| bash` |
| [**Lumier**](./libs/lumier/README.md) | Docker interface for macOS and Linux VMs | `docker pull trycua/lumier:latest` |
| [**Computer (Python)**](./libs/python/computer/README.md) | Python Interface for controlling virtual machines | `pip install "cua-computer[all]"` |
| [**Computer (Typescript)**](./libs/typescript/computer/README.md) | Typescript Interface for controlling virtual machines | `npm install @trycua/computer` |
| [**Agent**](./libs/python/agent/README.md) | AI agent framework for automating tasks | `pip install "cua-agent[all]"` |
| [**MCP Server**](./libs/python/mcp-server/README.md) | MCP server for using CUA with Claude Desktop | `pip install cua-mcp-server` |
| [**SOM**](./libs/python/som/README.md) | Self-of-Mark library for Agent | `pip install cua-som` |
| [**Computer Server**](./libs/python/computer-server/README.md) | Server component for Computer | `pip install cua-computer-server` |
| [**Core (Python)**](./libs/python/core/README.md) | Python Core utilities | `pip install cua-core` |
| [**Core (Typescript)**](./libs/typescript/core/README.md) | Typescript Core utilities | `npm install @trycua/core` |
| Module | Description | Installation |
| ----------------------------------------------------------------- | -------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
| [**Lume**](./libs/lume/README.md) | VM management for macOS/Linux using Apple's Virtualization.Framework | `curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh \| bash` |
| [**Lumier**](./libs/lumier/README.md) | Docker interface for macOS and Linux VMs | `docker pull trycua/lumier:latest` |
| [**Computer (Python)**](./libs/python/computer/README.md) | Python Interface for controlling virtual machines | `pip install "cua-computer[all]"` |
| [**Computer (Typescript)**](./libs/typescript/computer/README.md) | Typescript Interface for controlling virtual machines | `npm install @trycua/computer` |
| [**Agent**](./libs/python/agent/README.md) | AI agent framework for automating tasks | `pip install "cua-agent[all]"` |
| [**MCP Server**](./libs/python/mcp-server/README.md) | MCP server for using CUA with Claude Desktop | `pip install cua-mcp-server` |
| [**SOM**](./libs/python/som/README.md) | Self-of-Mark library for Agent | `pip install cua-som` |
| [**Computer Server**](./libs/python/computer-server/README.md) | Server component for Computer | `pip install cua-computer-server` |
| [**Core (Python)**](./libs/python/core/README.md) | Python Core utilities | `pip install cua-core` |
| [**Core (Typescript)**](./libs/typescript/core/README.md) | Typescript Core utilities | `npm install @trycua/core` |
## Community
@@ -193,7 +201,7 @@ Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss i
## License
Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE.md) file for details.
Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE.md) file for details.
Portions of this project, specifically components adapted from Kasm Technologies Inc., are also licensed under the MIT License. See [libs/kasm/LICENSE](libs/kasm/LICENSE) for details.
@@ -211,16 +219,16 @@ When you choose to install and use such optional extras, your use, modification,
Cua uses `bump2version` to manage package versions across all Python modules. A Makefile is provided to simplify the release process.
### Prerequisites
### Prerequisites
#### install `bump2version`
using brew
```
brew install bumpversion
```
### View Current Versions
```bash
@@ -282,7 +290,7 @@ We welcome contributions to Cua! Please refer to our [Contributing Guidelines](C
Apple, macOS, and Apple Silicon are trademarks of Apple Inc.
Ubuntu and Canonical are registered trademarks of Canonical Ltd.
Microsoft is a registered trademark of Microsoft Corporation.
Microsoft is a registered trademark of Microsoft Corporation.
This project is not affiliated with, endorsed by, or sponsored by Apple Inc., Canonical Ltd., Microsoft Corporation, or Kasm Technologies.

View File

@@ -1,6 +1,6 @@
# App-Use: Control Individual Applications with Cua Agents
*Published on May 31, 2025 by The Cua Team*
_Published on May 31, 2025 by The Cua Team_
Today, we are excited to introduce a new experimental feature landing in the [Cua GitHub repository](https://github.com/trycua/cua): **App-Use**. App-Use allows you to create lightweight virtual desktops that limit agent access to specific applications, improving precision of your agent's trajectory. Perfect for parallel workflows, and focused task execution.
@@ -33,9 +33,11 @@ agent = ComputerAgent(
## Key Benefits
### 1. Lightweight and Fast
App-Use creates visual filters, not new processes. Your apps continue running normally - we just control what the agent can see and click on. The virtual desktops are composited views that require no additional compute resources beyond the existing window manager operations.
### 2. Run Multiple Agents in Parallel
Deploy a team of specialized agents, each focused on their own apps:
```python
@@ -46,7 +48,7 @@ computer = Computer(experiments=["app-use"])
research_desktop = computer.create_desktop_from_apps(["Safari"])
research_agent = ComputerAgent(tools=[research_desktop], ...)
# Writing agent focuses on documents
# Writing agent focuses on documents
writing_desktop = computer.create_desktop_from_apps(["Pages", "Notes"])
writing_agent = ComputerAgent(tools=[writing_desktop], ...)
@@ -66,6 +68,7 @@ await asyncio.gather(
### Requirements
To get started with App-Use, you'll need:
- Python 3.11+
- macOS Sequoia (15.0) or later
@@ -85,21 +88,21 @@ from agent import ComputerAgent
async def main():
computer = Computer()
await computer.run()
# Create app-specific desktop sessions
desktop = computer.create_desktop_from_apps(["Notes"])
# Initialize an agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
tools=[desktop]
)
# Take a screenshot (returns bytes by default)
screenshot = await desktop.interface.screenshot()
with open("app_screenshot.png", "wb") as f:
f.write(screenshot)
# Run an agent task
async for result in agent.run("Create a new note titled 'Meeting Notes' and add today's agenda items"):
print(f"Agent: {result.get('text', '')}")
@@ -113,6 +116,7 @@ if __name__ == "__main__":
### ⚠️ Important Warning
Computer-use agents are powerful tools that can interact with your devices. This guide involves using your own macOS and iPhone instead of a VM. **Proceed at your own risk.** Always:
- Review agent actions before running
- Start with non-critical tasks
- Monitor agent behavior closely
@@ -150,20 +154,20 @@ async def automate_iphone():
# Connect to your local computer server
my_mac = Computer(use_host_computer_server=True, os_type="macos", experiments=["app-use"])
await my_mac.run()
# Create a desktop focused on iPhone Mirroring
my_iphone = my_mac.create_desktop_from_apps(["iPhone Mirroring"])
# Initialize an agent for iPhone automation
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
tools=[my_iphone]
)
# Example: Send a message
async for result in agent.run("Open Messages and send 'Hello from Cua!' to John"):
print(f"Agent: {result.get('text', '')}")
# Example: Set a reminder
async for result in agent.run("Create a reminder to call mom at 5 PM today"):
print(f"Agent: {result.get('text', '')}")
@@ -175,6 +179,7 @@ if __name__ == "__main__":
### iPhone Automation Use Cases
With Cua's iPhone automation, you can:
- **Automate messaging**: Send texts, respond to messages, manage conversations
- **Control apps**: Navigate any iPhone app using natural language
- **Manage settings**: Adjust iPhone settings programmatically
@@ -191,6 +196,7 @@ With Cua's iPhone automation, you can:
## When to Use What: App-Use vs Multiple Cua Containers
### Use App-Use within the same macOS Cua Container:
- ✅ You need lightweight, fast agent focusing (macOS only)
- ✅ You want to run multiple agents on one desktop
- ✅ You're automating personal devices like iPhones
@@ -198,6 +204,7 @@ With Cua's iPhone automation, you can:
- ✅ You want low computational overhead
### Use Multiple Cua Containers:
- ✅ You need maximum isolation between agents
- ✅ You require cross-platform support (Mac/Linux/Windows)
- ✅ You need guaranteed resource allocation
@@ -215,6 +222,7 @@ With Cua's iPhone automation, you can:
### How It Works
When you create a desktop session with `create_desktop_from_apps()`, App Use:
- Filters the visual output to show only specified application windows
- Routes input events only to those applications
- Maintains window layout isolation between different sessions

View File

@@ -1,6 +1,6 @@
# Bringing Computer-Use to the Web
*Published on August 5, 2025 by Morgan Dean*
_Published on August 5, 2025 by Morgan Dean_
In one of our original posts, we explored building Computer-Use Operators on macOS - first with a [manual implementation](build-your-own-operator-on-macos-1.md) using OpenAI's `computer-use-preview` model, then with our [cua-agent framework](build-your-own-operator-on-macos-2.md) for Python developers. While these tutorials have been incredibly popular, we've received consistent feedback from our community: **"Can we use Cua with JavaScript and TypeScript?"**
@@ -96,7 +96,7 @@ const res = await openai.responses.create({
],
},
],
truncation: 'auto'
truncation: 'auto',
});
```
@@ -144,30 +144,30 @@ Each response contains:
### Provision a Cua Cloud Container
1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
3. Start the cloud container from the dashboard.
1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
3. Start the cloud container from the dashboard.
### Environment Setup
1. Install required packages with your preferred package manager:
1. Install required packages with your preferred package manager:
```bash
npm install --save @trycua/computer # or yarn, pnpm, bun
npm install --save openai # or yarn, pnpm, bun
```
```bash
npm install --save @trycua/computer # or yarn, pnpm, bun
npm install --save openai # or yarn, pnpm, bun
```
Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript.
Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript.
2. Save your OpenAI API key, Cua API key, and container name to a `.env` file:
2. Save your OpenAI API key, Cua API key, and container name to a `.env` file:
```bash
OPENAI_API_KEY=openai-api-key
CUA_API_KEY=cua-api-key
CUA_CONTAINER_NAME=cua-cloud-container-name
```
```bash
OPENAI_API_KEY=openai-api-key
CUA_API_KEY=cua-api-key
CUA_CONTAINER_NAME=cua-cloud-container-name
```
These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework.
These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework.
## Building the Agent

View File

@@ -1,6 +1,6 @@
# Build Your Own Operator on macOS - Part 1
*Published on March 31, 2025 by Francesco Bonacci*
_Published on March 31, 2025 by Francesco Bonacci_
In this first blogpost, we'll learn how to build our own Computer-Use Operator using OpenAI's `computer-use-preview` model. But first, let's understand what some common terms mean:
@@ -19,6 +19,7 @@ Check out what it looks like to use your own Operator from a Gradio app:
## What You'll Learn
By the end of this tutorial, you'll be able to:
- Set up a macOS virtual machine for AI automation
- Connect OpenAI's computer-use model to your VM
- Create a basic loop for the AI to interact with your VM
@@ -26,6 +27,7 @@ By the end of this tutorial, you'll be able to:
- Implement safety checks and error handling
**Prerequisites:**
- macOS Sonoma (14.0) or later
- 8GB RAM minimum (16GB recommended)
- OpenAI API access (Tier 3+)
@@ -41,15 +43,17 @@ Last March OpenAI released a fine-tuned version of GPT-4o, namely [CUA](https://
Professor Ethan Mollick provides an excellent explanation of computer-use agents in this article: [When you give a Claude a mouse](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse).
### ChatGPT Operator
OpenAI's computer-use model powers [ChatGPT Operator](https://openai.com/index/introducing-operator), a Chromium-based interface exclusively available to ChatGPT Pro subscribers. Users leverage this functionality to automate web-based tasks such as online shopping, expense report submission, and booking reservations by interacting with websites in a human-like manner.
## Benefits of Custom Operators
### Why Build Your Own?
While OpenAI's Operator uses a controlled Chromium VM instance, there are scenarios where you may want to use your own VM with full desktop capabilities. Here are some examples:
- Automating native macOS apps like Finder, Xcode
- Managing files, changing settings, and running terminal commands
- Managing files, changing settings, and running terminal commands
- Testing desktop software and applications
- Creating workflows that combine web and desktop tasks
- Automating media editing in apps like Final Cut Pro and Blender
@@ -59,7 +63,9 @@ This gives you more control and flexibility to automate tasks beyond just web br
## Access Requirements
### Model Availability
As we speak, the **computer-use-preview** model has limited availability:
- Only accessible to OpenAI tier 3+ users
- Additional application process may be required even for eligible users
- Cannot be used in the OpenAI Playground
@@ -68,15 +74,18 @@ As we speak, the **computer-use-preview** model has limited availability:
## Understanding the OpenAI API
### Responses API Overview
Let's start with the basics. In our case, we'll use OpenAI's Responses API to communicate with their computer-use model.
Think of it like this:
1. We send the model a screenshot of our VM and tell it what we want it to do
2. The model looks at the screenshot and decides what actions to take
3. It sends back instructions (like "click here" or "type this")
4. We execute those instructions in our VM
The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI's newest way to interact with their AI models. It comes with several built-in tools:
- **Web search**: Let the AI search the internet
- **File search**: Help the AI find documents
- **Computer use**: Allow the AI to control a computer (what we'll be using)
@@ -84,9 +93,11 @@ The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI
As we speak, the computer-use model is only available through the Responses API.
### Responses API Examples
Let's look at some simple examples. We'll start with the traditional way of using OpenAI's API with Chat Completions, then show the new Responses API primitive.
Chat Completions:
```python
# The old way required managing conversation history manually
messages = [{"role": "user", "content": "Hello"}]
@@ -98,13 +109,14 @@ messages.append(response.choices[0].message) # Manual message tracking
```
Responses API:
```python
# Example 1: Simple web search
# The API handles all the complexity for us
response = client.responses.create(
model="gpt-4",
input=[{
"role": "user",
"role": "user",
"content": "What's the latest news about AI?"
}],
tools=[{
@@ -118,7 +130,7 @@ response = client.responses.create(
response = client.responses.create(
model="gpt-4",
input=[{
"role": "user",
"role": "user",
"content": "Find documents about project X"
}],
tools=[{
@@ -130,6 +142,7 @@ response = client.responses.create(
```
### Computer-Use Model Setup
For our operator, we'll use the computer-use model. Here's how we set it up:
```python
@@ -144,7 +157,7 @@ response = client.responses.create(
}],
input=[
{
"role": "user",
"role": "user",
"content": [
# What we want the AI to do
{"type": "input_text", "text": "Open Safari and go to google.com"},
@@ -158,6 +171,7 @@ response = client.responses.create(
```
### Understanding the Response
When we send a request, the API sends back a response that looks like this:
```json
@@ -189,6 +203,7 @@ When we send a request, the API sends back a response that looks like this:
```
Each response contains:
1. **Reasoning**: The AI's explanation of what it's doing
2. **Action**: The specific computer action to perform
3. **Safety Checks**: Any potential risks to review
@@ -197,15 +212,18 @@ Each response contains:
## CUA-Computer Interface
### Architecture Overview
Let's break down the main components of our system and how they work together:
1. **The Virtual Machine (VM)**
- Think of this as a safe playground for our AI
- It's a complete macOS system running inside your computer
- Anything the AI does stays inside this VM, keeping your main system safe
- We use `lume` to create and manage this VM
2. **The Computer Interface (CUI)**
- This is how we control the VM
- It can move the mouse, type text, and take screenshots
- Works like a remote control for the VM
@@ -238,7 +256,7 @@ sequenceDiagram
VM-->>CUI: Return current screen
CUI->>AI: Send screenshot + instructions
AI-->>CUI: Return next action
Note over CUI,VM: Execute the action
alt Mouse Click
CUI->>VM: Move and click mouse
@@ -259,6 +277,7 @@ sequenceDiagram
```
The diagram above shows how information flows through our system:
1. You start the operator
2. The Computer Interface creates a virtual macOS
3. Then it enters a loop:
@@ -284,23 +303,26 @@ This design keeps everything organized and safe. The AI can only interact with t
```
**Important Storage Notes:**
- Initial download requires 80GB of free space
- After first run, space usage reduces to ~30GB due to macOS's sparse file system
- VMs are stored in `~/.lume`
- Cached images are stored in `~/.lume/cache`
You can check your downloaded VM images anytime:
```bash
lume ls
```
Example output:
| name | os | cpu | memory | disk | display | status | ip | vnc |
|--------------------------|---------|-------|---------|----------------|-----------|-----------|----------------|---------------------------------------------------|
| macos-sequoia-cua:latest | macOS | 12 | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:kind-forest-zulu-island@127.0.0.1:56085 |
| name | os | cpu | memory | disk | display | status | ip | vnc |
| ------------------------ | ----- | --- | ------ | ------------- | -------- | ------- | ------------- | ---------------------------------------------- |
| macos-sequoia-cua:latest | macOS | 12 | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:kind-forest-zulu-island@127.0.0.1:56085 |
After checking your available images, you can run the VM to ensure everything is working correctly:
```bash
lume run macos-sequoia-cua:latest
```
@@ -309,12 +331,14 @@ This design keeps everything organized and safe. The AI can only interact with t
**Note**: The `cua-computer` package requires Python 3.10 or later. We recommend creating a dedicated Python environment:
**Using venv:**
```bash
python -m venv cua-env
source cua-env/bin/activate
```
**Using conda:**
```bash
conda create -n cua-env python=3.10
conda activate cua-env
@@ -332,6 +356,7 @@ This design keeps everything organized and safe. The AI can only interact with t
### Building the Operator
#### Importing Required Modules
With the prerequisites installed and configured, we're ready to build our first operator.
The following example uses asynchronous Python (async/await). You can run it either in a VS Code Notebook or as a standalone Python script.
@@ -344,12 +369,13 @@ from computer import Computer
```
#### Mapping API Actions to CUA Methods
The following helper function converts a `computer_call` action from the OpenAI Responses API into corresponding commands on the CUI interface. For example, if the API instructs a `click` action, we move the cursor and perform a left click on the lume VM Sandbox. We will use the computer interface to execute the actions.
```python
async def execute_action(computer, action):
action_type = action.type
if action_type == "click":
x = action.x
y = action.y
@@ -360,12 +386,12 @@ async def execute_action(computer, action):
await computer.interface.right_click()
else:
await computer.interface.left_click()
elif action_type == "type":
text = action.text
print(f"Typing text: {text}")
await computer.interface.type_text(text)
elif action_type == "scroll":
x = action.x
y = action.y
@@ -374,7 +400,7 @@ async def execute_action(computer, action):
print(f"Scrolling at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})")
await computer.interface.move_cursor(x, y)
await computer.interface.scroll(scroll_y) # Using vertical scroll only
elif action_type == "keypress":
keys = action.keys
for key in keys:
@@ -386,23 +412,24 @@ async def execute_action(computer, action):
await computer.interface.press_key("space")
else:
await computer.interface.press_key(key)
elif action_type == "wait":
wait_time = action.time
print(f"Waiting for {wait_time} seconds")
await asyncio.sleep(wait_time)
elif action_type == "screenshot":
print("Taking screenshot")
# This is handled automatically in the main loop, but we can take an extra one if requested
screenshot = await computer.interface.screenshot()
return screenshot
else:
print(f"Unrecognized action: {action_type}")
```
#### Implementing the Computer-Use Loop
This section defines a loop that:
1. Initializes the cua-computer instance (connecting to a macOS sandbox).
@@ -423,7 +450,7 @@ async def cua_openai_loop():
os_type="macos"
) as computer:
await computer.run() # Start the lume VM
# Capture the initial screenshot
screenshot = await computer.interface.screenshot()
screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
@@ -438,8 +465,8 @@ async def cua_openai_loop():
"environment": "mac"
}],
input=[
{
"role": "user",
{
"role": "user",
"content": [
{"type": "input_text", "text": "Open Safari, download and install Cursor."},
{"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"}
@@ -488,7 +515,7 @@ async def cua_openai_loop():
"display_height": 768,
"environment": "mac"
}],
input=[{
input=[{
"type": "computer_call_output",
"call_id": last_call_id,
"acknowledged_safety_checks": acknowledged_checks,
@@ -511,12 +538,15 @@ if __name__ == "__main__":
You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-1.ipynb).
#### Request Handling Differences
The first request to the OpenAI Responses API is special in that it includes the initial screenshot and prompt. Subsequent requests are handled differently, using the `computer_call_output` type to provide feedback on the executed action.
##### Initial Request Format
- We use `role: "user"` with `content` that contains both `input_text` (the prompt) and `input_image` (the screenshot)
##### Subsequent Request Format
- We use `type: "computer_call_output"` instead of the user role
- We include the `call_id` to link the output to the specific previous action that was executed
- We provide any `acknowledged_safety_checks` that were approved
@@ -529,6 +559,7 @@ This structured approach allows the API to maintain context and continuity throu
## Conclusion
### Summary
This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop where:
- A macOS sandbox is controlled using the CUA interface.
@@ -538,9 +569,11 @@ This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop wher
In a production setting, you would wrap the action-response cycle in a loop, handling multiple actions and safety checks as needed.
### Next Steps
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/agent) package!
### Resources
- [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use)
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer)
- [lume](https://github.com/trycua/cua/tree/main/libs/lume)

View File

@@ -1,6 +1,6 @@
# Build Your Own Operator on macOS - Part 2
*Published on April 27, 2025 by Francesco Bonacci*
_Published on April 27, 2025 by Francesco Bonacci_
In our [previous post](build-your-own-operator-on-macos-1.md), we built a basic Computer-Use Operator from scratch using OpenAI's `computer-use-preview` model and our [cua-computer](https://pypi.org/project/cua-computer) package. While educational, implementing the control loop manually can be tedious and error-prone.
@@ -13,12 +13,14 @@ In this follow-up, we'll explore our [cua-agent](https://pypi.org/project/cua-ag
## What You'll Learn
By the end of this tutorial, you'll be able to:
- Set up the `cua-agent` framework with various agent loop types and model providers
- Understand the different agent loop types and their capabilities
- Work with local models for cost-effective workflows
- Use a simple UI for your operator
**Prerequisites:**
- Completed setup from Part 1 ([lume CLI installed](https://github.com/trycua/cua?tab=readme-ov-file#option-2-full-computer-use-agent-capabilities), macOS CUA image already pulled)
- Python 3.10+. We recommend using Conda (or Anaconda) to create an ad hoc Python environment.
- API keys for OpenAI and/or Anthropic (optional for local models)
@@ -58,6 +60,7 @@ pip install "cua-agent[ui]" # Gradio UI
Before running any code examples, let's set up a proper environment:
1. **Create a new directory** for your project:
```bash
mkdir cua-agent-tutorial
cd cua-agent-tutorial
@@ -66,13 +69,15 @@ Before running any code examples, let's set up a proper environment:
2. **Set up a Python environment** using one of these methods:
**Option A: Using conda command line**
```bash
# Using conda
conda create -n cua-agent python=3.10
conda activate cua-agent
```
**Option B: Using Anaconda Navigator UI**
- Open Anaconda Navigator
- Click on "Environments" in the left sidebar
- Click the "Create" button at the bottom
@@ -80,36 +85,41 @@ Before running any code examples, let's set up a proper environment:
- Select Python 3.10
- Click "Create"
- Once created, select the environment and click "Open Terminal" to activate it
**Option C: Using venv**
```bash
python -m venv cua-env
source cua-env/bin/activate # On macOS/Linux
```
3. **Install the cua-agent package**:
```bash
pip install "cua-agent[all]"
```
4. **Set up your API keys as environment variables**:
```bash
# For OpenAI models
export OPENAI_API_KEY=your_openai_key_here
# For Anthropic models (if needed)
export ANTHROPIC_API_KEY=your_anthropic_key_here
```
5. **Create a Python file or notebook**:
**Option A: Create a Python script**
```bash
# For a Python script
touch cua_agent_example.py
```
**Option B: Use VS Code notebooks**
- Open VS Code
- Install the Python extension if you haven't already
- Create a new file with a `.ipynb` extension (e.g., `cua_agent_tutorial.ipynb`)
@@ -120,9 +130,10 @@ Now you're ready to run the code examples!
## Understanding Agent Loops
If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model.
If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model.
In the `cua-agent` framework, an **Agent Loop** is the core abstraction that implements the continuous interaction cycle between an AI model and the computer environment. It manages the flow of:
1. Capturing screenshots of the computer's state
2. Processing these screenshots (with or without UI element detection)
3. Sending this visual context to an AI model along with the task instructions
@@ -141,6 +152,7 @@ While the core concept remains the same across all agent loops, different AI mod
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
Each loop handles the same basic pattern we implemented manually in Part 1:
1. Take a screenshot of the VM
2. Send the screenshot and task to the AI model
3. Receive an action to perform
@@ -169,13 +181,13 @@ Choosing the right agent loop depends not only on your API access and technical
The performance of different Computer-Use models varies significantly across tasks. These benchmark evaluations measure an agent's ability to follow instructions and complete real-world tasks in different computing environments.
| Benchmark type | Benchmark | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA | Human |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-------------|-------------|----------------------|-------------|
| **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps) | **42.5** | 36.4 | 28 | 38.1 (200 step) | 72.4 |
| | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps) | **42.1** | - | - | 29.8 | - |
| **Browser Use** | [WebVoyager](https://arxiv.org/abs/2401.13919) | 84.8 | **87** | 84.1 | 87 | - |
| | [Online-Mind2web](https://arxiv.org/abs/2504.01382) | **75.8** | 71 | 62.9 | 71 | - |
| **Phone Use** | [Android World](https://arxiv.org/abs/2405.14573) | **64.2** | - | - | 59.5 | - |
| Benchmark type | Benchmark | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA | Human |
| ---------------- | ------------------------------------------------------------------ | ----------- | ---------- | ---------- | --------------- | ----- |
| **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps) | **42.5** | 36.4 | 28 | 38.1 (200 step) | 72.4 |
| | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps) | **42.1** | - | - | 29.8 | - |
| **Browser Use** | [WebVoyager](https://arxiv.org/abs/2401.13919) | 84.8 | **87** | 84.1 | 87 | - |
| | [Online-Mind2web](https://arxiv.org/abs/2504.01382) | **75.8** | 71 | 62.9 | 71 | - |
| **Phone Use** | [Android World](https://arxiv.org/abs/2405.14573) | **64.2** | - | - | 59.5 | - |
### When to Use Each Loop
@@ -210,10 +222,10 @@ async def run_simple_task():
model="openai/computer-use-preview",
tools=[macos_computer]
)
# Define a simple task
task = "Open Safari and search for 'Python tutorials'"
# Run the task and process responses
async for result in agent.run(task):
print(f"Action: {result.get('text')}")
@@ -225,6 +237,7 @@ if __name__ == "__main__":
3. Save the file
4. Open a terminal, navigate to your project directory, and run:
```bash
python simple_task.py
```
@@ -232,6 +245,7 @@ if __name__ == "__main__":
5. The code will initialize the macOS virtual machine, create an agent, and execute the task of opening Safari and searching for Python tutorials.
You can also run this in a VS Code notebook:
1. Create a new notebook in VS Code (.ipynb file)
2. Copy the code into a cell (without the `if __name__ == "__main__":` part)
3. Run the cell to execute the code
@@ -259,7 +273,7 @@ async def run_multi_task_workflow():
model="anthropic/claude-3-5-sonnet-20241022",
tools=[macos_computer]
)
tasks = [
"Open Safari and go to github.com",
"Search for 'trycua/cua'",
@@ -267,7 +281,7 @@ async def run_multi_task_workflow():
"Click on the 'Issues' tab",
"Read the first open issue"
]
for i, task in enumerate(tasks):
print(f"\nTask {i+1}/{len(tasks)}: {task}")
async for result in agent.run(task):
@@ -301,13 +315,13 @@ async for result in agent.run(task):
# Basic information
print(f"Response ID: {result.get('id')}")
print(f"Response Text: {result.get('text')}")
# Detailed token usage statistics
usage = result.get('usage')
if usage:
print(f"Input Tokens: {usage.get('input_tokens')}")
print(f"Output Tokens: {usage.get('output_tokens')}")
# Reasoning and actions
for output in result.get('output', []):
if output.get('type') == 'reasoning':
@@ -318,6 +332,7 @@ async for result in agent.run(task):
```
This structured format allows you to:
- Log detailed information about agent actions
- Provide real-time feedback to users
- Track token usage for cost monitoring
@@ -330,6 +345,7 @@ One of the most powerful features of the framework is the ability to use local m
**How to run this example:**
1. First, you'll need to install Ollama for running local models:
- Visit [ollama.com](https://ollama.com) and download the installer for your OS
- Follow the installation instructions
- Pull the Gemma 3 model:
@@ -350,9 +366,9 @@ async def run_with_local_model():
model="omniparser+ollama_chat/gemma3",
tools=[macos_computer]
)
task = "Open the Calculator app and perform a simple calculation"
async for result in agent.run(task):
print(f"Action: {result.get('text')}")
@@ -379,12 +395,14 @@ agent = ComputerAgent(
```
Common local endpoints include:
- LM Studio: `http://localhost:1234/v1`
- vLLM: `http://localhost:8000/v1`
- LocalAI: `http://localhost:8080/v1`
- Ollama with OpenAI compat: `http://localhost:11434/v1`
This approach is perfect for:
- Development and testing without incurring API costs
- Offline or air-gapped environments where API access isn't possible
- Privacy-sensitive applications where data can't leave your network
@@ -406,8 +424,8 @@ UI-TARS is ByteDance's Computer-Use model designed for navigating OS-level inter
```python
agent = ComputerAgent(
model=LLM(
provider=LLMProvider.OAICOMPAT,
name="tgi",
provider=LLMProvider.OAICOMPAT,
name="tgi",
provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1"
),
tools=[macos_computer]
@@ -475,11 +493,13 @@ if __name__ == "__main__":
```
2. Install the UI dependencies if you haven't already:
```bash
pip install "cua-agent[ui]"
```
3. Run the script:
```bash
python launch_ui.py
```
@@ -498,12 +518,14 @@ if __name__ == "__main__":
```
When you run this, Gradio will display both a local URL and a public URL like:
```
Running on local URL: http://127.0.0.1:7860
Running on public URL: https://abcd1234.gradio.live
```
**Security Note:** Be cautious when sharing your Gradio UI publicly:
- The public URL gives anyone with the link full access to your agent
- Consider using basic authentication for additional protection:
```python
@@ -513,6 +535,7 @@ Running on public URL: https://abcd1234.gradio.live
- The temporary link expires when you stop the Gradio application
This provides:
- Model provider selection
- Agent loop selection
- Task input field
@@ -566,7 +589,7 @@ async def github_workflow():
verbosity=logging.INFO,
tools=[macos_computer]
)
tasks = [
"Look for a repository named trycua/cua on GitHub.",
"Check the open issues, open the most recent one and read it.",
@@ -575,7 +598,7 @@ async def github_workflow():
"From Cursor, open Composer if not already open.",
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
]
for i, task in enumerate(tasks):
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
async for result in agent.run(task):
@@ -587,11 +610,13 @@ if __name__ == "__main__":
```
2. Make sure your OpenAI API key is set:
```bash
export OPENAI_API_KEY=your_openai_key_here
```
3. Run the script:
```bash
python github_workflow.py
```
@@ -604,6 +629,7 @@ if __name__ == "__main__":
- Use Cursor's AI features to work on a solution
This example:
1. Searches GitHub for a repository
2. Reads an issue
3. Clones the repository
@@ -615,6 +641,7 @@ This example:
Let's compare our manual implementation from Part 1 with the framework approach:
### Manual Implementation (Part 1)
- Required writing custom code for the interaction loop
- Needed explicit handling of different action types
- Required direct management of the OpenAI API calls
@@ -622,6 +649,7 @@ Let's compare our manual implementation from Part 1 with the framework approach:
- Limited to OpenAI's computer-use model
### Framework Implementation (Part 2)
- Abstracts the interaction loop
- Handles all action types automatically
- Manages API calls internally
@@ -634,17 +662,21 @@ Let's compare our manual implementation from Part 1 with the framework approach:
The `cua-agent` framework transforms what was a complex implementation task into a simple, high-level interface for building Computer-Use Agents. By abstracting away the technical details, it lets you focus on defining the tasks rather than the machinery.
### When to Use Each Approach
- **Manual Implementation (Part 1)**: When you need complete control over the interaction loop or are implementing a custom solution
- **Framework (Part 2)**: For most applications where you want to quickly build and deploy Computer-Use Agents
### Next Steps
With the basics covered, you might want to explore:
- Customizing the agent's behavior with additional parameters
- Building more complex workflows spanning multiple applications
- Integrating your agent into other applications
- Contributing to the open-source project on GitHub
### Resources
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent)
- [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb)
- [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses)

View File

@@ -1,6 +1,6 @@
# Announcing Cua Agent framework 0.4 and Composite Agents
*Published on August 26, 2025 by Dillon DuPont*
_Published on August 26, 2025 by Dillon DuPont_
<img src="./assets/composite-agents.png" alt="Composite Agents">
@@ -12,7 +12,7 @@ This is the kind of problem that makes you wonder if we're building the future o
## What we fixed
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
@@ -42,7 +42,7 @@ agent = ComputerAgent(
This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository.
You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
```python
agent = ComputerAgent(
@@ -63,12 +63,11 @@ We're building integration with HUD evals, allowing us to curate and benchmark m
If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects.
---
## Links
* **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
* **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
- **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.

View File

@@ -1,6 +1,6 @@
# Computer-Use Agents SOTA Challenge: Hack the North + Global Online
*Published on August 25, 2025 by Francesco Bonacci*
_Published on August 25, 2025 by Francesco Bonacci_
Were bringing something new to [Hack the North](https://hackthenorth.com), Canadas largest hackathon, this year: a head-to-head competition for **Computer-Use Agents** - on-site at Waterloo and a **Global online challenge**. From September 1214, 2025, teams build on the **Cua Agent Framework** and are scored in **HUDs OSWorld-Verified** environment to push past todays SOTA on [OS-World](https://os-world.github.io).
@@ -14,7 +14,8 @@ Theres one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Age
**Cua** and [**Ollama**](https://ollama.com) organize a global hackathon to find the **most creative uses of local and hybrid computer-use agents**. There are no geographic restrictions on who can join — this is a worldwide competition focused on **originality, impact, and inventive applications** that showcase what's possible with local and hybrid inference.
**Prizes:**
**Prizes:**
- 1st **MacBook Air M4 (or equivalent value)** + features in Cua & Ollama channels
- 2nd **$500 CAD + swag**
- 3rd **swag + public feature**
@@ -26,36 +27,42 @@ Theres one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Age
Two different tracks, two different processes:
### On-site (Track A)
Build during the weekend and submit a repo with a one-line start command. **HUD** executes your command in a clean environment and runs **OSWorld-Verified**. Scores come from official benchmark results; ties break by median, then wall-clock time, then earliest submission. Any model setup is allowed (cloud or local).
**HUD** runs official evaluations immediately after submission. Winners are announced at the **closing ceremony**.
### Rules
- Fork and star the [Cua repo](https://github.com/trycua/cua).
- Add your agent and instructions in `samples/community/hack-the-north/<YOUR_TEAM_NAME>`.
- Include a README with details on the approach and any required notes.
- Submit a PR.
- Include a README with details on the approach and any required notes.
- Submit a PR.
**Deadline: Sept 15, 8:00 AM EDT**
### Global Online (Track B)
Open to anyone, anywhere. Build on your own timeline and submit through the **Cua Discord form** by the deadline.
**Project Requirements:**
- Your agent must integrate **Cua and Ollama** in some way
- Your agent must be **easily runnable by judges**
Judged by **Cua** and **Ollama** teams on:
- **Creativity (30%)** originality, usefulness, surprise factor
- **Technical Depth (30%)** quality of engineering and agent design
- **Use of Ollama (30%)** effective integration of local/hybrid inference
- **Polish (10%)** presentation, clarity, demo readiness
Judged by **Cua** and **Ollama** teams on:
- **Creativity (30%)** originality, usefulness, surprise factor
- **Technical Depth (30%)** quality of engineering and agent design
- **Use of Ollama (30%)** effective integration of local/hybrid inference
- **Polish (10%)** presentation, clarity, demo readiness
### Submission Process
Submissions will be collected via a **form link provided in the Cua Discord**. Your submission must contain:
- **GitHub repo** containing the agent source code and a clear README with instructions on how to use the agent
- **Explanation** of the models and tools used, and what's local or hybrid about your design
- **Explanation** of the models and tools used, and what's local or hybrid about your design
- **Short demo video** (up to two minutes)
A **commit freeze** will be used to ensure that no changes are made after the deadline. Winners will be announced after judging is complete.
@@ -68,12 +75,13 @@ A **commit freeze** will be used to ensure that no changes are made after the de
Bring a team, pick a model stack, and push what agents can do on real computers. We cant wait to see what you build at **Hack the North 2025**.
**Discord channels**
**Discord channels**
- Join the Discord first: https://discord.gg/cua-ai
- **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
- **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
- **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
- **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
**Contact**
Questions on Hack the North? Email **hackthenorth@trycua.com**.
*P.S. If youre planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; well share office-hour times in both Discord channels.*
_P.S. If youre planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; well share office-hour times in both Discord channels._

View File

@@ -1,6 +1,6 @@
# What happens when hackathon judging is a public benchmark (Hack the North edition)
*Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025*
_Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025_
## Prologue
@@ -16,7 +16,7 @@ The rest, as they say, was a 36h story worth telling—and a playbook worth shar
## The sign-up problem we had to invent
We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. Its kind of an open secret that UWaterloo turns out some of the sharpest hackers around (*no pun intended, HackMIT*). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. *This wasnt a blank-slate, build-whatever-you-want kind of track.*
We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. Its kind of an open secret that UWaterloo turns out some of the sharpest hackers around (_no pun intended, HackMIT_). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. _This wasnt a blank-slate, build-whatever-you-want kind of track._
From day one, though, we knew wed have to fight for sign-ups. This was a niche track, and a guaranteed YC interview alone wouldnt be enough to pull people in.
@@ -24,7 +24,7 @@ Unfortunately, Hack the North (HTN) didnt offer an interest form to help us e
On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **trycua.com/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register.
*(Moral: the “measure-zero effect”—no matter how many times you say it, some people wont see it. Plenty of invalid sign-ups still slipped through.)*
_(Moral: the “measure-zero effect”—no matter how many times you say it, some people wont see it. Plenty of invalid sign-ups still slipped through.)_
Even so, having your own form is absolutely worth it: it gives you an **early funnel**, surfaces demand signals ahead of time, and—crucially—**lets you require platform sign-up before kickoff**. In our case, Hack the North didnt provide Devpost access until the very end, so our form was the only way to build a working roster.
@@ -45,13 +45,13 @@ Day 0 on campus made the difference. We arrived a couple of hours early to colle
![hack-booth](./assets/hack-booth.png)
*(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)*
_(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)_
## 02:30 a.m. is still prime time at a hackathon
Hack the North gives sponsors a 30-minute API Workshop during the early hours of the event—a perfect moment to shift from talking to building.
Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it.
Our slot landed at **2:30 a.m.** (_perks of the cheapest sponsor tier_). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it.
**Our track rules were simple:**
@@ -67,7 +67,7 @@ Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty
![hack-booth](./assets/hack-workshop.jpeg)
*(Our CUA Workshop at 2:30 AM.)*
_(Our CUA Workshop at 2:30 AM.)_
## Making it possible to focus on the work
@@ -87,7 +87,7 @@ We provided:
**After the workshop buzz.** Morning interest was high, but Docker setup + requiring focus on a single track thinned the crowd. Most sponsor prizes are broad (“use our product and you qualify”), letting students stack tracks. Ours required commitment. Upside: those who stayed shipped sharper, higher-quality submissions.
**The bell curve of submissions.** Most entries used *claude-sonnet-4-20250514*—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries.
**The bell curve of submissions.** Most entries used _claude-sonnet-4-20250514_—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries.
**Who went beyond the baseline.** A few tried multi-agent/tool graphs. One standout—[**cuala**](https://github.com/YeIIcw/cuala)—was a clean reference: deterministic actions, verifiable state changes, callbacks for saving images and trajectories.
@@ -97,7 +97,7 @@ We provided:
We skipped a full end-to-end **Cua × HUD** dry-run. It showed.
- Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for *computer-use-preview* (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks.
- Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for _computer-use-preview_ (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks.
- Python version/build mismatches surfaced, requiring debug time across both OSS repos.
- Our Cua framework lacked a **Response Agent** to complete evaluation loops. Some runs stalled until patched.
@@ -112,28 +112,31 @@ We skipped a full end-to-end **Cua × HUD** dry-run. It showed.
![hack-leaderboard](./assets/hack-leaderboard.png)
*(Leaderboard on HUD)*
_(Leaderboard on HUD)_
### Winners
**🥇 Winner — Ram**
- Devpost: https://devpost.com/software/sota-computer-use-agent-challenge
- Code: https://github.com/Ram-Raghav-S/cua/tree/ram
**🥇 Winner — Ram**
- Devpost: https://devpost.com/software/sota-computer-use-agent-challenge
- Code: https://github.com/Ram-Raghav-S/cua/tree/ram
- Score: 68.3%
**🥈 Runner-up — Aryan**
- Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt
- Code: https://github.com/Tumph/cua
**🥈 Runner-up — Aryan**
- Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt
- Code: https://github.com/Tumph/cua
- Score: 55.9%
**🥉 Special Mention — Adam**
- Devpost: https://devpost.com/software/cuala
- Code: https://github.com/YeIIcw/cuala
**🥉 Special Mention — Adam**
- Devpost: https://devpost.com/software/cuala
- Code: https://github.com/YeIIcw/cuala
- Score: 42.1%
![hack-winners](./assets/hack-winners.jpeg)
*(Our finalists before the award ceremony)*
_(Our finalists before the award ceremony)_
## What Wed Keep
@@ -163,4 +166,4 @@ Whether youre a hacker who wants to participate, or a company looking to spon
![hack-closing-ceremony](./assets/hack-closing-ceremony.jpg)
*(HTN Closing Ceremony — Cua Track Winner Announcement)*
_(HTN Closing Ceremony — Cua Track Winner Announcement)_

View File

@@ -1,6 +1,6 @@
# Cua × HUD - Evaluate Any Computer-Use Agent
*Published on August 27, 2025 by Dillon DuPont*
_Published on August 27, 2025 by Dillon DuPont_
You can now benchmark any GUI-capable agent on real computer-use tasks through our new integration with [HUD](https://hud.so), the evaluation platform for computer-use agents.
@@ -70,9 +70,9 @@ Watch your agent work in real-time. Example output:
```md
Starting full dataset run...
╔═════════════════════════════════════════════════════════════════╗
🚀 See your agent live at:
║ 🚀 See your agent live at: ║
╟─────────────────────────────────────────────────────────────────╢
https://app.hud.so/jobs/fe05805d-4da9-4fc6-84b5-5c518528fd3c
║ https://app.hud.so/jobs/fe05805d-4da9-4fc6-84b5-5c518528fd3c ║
╚═════════════════════════════════════════════════════════════════╝
```
@@ -90,4 +90,4 @@ Customize your evaluation with these options:
- Notebook with endtoend examples: https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb
- Docs: https://docs.trycua.com/docs/agent-sdk/integrations/hud
- Live traces: https://app.hud.so
- Live traces: https://app.hud.so

View File

@@ -1,10 +1,10 @@
# When Agents Need Human Wisdom - Introducing Human-In-The-Loop Support
*Published on August 29, 2025 by Francesco Bonacci*
_Published on August 29, 2025 by Francesco Bonacci_
Sometimes the best AI agent is a human. Whether you're creating training demonstrations, evaluating complex scenarios, or need to intervene when automation hits a wall, our new Human-In-The-Loop integration puts you directly in control.
With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could benchmark any agent at scale. Today's update lets you *become* the agent when it matters most—seamlessly switching between automated intelligence and human judgment.
With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could benchmark any agent at scale. Today's update lets you _become_ the agent when it matters most—seamlessly switching between automated intelligence and human judgment.
<div align="center">
<video src="https://github.com/user-attachments/assets/9091b50f-26e7-4981-95ce-40e5d42a1260" width="600" controls></video>
@@ -20,11 +20,12 @@ With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could ben
## Why Human-In-The-Loop?
Even the most sophisticated agents encounter edge cases, ambiguous interfaces, or tasks requiring human judgment. Rather than failing gracefully, they can now fail *intelligently*—by asking for human help.
Even the most sophisticated agents encounter edge cases, ambiguous interfaces, or tasks requiring human judgment. Rather than failing gracefully, they can now fail _intelligently_—by asking for human help.
This approach bridges the gap between fully automated systems and pure manual control, letting you:
- **Demonstrate complex workflows** that agents can learn from
- **Evaluate tricky scenarios** where ground truth requires human assessment
- **Evaluate tricky scenarios** where ground truth requires human assessment
- **Intervene selectively** when automated agents need guidance
- **Test and debug** your tools and environments manually
@@ -64,7 +65,7 @@ Combine model intelligence with human precision—let AI plan, then execute manu
```python
agent = ComputerAgent(
"huggingface-local/HelloKKMe/GTA1-7B+human/human",
"huggingface-local/HelloKKMe/GTA1-7B+human/human",
tools=[computer]
)
@@ -81,7 +82,7 @@ Start automated, escalate to human when needed:
# Primary automated agent
primary_agent = ComputerAgent("openai/computer-use-preview", tools=[computer])
# Human fallback agent
# Human fallback agent
fallback_agent = ComputerAgent("human/human", tools=[computer])
try:
@@ -101,22 +102,26 @@ except Exception:
The human-in-the-loop interface provides a rich, responsive experience:
### **Visual Environment**
- **Screenshot display** with live updates as you work
- **Click handlers** for direct interaction with UI elements
- **Click handlers** for direct interaction with UI elements
- **Zoom and pan** to see details clearly
### **Action Controls**
- **Click actions** - precise cursor positioning and clicking
- **Keyboard input** - type text naturally or send specific key combinations
- **Action history** - see the sequence of actions taken
- **Undo support** - step back when needed
### **Tool Integration**
### **Tool Integration**
- **Full OpenAI compatibility** - standard tool call format
- **Custom tools** - integrate your own tools seamlessly
- **Real-time feedback** - see tool responses immediately
### **Smart Polling**
- **Responsive updates** - UI refreshes when new completions arrive
- **Background processing** - continue working while waiting for tasks
- **Session persistence** - resume interrupted sessions
@@ -124,6 +129,7 @@ The human-in-the-loop interface provides a rich, responsive experience:
## Real-World Use Cases
### **Training Data Generation**
Create perfect demonstrations for fine-tuning:
```python
@@ -132,7 +138,7 @@ demo_agent = ComputerAgent("human/human", tools=[computer])
tasks = [
"Create a budget spreadsheet with income and expense categories",
"Apply conditional formatting to highlight overbudget items",
"Apply conditional formatting to highlight overbudget items",
"Generate a pie chart showing expense distribution"
]
@@ -143,6 +149,7 @@ for task in tasks:
```
### **Evaluation and Ground Truth**
Validate agent performance on complex scenarios:
```python
@@ -154,6 +161,7 @@ async for _ in evaluator.run("Review this completed form and rate accuracy (1-10
```
### **Interactive Debugging**
Step through agent behavior manually:
```python
@@ -165,6 +173,7 @@ async for _ in debug_agent.run("Reproduce the agent's failed login sequence"):
```
### **Edge Case Handling**
Handle scenarios that break automated agents:
```python
@@ -180,26 +189,26 @@ async for _ in edge_case_agent.run("Navigate this CAPTCHA-protected form"):
Customize the human agent experience:
- **UI refresh rate**: Adjust polling frequency for your workflow
- **Image quality**: Balance detail vs. performance for screenshots
- **Image quality**: Balance detail vs. performance for screenshots
- **Action logging**: Save detailed traces for analysis and training
- **Session timeout**: Configure idle timeouts for security
- **Tool permissions**: Restrict which tools humans can access
## When to Use Human-In-The-Loop
| **Scenario** | **Why Human Control** |
|--------------|----------------------|
| **Creating training data** | Perfect demonstrations for model fine-tuning |
| **Evaluating complex tasks** | Human judgment for subjective or nuanced assessment |
| **Handling edge cases** | CAPTCHAs, unusual UIs, context-dependent decisions |
| **Debugging workflows** | Step through failures to identify breaking points |
| **High-stakes operations** | Critical tasks requiring human oversight and approval |
| **Testing new environments** | Validate tools and environments work as expected |
| **Scenario** | **Why Human Control** |
| ---------------------------- | ----------------------------------------------------- |
| **Creating training data** | Perfect demonstrations for model fine-tuning |
| **Evaluating complex tasks** | Human judgment for subjective or nuanced assessment |
| **Handling edge cases** | CAPTCHAs, unusual UIs, context-dependent decisions |
| **Debugging workflows** | Step through failures to identify breaking points |
| **High-stakes operations** | Critical tasks requiring human oversight and approval |
| **Testing new environments** | Validate tools and environments work as expected |
## Learn More
- **Interactive examples**: Try human-in-the-loop control with sample tasks
- **Training data pipelines**: Learn how to convert human demonstrations into model training data
- **Training data pipelines**: Learn how to convert human demonstrations into model training data
- **Evaluation frameworks**: Build human-validated test suites for your agents
- **API documentation**: Full reference for human agent configuration
@@ -207,4 +216,4 @@ Ready to put humans back in the loop? The most sophisticated AI system knows whe
---
*Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop).*
_Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)._

View File

@@ -1,6 +1,6 @@
# Introducing Cua Cloud Sandbox: Computer-Use Agents in the Cloud
*Published on May 28, 2025 by Francesco Bonacci*
_Published on May 28, 2025 by Francesco Bonacci_
Welcome to the next chapter in our Computer-Use Agent journey! In [Part 1](./build-your-own-operator-on-macos-1), we showed you how to build your own Operator on macOS. In [Part 2](./build-your-own-operator-on-macos-2), we explored the cua-agent framework. Today, we're excited to introduce **Cua Cloud Sandbox** the easiest way to deploy Computer-Use Agents at scale.
@@ -14,9 +14,9 @@ Think of Cua Cloud as **Docker for Computer-Use Agents**. Instead of managing VM
## Why Cua Cloud Sandbox?
Four months ago, we launched [**Lume**](https://github.com/trycua/cua/tree/main/libs/lume) and [**Cua**](https://github.com/trycua/cua) with the goal to bring sandboxed VMs and Computer-Use Agents on Apple Silicon. The developer's community response was incredible 🎉
Four months ago, we launched [**Lume**](https://github.com/trycua/cua/tree/main/libs/lume) and [**Cua**](https://github.com/trycua/cua) with the goal to bring sandboxed VMs and Computer-Use Agents on Apple Silicon. The developer's community response was incredible 🎉
Going from prototype to production revealed a problem though: **local macOS VMs don't scale**, neither are they easily portable.
Going from prototype to production revealed a problem though: **local macOS VMs don't scale**, neither are they easily portable.
Our Discord community, YC peers, and early pilot customers kept hitting the same issues. Storage constraints meant **20-40GB per VM** filled laptops fast. Different hardware architectures (Apple Silicon ARM vs Intel x86) prevented portability of local workflows. Every new user lost a day to setup and configuration.
@@ -55,7 +55,7 @@ async def run_cloud_agent():
name=os.getenv("CUA_CONTAINER_NAME"),
provider_type=VMProviderType.CLOUD,
)
# Create an agent with your preferred loop
agent = ComputerAgent(
model="openai/gpt-4o",
@@ -63,7 +63,7 @@ async def run_cloud_agent():
verbosity=logging.INFO,
tools=[computer]
)
# Run a task
async for result in agent.run("Open Chrome and search for AI news"):
print(f"Response: {result.get('text')}")
@@ -102,14 +102,14 @@ async def github_automation():
name="github-automation",
provider_type=VMProviderType.CLOUD,
)
agent = ComputerAgent(
model="openai/gpt-4o",
save_trajectory=True,
verbosity=logging.INFO,
tools=[computer]
)
tasks = [
"Look for a repository named trycua/cua on GitHub.",
"Check the open issues, open the most recent one and read it.",
@@ -119,17 +119,17 @@ async def github_automation():
"Commit the changes with a descriptive message.",
"Create a pull request."
]
for i, task in enumerate(tasks):
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
async for result in agent.run(task):
print(f"Response: {result.get('text')}")
# Check if any tools were used
tools = result.get('tools')
if tools:
print(f"Tools used: {tools}")
print(f"Task {i+1} completed")
# Run the automation
@@ -153,13 +153,13 @@ async def scrape_website(site_name, url):
name=f"scraper-{site_name}",
provider_type=VMProviderType.CLOUD,
)
agent = ComputerAgent(
model="openai/gpt-4o",
save_trajectory=True,
tools=[computer]
)
results = []
tasks = [
f"Navigate to {url}",
@@ -167,7 +167,7 @@ async def scrape_website(site_name, url):
"Take a screenshot of the page",
"Save the extracted data to a file"
]
for task in tasks:
async for result in agent.run(task):
results.append({
@@ -175,7 +175,7 @@ async def scrape_website(site_name, url):
'task': task,
'response': result.get('text')
})
return results
async def parallel_scraping():
@@ -185,11 +185,11 @@ async def parallel_scraping():
("HackerNews", "https://news.ycombinator.com"),
("TechCrunch", "https://techcrunch.com")
]
# Run all scraping tasks in parallel
tasks = [scrape_website(name, url) for name, url in sites]
results = await asyncio.gather(*tasks)
# Process results
for site_results in results:
print(f"\nResults from {site_results[0]['site']}:")

View File

@@ -1,6 +1,6 @@
# From Lume to Containerization: Our Journey Meets Apple's Vision
*Published on June 10, 2025 by Francesco Bonacci*
_Published on June 10, 2025 by Francesco Bonacci_
Yesterday, Apple announced their new [Containerization framework](https://github.com/apple/containerization) at WWDC. Since then, our Discord and X users have been asking what this means for Cua virtualization capabilities on Apple Silicon. We've been working in this space for months - from [Lume](https://github.com/trycua/cua/tree/main/libs/lume) to [Lumier](https://github.com/trycua/cua/tree/main/libs/lumier) to [Cua Cloud Sandbox](./introducing-cua-cloud-containers). Here's our take on Apple's announcement.
@@ -40,6 +40,7 @@ How Apple's Framework Works:
```
Why is this better?
- **Better security**: Each container is completely separate
- **Better performance**: Each container gets its own resources
- **Real isolation**: If one container has problems, others aren't affected
@@ -71,6 +72,7 @@ While Apple's new framework focuses on containers, we've been building VM manage
[Lume](https://github.com/trycua/cua/tree/main/libs/lume) is our command-line tool for creating and managing VMs on Apple Silicon. We built it because setting up VMs on macOS was too complicated.
What Lume does:
- **Direct control**: Works directly with Apple's Virtualization framework
- **Ready-to-use images**: Start a macOS or Linux VM with one command
- **API server**: Control VMs from other programs (runs on port 7777)
@@ -91,6 +93,7 @@ lume run macos-sequoia-vanilla:latest
[Lumier](https://github.com/trycua/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
What makes Lumier useful:
- **Familiar commands**: If you know Docker, you know Lumier
- **Web access**: Connect to your VM through a browser
- **Save your work**: VMs remember their state
@@ -127,6 +130,7 @@ Docker → Lume → Full VM → Mac Hardware
### When to Use What
**Apple's Containerization**
- ✅ Perfect for: Running containers with maximum security
- ✅ Starts in under a second
- ✅ Uses less memory and CPU
@@ -134,6 +138,7 @@ Docker → Lume → Full VM → Mac Hardware
- ❌ Only for containers, not full VMs
**Lume**
- ✅ Perfect for: Development and testing
- ✅ Full control over macOS/Linux VMs
- ✅ Works on current macOS versions
@@ -141,6 +146,7 @@ Docker → Lume → Full VM → Mac Hardware
- ❌ Uses more resources than containers
**Lumier**
- ✅ Perfect for: Teams already using Docker
- ✅ Easy to share and deploy
- ✅ Access through your browser
@@ -173,4 +179,4 @@ Apple's announcement confirms we're on the right path. Here's what we're looking
---
*Questions about virtualization on Apple Silicon? Come chat with us on Discord!*
_Questions about virtualization on Apple Silicon? Come chat with us on Discord!_

View File

@@ -1,6 +1,6 @@
# Sandboxed Python Execution: Run Code Safely in Cua Containers
*Published on June 23, 2025 by Dillon DuPont*
_Published on June 23, 2025 by Dillon DuPont_
Cua's computer-use capabilities that we touched on in [Building your own Operator on macOS - Part 2](build-your-own-operator-on-macos-2.md) your AI agents can click, scroll, type, and interact with any desktop application. But what if your agent needs to do more than just UI automation? What if it needs to process data, make API calls, analyze images, or run complex logic alongside those UI interactions, within the same virtual environment?
@@ -49,15 +49,19 @@ What's happening here? When you call `greet_and_print()`, Cua extracts the funct
Cua's sandboxed execution system employs several key architectural components:
### 1. Source Code Extraction
Cua uses Python's `inspect.getsource()` to extract your function's source code and reconstruct the function definition in the remote environment.
### 2. Virtual Environment Isolation
Each sandboxed function runs in a named virtual environment within the container. This provides complete dependency isolation between different functions and their respective environments.
### 3. Data Serialization and Transport
Arguments and return values are serialized as JSON and transported between the host and container. This ensures compatibility across different Python versions and execution environments.
### 4. Comprehensive Error Handling
The system captures both successful results and exceptions, preserving stack traces and error information for debugging purposes.
## Getting your sandbox ready
@@ -73,10 +77,10 @@ async def main():
# Fire up the computer
computer = Computer()
await computer.run()
# Make it the default for all sandboxed functions
set_default_computer(computer)
# Install some packages in a virtual environment
await computer.venv_install("demo_venv", ["requests", "beautifulsoup4"])
```
@@ -104,7 +108,7 @@ def automate_browser_with_playwright():
import time
import base64
from datetime import datetime
try:
with sync_playwright() as p:
# Launch browser (visible, because why not?)
@@ -112,68 +116,68 @@ def automate_browser_with_playwright():
headless=False,
args=['--no-sandbox', '--disable-dev-shm-usage']
)
page = browser.new_page()
page.set_viewport_size({"width": 1280, "height": 720})
actions = []
screenshots = {}
# Let's visit example.com and poke around
page.goto("https://example.com")
actions.append("Navigated to example.com")
# Grab a screenshot because screenshots are cool
screenshot_bytes = page.screenshot(full_page=True)
screenshots["initial"] = base64.b64encode(screenshot_bytes).decode()
# Get some basic info
title = page.title()
actions.append(f"Page title: {title}")
# Find links and headings
try:
links = page.locator("a").all()
link_texts = [link.text_content() for link in links[:5]]
actions.append(f"Found {len(links)} links: {link_texts}")
headings = page.locator("h1, h2, h3").all()
heading_texts = [h.text_content() for h in headings[:3]]
actions.append(f"Found headings: {heading_texts}")
except Exception as e:
actions.append(f"Element interaction error: {str(e)}")
# Let's try a form for good measure
try:
page.goto("https://httpbin.org/forms/post")
actions.append("Navigated to form page")
# Fill out the form
page.fill('input[name="custname"]', "Test User from Sandboxed Environment")
page.fill('input[name="custtel"]', "555-0123")
page.fill('input[name="custemail"]', "test@example.com")
page.select_option('select[name="size"]', "large")
actions.append("Filled out form fields")
# Submit and see what happens
page.click('input[type="submit"]')
page.wait_for_load_state("networkidle")
actions.append("Submitted form")
except Exception as e:
actions.append(f"Form interaction error: {str(e)}")
browser.close()
return {
"actions_performed": actions,
"screenshots": screenshots,
"success": True
}
except Exception as e:
return {"error": f"Browser automation failed: {str(e)}"}
@@ -196,9 +200,9 @@ def security_audit_tool(code_snippet):
"""Analyze code for potential security issues"""
import ast
import re
issues = []
# Check for the usual suspects
dangerous_patterns = [
(r'eval\s*\(', "Use of eval() function"),
@@ -207,11 +211,11 @@ def security_audit_tool(code_snippet):
(r'subprocess\.', "Subprocess usage"),
(r'os\.system\s*\(', "OS system call"),
]
for pattern, description in dangerous_patterns:
if re.search(pattern, code_snippet):
issues.append(description)
# Get fancy with AST analysis
try:
tree = ast.parse(code_snippet)
@@ -222,7 +226,7 @@ def security_audit_tool(code_snippet):
issues.append(f"Dangerous function call: {node.func.id}")
except SyntaxError:
issues.append("Syntax error in code")
return {
"security_issues": issues,
"risk_level": "HIGH" if len(issues) > 2 else "MEDIUM" if issues else "LOW"
@@ -245,34 +249,34 @@ def take_screenshot_and_analyze():
import base64
from PIL import ImageGrab
from datetime import datetime
try:
# Grab the screen
screenshot = ImageGrab.grab()
# Convert to base64 for easy transport
buffer = io.BytesIO()
screenshot.save(buffer, format='PNG')
screenshot_data = base64.b64encode(buffer.getvalue()).decode()
# Get some basic info
screen_info = {
"size": screenshot.size,
"mode": screenshot.mode,
"timestamp": datetime.now().isoformat()
}
# Analyze the colors (because why not?)
colors = screenshot.getcolors(maxcolors=256*256*256)
dominant_color = max(colors, key=lambda x: x[0])[1] if colors else None
return {
"screenshot_base64": screenshot_data,
"screen_info": screen_info,
"dominant_color": dominant_color,
"unique_colors": len(colors) if colors else 0
}
except Exception as e:
return {"error": f"Screenshot failed: {str(e)}"}
@@ -287,6 +291,7 @@ print("Desktop analysis complete!")
## Pro tips for sandboxed success
### Keep it self-contained
Always put your imports inside the function. Trust us on this one:
```python
@@ -294,12 +299,13 @@ Always put your imports inside the function. Trust us on this one:
def good_function():
import os # Import inside the function
import json
# Your code here
return {"result": "success"}
```
### Install dependencies first
Don't forget to install packages before using them:
```python
@@ -314,13 +320,14 @@ def data_analysis():
```
### Use descriptive environment names
Future you will thank you:
```python
@sandboxed("data_processing_env")
def process_data(): pass
@sandboxed("web_scraping_env")
@sandboxed("web_scraping_env")
def scrape_site(): pass
@sandboxed("ml_training_env")
@@ -328,6 +335,7 @@ def train_model(): pass
```
### Always handle errors gracefully
Things break. Plan for it:
```python
@@ -345,6 +353,7 @@ def robust_function(data):
Let's be honest there's some overhead here. Code needs to be serialized, sent over the network, and executed remotely. But for most use cases, the benefits far outweigh the costs.
If you're building something performance-critical, consider:
- Batching multiple operations into a single sandboxed function
- Minimizing data transfer between host and container
- Using persistent virtual environments
@@ -369,4 +378,4 @@ Happy coding (safely)!
---
*Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!*
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!_

View File

@@ -1,6 +1,6 @@
# Training Computer-Use Models: Creating Human Trajectories with Cua
*Published on May 1, 2025 by Dillon DuPont*
_Published on May 1, 2025 by Dillon DuPont_
In our previous posts, we covered [building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [using the Agent framework](build-your-own-operator-on-macos-2) to simplify development. Today, we'll focus on a critical aspect of improving computer-use agents and models: gathering high-quality demonstration data using Cua's Computer-Use Interface (CUI) and its Gradio UI to create and share human-generated trajectories.
@@ -8,10 +8,10 @@ Why is this important? Underlying models used by Computer-use agents need exampl
<video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
## What You'll Learn
By the end of this tutorial, you'll be able to:
- Set up the Computer-Use Interface (CUI) with Gradio UI support
- Record your own computer interaction trajectories
- Organize and tag your demonstrations
@@ -19,6 +19,7 @@ By the end of this tutorial, you'll be able to:
- Contribute to improving computer-use AI for everyone
**Prerequisites:**
- macOS Sonoma (14.0) or later
- Python 3.10+
- Basic familiarity with Python and terminal commands
@@ -38,6 +39,7 @@ Human trajectories, in the context of Computer-use AI Agents, are recordings of
- Time spent on different elements
These trajectories serve as examples for AI models to learn from, helping them understand the relationship between:
1. The visual state of the screen
2. The user's goal or task
3. The most appropriate action to take
@@ -59,17 +61,19 @@ By contributing high-quality demonstrations, you're helping to create more capab
The Computer-Use Interface includes an optional Gradio UI specifically designed to make recording and sharing demonstrations easy. Let's set it up:
1. **Create a Python environment** (optional but recommended):
```bash
# Using conda
conda create -n cua-trajectories python=3.10
conda activate cua-trajectories
# Using venv
python -m venv cua-trajectories
source cua-trajectories/bin/activate # On macOS/Linux
```
2. **Install the CUI package with UI support**:
```bash
pip install "cua-computer[ui]"
```
@@ -145,6 +149,7 @@ Effective tagging and organization make your demonstrations more valuable to res
### Task-Based Tags
Describe what the demonstration accomplishes:
- `web-browsing`
- `document-editing`
- `file-management`
@@ -154,6 +159,7 @@ Describe what the demonstration accomplishes:
### Application Tags
Identify the applications used:
- `finder`
- `safari`
- `notes`
@@ -163,6 +169,7 @@ Identify the applications used:
### Complexity Tags
Indicate the difficulty level:
- `beginner`
- `intermediate`
- `advanced`
@@ -171,6 +178,7 @@ Indicate the difficulty level:
### UI Element Tags
Highlight specific UI interactions:
- `drag-and-drop`
- `menu-navigation`
- `form-filling`
@@ -229,11 +237,11 @@ from computer import Computer
computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4")
try:
await computer.run()
screenshot = await computer.interface.screenshot()
with open("screenshot.png", "wb") as f:
f.write(screenshot)
await computer.interface.move_cursor(100, 100)
await computer.interface.left_click()
await computer.interface.right_click(300, 300)
@@ -280,6 +288,7 @@ You can also learn from existing trajectory datasets contributed by the communit
### Summary
In this guide, we've covered how to:
- Set up the Computer-Use Interface with Gradio UI
- Record high-quality human demonstrations
- Organize and tag your trajectories

View File

@@ -1,6 +1,6 @@
# Trajectory Viewer for Cua
*Published on May 13, 2025 by Dillon DuPont*
_Published on May 13, 2025 by Dillon DuPont_
Dont forget to check out [Part 1: Building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [Part 2: Using the Agent framework](build-your-own-operator-on-macos-2) for setting up your Cua environment and basic tips and tricks!
@@ -18,7 +18,7 @@ Think of a trajectory as a detailed video recording of your agents journey:
- **Observations**: What did the agent see (the exact screen content) at each point in time?
- **Actions**: What clicks, keystrokes, or commands did it perform in response?
- **Decisions**: Which options did it choose, and why?
Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
## So, whats Cuas Trajectory Viewer and why use it?

View File

@@ -1,6 +1,6 @@
# Ubuntu Docker Support in Cua with Kasm
*Published Aug 26, 2025 by Francesco Bonacci*
_Published Aug 26, 2025 by Francesco Bonacci_
Today were shipping **Ubuntu Docker support** in Cua. You get a full Linux desktop inside a Docker container, viewable right in your browser—no VM spin-up, no extra clients. It behaves the same on macOS, Windows, and Linux.
@@ -16,17 +16,17 @@ We wanted something lightweight, isolated, and identical across machines. So we
Short answer: **portability, startup time, and ops friction.**
* **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
* **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
* **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
* **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
* **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
- **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
- **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
- **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
- **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
- **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
**When we *do* reach for QEMU/KVM:**
**When we _do_ reach for QEMU/KVM:**
* You need **true OS isolation** or to run **non-Linux** guests.
* You want **kernel-level features** or **device/GPU passthrough** (VFIO).
* Youre optimizing for **hardware realism** over startup speed and density.
- You need **true OS isolation** or to run **non-Linux** guests.
- You want **kernel-level features** or **device/GPU passthrough** (VFIO).
- Youre optimizing for **hardware realism** over startup speed and density.
For this release, the goal was a **cross-platform Linux desktop that feels instant and identical** across local dev and CI. Containers + KasmVNC hit that sweet spot.
@@ -174,10 +174,10 @@ await computer.run()
## Links
* **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
* **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
* **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
* **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
* **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
- **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
- **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
- **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
- **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.
Questions or weird edge cases? Ping us on Discord—were curious to see what you build.

View File

@@ -1,10 +1,10 @@
# Your Windows PC is Already the Perfect Development Environment for Computer-Use Agents
*Published on June 18, 2025 by Dillon DuPont*
_Published on June 18, 2025 by Dillon DuPont_
Over the last few months, our enterprise users kept asking the same type of question: *"When are you adding support for AutoCAD?"* *"What about SAP integration?"* *"Can you automate our MES system?"* - each request was for different enterprise applications we'd never heard of.
Over the last few months, our enterprise users kept asking the same type of question: _"When are you adding support for AutoCAD?"_ _"What about SAP integration?"_ _"Can you automate our MES system?"_ - each request was for different enterprise applications we'd never heard of.
At first, we deflected. We've been building Cua to work across different environments - from [Lume for macOS VMs](./lume-to-containerization) to cloud containers. But these requests kept piling up. AutoCAD automation. SAP integration. Specialized manufacturing systems.
At first, we deflected. We've been building Cua to work across different environments - from [Lume for macOS VMs](./lume-to-containerization) to cloud containers. But these requests kept piling up. AutoCAD automation. SAP integration. Specialized manufacturing systems.
Then it hit us: **they all ran exclusively on Windows**.
@@ -80,6 +80,7 @@ python -m agent.ui
```
**What you get**:
- Visual interface in your browser
- Real-time agent action viewing
- Natural language task instructions
@@ -101,21 +102,21 @@ async def test_windows_agent():
os_type="windows",
memory="4GB",
)
# Start the VM (~35s)
await computer.run()
# Create agent with your preferred model
agent = ComputerAgent(
model="openai/computer-use-preview",
save_trajectory=True,
tools=[computer]
)
# Give it a task
async for result in agent.run("Open Calculator and compute 15% tip on $47.50"):
print(f"Agent action: {result}")
# Shutdown the VM
await computer.stop()
@@ -123,6 +124,7 @@ asyncio.run(test_windows_agent())
```
**What you get**:
- Full programmatic control
- Custom agent workflows
- Integration with your existing code
@@ -141,6 +143,7 @@ asyncio.run(test_windows_agent())
Let's see how different testing approaches stack up:
### Windows Sandbox + Cua
- **Perfect for**: Quick testing and development
- **Cost**: Free (built into Windows)
- **Setup time**: Under 5 minutes
@@ -149,6 +152,7 @@ Let's see how different testing approaches stack up:
- **Requires**: Windows 10/11 with 4GB+ RAM
### Traditional VMs
- **Perfect for**: Complex testing scenarios
- **Full customization**: Any Windows version
- **Heavy resource usage**: Slow to start/stop
@@ -160,6 +164,7 @@ Let's see how different testing approaches stack up:
Here's what our enterprise users are building with Windows Sandbox:
### CAD and Engineering Automation
```python
# Example: AutoCAD drawing automation
task = """
@@ -172,6 +177,7 @@ task = """
```
### Manufacturing and ERP Integration
```python
# Example: SAP workflow automation
task = """
@@ -184,6 +190,7 @@ task = """
```
### Financial Software Automation
```python
# Example: Trading platform automation
task = """
@@ -196,6 +203,7 @@ task = """
```
### Legacy Windows Application Integration
```python
# Example: Custom Windows application automation
task = """
@@ -210,12 +218,14 @@ task = """
## System Requirements and Performance
### What You Need
- **Windows 10/11**: Any edition that supports Windows Sandbox
- **Memory**: 4GB minimum (8GB recommended for CAD/professional software)
- **CPU**: Virtualization support (enabled by default on modern systems)
- **Storage**: A few GB free space
### Performance Tips
- **Close unnecessary applications** before starting Windows Sandbox
- **Allocate appropriate memory** based on your RPA workflow complexity
- **Use SSD storage** for faster sandbox startup
@@ -234,4 +244,4 @@ But for development, prototyping, and learning Windows RPA workflows, **Windows
---
*Ready to see AI agents control your Windows applications? Come share your testing experiences on Discord!*
_Ready to see AI agents control your Windows applications? Come share your testing experiences on Discord!_

View File

@@ -1,9 +1,3 @@
{
"pages": [
"introduction",
"screenspot-v2",
"screenspot-pro",
"interactive",
"osworld-verified"
]
}
"pages": ["introduction", "screenspot-v2", "screenspot-pro", "interactive", "osworld-verified"]
}

View File

@@ -1,11 +1,5 @@
{
"title": "Callbacks",
"description": "Extending agents with callback hooks and built-in handlers",
"pages": [
"agent-lifecycle",
"trajectories",
"logging",
"cost-saving",
"pii-anonymization"
]
"title": "Callbacks",
"description": "Extending agents with callback hooks and built-in handlers",
"pages": ["agent-lifecycle", "trajectories", "logging", "cost-saving", "pii-anonymization"]
}

View File

@@ -1,20 +1,20 @@
{
"title": "Agent SDK",
"description": "Build computer-using agents with the Agent SDK",
"pages": [
"agent-loops",
"supported-agents",
"supported-model-providers",
"chat-history",
"message-format",
"customizing-computeragent",
"callbacks",
"custom-tools",
"custom-computer-handlers",
"prompt-caching",
"usage-tracking",
"benchmarks",
"migration-guide",
"integrations"
]
"title": "Agent SDK",
"description": "Build computer-using agents with the Agent SDK",
"pages": [
"agent-loops",
"supported-agents",
"supported-model-providers",
"chat-history",
"message-format",
"customizing-computeragent",
"callbacks",
"custom-tools",
"custom-computer-handlers",
"prompt-caching",
"usage-tracking",
"benchmarks",
"migration-guide",
"integrations"
]
}

View File

@@ -1,10 +1,5 @@
{
"title": "Supported Agents",
"description": "Models and configurations supported by the Agent SDK",
"pages": [
"computer-use-agents",
"grounding-models",
"composed-agents",
"human-in-the-loop"
]
"title": "Supported Agents",
"description": "Models and configurations supported by the Agent SDK",
"pages": ["computer-use-agents", "grounding-models", "composed-agents", "human-in-the-loop"]
}

View File

@@ -1,11 +1,5 @@
{
"title": "Computer SDK",
"description": "Build computer-using agents with the Computer SDK",
"pages": [
"computers",
"cloud-vm-management",
"commands",
"computer-ui",
"sandboxed-python"
]
"title": "Computer SDK",
"description": "Build computer-using agents with the Computer SDK",
"pages": ["computers", "cloud-vm-management", "commands", "computer-ui", "sandboxed-python"]
}

View File

@@ -1,7 +1,5 @@
{
"title": "Example Use Cases",
"description": "Real-world examples of building with Cua",
"pages": [
"form-filling"
]
"title": "Example Use Cases",
"description": "Real-world examples of building with Cua",
"pages": ["form-filling"]
}

View File

@@ -17,10 +17,12 @@ Lume follows the XDG Base Directory specification for the configuration file:
- Configuration is stored in `$XDG_CONFIG_HOME/lume/config.yaml` (defaults to `~/.config/lume/config.yaml`)
By default, other data is stored in:
- VM data: `~/.lume`
- Cache files: `~/.lume/cache`
The config file contains settings for:
- VM storage locations and the default location
- Cache directory location
- Whether caching is enabled
@@ -88,6 +90,7 @@ lume delete <name>
### How to Install macOS from an IPSW Image
#### Create a new macOS VM using the latest supported IPSW image:
Run the following command to create a new macOS virtual machine using the latest available IPSW image:
```bash
@@ -95,6 +98,7 @@ lume create <name> --os macos --ipsw latest
```
#### Create a new macOS VM using a specific IPSW image:
To create a macOS virtual machine from an older or specific IPSW file, first download the desired IPSW (UniversalMac) from a trusted source.
Then, use the downloaded IPSW path:

View File

@@ -1,9 +1,3 @@
{
"pages": [
"installation",
"prebuilt-images",
"cli-reference",
"http-api",
"faq"
]
"pages": ["installation", "prebuilt-images", "cli-reference", "http-api", "faq"]
}

View File

@@ -1,8 +1,3 @@
{
"pages": [
"installation",
"docker",
"docker-compose",
"building-lumier"
]
"pages": ["installation", "docker", "docker-compose", "building-lumier"]
}

View File

@@ -1,10 +1,10 @@
{
"pages": [
"installation",
"configuration",
"usage",
"tools",
"client-integrations",
"llm-integrations"
]
}
"pages": [
"installation",
"configuration",
"usage",
"tools",
"client-integrations",
"llm-integrations"
]
}

View File

@@ -1,19 +1,19 @@
{
"title": "Home",
"description": "Documentation Home",
"root": true,
"defaultOpen": true,
"pages": [
"index",
"quickstart-devs",
"quickstart-cli",
"telemetry",
"example-usecases",
"---[BookCopy]Computer Playbook---",
"...computer-sdk",
"---[BookCopy]Agent Playbook---",
"...agent-sdk",
"---[CodeXml]API Reference---",
"...libraries"
]
}
"title": "Home",
"description": "Documentation Home",
"root": true,
"defaultOpen": true,
"pages": [
"index",
"quickstart-devs",
"quickstart-cli",
"telemetry",
"example-usecases",
"---[BookCopy]Computer Playbook---",
"...computer-sdk",
"---[BookCopy]Agent Playbook---",
"...agent-sdk",
"---[CodeXml]API Reference---",
"...libraries"
]
}

View File

@@ -43,4 +43,4 @@
"sharp"
]
}
}
}

View File

@@ -1,9 +1,4 @@
import {
defineConfig,
defineDocs,
frontmatterSchema,
metaSchema,
} from 'fumadocs-mdx/config';
import { defineConfig, defineDocs, frontmatterSchema, metaSchema } from 'fumadocs-mdx/config';
import { z } from 'zod';
// You can customise Zod schemas for frontmatter and `meta.json` here

View File

@@ -1,18 +1,9 @@
import { getApiVersions, source } from '@/lib/source';
import { getMDXComponents } from '@/mdx-components';
import { buttonVariants } from 'fumadocs-ui/components/ui/button';
import {
Popover,
PopoverContent,
PopoverTrigger,
} from 'fumadocs-ui/components/ui/popover';
import { Popover, PopoverContent, PopoverTrigger } from 'fumadocs-ui/components/ui/popover';
import { createRelativeLink } from 'fumadocs-ui/mdx';
import {
DocsBody,
DocsDescription,
DocsPage,
DocsTitle,
} from 'fumadocs-ui/page';
import { DocsBody, DocsDescription, DocsPage, DocsTitle } from 'fumadocs-ui/page';
import { cn } from 'fumadocs-ui/utils/cn';
import { ChevronDown, CodeXml, ExternalLink } from 'lucide-react';
import type { Metadata } from 'next';
@@ -20,9 +11,7 @@ import Link from 'next/link';
import { notFound, redirect } from 'next/navigation';
import { PageFeedback } from '@/components/page-feedback';
export default async function Page(props: {
params: Promise<{ slug?: string[] }>;
}) {
export default async function Page(props: { params: Promise<{ slug?: string[] }> }) {
const params = await props.params;
const slug = params.slug || [];
const page = source.getPage(slug);
@@ -66,7 +55,8 @@ export default async function Page(props: {
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
className="h-5"
viewBox="0 0 448 512">
viewBox="0 0 448 512"
>
<title>Windows</title>
<path d="M0 93.7l183.6-25.3v177.4H0V93.7zm0 324.6l183.6 25.3V268.4H0v149.9zm203.8 28L448 480V268.4H203.8v177.9zm0-380.6v180.1H448V32L203.8 65.7z" />
</svg>
@@ -76,7 +66,8 @@ export default async function Page(props: {
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
className="h-5"
viewBox="0 0 384 512">
viewBox="0 0 384 512"
>
<title>macOS</title>
<path d="M318.7 268.7c-.2-36.7 16.4-64.4 50-84.8-18.8-26.9-47.2-41.7-84.7-44.6-35.5-2.8-74.3 20.7-88.5 20.7-15 0-49.4-19.7-76.4-19.7C63.3 141.2 4 184.8 4 273.5q0 39.3 14.4 81.2c12.8 36.7 59 126.7 107.2 125.2 25.2-.6 43-17.9 75.8-17.9 31.8 0 48.3 17.9 76.4 17.9 48.6-.7 90.4-82.5 102.6-119.3-65.2-30.7-61.7-90-61.7-91.9zm-56.6-164.2c27.3-32.4 24.8-61.9 24-72.5-24.1 1.4-52 16.4-67.9 34.9-17.5 19.8-27.8 44.3-25.6 71.9 26.1 2 49.9-11.4 69.5-34.3z" />
</svg>
@@ -86,7 +77,8 @@ export default async function Page(props: {
xmlns="http://www.w3.org/2000/svg"
fill="currentColor"
className="h-5"
viewBox="0 0 448 512">
viewBox="0 0 448 512"
>
<title>Linux</title>
<path d="M220.8 123.3c1 .5 1.8 1.7 3 1.7 1.1 0 2.8-.4 2.9-1.5 .2-1.4-1.9-2.3-3.2-2.9-1.7-.7-3.9-1-5.5-.1-.4 .2-.8 .7-.6 1.1 .3 1.3 2.3 1.1 3.4 1.7zm-21.9 1.7c1.2 0 2-1.2 3-1.7 1.1-.6 3.1-.4 3.5-1.6 .2-.4-.2-.9-.6-1.1-1.6-.9-3.8-.6-5.5 .1-1.3 .6-3.4 1.5-3.2 2.9 .1 1 1.8 1.5 2.8 1.4zM420 403.8c-3.6-4-5.3-11.6-7.2-19.7-1.8-8.1-3.9-16.8-10.5-22.4-1.3-1.1-2.6-2.1-4-2.9-1.3-.8-2.7-1.5-4.1-2 9.2-27.3 5.6-54.5-3.7-79.1-11.4-30.1-31.3-56.4-46.5-74.4-17.1-21.5-33.7-41.9-33.4-72C311.1 85.4 315.7 .1 234.8 0 132.4-.2 158 103.4 156.9 135.2c-1.7 23.4-6.4 41.8-22.5 64.7-18.9 22.5-45.5 58.8-58.1 96.7-6 17.9-8.8 36.1-6.2 53.3-6.5 5.8-11.4 14.7-16.6 20.2-4.2 4.3-10.3 5.9-17 8.3s-14 6-18.5 14.5c-2.1 3.9-2.8 8.1-2.8 12.4 0 3.9 .6 7.9 1.2 11.8 1.2 8.1 2.5 15.7 .8 20.8-5.2 14.4-5.9 24.4-2.2 31.7 3.8 7.3 11.4 10.5 20.1 12.3 17.3 3.6 40.8 2.7 59.3 12.5 19.8 10.4 39.9 14.1 55.9 10.4 11.6-2.6 21.1-9.6 25.9-20.2 12.5-.1 26.3-5.4 48.3-6.6 14.9-1.2 33.6 5.3 55.1 4.1 .6 2.3 1.4 4.6 2.5 6.7v.1c8.3 16.7 23.8 24.3 40.3 23 16.6-1.3 34.1-11 48.3-27.9 13.6-16.4 36-23.2 50.9-32.2 7.4-4.5 13.4-10.1 13.9-18.3 .4-8.2-4.4-17.3-15.5-29.7zM223.7 87.3c9.8-22.2 34.2-21.8 44-.4 6.5 14.2 3.6 30.9-4.3 40.4-1.6-.8-5.9-2.6-12.6-4.9 1.1-1.2 3.1-2.7 3.9-4.6 4.8-11.8-.2-27-9.1-27.3-7.3-.5-13.9 10.8-11.8 23-4.1-2-9.4-3.5-13-4.4-1-6.9-.3-14.6 2.9-21.8zM183 75.8c10.1 0 20.8 14.2 19.1 33.5-3.5 1-7.1 2.5-10.2 4.6 1.2-8.9-3.3-20.1-9.6-19.6-8.4 .7-9.8 21.2-1.8 28.1 1 .8 1.9-.2-5.9 5.5-15.6-14.6-10.5-52.1 8.4-52.1zm-13.6 60.7c6.2-4.6 13.6-10 14.1-10.5 4.7-4.4 13.5-14.2 27.9-14.2 7.1 0 15.6 2.3 25.9 8.9 6.3 4.1 11.3 4.4 22.6 9.3 8.4 3.5 13.7 9.7 10.5 18.2-2.6 7.1-11 14.4-22.7 18.1-11.1 3.6-19.8 16-38.2 14.9-3.9-.2-7-1-9.6-2.1-8-3.5-12.2-10.4-20-15-8.6-4.8-13.2-10.4-14.7-15.3-1.4-4.9 0-9 4.2-12.3zm3.3 334c-2.7 35.1-43.9 34.4-75.3 18-29.9-15.8-68.6-6.5-76.5-21.9-2.4-4.7-2.4-12.7 2.6-26.4v-.2c2.4-7.6 .6-16-.6-23.9-1.2-7.8-1.8-15 .9-20 3.5-6.7 8.5-9.1 14.8-11.3 10.3-3.7 11.8-3.4 19.6-9.9 5.5-5.7 9.5-12.9 14.3-18 5.1-5.5 10-8.1 17.7-6.9 8.1 1.2 15.1 6.8 21.9 16l19.6 35.6c9.5 19.9 43.1 48.4 41 68.9zm-1.4-25.9c-4.1-6.6-9.6-13.6-14.4-19.6 7.1 0 14.2-2.2 16.7-8.9 2.3-6.2 0-14.9-7.4-24.9-13.5-18.2-38.3-32.5-38.3-32.5-13.5-8.4-21.1-18.7-24.6-29.9s-3-23.3-.3-35.2c5.2-22.9 18.6-45.2 27.2-59.2 2.3-1.7 .8 3.2-8.7 20.8-8.5 16.1-24.4 53.3-2.6 82.4 .6-20.7 5.5-41.8 13.8-61.5 12-27.4 37.3-74.9 39.3-112.7 1.1 .8 4.6 3.2 6.2 4.1 4.6 2.7 8.1 6.7 12.6 10.3 12.4 10 28.5 9.2 42.4 1.2 6.2-3.5 11.2-7.5 15.9-9 9.9-3.1 17.8-8.6 22.3-15 7.7 30.4 25.7 74.3 37.2 95.7 6.1 11.4 18.3 35.5 23.6 64.6 3.3-.1 7 .4 10.9 1.4 13.8-35.7-11.7-74.2-23.3-84.9-4.7-4.6-4.9-6.6-2.6-6.5 12.6 11.2 29.2 33.7 35.2 59 2.8 11.6 3.3 23.7 .4 35.7 16.4 6.8 35.9 17.9 30.7 34.8-2.2-.1-3.2 0-4.2 0 3.2-10.1-3.9-17.6-22.8-26.1-19.6-8.6-36-8.6-38.3 12.5-12.1 4.2-18.3 14.7-21.4 27.3-2.8 11.2-3.6 24.7-4.4 39.9-.5 7.7-3.6 18-6.8 29-32.1 22.9-76.7 32.9-114.3 7.2zm257.4-11.5c-.9 16.8-41.2 19.9-63.2 46.5-13.2 15.7-29.4 24.4-43.6 25.5s-26.5-4.8-33.7-19.3c-4.7-11.1-2.4-23.1 1.1-36.3 3.7-14.2 9.2-28.8 9.9-40.6 .8-15.2 1.7-28.5 4.2-38.7 2.6-10.3 6.6-17.2 13.7-21.1 .3-.2 .7-.3 1-.5 .8 13.2 7.3 26.6 18.8 29.5 12.6 3.3 30.7-7.5 38.4-16.3 9-.3 15.7-.9 22.6 5.1 9.9 8.5 7.1 30.3 17.1 41.6 10.6 11.6 14 19.5 13.7 24.6zM173.3 148.7c2 1.9 4.7 4.5 8 7.1 6.6 5.2 15.8 10.6 27.3 10.6 11.6 0 22.5-5.9 31.8-10.8 4.9-2.6 10.9-7 14.8-10.4s5.9-6.3 3.1-6.6-2.6 2.6-6 5.1c-4.4 3.2-9.7 7.4-13.9 9.8-7.4 4.2-19.5 10.2-29.9 10.2s-18.7-4.8-24.9-9.7c-3.1-2.5-5.7-5-7.7-6.9-1.5-1.4-1.9-4.6-4.3-4.9-1.4-.1-1.8 3.7 1.7 6.5z" />
</svg>
@@ -96,10 +88,7 @@ export default async function Page(props: {
<div className="flex flex-row gap-2 items-left">
{pypi && (
<a
target="_blank"
href={`https://pypi.org/project/${pypi}/`}
rel="noreferrer">
<a target="_blank" href={`https://pypi.org/project/${pypi}/`} rel="noreferrer">
<img
src={`https://img.shields.io/pypi/v/${pypi}?color=blue`}
className="h-5"
@@ -108,10 +97,7 @@ export default async function Page(props: {
</a>
)}
{npm && (
<a
target="_blank"
href={`https://www.npmjs.com/package/${npm}`}
rel="noreferrer">
<a target="_blank" href={`https://www.npmjs.com/package/${npm}`} rel="noreferrer">
<img
src={`https://img.shields.io/npm/v/${npm}?color=bf4c4b`}
className="h-5"
@@ -138,7 +124,8 @@ export default async function Page(props: {
target="_blank"
className="inline-flex gap-2 w-fit items-center justify-center rounded-md text-sm font-medium transition-colors duration-100 disabled:pointer-events-none disabled:opacity-50 focus-visible:outline-none hover:bg-fd-accent hover:text-fd-accent-foreground p-1.5 [&amp;_svg]:size-5 text-fd-muted-foreground md:[&amp;_svg]:size-4.5"
aria-label="Source"
data-active="false">
data-active="false"
>
<svg role="img" viewBox="0 0 24 24" fill="currentColor">
<path d="M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12"></path>
</svg>
@@ -162,12 +149,13 @@ export default async function Page(props: {
href={link}
rel="noreferrer noopener"
target="_blank"
className="inline-flex gap-2 w-full items-center rounded-md p-2 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground">
className="inline-flex gap-2 w-full items-center rounded-md p-2 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground"
>
{link.includes('python')
? 'Python'
: link.includes('typescript')
? 'TypeScript'
: `Source ${index + 1}`}
? 'TypeScript'
: `Source ${index + 1}`}
<ExternalLink className="w-4 h-4 ml-auto" />
</a>
))}
@@ -190,10 +178,7 @@ export default async function Page(props: {
};
return (
<DocsPage
toc={page.data.toc}
tableOfContent={{ header: tocHeader() }}
full={page.data.full}>
<DocsPage toc={page.data.toc} tableOfContent={{ header: tocHeader() }} full={page.data.full}>
<div className="flex flex-row w-full items-start">
<div className="flex-1">
<div className="flex flex-row w-full">
@@ -209,15 +194,14 @@ export default async function Page(props: {
size: 'sm',
className: 'gap-2',
})
)}>
)}
>
{(() => {
// Find the current version label
let currentLabel = 'Current';
if (apiVersionSlug.length > 0) {
const found = versionItems.find(
(item) =>
item.label !== 'Current' &&
apiVersionSlug[0] === item.label
(item) => item.label !== 'Current' && apiVersionSlug[0] === item.label
);
if (found) currentLabel = found.label;
}
@@ -238,10 +222,8 @@ export default async function Page(props: {
: `/api/${apiSection}/${item.label}`;
// Highlight current version
const isCurrent =
(item.label === 'Current' &&
apiVersionSlug.length === 0) ||
(item.label !== 'Current' &&
apiVersionSlug[0] === item.label);
(item.label === 'Current' && apiVersionSlug.length === 0) ||
(item.label !== 'Current' && apiVersionSlug[0] === item.label);
return (
<Link
key={item.label}
@@ -249,7 +231,8 @@ export default async function Page(props: {
className={cn(
'px-3 py-1 rounded hover:bg-fd-muted',
isCurrent && 'font-bold bg-fd-muted'
)}>
)}
>
API version: {item.label}
</Link>
);
@@ -259,9 +242,7 @@ export default async function Page(props: {
)}
</div>
</div>
<DocsDescription className="text-md mt-1">
{page.data.description}
</DocsDescription>
<DocsDescription className="text-md mt-1">{page.data.description}</DocsDescription>
</div>
</div>
<DocsBody>
@@ -290,8 +271,7 @@ export async function generateMetadata(props: {
let title = `${page.data.title} | Cua Docs`;
if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
if (page.url.includes('guide'))
title = ` Guide: ${page.data.title} | Cua Docs`;
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;
return {
title,

View File

@@ -24,9 +24,7 @@ export default function Layout({ children }: { children: ReactNode }) {
<PostHogPageView />
</Suspense>
<AnalyticsTracker />
<RootProvider search={{ options: { api: '/docs/api/search' } }}>
{children}
</RootProvider>
<RootProvider search={{ options: { api: '/docs/api/search' } }}>{children}</RootProvider>
<Footer />
<CookieConsent />
</PHProvider>

View File

@@ -5,10 +5,7 @@ import { notFound } from 'next/navigation';
export const revalidate = false;
export async function GET(
_req: NextRequest,
{ params }: { params: Promise<{ slug?: string[] }> }
) {
export async function GET(_req: NextRequest, { params }: { params: Promise<{ slug?: string[] }> }) {
const { slug } = await params;
const page = source.getPage(slug);
if (!page) notFound();

View File

@@ -55,14 +55,17 @@ export function EditableCodeBlock({
const [values, setValues] = useState<Record<string, string>>(defaultValues);
const updateValue = (key: string, value: string) => {
setValues(prev => ({ ...prev, [key]: value }));
setValues((prev) => ({ ...prev, [key]: value }));
};
return (
<EditableCodeContext.Provider value={{ values, updateValue }}>
<Base.CodeBlock title={title} className={cn('my-4', className)}>
<Base.Pre className={cn(`language-${lang}`, "px-3")}>
<code className={cn(`language-${lang}`)} style={{ display: 'block', whiteSpace: 'pre-wrap' }}>
<Base.Pre className={cn(`language-${lang}`, 'px-3')}>
<code
className={cn(`language-${lang}`)}
style={{ display: 'block', whiteSpace: 'pre-wrap' }}
>
{children}
</code>
</Base.Pre>
@@ -219,9 +222,7 @@ export function EditableValue({
value={value}
onChange={(e) => updateValue(placeholder, e.target.value)}
placeholder={placeholder}
className={cn(
type === 'password' && value && 'text-security-disc'
)}
className={cn(type === 'password' && value && 'text-security-disc')}
style={{
display: 'inline',
width: inputWidth,

View File

@@ -34,7 +34,7 @@ interface IOUProps {
}
/**
* A React component that visualizes and calculates the Intersection over Union (IOU)
* A React component that visualizes and calculates the Intersection over Union (IOU)
* of two rectangles on a canvas
* @param props - The component props
* @returns The rendered IOU visualization component
@@ -130,12 +130,7 @@ export default function IOU({ title, description, rect1, rect2 }: IOUProps) {
<h3 className="text-sm font-semibold ">{title}</h3>
<div className="flex items-start gap-6">
<div>
<canvas
ref={canvasRef}
width={200}
height={150}
className="border bg-white rounded-md"
/>
<canvas ref={canvasRef} width={200} height={150} className="border bg-white rounded-md" />
<div className="mt-2 text-sm">
<div className="font-mono mb-2">IOU = {actualIOU.toFixed(3)}</div>
<span className="">{description}</span>

View File

@@ -28,10 +28,7 @@ export function Mermaid({ chart }: { chart: string }) {
theme: resolvedTheme === 'dark' ? 'dark' : 'default',
});
const { svg, bindFunctions } = await mermaid.render(
id,
chart.replaceAll('\\n', '\n'),
);
const { svg, bindFunctions } = await mermaid.render(id, chart.replaceAll('\\n', '\n'));
bindFunctions?.(container);
setSvg(svg);
@@ -44,4 +41,4 @@ export function Mermaid({ chart }: { chart: string }) {
}, [chart, id, resolvedTheme]);
return <div ref={containerRef} dangerouslySetInnerHTML={{ __html: svg }} />;
}
}

View File

@@ -45,7 +45,7 @@ export function PageFeedback() {
<p className="text-sm text-fd-muted-foreground text-left">
{feedback === 'helpful'
? 'Thanks for your feedback!'
: 'Thanks for your feedback. We\'ll work on improving this page.'}
: "Thanks for your feedback. We'll work on improving this page."}
</p>
)}
</div>

View File

@@ -34,9 +34,7 @@ export async function getApiVersions(
...versions.filter((v) => v.label === 'Current'),
...versions
.filter((v) => v.label !== 'Current')
.sort((a, b) =>
b.label.localeCompare(a.label, undefined, { numeric: true })
),
.sort((a, b) => b.label.localeCompare(a.label, undefined, { numeric: true })),
];
}

View File

@@ -2,11 +2,7 @@
"compilerOptions": {
"baseUrl": ".",
"target": "ESNext",
"lib": [
"dom",
"dom.iterable",
"esnext"
],
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
@@ -20,12 +16,8 @@
"jsx": "preserve",
"incremental": true,
"paths": {
"@/.source": [
"./.source/index.ts"
],
"@/*": [
"./src/*"
]
"@/.source": ["./.source/index.ts"],
"@/*": ["./src/*"]
},
"plugins": [
{
@@ -33,13 +25,6 @@
}
]
},
"include": [
"next-env.d.ts",
"**/*.ts",
"**/*.tsx",
".next/types/**/*.ts"
],
"exclude": [
"node_modules"
]
}
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"exclude": ["node_modules"]
}

View File

@@ -2,16 +2,15 @@
import asyncio
import logging
import traceback
import signal
from computer import Computer, VMProviderType
import traceback
# Import the unified agent class and types
from agent import ComputerAgent
from computer import Computer, VMProviderType
# Import utility functions
from utils import load_dotenv_files, handle_sigint
from utils import handle_sigint, load_dotenv_files
# Set up logging
logging.basicConfig(level=logging.INFO)
@@ -40,25 +39,20 @@ async def run_agent_example():
# Create ComputerAgent with new API
agent = ComputerAgent(
# Supported models:
# == OpenAI CUA (computer-use-preview) ==
model="openai/computer-use-preview",
# == Anthropic CUA (Claude > 3.5) ==
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-sonnet-4-20250514",
# model="anthropic/claude-3-7-sonnet-20250219",
# model="anthropic/claude-3-5-sonnet-20241022",
# == UI-TARS ==
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
# model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
# model="ollama_chat/0000/ui-tars-1.5-7b",
# == Omniparser + Any LLM ==
# model="omniparser+anthropic/claude-opus-4-20250514",
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.DEBUG,
@@ -79,18 +73,18 @@ async def run_agent_example():
# Use message-based conversation history
history = []
for i, task in enumerate(tasks):
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
# Add user message to history
history.append({"role": "user", "content": task})
# Run agent with conversation history
async for result in agent.run(history, stream=False):
# Add agent outputs to history
history += result.get("output", [])
# Print output for debugging
for item in result.get("output", []):
if item.get("type") == "message":
@@ -104,7 +98,7 @@ async def run_agent_example():
print(f"Computer Action: {action_type}({action})")
elif item.get("type") == "computer_call_output":
print("Computer Output: [Screenshot/Result]")
print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")
except Exception as e:

View File

@@ -1,11 +1,13 @@
import asyncio
import os
from utils import load_dotenv_files
load_dotenv_files()
from computer.providers.cloud.provider import CloudProvider
async def main() -> None:
api_key = os.getenv("CUA_API_KEY")
if not api_key:
@@ -13,7 +15,7 @@ async def main() -> None:
api_base = os.getenv("CUA_API_BASE")
if api_base:
print(f"Using API base: {api_base}")
provider = CloudProvider(api_key=api_key, verbose=True)
async with provider:
@@ -23,7 +25,7 @@ async def main() -> None:
for vm in vms:
print(
f"name: {vm['name']}\n",
f"status: {vm['status']}\n", # pending, running, stopped, terminated, failed
f"status: {vm['status']}\n", # pending, running, stopped, terminated, failed
f"api_url: {vm.get('api_url')}\n",
f"vnc_url: {vm.get('vnc_url')}\n",
)
@@ -59,12 +61,13 @@ async def main() -> None:
# # To probe a VM's status via its public hostname (if you know the name):
# name = "m-linux-96lcxd2c2k"
# info = await provider.get_vm(name)
# print("get_vm info:\n",
# print("get_vm info:\n",
# f"name: {info['name']}\n",
# f"status: {info['status']}\n", # running
# f"api_url: {info.get('api_url')}\n",
# f"os_type: {info.get('os_type')}\n",
# )
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -19,6 +19,7 @@ This example demonstrates how to control a Cua Cloud Sandbox using the OpenAI `c
2. **Set up environment variables:**
Create a `.env` file with the following variables:
- `OPENAI_API_KEY` — your OpenAI API key
- `CUA_API_KEY` — your Cua Cloud API key
- `CUA_CONTAINER_NAME` — the name of your provisioned sandbox

View File

@@ -1,63 +1,63 @@
import type { Computer } from "@trycua/computer";
import type OpenAI from "openai";
import type { Computer } from '@trycua/computer';
import type OpenAI from 'openai';
export async function executeAction(
computer: Computer,
action: OpenAI.Responses.ResponseComputerToolCall["action"],
computer: Computer,
action: OpenAI.Responses.ResponseComputerToolCall['action']
) {
switch (action.type) {
case "click": {
const { x, y, button } = action;
console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
await computer.interface.moveCursor(x, y);
if (button === "right") await computer.interface.rightClick();
else await computer.interface.leftClick();
break;
}
case "type":
{
const { text } = action;
console.log(`Typing text: ${text}`);
await computer.interface.typeText(text);
}
break;
case "scroll": {
const { x: locX, y: locY, scroll_x, scroll_y } = action;
console.log(
`Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`,
);
await computer.interface.moveCursor(locX, locY);
await computer.interface.scroll(scroll_x, scroll_y);
break;
}
case "keypress": {
const { keys } = action;
for (const key of keys) {
console.log(`Pressing key: ${key}.`);
// Map common key names to CUA equivalents
if (key.toLowerCase() === "enter") {
await computer.interface.pressKey("return");
} else if (key.toLowerCase() === "space") {
await computer.interface.pressKey("space");
} else {
await computer.interface.pressKey(key);
}
}
break;
}
case "wait": {
console.log(`Waiting for 3 seconds.`);
await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
break;
}
case "screenshot": {
console.log("Taking screenshot.");
// This is handled automatically in the main loop, but we can take an extra one if requested
const screenshot = await computer.interface.screenshot();
return screenshot;
}
default:
console.log(`Unrecognized action: ${action.type}`);
break;
}
switch (action.type) {
case 'click': {
const { x, y, button } = action;
console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
await computer.interface.moveCursor(x, y);
if (button === 'right') await computer.interface.rightClick();
else await computer.interface.leftClick();
break;
}
case 'type':
{
const { text } = action;
console.log(`Typing text: ${text}`);
await computer.interface.typeText(text);
}
break;
case 'scroll': {
const { x: locX, y: locY, scroll_x, scroll_y } = action;
console.log(
`Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`
);
await computer.interface.moveCursor(locX, locY);
await computer.interface.scroll(scroll_x, scroll_y);
break;
}
case 'keypress': {
const { keys } = action;
for (const key of keys) {
console.log(`Pressing key: ${key}.`);
// Map common key names to CUA equivalents
if (key.toLowerCase() === 'enter') {
await computer.interface.pressKey('return');
} else if (key.toLowerCase() === 'space') {
await computer.interface.pressKey('space');
} else {
await computer.interface.pressKey(key);
}
}
break;
}
case 'wait': {
console.log(`Waiting for 3 seconds.`);
await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
break;
}
case 'screenshot': {
console.log('Taking screenshot.');
// This is handled automatically in the main loop, but we can take an extra one if requested
const screenshot = await computer.interface.screenshot();
return screenshot;
}
default:
console.log(`Unrecognized action: ${action.type}`);
break;
}
}

View File

@@ -1,104 +1,103 @@
import { Computer, OSType } from "@trycua/computer";
import OpenAI from "openai";
import { executeAction } from "./helpers";
import { Computer, OSType } from '@trycua/computer';
import OpenAI from 'openai';
import { executeAction } from './helpers';
import "dotenv/config";
import 'dotenv/config';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const COMPUTER_USE_PROMPT = "Open firefox and go to trycua.com";
const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com';
// Initialize the Computer Connection
const computer = new Computer({
apiKey: process.env.CUA_API_KEY!,
name: process.env.CUA_CONTAINER_NAME!,
osType: OSType.LINUX,
apiKey: process.env.CUA_API_KEY!,
name: process.env.CUA_CONTAINER_NAME!,
osType: OSType.LINUX,
});
await computer.run();
// Take the initial screenshot
const screenshot = await computer.interface.screenshot();
const screenshotBase64 = screenshot.toString("base64");
const screenshotBase64 = screenshot.toString('base64');
// Setup openai config for computer use
const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
model: "computer-use-preview",
tools: [
{
type: "computer_use_preview",
display_width: 1024,
display_height: 768,
environment: "linux", // we're using a linux vm
},
],
truncation: "auto",
model: 'computer-use-preview',
tools: [
{
type: 'computer_use_preview',
display_width: 1024,
display_height: 768,
environment: 'linux', // we're using a linux vm
},
],
truncation: 'auto',
};
// Send initial screenshot to the openai computer use model
let res = await openai.responses.create({
...computerUseConfig,
input: [
{
role: "user",
content: [
// what we want the ai to do
{ type: "input_text", text: COMPUTER_USE_PROMPT },
// current screenshot of the vm
{
type: "input_image",
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: "auto",
},
],
},
],
...computerUseConfig,
input: [
{
role: 'user',
content: [
// what we want the ai to do
{ type: 'input_text', text: COMPUTER_USE_PROMPT },
// current screenshot of the vm
{
type: 'input_image',
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: 'auto',
},
],
},
],
});
// Loop until there are no more computer use actions.
while (true) {
const computerCalls = res.output.filter((o) => o.type === "computer_call");
if (computerCalls.length < 1) {
console.log("No more computer calls. Loop complete.");
break;
}
// Get the first call
const call = computerCalls[0];
const action = call.action;
console.log("Received action from OpenAI Responses API:", action);
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] =
[];
if (call.pending_safety_checks.length > 0) {
console.log("Safety checks pending:", call.pending_safety_checks);
// In a real implementation, you would want to get user confirmation here
ackChecks = call.pending_safety_checks;
}
const computerCalls = res.output.filter((o) => o.type === 'computer_call');
if (computerCalls.length < 1) {
console.log('No more computer calls. Loop complete.');
break;
}
// Get the first call
const call = computerCalls[0];
const action = call.action;
console.log('Received action from OpenAI Responses API:', action);
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = [];
if (call.pending_safety_checks.length > 0) {
console.log('Safety checks pending:', call.pending_safety_checks);
// In a real implementation, you would want to get user confirmation here
ackChecks = call.pending_safety_checks;
}
// Execute the action in the container
await executeAction(computer, action);
// Wait for changes to process within the container (1sec)
await new Promise((resolve) => setTimeout(resolve, 1000));
// Execute the action in the container
await executeAction(computer, action);
// Wait for changes to process within the container (1sec)
await new Promise((resolve) => setTimeout(resolve, 1000));
// Capture new screenshot
const newScreenshot = await computer.interface.screenshot();
const newScreenshotBase64 = newScreenshot.toString("base64");
// Capture new screenshot
const newScreenshot = await computer.interface.screenshot();
const newScreenshotBase64 = newScreenshot.toString('base64');
// Screenshot back as computer_call_output
// Screenshot back as computer_call_output
res = await openai.responses.create({
...computerUseConfig,
previous_response_id: res.id,
input: [
{
type: "computer_call_output",
call_id: call.call_id,
acknowledged_safety_checks: ackChecks,
output: {
type: "computer_screenshot",
image_url: `data:image/png;base64,${newScreenshotBase64}`,
},
},
],
});
res = await openai.responses.create({
...computerUseConfig,
previous_response_id: res.id,
input: [
{
type: 'computer_call_output',
call_id: call.call_id,
acknowledged_safety_checks: ackChecks,
output: {
type: 'computer_screenshot',
image_url: `data:image/png;base64,${newScreenshotBase64}`,
},
},
],
});
}
process.exit();

View File

@@ -1,17 +1,13 @@
{
"compilerOptions": {
"target": "esnext",
"lib": [
"es2023"
],
"lib": ["es2023"],
"moduleDetection": "force",
"module": "preserve",
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"types": [
"node"
],
"types": ["node"],
"allowSyntheticDefaultImports": true,
"strict": true,
"noUnusedLocals": true,
@@ -21,9 +17,7 @@
"isolatedModules": true,
"verbatimModuleSyntax": true,
"skipLibCheck": true,
"outDir": "build",
"outDir": "build"
},
"include": [
"src"
]
}
"include": ["src"]
}

View File

@@ -1,8 +1,8 @@
import os
import asyncio
from pathlib import Path
import os
import sys
import traceback
from pathlib import Path
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
@@ -20,8 +20,9 @@ for path in pythonpath.split(":"):
print(f"Added to sys.path: {path}")
from computer.computer import Computer
from computer.providers.base import VMProviderType
from computer.logger import LogLevel
from computer.providers.base import VMProviderType
async def main():
try:
@@ -29,17 +30,15 @@ async def main():
# Create a local macOS computer
computer = Computer(
display="1024x768",
memory="8GB",
cpu="4",
display="1024x768",
memory="8GB",
cpu="4",
os_type="macos",
name="macos",
verbosity=LogLevel.VERBOSE,
provider_type=VMProviderType.LUME,
storage="/Users/<USER>/repos/trycua/computer/examples/storage",
shared_directories=[
"/Users/<USER>/repos/trycua/computer/examples/shared"
],
shared_directories=["/Users/<USER>/repos/trycua/computer/examples/shared"],
ephemeral=False,
)
@@ -50,22 +49,22 @@ async def main():
# name=os.getenv("CONTAINER_NAME"),
# provider_type=VMProviderType.CLOUD,
# )
try:
# Run the computer with default parameters
await computer.run()
screenshot = await computer.interface.screenshot()
# Create output directory if it doesn't exist
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)
screenshot_path = output_dir / "screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot)
print(f"Screenshot saved to: {screenshot_path.absolute()}")
# await computer.interface.hotkey("command", "space")
# res = await computer.interface.run_command("touch ./Downloads/empty_file")

View File

@@ -1,8 +1,8 @@
import os
import asyncio
from pathlib import Path
import os
import sys
import traceback
from pathlib import Path
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
@@ -21,12 +21,13 @@ for path in pythonpath.split(":"):
print(f"Added to sys.path: {path}")
from computer.computer import Computer
from computer.providers.base import VMProviderType
from computer.logger import LogLevel
from computer.providers.base import VMProviderType
# ANSI color codes
RED = '\033[91m'
RESET = '\033[0m'
RED = "\033[91m"
RESET = "\033[0m"
async def main():
try:
@@ -39,15 +40,15 @@ async def main():
name=os.getenv("CONTAINER_NAME") or "",
provider_type=VMProviderType.CLOUD,
)
try:
# Run the computer with default parameters
await computer.run()
# Create output directory if it doesn't exist
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)
# Keyboard Actions Examples
print("\n=== Keyboard Actions ===")
await computer.interface.type_text("Hello, World!")
@@ -65,8 +66,10 @@ async def main():
@sandboxed("demo_venv")
def greet_and_print(name):
from mss import mss
import os
from mss import mss
# get username
username = os.getlogin()
print(f"Hello from inside the container, {name}!")
@@ -75,9 +78,9 @@ async def main():
# take a screenshot
with mss() as sct:
filename = sct.shot(mon=-1, output='C:/Users/azureuser/Desktop/fullscreen.png')
filename = sct.shot(mon=-1, output="C:/Users/azureuser/Desktop/fullscreen.png")
print(filename)
return {"greeted": name, "username": username}
# Call with args and kwargs
@@ -94,33 +97,32 @@ async def main():
with open(screenshot_path, "wb") as f:
f.write(screenshot)
print(f"Screenshot saved to: {screenshot_path.absolute()}")
# Clipboard Actions Examples
print("\n=== Clipboard Actions ===")
await computer.interface.set_clipboard("Test clipboard")
content = await computer.interface.copy_to_clipboard()
print(f"Clipboard content: {content}")
# Simple REPL Loop
print("\n=== Command REPL ===")
print("Enter commands to run on the remote computer.")
print("Type 'exit' or 'quit' to leave the REPL.\n")
while True:
try:
# Get command from user
command = input("command> ").strip()
# Check for exit commands
if command.lower() in ['exit', 'quit', '']:
if command.lower() in ['exit', 'quit']:
if command.lower() in ["exit", "quit", ""]:
if command.lower() in ["exit", "quit"]:
print("Exiting REPL...")
break
# Run the command
result = await computer.interface.run_command(command)
print(result.stdout)
if result.stderr:
print(f"{RED}{result.stderr}{RESET}")
@@ -130,7 +132,6 @@ async def main():
except Exception as e:
print(f"{RED}Error running command: {e}{RESET}")
finally:
# Important to clean up resources
# await computer.stop()

View File

@@ -23,9 +23,9 @@ if __name__ == "__main__":
server_name="0.0.0.0",
server_port=7860,
)
# Optional: Using the saved dataset
# import datasets
# from computer.ui.utils import convert_to_unsloth
# ds = datasets.load_dataset("ddupont/highquality-cua-demonstrations")
# ds = convert_to_unsloth(ds)
# ds = convert_to_unsloth(ds)

View File

@@ -1,8 +1,10 @@
import asyncio
from computer.providers.factory import VMProviderFactory
from computer import Computer, VMProviderType
import os
from computer import Computer, VMProviderType
from computer.providers.factory import VMProviderFactory
async def main():
# # Create docker provider
# provider = VMProviderFactory.create_provider(
@@ -39,5 +41,6 @@ async def main():
with open("screenshot_docker.png", "wb") as f:
f.write(screenshot)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -6,7 +6,7 @@ hud_eval_examples.py — minimal HUD evaluation runner
- No Docker/local computer usage
"""
#imports
# imports
import asyncio
import logging
import os
@@ -14,13 +14,15 @@ import uuid
from pathlib import Path
from pprint import pprint
from dotenv import load_dotenv, find_dotenv
from agent import ComputerAgent
from agent.integrations.hud import run_full_dataset
from dotenv import find_dotenv, load_dotenv
"""
Loading env
"""
def load_env_or_fail() -> None:
# Walk up from CWD / file dir to find nearest .env
env_path = find_dotenv(usecwd=False)
@@ -32,17 +34,19 @@ def load_env_or_fail() -> None:
if not os.getenv("HUD_API_KEY"):
raise EnvironmentError("❌ HUD_API_KEY is missing in the loaded environment")
"""
Build Agent Config
- customize agent behavior, tool integration, callbacks, resource management, and more
- https://docs.trycua.com/docs/agent-sdk/agent-loops#parameters
- https://docs.trycua.com/docs/agent-sdk/supported-model-providers
"""
def build_agent_config() -> dict:
instruction = "You are a computer-using agent graded by deterministic checkers."
return {
"model": "openai/computer-use-preview",
"trajectory_dir": str(Path("trajectories")),
@@ -51,21 +55,25 @@ def build_agent_config() -> dict:
"instruction": instruction,
}
"""
Hud Eval
"""
async def run_hud_eval() -> None:
#load env and agent config
# load env and agent config
load_env_or_fail()
agent_config = build_agent_config()
# Initialize to ensure config is valid (tools, verbosity, etc.)
_ = ComputerAgent(**agent_config)
job_name = f"osworld-test-{str(uuid.uuid4())[:4]}" #job name (each run of your task is a job on hud)
job_name = (
f"osworld-test-{str(uuid.uuid4())[:4]}" # job name (each run of your task is a job on hud)
)
print(f"🚀 Running HUD eval: {job_name}")
"""
Customize your hud eval below, check the doc for additional params
- https://docs.trycua.com/docs/agent-sdk/integrations/hud#parameters-1

View File

@@ -1,5 +1,6 @@
import asyncio
from pylume import PyLume, ImageRef, VMRunOpts, SharedDirectory, VMConfig, VMUpdateOpts
from pylume import ImageRef, PyLume, SharedDirectory, VMConfig, VMRunOpts, VMUpdateOpts
async def main():

View File

@@ -1,6 +1,6 @@
from pathlib import Path
import os
import sys
from pathlib import Path
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
@@ -18,14 +18,16 @@ for path in pythonpath.split(":"):
print(f"Added to sys.path: {path}")
import asyncio
from computer.computer import Computer
from computer.helpers import sandboxed
async def main():
# Initialize the computer in a Cua Container
computer = Computer()
await computer.run()
# Install a package in a virtual environment in the container
await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
@@ -39,6 +41,7 @@ async def main():
def greet_and_print(name):
# get .html of the current Safari tab
import PyXA
safari = PyXA.Application("Safari")
current_doc = safari.current_document
html = current_doc.source()
@@ -50,5 +53,6 @@ async def main():
result = await greet_and_print("Cua")
print("Result from sandboxed function:", result)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -9,17 +9,18 @@ This script shows how to:
"""
import argparse
import logging
import sys
from pathlib import Path
import time
from PIL import Image
from typing import Dict, Any, List, Optional
import numpy as np
import io
import base64
import glob
import io
import logging
import os
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
import numpy as np
from PIL import Image
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
@@ -42,8 +43,8 @@ if str(libs_path) not in sys.path:
sys.path.append(str(libs_path))
print(f"Added to sys.path: {libs_path}")
from som import OmniParser, ParseResult, IconElement, TextElement
from som.models import UIElement, ParserMetadata, BoundingBox
from som import IconElement, OmniParser, ParseResult, TextElement
from som.models import BoundingBox, ParserMetadata, UIElement
# Configure logging
logging.basicConfig(
@@ -361,7 +362,7 @@ def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False):
# Update timing totals
total_time += t.elapsed_time
# Write summary for this combination
avg_time = total_time / len(image_files)
f.write(

View File

@@ -1,8 +1,8 @@
"""Utility functions for example scripts."""
import os
import sys
import signal
import sys
from pathlib import Path
from typing import Optional

View File

@@ -4,11 +4,13 @@ Learn more at: https://learn.microsoft.com/en-us/windows/security/application-se
"""
import asyncio
from computer import Computer
async def main():
"""Test the Windows Sandbox provider."""
# Create a computer instance using Windows Sandbox
computer = Computer(
provider_type="winsandbox",
@@ -16,19 +18,19 @@ async def main():
memory="4GB",
# ephemeral=True, # Always true for Windows Sandbox
)
try:
print("Starting Windows Sandbox...")
await computer.run()
print("Windows Sandbox is ready!")
print(f"IP Address: {await computer.get_ip()}")
# Test basic functionality
print("Testing basic functionality...")
screenshot = await computer.interface.screenshot()
print(f"Screenshot taken: {len(screenshot)} bytes")
# Test running a command
print("Testing command execution...")
result = await computer.interface.run_command("echo Hello from Windows Sandbox!")
@@ -36,16 +38,18 @@ async def main():
print("Press any key to continue...")
input()
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
finally:
print("Stopping Windows Sandbox...")
await computer.stop()
print("Windows Sandbox stopped.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -70,14 +70,17 @@ async with provider:
## Container Configuration
### Ports
- **6901**: VNC web interface (noVNC)
- **8080**: Computer-server API endpoint
### Environment Variables
- `VNC_PW`: VNC password (default: "password")
- `DISPLAY`: X11 display (set to ":0")
### Volumes
- `/home/kasm-user/storage`: Persistent storage mount point
- `/home/kasm-user/shared`: Shared folder mount point

View File

@@ -29,6 +29,7 @@ We're always looking for suggestions to make lume better. If you have an idea:
## Documentation
Documentation improvements are always welcome. You can:
- Fix typos or unclear explanations
- Add examples and use cases
- Improve API documentation
@@ -36,4 +37,4 @@ Documentation improvements are always welcome. You can:
For detailed instructions on setting up your development environment and submitting code contributions, please see our [Development.md](docs/Development.md) guide.
Feel free to join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get help with your contributions.
Feel free to join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get help with your contributions.

View File

@@ -5,6 +5,7 @@ This guide will help you set up your development environment and understand the
## Environment Setup
Lume development requires:
- Swift 6 or higher
- Xcode 15 or higher
- macOS Sequoia 15.2 or higher
@@ -16,12 +17,13 @@ If you're working on Lume in the context of the Cua monorepo, we recommend using
# Open VS Code workspace from the root of the monorepo
code .vscode/lume.code-workspace
```
This workspace is preconfigured with Swift language support, build tasks, and debug configurations.
## Setting Up the Repository Locally
1. **Fork the Repository**: Create your own fork of lume
2. **Clone the Repository**:
2. **Clone the Repository**:
```bash
git clone https://github.com/trycua/lume.git
cd lume

View File

@@ -8,13 +8,13 @@
</picture>
</div>
[![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
[![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
</h1>
</div>
**lume** is a lightweight Command Line Interface and local API server to create, run and manage macOS and Linux virtual machines (VMs) with near-native performance on Apple Silicon, using Apple's `Virtualization.Framework`.
### Run prebuilt macOS images in just 1 step
@@ -43,6 +43,7 @@ All prebuilt images use the default password `lume`. Change this immediately aft
</Callout>
**System Requirements**:
- Apple Silicon Mac (M1, M2, M3, etc.)
- macOS 13.0 or later
- At least 8GB of RAM (16GB recommended)

View File

@@ -8,9 +8,10 @@
</picture>
</div>
[![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
[![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
</h1>
</div>
@@ -21,6 +22,7 @@ macOS and Linux virtual machines in a Docker container.
</div>
## What is Lumier?
**Lumier** is an interface for running macOS virtual machines with minimal setup. It uses Docker as a packaging system to deliver a pre-configured environment that connects to the `lume` virtualization service running on your host machine. With Lumier, you get:
- A ready-to-use macOS or Linux virtual machine in minutes
@@ -35,6 +37,7 @@ Before using Lumier, make sure you have:
1. **Docker for Apple Silicon** - download it [here](https://desktop.docker.com/mac/main/arm64/Docker.dmg) and follow the installation instructions.
2. **Lume** - This is the virtualization CLI that powers Lumier. Install it with this command:
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
@@ -69,6 +72,7 @@ After running the command above, you can access your macOS VM through a web brow
This project was inspired by [dockur/windows](https://github.com/dockur/windows) and [dockur/macos](https://github.com/dockur/macos), which pioneered the approach of running Windows and macOS VMs in Docker containers.
Main differences with dockur/macos:
- Lumier is specifically designed for macOS virtualization
- Lumier supports Apple Silicon (M1/M2/M3/M4) while dockur/macos only supports Intel
- Lumier uses the Apple Virtualization Framework (Vz) through the `lume` CLI to create true virtual machines, while dockur relies on KVM.

View File

@@ -8,10 +8,11 @@
</picture>
</div>
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
[![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
[![Python](https://img.shields.io/badge/Python-333333?logo=python&logoColor=white&labelColor=333333)](#)
[![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
[![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
[![PyPI](https://img.shields.io/pypi/v/cua-computer?color=333333)](https://pypi.org/project/cua-computer/)
</h1>
</div>
@@ -47,7 +48,7 @@ async def main():
name=os.getenv("CUA_CONTAINER_NAME"),
api_key=os.getenv("CUA_API_KEY")
) as computer:
# Create agent
agent = ComputerAgent(
model="anthropic/claude-3-5-sonnet-20241022",
@@ -56,10 +57,10 @@ async def main():
trajectory_dir="trajectories",
max_trajectory_budget=5.0 # $5 budget limit
)
# Run agent
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
@@ -84,4 +85,4 @@ if __name__ == "__main__":
## License
MIT License - see LICENSE file for details.
MIT License - see LICENSE file for details.

View File

@@ -5,19 +5,13 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
import logging
import sys
from .decorators import register_agent
from .agent import ComputerAgent
from .types import Messages, AgentResponse
# Import loops to register them
from . import loops
from .agent import ComputerAgent
from .decorators import register_agent
from .types import AgentResponse, Messages
__all__ = [
"register_agent",
"ComputerAgent",
"Messages",
"AgentResponse"
]
__all__ = ["register_agent", "ComputerAgent", "Messages", "AgentResponse"]
__version__ = "0.4.0"

View File

@@ -5,8 +5,9 @@ Usage:
python -m agent.cli <model_string>
"""
import sys
import asyncio
import sys
from .cli import main
if __name__ == "__main__":

View File

@@ -2,27 +2,30 @@ import asyncio
import functools
import warnings
from concurrent.futures import ThreadPoolExecutor
from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
from litellm.types.utils import GenericStreamingChunk, ModelResponse
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm import completion, acompletion
from litellm.types.utils import GenericStreamingChunk, ModelResponse
# Try to import HuggingFace dependencies
try:
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
from .models import load_model as load_model_handler
class HuggingFaceLocalAdapter(CustomLLM):
"""HuggingFace Local Adapter for running vision-language models locally."""
def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
"""Initialize the adapter.
Args:
device: Device to load model on ("auto", "cuda", "cpu", etc.)
trust_remote_code: Whether to trust remote code
@@ -34,129 +37,120 @@ class HuggingFaceLocalAdapter(CustomLLM):
# Cache for model handlers keyed by model_name
self._handlers: Dict[str, Any] = {}
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
def _get_handler(self, model_name: str):
"""Get or create a model handler for the given model name."""
if model_name not in self._handlers:
self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
self._handlers[model_name] = load_model_handler(
model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
)
return self._handlers[model_name]
def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert OpenAI format messages to HuggingFace format.
Args:
messages: Messages in OpenAI format
Returns:
Messages in HuggingFace format
"""
converted_messages = []
for message in messages:
converted_message = {
"role": message["role"],
"content": []
}
converted_message = {"role": message["role"], "content": []}
content = message.get("content", [])
if isinstance(content, str):
# Simple text content
converted_message["content"].append({
"type": "text",
"text": content
})
converted_message["content"].append({"type": "text", "text": content})
elif isinstance(content, list):
# Multi-modal content
for item in content:
if item.get("type") == "text":
converted_message["content"].append({
"type": "text",
"text": item.get("text", "")
})
converted_message["content"].append(
{"type": "text", "text": item.get("text", "")}
)
elif item.get("type") == "image_url":
# Convert image_url format to image format
image_url = item.get("image_url", {}).get("url", "")
converted_message["content"].append({
"type": "image",
"image": image_url
})
converted_message["content"].append({"type": "image", "image": image_url})
converted_messages.append(converted_message)
return converted_messages
def _generate(self, **kwargs) -> str:
"""Generate response using the local HuggingFace model.
Args:
**kwargs: Keyword arguments containing messages and model info
Returns:
Generated text response
"""
if not HF_AVAILABLE:
raise ImportError(
"HuggingFace transformers dependencies not found. "
"Please install with: pip install \"cua-agent[uitars-hf]\""
'Please install with: pip install "cua-agent[uitars-hf]"'
)
# Extract messages and model from kwargs
messages = kwargs.get('messages', [])
model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
max_new_tokens = kwargs.get('max_tokens', 128)
messages = kwargs.get("messages", [])
model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
max_new_tokens = kwargs.get("max_tokens", 128)
# Warn about ignored kwargs
ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
if ignored_kwargs:
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
# Convert messages to HuggingFace format
hf_messages = self._convert_messages(messages)
# Delegate to model handler
handler = self._get_handler(model_name)
generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
return generated_text
def completion(self, *args, **kwargs) -> ModelResponse:
"""Synchronous completion method.
Returns:
ModelResponse with generated text
"""
generated_text = self._generate(**kwargs)
return completion(
model=f"huggingface-local/{kwargs['model']}",
mock_response=generated_text,
)
async def acompletion(self, *args, **kwargs) -> ModelResponse:
"""Asynchronous completion method.
Returns:
ModelResponse with generated text
"""
# Run _generate in thread pool to avoid blocking
loop = asyncio.get_event_loop()
generated_text = await loop.run_in_executor(
self._executor,
functools.partial(self._generate, **kwargs)
self._executor, functools.partial(self._generate, **kwargs)
)
return await acompletion(
model=f"huggingface-local/{kwargs['model']}",
mock_response=generated_text,
)
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
"""Synchronous streaming method.
Returns:
Iterator of GenericStreamingChunk
"""
generated_text = self._generate(**kwargs)
generic_streaming_chunk: GenericStreamingChunk = {
"finish_reason": "stop",
"index": 0,
@@ -165,22 +159,21 @@ class HuggingFaceLocalAdapter(CustomLLM):
"tool_use": None,
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
}
yield generic_streaming_chunk
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
"""Asynchronous streaming method.
Returns:
AsyncIterator of GenericStreamingChunk
"""
# Run _generate in thread pool to avoid blocking
loop = asyncio.get_event_loop()
generated_text = await loop.run_in_executor(
self._executor,
functools.partial(self._generate, **kwargs)
self._executor, functools.partial(self._generate, **kwargs)
)
generic_streaming_chunk: GenericStreamingChunk = {
"finish_reason": "stop",
"index": 0,
@@ -189,5 +182,5 @@ class HuggingFaceLocalAdapter(CustomLLM):
"tool_use": None,
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
}
yield generic_streaming_chunk
yield generic_streaming_chunk

View File

@@ -1,22 +1,23 @@
import os
import asyncio
import os
from typing import Any, AsyncIterator, Dict, Iterator, List
import requests
from typing import List, Dict, Any, Iterator, AsyncIterator
from litellm.types.utils import GenericStreamingChunk, ModelResponse
from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm import completion, acompletion
from litellm.types.utils import GenericStreamingChunk, ModelResponse
class HumanAdapter(CustomLLM):
"""Human Adapter for human-in-the-loop completions.
This adapter sends completion requests to a human completion server
where humans can review and respond to AI requests.
"""
def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs):
"""Initialize the human adapter.
Args:
base_url: Base URL for the human completion server.
Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002
@@ -24,60 +25,58 @@ class HumanAdapter(CustomLLM):
**kwargs: Additional arguments
"""
super().__init__()
self.base_url = base_url or os.getenv('HUMAN_BASE_URL', 'http://localhost:8002')
self.base_url = base_url or os.getenv("HUMAN_BASE_URL", "http://localhost:8002")
self.timeout = timeout
# Ensure base_url doesn't end with slash
self.base_url = self.base_url.rstrip('/')
self.base_url = self.base_url.rstrip("/")
def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
"""Queue a completion request and return the call ID.
Args:
messages: Messages in OpenAI format
model: Model name
Returns:
Call ID for tracking the request
Raises:
Exception: If queueing fails
"""
try:
response = requests.post(
f"{self.base_url}/queue",
json={"messages": messages, "model": model},
timeout=10
f"{self.base_url}/queue", json={"messages": messages, "model": model}, timeout=10
)
response.raise_for_status()
return response.json()["id"]
except requests.RequestException as e:
raise Exception(f"Failed to queue completion request: {e}")
def _wait_for_completion(self, call_id: str) -> Dict[str, Any]:
"""Wait for human to complete the call.
Args:
call_id: ID of the queued completion call
Returns:
Dict containing response and/or tool_calls
Raises:
TimeoutError: If timeout is exceeded
Exception: If completion fails
"""
import time
start_time = time.time()
while True:
try:
# Check status
status_response = requests.get(f"{self.base_url}/status/{call_id}")
status_response.raise_for_status()
status_data = status_response.json()
if status_data["status"] == "completed":
result = {}
if "response" in status_data and status_data["response"]:
@@ -88,38 +87,41 @@ class HumanAdapter(CustomLLM):
elif status_data["status"] == "failed":
error_msg = status_data.get("error", "Unknown error")
raise Exception(f"Completion failed: {error_msg}")
# Check timeout
if time.time() - start_time > self.timeout:
raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
raise TimeoutError(
f"Timeout waiting for human response after {self.timeout} seconds"
)
# Wait before checking again
time.sleep(1.0)
except requests.RequestException as e:
if time.time() - start_time > self.timeout:
raise TimeoutError(f"Timeout waiting for human response: {e}")
# Continue trying if we haven't timed out
time.sleep(1.0)
async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]:
"""Async version of wait_for_completion.
Args:
call_id: ID of the queued completion call
Returns:
Dict containing response and/or tool_calls
Raises:
TimeoutError: If timeout is exceeded
Exception: If completion fails
"""
import aiohttp
import time
import aiohttp
start_time = time.time()
async with aiohttp.ClientSession() as session:
while True:
try:
@@ -127,7 +129,7 @@ class HumanAdapter(CustomLLM):
async with session.get(f"{self.base_url}/status/{call_id}") as response:
response.raise_for_status()
status_data = await response.json()
if status_data["status"] == "completed":
result = {}
if "response" in status_data and status_data["response"]:
@@ -138,166 +140,158 @@ class HumanAdapter(CustomLLM):
elif status_data["status"] == "failed":
error_msg = status_data.get("error", "Unknown error")
raise Exception(f"Completion failed: {error_msg}")
# Check timeout
if time.time() - start_time > self.timeout:
raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
raise TimeoutError(
f"Timeout waiting for human response after {self.timeout} seconds"
)
# Wait before checking again
await asyncio.sleep(1.0)
except Exception as e:
if time.time() - start_time > self.timeout:
raise TimeoutError(f"Timeout waiting for human response: {e}")
# Continue trying if we haven't timed out
await asyncio.sleep(1.0)
def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
"""Generate a human response for the given messages.
Args:
messages: Messages in OpenAI format
model: Model name
Returns:
Dict containing response and/or tool_calls
"""
# Queue the completion request
call_id = self._queue_completion(messages, model)
# Wait for human response
response = self._wait_for_completion(call_id)
return response
async def _async_generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
async def _async_generate_response(
self, messages: List[Dict[str, Any]], model: str
) -> Dict[str, Any]:
"""Async version of _generate_response.
Args:
messages: Messages in OpenAI format
model: Model name
Returns:
Dict containing response and/or tool_calls
"""
# Queue the completion request (sync operation)
call_id = self._queue_completion(messages, model)
# Wait for human response (async)
response = await self._async_wait_for_completion(call_id)
return response
def completion(self, *args, **kwargs) -> ModelResponse:
"""Synchronous completion method.
Returns:
ModelResponse with human-generated text or tool calls
"""
messages = kwargs.get('messages', [])
model = kwargs.get('model', 'human')
messages = kwargs.get("messages", [])
model = kwargs.get("model", "human")
# Generate human response
human_response_data = self._generate_response(messages, model)
# Create ModelResponse with proper structure
from litellm.types.utils import ModelResponse, Choices, Message
import uuid
import time
import uuid
from litellm.types.utils import Choices, Message, ModelResponse
# Create message content based on response type
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
# Tool calls response
message = Message(
role="assistant",
content=human_response_data.get("response", ""),
tool_calls=human_response_data["tool_calls"]
tool_calls=human_response_data["tool_calls"],
)
else:
# Text response
message = Message(
role="assistant",
content=human_response_data.get("response", "")
)
choice = Choices(
finish_reason="stop",
index=0,
message=message
)
message = Message(role="assistant", content=human_response_data.get("response", ""))
choice = Choices(finish_reason="stop", index=0, message=message)
result = ModelResponse(
id=f"human-{uuid.uuid4()}",
choices=[choice],
created=int(time.time()),
model=f"human/{model}",
object="chat.completion"
object="chat.completion",
)
return result
async def acompletion(self, *args, **kwargs) -> ModelResponse:
"""Asynchronous completion method.
Returns:
ModelResponse with human-generated text or tool calls
"""
messages = kwargs.get('messages', [])
model = kwargs.get('model', 'human')
messages = kwargs.get("messages", [])
model = kwargs.get("model", "human")
# Generate human response
human_response_data = await self._async_generate_response(messages, model)
# Create ModelResponse with proper structure
from litellm.types.utils import ModelResponse, Choices, Message
import uuid
import time
import uuid
from litellm.types.utils import Choices, Message, ModelResponse
# Create message content based on response type
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
# Tool calls response
message = Message(
role="assistant",
content=human_response_data.get("response", ""),
tool_calls=human_response_data["tool_calls"]
tool_calls=human_response_data["tool_calls"],
)
else:
# Text response
message = Message(
role="assistant",
content=human_response_data.get("response", "")
)
choice = Choices(
finish_reason="stop",
index=0,
message=message
)
message = Message(role="assistant", content=human_response_data.get("response", ""))
choice = Choices(finish_reason="stop", index=0, message=message)
result = ModelResponse(
id=f"human-{uuid.uuid4()}",
choices=[choice],
created=int(time.time()),
model=f"human/{model}",
object="chat.completion"
object="chat.completion",
)
return result
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
"""Synchronous streaming method.
Yields:
Streaming chunks with human-generated text or tool calls
"""
messages = kwargs.get('messages', [])
model = kwargs.get('model', 'human')
messages = kwargs.get("messages", [])
model = kwargs.get("model", "human")
# Generate human response
human_response_data = self._generate_response(messages, model)
import time
# Handle tool calls vs text response
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
# Stream tool calls as a single chunk
@@ -319,22 +313,26 @@ class HumanAdapter(CustomLLM):
"is_finished": True,
"text": response_text,
"tool_use": None,
"usage": {"completion_tokens": len(response_text.split()), "prompt_tokens": 0, "total_tokens": len(response_text.split())},
"usage": {
"completion_tokens": len(response_text.split()),
"prompt_tokens": 0,
"total_tokens": len(response_text.split()),
},
}
yield generic_chunk
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
"""Asynchronous streaming method.
Yields:
Streaming chunks with human-generated text or tool calls
"""
messages = kwargs.get('messages', [])
model = kwargs.get('model', 'human')
messages = kwargs.get("messages", [])
model = kwargs.get("model", "human")
# Generate human response
human_response = await self._async_generate_response(messages, model)
# Return as single streaming chunk
generic_streaming_chunk: GenericStreamingChunk = {
"finish_reason": "stop",
@@ -342,7 +340,11 @@ class HumanAdapter(CustomLLM):
"is_finished": True,
"text": human_response,
"tool_use": None,
"usage": {"completion_tokens": len(human_response.split()), "prompt_tokens": 0, "total_tokens": len(human_response.split())},
"usage": {
"completion_tokens": len(human_response.split()),
"prompt_tokens": 0,
"total_tokens": len(human_response.split()),
},
}
yield generic_streaming_chunk
yield generic_streaming_chunk

View File

@@ -1,24 +1,26 @@
import asyncio
import functools
import warnings
import io
import base64
import functools
import io
import math
import re
import warnings
from concurrent.futures import ThreadPoolExecutor
from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast
from PIL import Image
from litellm.types.utils import GenericStreamingChunk, ModelResponse
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple, cast
from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm import completion, acompletion
from litellm.types.utils import GenericStreamingChunk, ModelResponse
from PIL import Image
# Try to import MLX dependencies
try:
import mlx.core as mx
from mlx_vlm import load, generate
from mlx_vlm import generate, load
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config
from transformers.tokenization_utils import PreTrainedTokenizer
MLX_AVAILABLE = True
except ImportError:
MLX_AVAILABLE = False
@@ -29,20 +31,28 @@ MIN_PIXELS = 100 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
def round_by_factor(number: float, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: float, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: float, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def smart_resize(
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
height: int,
width: int,
factor: int = IMAGE_FACTOR,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
) -> tuple[int, int]:
"""
Rescales the image so that the following conditions are met:
@@ -70,61 +80,62 @@ def smart_resize(
class MLXVLMAdapter(CustomLLM):
"""MLX VLM Adapter for running vision-language models locally using MLX."""
def __init__(self, **kwargs):
"""Initialize the adapter.
Args:
**kwargs: Additional arguments
"""
super().__init__()
self.models = {} # Cache for loaded models
self.processors = {} # Cache for loaded processors
self.configs = {} # Cache for loaded configs
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
def _load_model_and_processor(self, model_name: str):
"""Load model and processor if not already cached.
Args:
model_name: Name of the model to load
Returns:
Tuple of (model, processor, config)
"""
if not MLX_AVAILABLE:
raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
if model_name not in self.models:
# Load model and processor
model_obj, processor = load(
model_name,
processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
model_name, processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
)
config = load_config(model_name)
# Cache them
self.models[model_name] = model_obj
self.processors[model_name] = processor
self.configs[model_name] = config
return self.models[model_name], self.processors[model_name], self.configs[model_name]
def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
def _process_coordinates(
self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]
) -> str:
"""Process coordinates in box tokens based on image resizing using smart_resize approach.
Args:
text: Text containing box tokens
original_size: Original image size (width, height)
model_size: Model processed image size (width, height)
Returns:
Text with processed coordinates
"""
# Find all box tokens
box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
def process_coords(match):
model_x, model_y = int(match.group(1)), int(match.group(2))
# Scale coordinates from model space to original image space
@@ -132,15 +143,20 @@ class MLXVLMAdapter(CustomLLM):
new_x = int(model_x * original_size[0] / model_size[0]) # Width
new_y = int(model_y * original_size[1] / model_size[1]) # Height
return f"<|box_start|>({new_x},{new_y})<|box_end|>"
return re.sub(box_pattern, process_coords, text)
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]:
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[
List[Dict[str, Any]],
List[Image.Image],
Dict[int, Tuple[int, int]],
Dict[int, Tuple[int, int]],
]:
"""Convert OpenAI format messages to MLX VLM format and extract images.
Args:
messages: Messages in OpenAI format
Returns:
Tuple of (processed_messages, images, original_sizes, model_sizes)
"""
@@ -149,13 +165,10 @@ class MLXVLMAdapter(CustomLLM):
original_sizes = {} # Track original sizes of images for coordinate mapping
model_sizes = {} # Track model processed sizes
image_index = 0
for message in messages:
processed_message = {
"role": message["role"],
"content": []
}
processed_message = {"role": message["role"], "content": []}
content = message.get("content", [])
if isinstance(content, str):
# Simple text content
@@ -165,164 +178,163 @@ class MLXVLMAdapter(CustomLLM):
processed_content = []
for item in content:
if item.get("type") == "text":
processed_content.append({
"type": "text",
"text": item.get("text", "")
})
processed_content.append({"type": "text", "text": item.get("text", "")})
elif item.get("type") == "image_url":
image_url = item.get("image_url", {}).get("url", "")
pil_image = None
if image_url.startswith("data:image/"):
# Extract base64 data
base64_data = image_url.split(',')[1]
base64_data = image_url.split(",")[1]
# Convert base64 to PIL Image
image_data = base64.b64decode(base64_data)
pil_image = Image.open(io.BytesIO(image_data))
else:
# Handle file path or URL
pil_image = Image.open(image_url)
# Store original image size for coordinate mapping
original_size = pil_image.size
original_sizes[image_index] = original_size
# Use smart_resize to determine model size
# Note: smart_resize expects (height, width) but PIL gives (width, height)
height, width = original_size[1], original_size[0]
new_height, new_width = smart_resize(height, width)
# Store model size in (width, height) format for consistent coordinate processing
model_sizes[image_index] = (new_width, new_height)
# Resize the image using the calculated dimensions from smart_resize
resized_image = pil_image.resize((new_width, new_height))
images.append(resized_image)
# Add image placeholder to content
processed_content.append({
"type": "image"
})
processed_content.append({"type": "image"})
image_index += 1
processed_message["content"] = processed_content
processed_messages.append(processed_message)
return processed_messages, images, original_sizes, model_sizes
def _generate(self, **kwargs) -> str:
"""Generate response using the local MLX VLM model.
Args:
**kwargs: Keyword arguments containing messages and model info
Returns:
Generated text response
"""
messages = kwargs.get('messages', [])
model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')
max_tokens = kwargs.get('max_tokens', 128)
messages = kwargs.get("messages", [])
model_name = kwargs.get("model", "mlx-community/UI-TARS-1.5-7B-4bit")
max_tokens = kwargs.get("max_tokens", 128)
# Warn about ignored kwargs
ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
if ignored_kwargs:
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
# Load model and processor
model, processor, config = self._load_model_and_processor(model_name)
# Convert messages and extract images
processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
# Process user text input with box coordinates after image processing
# Swap original_size and model_size arguments for inverse transformation
for msg_idx, msg in enumerate(processed_messages):
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
content = msg.get("content", "")
if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
if (
"<|box_start|>" in content
and original_sizes
and model_sizes
and 0 in original_sizes
and 0 in model_sizes
):
orig_size = original_sizes[0]
model_size = model_sizes[0]
# Swap arguments to perform inverse transformation for user input
processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size)
processed_messages[msg_idx]["content"] = self._process_coordinates(
content, model_size, orig_size
)
try:
# Format prompt according to model requirements using the processor directly
prompt = processor.apply_chat_template(
processed_messages,
tokenize=False,
add_generation_prompt=True,
return_tensors='pt'
processed_messages, tokenize=False, add_generation_prompt=True, return_tensors="pt"
)
tokenizer = cast(PreTrainedTokenizer, processor)
# Generate response
text_content, usage = generate(
model,
tokenizer,
str(prompt),
images, # type: ignore
model,
tokenizer,
str(prompt),
images, # type: ignore
verbose=False,
max_tokens=max_tokens
max_tokens=max_tokens,
)
except Exception as e:
raise RuntimeError(f"Error generating response: {str(e)}") from e
# Process coordinates in the response back to original image space
if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
# Get original image size and model size (using the first image)
orig_size = original_sizes[0]
model_size = model_sizes[0]
# Check if output contains box tokens that need processing
if "<|box_start|>" in text_content:
# Process coordinates from model space back to original image space
text_content = self._process_coordinates(text_content, orig_size, model_size)
return text_content
def completion(self, *args, **kwargs) -> ModelResponse:
"""Synchronous completion method.
Returns:
ModelResponse with generated text
"""
generated_text = self._generate(**kwargs)
result = completion(
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
mock_response=generated_text,
)
return cast(ModelResponse, result)
async def acompletion(self, *args, **kwargs) -> ModelResponse:
"""Asynchronous completion method.
Returns:
ModelResponse with generated text
"""
# Run _generate in thread pool to avoid blocking
loop = asyncio.get_event_loop()
generated_text = await loop.run_in_executor(
self._executor,
functools.partial(self._generate, **kwargs)
self._executor, functools.partial(self._generate, **kwargs)
)
result = await acompletion(
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
mock_response=generated_text,
)
return cast(ModelResponse, result)
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
"""Synchronous streaming method.
Returns:
Iterator of GenericStreamingChunk
"""
generated_text = self._generate(**kwargs)
generic_streaming_chunk: GenericStreamingChunk = {
"finish_reason": "stop",
"index": 0,
@@ -331,22 +343,21 @@ class MLXVLMAdapter(CustomLLM):
"tool_use": None,
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
}
yield generic_streaming_chunk
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
"""Asynchronous streaming method.
Returns:
AsyncIterator of GenericStreamingChunk
"""
# Run _generate in thread pool to avoid blocking
loop = asyncio.get_event_loop()
generated_text = await loop.run_in_executor(
self._executor,
functools.partial(self._generate, **kwargs)
self._executor, functools.partial(self._generate, **kwargs)
)
generic_streaming_chunk: GenericStreamingChunk = {
"finish_reason": "stop",
"index": 0,
@@ -355,5 +366,5 @@ class MLXVLMAdapter(CustomLLM):
"tool_use": None,
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
}
yield generic_streaming_chunk
yield generic_streaming_chunk

View File

@@ -2,32 +2,40 @@ from typing import Optional
try:
from transformers import AutoConfig
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
from .generic import GenericHFModel
from .internvl import InternVLModel
from .opencua import OpenCUAModel
from .qwen2_5_vl import Qwen2_5_VLModel
from .internvl import InternVLModel
def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
"""Factory function to load and return the right model handler instance.
- If the underlying transformers config class matches OpenCUA, return OpenCUAModel
- Otherwise, return GenericHFModel
"""
if not HF_AVAILABLE:
raise ImportError(
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
)
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
cls = cfg.__class__.__name__
print(f"cls: {cls}")
if "OpenCUA" in cls:
return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
return OpenCUAModel(
model_name=model_name, device=device, trust_remote_code=trust_remote_code
)
elif "Qwen2_5_VL" in cls:
return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
return Qwen2_5_VLModel(
model_name=model_name, device=device, trust_remote_code=trust_remote_code
)
elif "InternVL" in cls:
return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
return InternVLModel(
model_name=model_name, device=device, trust_remote_code=trust_remote_code
)
return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)

View File

@@ -1,9 +1,10 @@
from typing import List, Dict, Any, Optional
from typing import Any, Dict, List, Optional
# Hugging Face imports are local to avoid hard dependency at module import
try:
import torch # type: ignore
from transformers import AutoModel, AutoProcessor # type: ignore
HF_AVAILABLE = True
except Exception:
HF_AVAILABLE = False
@@ -14,10 +15,12 @@ class GenericHFModel:
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
"""
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
def __init__(
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
) -> None:
if not HF_AVAILABLE:
raise ImportError(
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
)
self.model_name = model_name
self.device = device
@@ -64,7 +67,7 @@ class GenericHFModel:
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
# Trim prompt tokens from output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode
output_text = self.processor.batch_decode(

View File

@@ -1,19 +1,22 @@
from __future__ import annotations
from typing import List, Dict, Any, Optional
from typing import Any, Dict, List, Optional
# Hugging Face imports are local to avoid hard dependency at module import
try:
import torch # type: ignore
from transformers import AutoModel, AutoTokenizer # type: ignore
# Attempt to import InternVL's model dependencies
import einops as _ # type: ignore
import timm as _ # type: ignore
from PIL import Image # type: ignore
import torchvision.transforms as T # type: ignore
from torchvision.transforms.functional import InterpolationMode # type: ignore
import base64 # type: ignore
from io import BytesIO # type: ignore
# Attempt to import InternVL's model dependencies
import einops as _ # type: ignore
import requests # type: ignore
import timm as _ # type: ignore
import torch # type: ignore
import torchvision.transforms as T # type: ignore
from PIL import Image # type: ignore
from torchvision.transforms.functional import InterpolationMode # type: ignore
from transformers import AutoModel, AutoTokenizer # type: ignore
HF_AVAILABLE = True
except Exception:
HF_AVAILABLE = False
@@ -25,10 +28,12 @@ class InternVLModel:
Provides preprocessing to support multi-turn conversations with multiple images.
"""
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
def __init__(
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
) -> None:
if not HF_AVAILABLE:
raise ImportError(
"InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
)
self.model_name = model_name
self.device = device
@@ -60,16 +65,25 @@ class InternVLModel:
def _build_transform(self, input_size: int) -> T.Compose:
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
transform = T.Compose(
[
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
best_ratio_diff = float('inf')
def _find_closest_aspect_ratio(
self,
aspect_ratio: float,
target_ratios: List[tuple],
width: int,
height: int,
image_size: int,
):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
@@ -83,17 +97,29 @@ class InternVLModel:
best_ratio = ratio
return best_ratio
def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
def _dynamic_preprocess(
self,
image: Image.Image,
min_num: int = 1,
max_num: int = 12,
image_size: int = 448,
use_thumbnail: bool = True,
) -> List[Image.Image]:
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
target_aspect_ratio = self._find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
@@ -106,7 +132,7 @@ class InternVLModel:
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
((i // (target_width // image_size)) + 1) * image_size,
)
split_img = resized_img.crop(box)
processed_images.append(split_img)
@@ -122,20 +148,24 @@ class InternVLModel:
# data URL base64
header, b64data = src.split(",", 1)
img_bytes = base64.b64decode(b64data)
return Image.open(BytesIO(img_bytes)).convert('RGB')
return Image.open(BytesIO(img_bytes)).convert("RGB")
if src.startswith("http://") or src.startswith("https://"):
resp = requests.get(src, timeout=10)
resp.raise_for_status()
return Image.open(BytesIO(resp.content)).convert('RGB')
return Image.open(BytesIO(resp.content)).convert("RGB")
# Assume local file path
return Image.open(src).convert('RGB')
return Image.open(src).convert("RGB")
def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
def _images_to_pixel_values(
self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
):
transform = self._build_transform(input_size=input_size)
pixel_values_list = []
num_patches_list: List[int] = []
for img in images:
tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
tiles = self._dynamic_preprocess(
img, image_size=input_size, use_thumbnail=True, max_num=max_num
)
pv = [transform(tile) for tile in tiles]
pv = torch.stack(pv)
num_patches_list.append(pv.shape[0])
@@ -191,7 +221,9 @@ class InternVLModel:
last_user_text_parts = parts_text or last_user_text_parts
elif role == "assistant":
# Only keep text content for history
parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
parts_text = [
item.get("text", "") for item in content_items if item.get("type") == "text"
]
text = "\n".join(parts_text).strip()
if text:
context_lines.append(f"Assistant: {text}")
@@ -200,7 +232,9 @@ class InternVLModel:
pixel_values = None
num_patches_list: List[int] = []
if all_images:
pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
pixel_values, num_patches_list = self._images_to_pixel_values(
all_images, input_size=448, max_num=12
)
if pixel_values is not None:
# Convert dtype/device as in docs
pixel_values = pixel_values.to(torch.bfloat16)
@@ -246,7 +280,9 @@ class InternVLModel:
num_patches_list=num_patches_list,
)
else:
response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
response = self.model.chat(
self.tokenizer, pixel_values, question, generation_config
)
except Exception as e:
# Fallback: return empty string to avoid crashing the adapter
return ""

View File

@@ -1,13 +1,18 @@
from typing import List, Dict, Any
import re
import base64
import re
from io import BytesIO
from typing import Any, Dict, List
try:
import blobfile as _ # assert blobfile is installed
import torch # type: ignore
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
from PIL import Image # type: ignore
import blobfile as _ # assert blobfile is installed
from transformers import ( # type: ignore
AutoImageProcessor,
AutoModel,
AutoTokenizer,
)
OPENCUA_AVAILABLE = True
except Exception:
OPENCUA_AVAILABLE = False
@@ -16,10 +21,12 @@ except Exception:
class OpenCUAModel:
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
def __init__(
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
) -> None:
if not OPENCUA_AVAILABLE:
raise ImportError(
"OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
)
self.model_name = model_name
self.device = device
@@ -56,7 +63,11 @@ class OpenCUAModel:
return ""
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
assert (
self.model is not None
and self.tokenizer is not None
and self.image_processor is not None
)
# Tokenize text side using chat template
input_ids = self.tokenizer.apply_chat_template(
@@ -74,7 +85,11 @@ class OpenCUAModel:
pixel_values = torch.tensor(image_info["pixel_values"]).to(
dtype=torch.bfloat16, device=self.model.device
)
grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
grid_thws = (
torch.tensor(image_info["image_grid_thw"])
if "image_grid_thw" in image_info
else None
)
gen_kwargs: Dict[str, Any] = {
"max_new_tokens": max_new_tokens,

View File

@@ -1,9 +1,10 @@
from typing import List, Dict, Any, Optional
from typing import Any, Dict, List, Optional
# Hugging Face imports are local to avoid hard dependency at module import
try:
import torch # type: ignore
from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
HF_AVAILABLE = True
except Exception:
HF_AVAILABLE = False
@@ -14,10 +15,12 @@ class Qwen2_5_VLModel:
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
"""
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
def __init__(
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
) -> None:
if not HF_AVAILABLE:
raise ImportError(
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
)
self.model_name = model_name
self.device = device
@@ -64,7 +67,7 @@ class Qwen2_5_VLModel:
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
# Trim prompt tokens from output
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode
output_text = self.processor.batch_decode(

View File

@@ -3,76 +3,83 @@ ComputerAgent - Main agent class that selects and runs agent loops
"""
import asyncio
from pathlib import Path
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
from litellm.responses.utils import Usage
from .types import (
Messages,
AgentCapability,
ToolError,
IllegalArgumentError
)
from .responses import make_tool_error_item, replace_failed_computer_calls_with_function_calls
from .decorators import find_agent_config
import inspect
import json
from pathlib import Path
from typing import (
Any,
AsyncGenerator,
Callable,
Dict,
List,
Optional,
Set,
Tuple,
Union,
cast,
)
import litellm
import litellm.utils
import inspect
from litellm.responses.utils import Usage
from .adapters import (
HuggingFaceLocalAdapter,
HumanAdapter,
MLXVLMAdapter,
)
from .callbacks import (
ImageRetentionCallback,
LoggingCallback,
TrajectorySaverCallback,
BudgetManagerCallback,
TelemetryCallback,
ImageRetentionCallback,
LoggingCallback,
OperatorNormalizerCallback,
PromptInstructionsCallback,
TelemetryCallback,
TrajectorySaverCallback,
)
from .computers import (
AsyncComputerHandler,
is_agent_computer,
make_computer_handler
from .computers import AsyncComputerHandler, is_agent_computer, make_computer_handler
from .decorators import find_agent_config
from .responses import (
make_tool_error_item,
replace_failed_computer_calls_with_function_calls,
)
from .types import AgentCapability, IllegalArgumentError, Messages, ToolError
def assert_callable_with(f, *args, **kwargs):
"""Check if function can be called with given arguments."""
try:
inspect.signature(f).bind(*args, **kwargs)
return True
except TypeError as e:
sig = inspect.signature(f)
raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
"""Check if function can be called with given arguments."""
try:
inspect.signature(f).bind(*args, **kwargs)
return True
except TypeError as e:
sig = inspect.signature(f)
raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
def get_json(obj: Any, max_depth: int = 10) -> Any:
def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
if seen is None:
seen = set()
# Use model_dump() if available
if hasattr(o, 'model_dump'):
if hasattr(o, "model_dump"):
return o.model_dump()
# Check depth limit
if depth > max_depth:
return f"<max_depth_exceeded:{max_depth}>"
# Check for circular references using object id
obj_id = id(o)
if obj_id in seen:
return f"<circular_reference:{type(o).__name__}>"
# Handle Computer objects
if hasattr(o, '__class__') and 'computer' in getattr(o, '__class__').__name__.lower():
if hasattr(o, "__class__") and "computer" in o.__class__.__name__.lower():
return f"<computer:{o.__class__.__name__}>"
# Handle objects with __dict__
if hasattr(o, '__dict__'):
if hasattr(o, "__dict__"):
seen.add(obj_id)
try:
result = {}
@@ -84,7 +91,7 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
return result
finally:
seen.discard(obj_id)
# Handle common types that might contain nested objects
elif isinstance(o, dict):
seen.add(obj_id)
@@ -96,7 +103,7 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
}
finally:
seen.discard(obj_id)
elif isinstance(o, (list, tuple, set)):
seen.add(obj_id)
try:
@@ -107,32 +114,33 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
]
finally:
seen.discard(obj_id)
# For basic types that json.dumps can handle
elif isinstance(o, (str, int, float, bool)) or o is None:
return o
# Fallback to string representation
else:
return str(o)
def remove_nones(obj: Any) -> Any:
if isinstance(obj, dict):
return {k: remove_nones(v) for k, v in obj.items() if v is not None}
elif isinstance(obj, list):
return [remove_nones(item) for item in obj if item is not None]
return obj
# Serialize with circular reference and depth protection
serialized = custom_serializer(obj)
# Convert to JSON string and back to ensure JSON compatibility
json_str = json.dumps(serialized)
parsed = json.loads(json_str)
# Final cleanup of any remaining None values
return remove_nones(parsed)
def sanitize_message(msg: Any) -> Any:
"""Return a copy of the message with image_url omitted for computer_call_output messages."""
if msg.get("type") == "computer_call_output":
@@ -143,19 +151,24 @@ def sanitize_message(msg: Any) -> Any:
return sanitized
return msg
def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
call_ids = []
for message in messages:
if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
if (
message.get("type") == "computer_call_output"
or message.get("type") == "function_call_output"
):
call_ids.append(message.get("call_id"))
return call_ids
class ComputerAgent:
"""
Main agent class that automatically selects the appropriate agent loop
based on the model and executes tool calls.
"""
def __init__(
self,
model: str,
@@ -172,11 +185,11 @@ class ComputerAgent:
max_trajectory_budget: Optional[float | dict] = None,
telemetry_enabled: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
**kwargs
**kwargs,
):
"""
Initialize ComputerAgent.
Args:
model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
tools: List of tools (computer objects, decorated functions, etc.)
@@ -193,11 +206,11 @@ class ComputerAgent:
telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
**kwargs: Additional arguments passed to the agent loop
"""
"""
# If the loop is "human/human", we need to prefix a grounding model fallback
if model in ["human/human", "human"]:
model = "openai/computer-use-preview+human/human"
self.model = model
self.tools = tools or []
self.custom_loop = custom_loop
@@ -236,34 +249,33 @@ class ComputerAgent:
# Add image retention callback if only_n_most_recent_images is set
if self.only_n_most_recent_images:
self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
# Add trajectory saver callback if trajectory_dir is set
if self.trajectory_dir:
if isinstance(self.trajectory_dir, dict):
self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
elif isinstance(self.trajectory_dir, (str, Path)):
self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
# Add budget manager if max_trajectory_budget is set
if max_trajectory_budget:
if isinstance(max_trajectory_budget, dict):
self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
else:
self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
# == Enable local model providers w/ LiteLLM ==
# Register local model providers
hf_adapter = HuggingFaceLocalAdapter(
device="auto",
trust_remote_code=self.trust_remote_code or False
device="auto", trust_remote_code=self.trust_remote_code or False
)
human_adapter = HumanAdapter()
mlx_adapter = MLXVLMAdapter()
litellm.custom_provider_map = [
{"provider": "huggingface-local", "custom_handler": hf_adapter},
{"provider": "human", "custom_handler": human_adapter},
{"provider": "mlx", "custom_handler": mlx_adapter}
{"provider": "mlx", "custom_handler": mlx_adapter},
]
litellm.suppress_debug_info = True
@@ -280,16 +292,16 @@ class ComputerAgent:
# Instantiate the agent config class
self.agent_loop = config_info.agent_class()
self.agent_config_info = config_info
self.tool_schemas = []
self.computer_handler = None
async def _initialize_computers(self):
"""Initialize computer objects"""
if not self.tool_schemas:
# Process tools and create tool schemas
self.tool_schemas = self._process_tools()
# Find computer tool and create interface adapter
computer_handler = None
for schema in self.tool_schemas:
@@ -297,7 +309,7 @@ class ComputerAgent:
computer_handler = await make_computer_handler(schema["computer"])
break
self.computer_handler = computer_handler
def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
"""Process input messages and create schemas for the agent loop"""
if isinstance(input, str):
@@ -307,69 +319,73 @@ class ComputerAgent:
def _process_tools(self) -> List[Dict[str, Any]]:
"""Process tools and create schemas for the agent loop"""
schemas = []
for tool in self.tools:
# Check if it's a computer object (has interface attribute)
if is_agent_computer(tool):
# This is a computer tool - will be handled by agent loop
schemas.append({
"type": "computer",
"computer": tool
})
schemas.append({"type": "computer", "computer": tool})
elif callable(tool):
# Use litellm.utils.function_to_dict to extract schema from docstring
try:
function_schema = litellm.utils.function_to_dict(tool)
schemas.append({
"type": "function",
"function": function_schema
})
schemas.append({"type": "function", "function": function_schema})
except Exception as e:
print(f"Warning: Could not process tool {tool}: {e}")
else:
print(f"Warning: Unknown tool type: {tool}")
return schemas
def _get_tool(self, name: str) -> Optional[Callable]:
"""Get a tool by name"""
for tool in self.tools:
if hasattr(tool, '__name__') and tool.__name__ == name:
if hasattr(tool, "__name__") and tool.__name__ == name:
return tool
elif hasattr(tool, 'func') and tool.func.__name__ == name:
elif hasattr(tool, "func") and tool.func.__name__ == name:
return tool
return None
# ============================================================================
# AGENT RUN LOOP LIFECYCLE HOOKS
# ============================================================================
async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Initialize run tracking by calling callbacks."""
for callback in self.callbacks:
if hasattr(callback, 'on_run_start'):
if hasattr(callback, "on_run_start"):
await callback.on_run_start(kwargs, old_items)
async def _on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
async def _on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Finalize run tracking by calling callbacks."""
for callback in self.callbacks:
if hasattr(callback, 'on_run_end'):
if hasattr(callback, "on_run_end"):
await callback.on_run_end(kwargs, old_items, new_items)
async def _on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
async def _on_run_continue(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> bool:
"""Check if run should continue by calling callbacks."""
for callback in self.callbacks:
if hasattr(callback, 'on_run_continue'):
if hasattr(callback, "on_run_continue"):
should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
if not should_continue:
return False
return True
async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Prepare messages for the LLM call by applying callbacks."""
result = messages
for callback in self.callbacks:
if hasattr(callback, 'on_llm_start'):
if hasattr(callback, "on_llm_start"):
result = await callback.on_llm_start(result)
return result
@@ -377,82 +393,91 @@ class ComputerAgent:
"""Postprocess messages after the LLM call by applying callbacks."""
result = messages
for callback in self.callbacks:
if hasattr(callback, 'on_llm_end'):
if hasattr(callback, "on_llm_end"):
result = await callback.on_llm_end(result)
return result
async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
"""Called when responses are received."""
for callback in self.callbacks:
if hasattr(callback, 'on_responses'):
if hasattr(callback, "on_responses"):
await callback.on_responses(get_json(kwargs), get_json(responses))
async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
"""Called when a computer call is about to start."""
for callback in self.callbacks:
if hasattr(callback, 'on_computer_call_start'):
if hasattr(callback, "on_computer_call_start"):
await callback.on_computer_call_start(get_json(item))
async def _on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
async def _on_computer_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""Called when a computer call has completed."""
for callback in self.callbacks:
if hasattr(callback, 'on_computer_call_end'):
if hasattr(callback, "on_computer_call_end"):
await callback.on_computer_call_end(get_json(item), get_json(result))
async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
"""Called when a function call is about to start."""
for callback in self.callbacks:
if hasattr(callback, 'on_function_call_start'):
if hasattr(callback, "on_function_call_start"):
await callback.on_function_call_start(get_json(item))
async def _on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
async def _on_function_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""Called when a function call has completed."""
for callback in self.callbacks:
if hasattr(callback, 'on_function_call_end'):
if hasattr(callback, "on_function_call_end"):
await callback.on_function_call_end(get_json(item), get_json(result))
async def _on_text(self, item: Dict[str, Any]) -> None:
"""Called when a text message is encountered."""
for callback in self.callbacks:
if hasattr(callback, 'on_text'):
if hasattr(callback, "on_text"):
await callback.on_text(get_json(item))
async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
"""Called when an LLM API call is about to start."""
for callback in self.callbacks:
if hasattr(callback, 'on_api_start'):
if hasattr(callback, "on_api_start"):
await callback.on_api_start(get_json(kwargs))
async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
"""Called when an LLM API call has completed."""
for callback in self.callbacks:
if hasattr(callback, 'on_api_end'):
if hasattr(callback, "on_api_end"):
await callback.on_api_end(get_json(kwargs), get_json(result))
async def _on_usage(self, usage: Dict[str, Any]) -> None:
"""Called when usage information is received."""
for callback in self.callbacks:
if hasattr(callback, 'on_usage'):
if hasattr(callback, "on_usage"):
await callback.on_usage(get_json(usage))
async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
"""Called when a screenshot is taken."""
for callback in self.callbacks:
if hasattr(callback, 'on_screenshot'):
if hasattr(callback, "on_screenshot"):
await callback.on_screenshot(screenshot, name)
# ============================================================================
# AGENT OUTPUT PROCESSING
# ============================================================================
async def _handle_item(self, item: Any, computer: Optional[AsyncComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
async def _handle_item(
self,
item: Any,
computer: Optional[AsyncComputerHandler] = None,
ignore_call_ids: Optional[List[str]] = None,
) -> List[Dict[str, Any]]:
"""Handle each item; may cause a computer action + screenshot."""
call_id = item.get("call_id")
if ignore_call_ids and call_id and call_id in ignore_call_ids:
return []
item_type = item.get("type", None)
if item_type == "message":
await self._on_text(item)
# # Print messages
@@ -461,7 +486,7 @@ class ComputerAgent:
# if content_item.get("text"):
# print(content_item.get("text"))
return []
try:
if item_type == "computer_call":
await self._on_computer_call_start(item)
@@ -472,14 +497,16 @@ class ComputerAgent:
action = item.get("action")
action_type = action.get("type")
if action_type is None:
print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
print(
f"Action type cannot be `None`: action={action}, action_type={action_type}"
)
return []
# Extract action arguments (all fields except 'type')
action_args = {k: v for k, v in action.items() if k != "type"}
# print(f"{action_type}({action_args})")
# Execute the computer action
computer_method = getattr(computer, action_type, None)
if computer_method:
@@ -487,13 +514,13 @@ class ComputerAgent:
await computer_method(**action_args)
else:
raise ToolError(f"Unknown computer action: {action_type}")
# Take screenshot after action
if self.screenshot_delay and self.screenshot_delay > 0:
await asyncio.sleep(self.screenshot_delay)
screenshot_base64 = await computer.screenshot()
await self._on_screenshot(screenshot_base64, "screenshot_after")
# Handle safety checks
pending_checks = item.get("pending_safety_checks", [])
acknowledged_checks = []
@@ -505,7 +532,7 @@ class ComputerAgent:
# acknowledged_checks.append(check)
# else:
# raise ValueError(f"Safety check failed: {check_message}")
# Create call output
call_output = {
"type": "computer_call_output",
@@ -516,25 +543,25 @@ class ComputerAgent:
"image_url": f"data:image/png;base64,{screenshot_base64}",
},
}
# # Additional URL safety checks for browser environments
# if await computer.get_environment() == "browser":
# current_url = await computer.get_current_url()
# call_output["output"]["current_url"] = current_url
# # TODO: implement a callback for URL safety checks
# # check_blocklisted_url(current_url)
result = [call_output]
await self._on_computer_call_end(item, result)
return result
if item_type == "function_call":
await self._on_function_call_start(item)
# Perform function call
function = self._get_tool(item.get("name"))
if not function:
raise ToolError(f"Function {item.get('name')} not found")
args = json.loads(item.get("arguments"))
# Validate arguments before execution
@@ -545,14 +572,14 @@ class ComputerAgent:
result = await function(**args)
else:
result = await asyncio.to_thread(function, **args)
# Create function call output
call_output = {
"type": "function_call_output",
"call_id": item.get("call_id"),
"output": str(result),
}
result = [call_output]
await self._on_function_call_end(item, result)
return result
@@ -564,36 +591,35 @@ class ComputerAgent:
# ============================================================================
# MAIN AGENT LOOP
# ============================================================================
async def run(
self,
messages: Messages,
stream: bool = False,
**kwargs
self, messages: Messages, stream: bool = False, **kwargs
) -> AsyncGenerator[Dict[str, Any], None]:
"""
Run the agent with the given messages using Computer protocol handler pattern.
Args:
messages: List of message dictionaries
stream: Whether to stream the response
**kwargs: Additional arguments
Returns:
AsyncGenerator that yields response chunks
"""
if not self.agent_config_info:
raise ValueError("Agent configuration not found")
capabilities = self.get_capabilities()
if "step" not in capabilities:
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
raise ValueError(
f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions"
)
await self._initialize_computers()
# Merge kwargs
merged_kwargs = {**self.kwargs, **kwargs}
old_items = self._process_input(messages)
new_items = []
@@ -603,7 +629,7 @@ class ComputerAgent:
"stream": stream,
"model": self.model,
"agent_loop": self.agent_config_info.agent_class.__name__,
**merged_kwargs
**merged_kwargs,
}
await self._on_run_start(run_kwargs, old_items)
@@ -620,7 +646,7 @@ class ComputerAgent:
combined_messages = old_items + new_items
combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
preprocessed_messages = await self._on_llm_start(combined_messages)
loop_kwargs = {
"messages": preprocessed_messages,
"model": self.model,
@@ -629,7 +655,7 @@ class ComputerAgent:
"computer_handler": self.computer_handler,
"max_retries": self.max_retries,
"use_prompt_caching": self.use_prompt_caching,
**merged_kwargs
**merged_kwargs,
}
# Run agent loop iteration
@@ -641,13 +667,13 @@ class ComputerAgent:
_on_screenshot=self._on_screenshot,
)
result = get_json(result)
# Lifecycle hook: Postprocess messages after the LLM call
# Use cases:
# - PII deanonymization (if you want tool calls to see PII)
result["output"] = await self._on_llm_end(result.get("output", []))
await self._on_responses(loop_kwargs, result)
# Yield agent response
yield result
@@ -659,7 +685,9 @@ class ComputerAgent:
# Handle computer actions
for item in result.get("output"):
partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
partial_items = await self._handle_item(
item, self.computer_handler, ignore_call_ids=output_call_ids
)
new_items += partial_items
# Yield partial response
@@ -669,54 +697,52 @@ class ComputerAgent:
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
),
}
await self._on_run_end(loop_kwargs, old_items, new_items)
async def predict_click(
self,
instruction: str,
image_b64: Optional[str] = None
self, instruction: str, image_b64: Optional[str] = None
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates based on image and instruction.
Args:
instruction: Instruction for where to click
image_b64: Base64 encoded image (optional, will take screenshot if not provided)
Returns:
None or tuple with (x, y) coordinates
"""
if not self.agent_config_info:
raise ValueError("Agent configuration not found")
capabilities = self.get_capabilities()
if "click" not in capabilities:
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
if hasattr(self.agent_loop, 'predict_click'):
raise ValueError(
f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions"
)
if hasattr(self.agent_loop, "predict_click"):
if not image_b64:
if not self.computer_handler:
raise ValueError("Computer tool or image_b64 is required for predict_click")
image_b64 = await self.computer_handler.screenshot()
return await self.agent_loop.predict_click(
model=self.model,
image_b64=image_b64,
instruction=instruction
model=self.model, image_b64=image_b64, instruction=instruction
)
return None
def get_capabilities(self) -> List[AgentCapability]:
"""
Get list of capabilities supported by the current agent config.
Returns:
List of capability strings (e.g., ["step", "click"])
"""
if not self.agent_config_info:
raise ValueError("Agent configuration not found")
if hasattr(self.agent_loop, 'get_capabilities'):
if hasattr(self.agent_loop, "get_capabilities"):
return self.agent_loop.get_capabilities()
return ["step"] # Default capability
return ["step"] # Default capability

View File

@@ -3,17 +3,17 @@ Callback system for ComputerAgent preprocessing and postprocessing hooks.
"""
from .base import AsyncCallbackHandler
from .budget_manager import BudgetManagerCallback
from .image_retention import ImageRetentionCallback
from .logging import LoggingCallback
from .trajectory_saver import TrajectorySaverCallback
from .budget_manager import BudgetManagerCallback
from .telemetry import TelemetryCallback
from .operator_validator import OperatorNormalizerCallback
from .prompt_instructions import PromptInstructionsCallback
from .telemetry import TelemetryCallback
from .trajectory_saver import TrajectorySaverCallback
__all__ = [
"AsyncCallbackHandler",
"ImageRetentionCallback",
"ImageRetentionCallback",
"LoggingCallback",
"TrajectorySaverCallback",
"BudgetManagerCallback",

View File

@@ -3,7 +3,7 @@ Base callback handler interface for ComputerAgent preprocessing and postprocessi
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional, Union
from typing import Any, Dict, List, Optional, Union
class AsyncCallbackHandler(ABC):
@@ -16,42 +16,52 @@ class AsyncCallbackHandler(ABC):
"""Called at the start of an agent run loop."""
pass
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
async def on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Called at the end of an agent run loop."""
pass
async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
async def on_run_continue(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> bool:
"""Called during agent run loop to determine if execution should continue.
Args:
kwargs: Run arguments
old_items: Original messages
new_items: New messages generated during run
Returns:
True to continue execution, False to stop
"""
return True
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Called before messages are sent to the agent loop.
Args:
messages: List of message dictionaries to preprocess
Returns:
List of preprocessed message dictionaries
"""
return messages
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Called after the agent loop returns output.
Args:
output: List of output message dictionaries to postprocess
Returns:
List of postprocessed output dictionaries
"""
@@ -60,63 +70,67 @@ class AsyncCallbackHandler(ABC):
async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
"""
Called when a computer call is about to start.
Args:
item: The computer call item dictionary
"""
pass
async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
async def on_computer_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""
Called when a computer call has completed.
Args:
item: The computer call item dictionary
result: The result of the computer call
"""
pass
async def on_function_call_start(self, item: Dict[str, Any]) -> None:
"""
Called when a function call is about to start.
Args:
item: The function call item dictionary
"""
pass
async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
async def on_function_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""
Called when a function call has completed.
Args:
item: The function call item dictionary
result: The result of the function call
"""
pass
async def on_text(self, item: Dict[str, Any]) -> None:
"""
Called when a text message is encountered.
Args:
item: The message item dictionary
"""
pass
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
"""
Called when an API call is about to start.
Args:
kwargs: The kwargs being passed to the API call
"""
pass
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
"""
Called when an API call has completed.
Args:
kwargs: The kwargs that were passed to the API call
result: The result of the API call
@@ -126,7 +140,7 @@ class AsyncCallbackHandler(ABC):
async def on_usage(self, usage: Dict[str, Any]) -> None:
"""
Called when usage information is received.
Args:
usage: The usage information
"""
@@ -135,7 +149,7 @@ class AsyncCallbackHandler(ABC):
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
"""
Called when a screenshot is taken.
Args:
screenshot: The screenshot image
name: The name of the screenshot
@@ -145,9 +159,9 @@ class AsyncCallbackHandler(ABC):
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
"""
Called when responses are received.
Args:
kwargs: The kwargs being passed to the agent loop
responses: The responses received
"""
pass
pass

View File

@@ -1,17 +1,23 @@
from typing import Dict, List, Any
from typing import Any, Dict, List
from .base import AsyncCallbackHandler
class BudgetExceededError(Exception):
"""Exception raised when budget is exceeded."""
pass
class BudgetManagerCallback(AsyncCallbackHandler):
"""Budget manager callback that tracks usage costs and can stop execution when budget is exceeded."""
def __init__(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False):
def __init__(
self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False
):
"""
Initialize BudgetManagerCallback.
Args:
max_budget: Maximum budget allowed
reset_after_each_run: Whether to reset budget after each run
@@ -21,24 +27,30 @@ class BudgetManagerCallback(AsyncCallbackHandler):
self.reset_after_each_run = reset_after_each_run
self.raise_error = raise_error
self.total_cost = 0.0
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Reset budget if configured to do so."""
if self.reset_after_each_run:
self.total_cost = 0.0
async def on_usage(self, usage: Dict[str, Any]) -> None:
"""Track usage costs."""
if "response_cost" in usage:
self.total_cost += usage["response_cost"]
async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
async def on_run_continue(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> bool:
"""Check if budget allows continuation."""
if self.total_cost >= self.max_budget:
if self.raise_error:
raise BudgetExceededError(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
raise BudgetExceededError(
f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}"
)
else:
print(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
return False
return True

View File

@@ -2,7 +2,8 @@
Image retention callback handler that limits the number of recent images in message history.
"""
from typing import List, Dict, Any, Optional
from typing import Any, Dict, List, Optional
from .base import AsyncCallbackHandler
@@ -11,40 +12,40 @@ class ImageRetentionCallback(AsyncCallbackHandler):
Callback handler that applies image retention policy to limit the number
of recent images in message history to prevent context window overflow.
"""
def __init__(self, only_n_most_recent_images: Optional[int] = None):
"""
Initialize the image retention callback.
Args:
only_n_most_recent_images: If set, only keep the N most recent images in message history
"""
self.only_n_most_recent_images = only_n_most_recent_images
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Apply image retention policy to messages before sending to agent loop.
Args:
messages: List of message dictionaries
Returns:
List of messages with image retention policy applied
"""
if self.only_n_most_recent_images is None:
return messages
return self._apply_image_retention(messages)
def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Apply image retention policy to keep only the N most recent images.
Removes computer_call_output items with image_url and their corresponding computer_call items,
keeping only the most recent N image pairs based on only_n_most_recent_images setting.
Args:
messages: List of message dictionaries
Returns:
Filtered list of messages with image retention applied
"""
@@ -78,7 +79,11 @@ class ImageRetentionCallback(AsyncCallbackHandler):
# Remove the immediately preceding computer_call with matching call_id (if present)
call_id = messages[idx].get("call_id")
prev_idx = idx - 1
if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
if (
prev_idx >= 0
and messages[prev_idx].get("type") == "computer_call"
and messages[prev_idx].get("call_id") == call_id
):
to_remove.add(prev_idx)
# Check a single reasoning immediately before that computer_call
r_idx = prev_idx - 1
@@ -87,4 +92,4 @@ class ImageRetentionCallback(AsyncCallbackHandler):
# Construct filtered list
filtered = [m for i, m in enumerate(messages) if i not in to_remove]
return filtered
return filtered

View File

@@ -4,17 +4,18 @@ Logging callback for ComputerAgent that provides configurable logging of agent l
import json
import logging
from typing import Dict, List, Any, Optional, Union
from typing import Any, Dict, List, Optional, Union
from .base import AsyncCallbackHandler
def sanitize_image_urls(data: Any) -> Any:
"""
Recursively search for 'image_url' keys and set their values to '[omitted]'.
Args:
data: Any data structure (dict, list, or primitive type)
Returns:
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
"""
@@ -28,11 +29,11 @@ def sanitize_image_urls(data: Any) -> Any:
# Recursively sanitize the value
sanitized[key] = sanitize_image_urls(value)
return sanitized
elif isinstance(data, list):
# Recursively sanitize each item in the list
return [sanitize_image_urls(item) for item in data]
else:
# For primitive types (str, int, bool, None, etc.), return as-is
return data
@@ -41,37 +42,36 @@ def sanitize_image_urls(data: Any) -> Any:
class LoggingCallback(AsyncCallbackHandler):
"""
Callback handler that logs agent lifecycle events with configurable verbosity.
Logging levels:
- DEBUG: All events including API calls, message preprocessing, and detailed outputs
- INFO: Major lifecycle events (start/end, messages, outputs)
- INFO: Major lifecycle events (start/end, messages, outputs)
- WARNING: Only warnings and errors
- ERROR: Only errors
"""
def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO):
"""
Initialize the logging callback.
Args:
logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
level: Logging level (logging.DEBUG, logging.INFO, etc.)
"""
self.logger = logger or logging.getLogger('agent.ComputerAgent')
self.logger = logger or logging.getLogger("agent.ComputerAgent")
self.level = level
# Set up logger if it doesn't have handlers
if not self.logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.logger.setLevel(level)
def _update_usage(self, usage: Dict[str, Any]) -> None:
"""Update total usage statistics."""
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
for key, value in source.items():
if isinstance(value, dict):
@@ -82,18 +82,25 @@ class LoggingCallback(AsyncCallbackHandler):
if key not in target:
target[key] = 0
target[key] += value
add_dicts(self.total_usage, usage)
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Called before the run starts."""
self.total_usage = {}
async def on_usage(self, usage: Dict[str, Any]) -> None:
"""Called when usage information is received."""
self._update_usage(usage)
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
async def on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Called after the run ends."""
def format_dict(d, indent=0):
lines = []
prefix = f" - {' ' * indent}"
@@ -106,10 +113,10 @@ class LoggingCallback(AsyncCallbackHandler):
else:
lines.append(f"{prefix}{key}: {value}")
return lines
formatted_output = "\n".join(format_dict(self.total_usage))
self.logger.info(f"Total usage:\n{formatted_output}")
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Called before LLM processing starts."""
if self.logger.isEnabledFor(logging.INFO):
@@ -118,27 +125,27 @@ class LoggingCallback(AsyncCallbackHandler):
sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}")
return messages
async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Called after LLM processing ends."""
if self.logger.isEnabledFor(logging.DEBUG):
sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}")
return messages
async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
"""Called when a computer call starts."""
action = item.get("action", {})
action_type = action.get("type", "unknown")
action_args = {k: v for k, v in action.items() if k != "type"}
# INFO level logging for the action
self.logger.info(f"Computer: {action_type}({action_args})")
# DEBUG level logging for full details
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}")
async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None:
"""Called when a computer call ends."""
if self.logger.isEnabledFor(logging.DEBUG):
@@ -147,48 +154,52 @@ class LoggingCallback(AsyncCallbackHandler):
if result:
sanitized_result = sanitize_image_urls(result)
self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}")
async def on_function_call_start(self, item: Dict[str, Any]) -> None:
"""Called when a function call starts."""
name = item.get("name", "unknown")
arguments = item.get("arguments", "{}")
# INFO level logging for the function call
self.logger.info(f"Function: {name}({arguments})")
# DEBUG level logging for full details
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(f"Function call started: {name}")
async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None:
"""Called when a function call ends."""
# INFO level logging for function output (similar to function_call_output)
if result:
# Handle both list and direct result formats
if isinstance(result, list) and len(result) > 0:
output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0])
output = (
result[0].get("output", str(result))
if isinstance(result[0], dict)
else str(result[0])
)
else:
output = str(result)
# Truncate long outputs
if len(output) > 100:
output = output[:100] + "..."
self.logger.info(f"Output: {output}")
# DEBUG level logging for full details
if self.logger.isEnabledFor(logging.DEBUG):
name = item.get("name", "unknown")
self.logger.debug(f"Function call completed: {name}")
if result:
self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}")
async def on_text(self, item: Dict[str, Any]) -> None:
"""Called when a text message is encountered."""
# Get the role to determine if it's Agent or User
role = item.get("role", "unknown")
content_items = item.get("content", [])
# Process content items to build display text
text_parts = []
for content_item in content_items:
@@ -206,10 +217,10 @@ class LoggingCallback(AsyncCallbackHandler):
else:
# Non-text content, show as [type]
text_parts.append(f"[{content_type}]")
# Join all text parts
display_text = ''.join(text_parts) if text_parts else "[empty]"
display_text = "".join(text_parts) if text_parts else "[empty]"
# Log with appropriate level and format
if role == "assistant":
self.logger.info(f"Agent: {display_text}")
@@ -219,7 +230,7 @@ class LoggingCallback(AsyncCallbackHandler):
# Fallback for unknown roles, use debug level
if self.logger.isEnabledFor(logging.DEBUG):
self.logger.debug(f"Text message ({role}): {display_text}")
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
"""Called when an API call is about to start."""
if self.logger.isEnabledFor(logging.DEBUG):
@@ -232,16 +243,18 @@ class LoggingCallback(AsyncCallbackHandler):
elif "input" in kwargs:
sanitized_input = sanitize_image_urls(kwargs["input"])
self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}")
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
"""Called when an API call has completed."""
if self.logger.isEnabledFor(logging.DEBUG):
model = kwargs.get("model", "unknown")
self.logger.debug(f"API call completed for model: {model}")
self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}")
self.logger.debug(
f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}"
)
async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None:
"""Called when a screenshot is taken."""
if self.logger.isEnabledFor(logging.DEBUG):
image_size = len(item) / 1024
self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")
self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")

View File

@@ -9,6 +9,7 @@ Ensures agent output actions conform to expected schemas by fixing common issues
This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
"""
from __future__ import annotations
from typing import Any, Dict, List
@@ -48,6 +49,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
action["type"] = "type"
action_type = action.get("type")
def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
"""Keep only the provided keys on action; delete everything else.
Always ensures required 'type' is present if listed in keys_to_keep.
@@ -55,6 +57,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
for key in list(action.keys()):
if key not in keys_to_keep:
del action[key]
# rename "coordinate" to "x", "y"
if "coordinate" in action:
action["x"] = action["coordinate"][0]
@@ -100,7 +103,6 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
keep = required_keys_by_type.get(action_type or "")
if keep:
_keep_keys(action, keep)
# # Second pass: if an assistant message is immediately followed by a computer_call,
# # replace the assistant message itself with a reasoning message with summary text.

View File

@@ -2,38 +2,41 @@
PII anonymization callback handler using Microsoft Presidio for text and image redaction.
"""
from typing import List, Dict, Any, Optional, Tuple
from .base import AsyncCallbackHandler
import base64
import io
import logging
from typing import Any, Dict, List, Optional, Tuple
from .base import AsyncCallbackHandler
try:
# TODO: Add Presidio dependencies
from PIL import Image
PRESIDIO_AVAILABLE = True
except ImportError:
PRESIDIO_AVAILABLE = False
logger = logging.getLogger(__name__)
class PIIAnonymizationCallback(AsyncCallbackHandler):
"""
Callback handler that anonymizes PII in text and images using Microsoft Presidio.
This handler:
1. Anonymizes PII in messages before sending to the agent loop
2. Deanonymizes PII in tool calls and message outputs after the agent loop
3. Redacts PII from images in computer_call_output messages
"""
def __init__(
self,
# TODO: Any extra kwargs if needed
):
"""
Initialize the PII anonymization callback.
Args:
anonymize_text: Whether to anonymize text content
anonymize_images: Whether to redact images
@@ -46,16 +49,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
"Presidio is not available. Install with: "
"pip install cua-agent[pii-anonymization]"
)
# TODO: Implement __init__
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Anonymize PII in messages before sending to agent loop.
Args:
messages: List of message dictionaries
Returns:
List of messages with PII anonymized
"""
@@ -63,16 +66,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
for msg in messages:
anonymized_msg = await self._anonymize_message(msg)
anonymized_messages.append(anonymized_msg)
return anonymized_messages
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Deanonymize PII in tool calls and message outputs after agent loop.
Args:
output: List of output dictionaries
Returns:
List of output with PII deanonymized for tool calls
"""
@@ -84,13 +87,13 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
deanonymized_output.append(deanonymized_item)
else:
deanonymized_output.append(item)
return deanonymized_output
async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
# TODO: Implement _anonymize_message
return message
async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
# TODO: Implement _deanonymize_item
return item

View File

@@ -2,17 +2,17 @@
Telemetry callback handler for Computer-Use Agent (cua-agent)
"""
import platform
import time
import uuid
from typing import List, Dict, Any, Optional, Union
from typing import Any, Dict, List, Optional, Union
from .base import AsyncCallbackHandler
from core.telemetry import (
record_event,
is_telemetry_enabled,
record_event,
)
import platform
from .base import AsyncCallbackHandler
SYSTEM_INFO = {
"os": platform.system().lower(),
@@ -20,32 +20,29 @@ SYSTEM_INFO = {
"python_version": platform.python_version(),
}
class TelemetryCallback(AsyncCallbackHandler):
"""
Telemetry callback handler for Computer-Use Agent (cua-agent)
Tracks agent usage, performance metrics, and optionally trajectory data.
"""
def __init__(
self,
agent,
log_trajectory: bool = False
):
def __init__(self, agent, log_trajectory: bool = False):
"""
Initialize telemetry callback.
Args:
agent: The ComputerAgent instance
log_trajectory: Whether to log full trajectory items (opt-in)
"""
self.agent = agent
self.log_trajectory = log_trajectory
# Generate session/run IDs
self.session_id = str(uuid.uuid4())
self.run_id = None
# Track timing and metrics
self.run_start_time = None
self.step_count = 0
@@ -54,126 +51,133 @@ class TelemetryCallback(AsyncCallbackHandler):
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"response_cost": 0.0
"response_cost": 0.0,
}
# Record agent initialization
if is_telemetry_enabled():
self._record_agent_initialization()
def _record_agent_initialization(self) -> None:
"""Record agent type/model and session initialization."""
agent_info = {
"session_id": self.session_id,
"agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
"model": getattr(self.agent, 'model', 'unknown'),
**SYSTEM_INFO
"agent_type": (
self.agent.agent_loop.__name__ if hasattr(self.agent, "agent_loop") else "unknown"
),
"model": getattr(self.agent, "model", "unknown"),
**SYSTEM_INFO,
}
record_event("agent_session_start", agent_info)
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Called at the start of an agent run loop."""
if not is_telemetry_enabled():
return
self.run_id = str(uuid.uuid4())
self.run_start_time = time.time()
self.step_count = 0
# Calculate input context size
input_context_size = self._calculate_context_size(old_items)
run_data = {
"session_id": self.session_id,
"run_id": self.run_id,
"start_time": self.run_start_time,
"input_context_size": input_context_size,
"num_existing_messages": len(old_items)
"num_existing_messages": len(old_items),
}
# Log trajectory if opted in
if self.log_trajectory:
trajectory = self._extract_trajectory(old_items)
if trajectory:
run_data["uploaded_trajectory"] = trajectory
record_event("agent_run_start", run_data)
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
async def on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Called at the end of an agent run loop."""
if not is_telemetry_enabled() or not self.run_start_time:
return
run_duration = time.time() - self.run_start_time
run_data = {
"session_id": self.session_id,
"run_id": self.run_id,
"end_time": time.time(),
"duration_seconds": run_duration,
"num_steps": self.step_count,
"total_usage": self.total_usage.copy()
"total_usage": self.total_usage.copy(),
}
# Log trajectory if opted in
if self.log_trajectory:
trajectory = self._extract_trajectory(new_items)
if trajectory:
run_data["uploaded_trajectory"] = trajectory
record_event("agent_run_end", run_data)
async def on_usage(self, usage: Dict[str, Any]) -> None:
"""Called when usage information is received."""
if not is_telemetry_enabled():
return
# Accumulate usage stats
self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
# Record individual usage event
usage_data = {
"session_id": self.session_id,
"run_id": self.run_id,
"step": self.step_count,
**usage
**usage,
}
record_event("agent_usage", usage_data)
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
"""Called when responses are received."""
if not is_telemetry_enabled():
return
self.step_count += 1
step_duration = None
if self.step_start_time:
step_duration = time.time() - self.step_start_time
self.step_start_time = time.time()
step_data = {
"session_id": self.session_id,
"run_id": self.run_id,
"step": self.step_count,
"timestamp": self.step_start_time
"timestamp": self.step_start_time,
}
if step_duration is not None:
step_data["duration_seconds"] = step_duration
record_event("agent_step", step_data)
def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
"""Calculate approximate context size in tokens/characters."""
total_size = 0
for item in items:
if item.get("type") == "message" and "content" in item:
content = item["content"]
@@ -185,25 +189,27 @@ class TelemetryCallback(AsyncCallbackHandler):
total_size += len(part["text"])
elif "content" in item and isinstance(item["content"], str):
total_size += len(item["content"])
return total_size
def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Extract trajectory items that should be logged."""
trajectory = []
for item in items:
# Include user messages, assistant messages, reasoning, computer calls, and computer outputs
if (
item.get("role") == "user" or # User inputs
(item.get("type") == "message" and item.get("role") == "assistant") or # Model outputs
item.get("type") == "reasoning" or # Reasoning traces
item.get("type") == "computer_call" or # Computer actions
item.get("type") == "computer_call_output" # Computer outputs
item.get("role") == "user" # User inputs
or (
item.get("type") == "message" and item.get("role") == "assistant"
) # Model outputs
or item.get("type") == "reasoning" # Reasoning traces
or item.get("type") == "computer_call" # Computer actions
or item.get("type") == "computer_call_output" # Computer outputs
):
# Create a copy of the item with timestamp
trajectory_item = item.copy()
trajectory_item["logged_at"] = time.time()
trajectory.append(trajectory_item)
return trajectory
return trajectory

View File

@@ -2,26 +2,28 @@
Trajectory saving callback handler for ComputerAgent.
"""
import os
import json
import uuid
from datetime import datetime
import base64
from pathlib import Path
from typing import List, Dict, Any, Optional, Union, override
from PIL import Image, ImageDraw
import io
import json
import os
import uuid
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, override
from PIL import Image, ImageDraw
from .base import AsyncCallbackHandler
def sanitize_image_urls(data: Any) -> Any:
"""
Recursively search for 'image_url' keys and set their values to '[omitted]'.
Args:
data: Any data structure (dict, list, or primitive type)
Returns:
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
"""
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
# Recursively sanitize the value
sanitized[key] = sanitize_image_urls(value)
return sanitized
elif isinstance(data, list):
# Recursively sanitize each item in the list
return [sanitize_image_urls(item) for item in data]
else:
# For primitive types (str, int, bool, None, etc.), return as-is
return data
def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
def extract_computer_call_outputs(
items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
) -> List[Dict[str, Any]]:
"""
Save any base64-encoded screenshots from computer_call_output entries to files and
replace their image_url with the saved file path when a call_id is present.
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
updated.append(msg)
return updated
class TrajectorySaverCallback(AsyncCallbackHandler):
"""
Callback handler that saves agent trajectories to disk.
Saves each run as a separate trajectory with unique ID, and each turn
within the trajectory gets its own folder with screenshots and responses.
"""
def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
def __init__(
self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
):
"""
Initialize trajectory saver.
Args:
trajectory_dir: Base directory to save trajectories
reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
@@ -129,7 +136,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
self.reset_on_run = reset_on_run
# Optional directory to store extracted screenshots from metadata/new_items
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
# Ensure trajectory directory exists
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
@@ -137,7 +144,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
"""Get the directory for the current turn."""
if not self.trajectory_id:
raise ValueError("Trajectory not initialized - call _on_run_start first")
# format: trajectory_id/turn_000
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
turn_dir.mkdir(parents=True, exist_ok=True)
@@ -166,6 +173,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
def _update_usage(self, usage: Dict[str, Any]) -> None:
"""Update total usage statistics."""
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
for key, value in source.items():
if isinstance(value, dict):
@@ -176,20 +184,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
if key not in target:
target[key] = 0
target[key] += value
add_dicts(self.total_usage, usage)
@override
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Initialize trajectory tracking for a new run."""
model = kwargs.get("model", "unknown")
# Only reset trajectory state if reset_on_run is True or no trajectory exists
if self.reset_on_run or not self.trajectory_id:
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
if "+" in model:
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
# strip non-alphanumeric characters from model_name_short
model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
now = datetime.now()
@@ -198,11 +207,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
self.current_artifact = 0
self.model = model
self.total_usage = {}
# Create trajectory directory
trajectory_path = self.trajectory_dir / self.trajectory_id
trajectory_path.mkdir(parents=True, exist_ok=True)
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
kwargs_to_save = kwargs.copy()
try:
@@ -219,7 +228,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
"status": "running",
"kwargs": kwargs_to_save,
}
with open(trajectory_path / "metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
else:
@@ -227,22 +236,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
self.model = model
@override
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
async def on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
if not self.trajectory_id:
return
# Update metadata with completion status, total usage, and new items
trajectory_path = self.trajectory_dir / self.trajectory_id
metadata_path = trajectory_path / "metadata.json"
# Read existing metadata
if metadata_path.exists():
with open(metadata_path, "r") as f:
metadata = json.load(f)
else:
metadata = {}
# Update metadata with completion info
# Optionally extract screenshots from new_items before persisting
new_items_to_save = new_items
@@ -251,32 +265,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
except Exception:
pass
metadata.update({
"status": "completed",
"completed_at": str(uuid.uuid1().time),
"total_usage": self.total_usage,
"new_items": new_items_to_save,
"total_turns": self.current_turn
})
metadata.update(
{
"status": "completed",
"completed_at": str(uuid.uuid1().time),
"total_usage": self.total_usage,
"new_items": new_items_to_save,
"total_turns": self.current_turn,
}
)
# Save updated metadata
with open(metadata_path, "w") as f:
json.dump(metadata, f, indent=2)
@override
@override
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
if not self.trajectory_id:
return
self._save_artifact("api_start", { "kwargs": kwargs })
self._save_artifact("api_start", {"kwargs": kwargs})
@override
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
"""Save API call result."""
if not self.trajectory_id:
return
self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
@override
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
@@ -295,77 +311,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
"""Save responses to the current turn directory and update usage statistics."""
if not self.trajectory_id:
return
# Save responses
turn_dir = self._get_turn_dir()
response_data = {
"timestamp": str(uuid.uuid1().time),
"model": self.model,
"kwargs": kwargs,
"response": responses
"response": responses,
}
self._save_artifact("agent_response", response_data)
# Increment turn counter
self.current_turn += 1
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
"""
Draw a red dot and crosshair at the specified coordinates on the image.
Args:
image_bytes: The original image as bytes
x: X coordinate for the crosshair
y: Y coordinate for the crosshair
Returns:
Modified image as bytes with red dot and crosshair
"""
# Open the image
image = Image.open(io.BytesIO(image_bytes))
draw = ImageDraw.Draw(image)
# Draw crosshair lines (red, 2px thick)
crosshair_size = 20
line_width = 2
color = "red"
# Horizontal line
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
# Vertical line
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
# Draw center dot (filled circle)
dot_radius = 3
draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
draw.ellipse(
[(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
)
# Convert back to bytes
output = io.BytesIO()
image.save(output, format='PNG')
image.save(output, format="PNG")
return output.getvalue()
@override
async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
async def on_computer_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""
Called when a computer call has completed.
Saves screenshots and computer call output.
"""
if not self.trajectory_id:
return
self._save_artifact("computer_call_result", { "item": item, "result": result })
self._save_artifact("computer_call_result", {"item": item, "result": result})
# Check if action has x/y coordinates and there's a screenshot in the result
action = item.get("action", {})
if "x" in action and "y" in action:
# Look for screenshot in the result
for result_item in result:
if (result_item.get("type") == "computer_call_output" and
result_item.get("output", {}).get("type") == "input_image"):
if (
result_item.get("type") == "computer_call_output"
and result_item.get("output", {}).get("type") == "input_image"
):
image_url = result_item["output"]["image_url"]
# Extract base64 image data
if image_url.startswith("data:image/"):
# Format: data:image/png;base64,<base64_data>
@@ -373,26 +395,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
else:
# Assume it's just base64 data
base64_data = image_url
try:
# Decode the image
image_bytes = base64.b64decode(base64_data)
# Draw crosshair at the action coordinates
annotated_image = self._draw_crosshair_on_image(
image_bytes,
int(action["x"]),
int(action["y"])
image_bytes, int(action["x"]), int(action["y"])
)
# Save as screenshot_action
self._save_artifact("screenshot_action", annotated_image)
except Exception as e:
# If annotation fails, just log and continue
print(f"Failed to annotate screenshot: {e}")
break # Only process the first screenshot found
# Increment turn counter
self.current_turn += 1
self.current_turn += 1

View File

@@ -3,7 +3,7 @@ CLI chat interface for agent - Computer Use Agent
Usage:
python -m agent.cli <model_string>
Examples:
python -m agent.cli openai/computer-use-preview
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
@@ -11,19 +11,22 @@ Examples:
"""
try:
import asyncio
import argparse
import os
import sys
import json
from typing import List, Dict, Any
import dotenv
import asyncio
import base64
import time
import json
import os
import platform
import sys
import time
from pathlib import Path
from typing import Any, Dict, List
import dotenv
try:
from PIL import Image, ImageDraw
PIL_AVAILABLE = True
except Exception:
PIL_AVAILABLE = False
@@ -31,36 +34,44 @@ try:
except ImportError:
if __name__ == "__main__":
raise ImportError(
"CLI dependencies not found. "
"Please install with: pip install \"cua-agent[cli]\""
"CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
)
# Load environment variables
dotenv.load_dotenv()
# Color codes for terminal output
class Colors:
RESET = '\033[0m'
BOLD = '\033[1m'
DIM = '\033[2m'
# Text colors
RED = '\033[31m'
GREEN = '\033[32m'
YELLOW = '\033[33m'
BLUE = '\033[34m'
MAGENTA = '\033[35m'
CYAN = '\033[36m'
WHITE = '\033[37m'
GRAY = '\033[90m'
# Background colors
BG_RED = '\033[41m'
BG_GREEN = '\033[42m'
BG_YELLOW = '\033[43m'
BG_BLUE = '\033[44m'
RESET = "\033[0m"
BOLD = "\033[1m"
DIM = "\033[2m"
def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
# Text colors
RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
BLUE = "\033[34m"
MAGENTA = "\033[35m"
CYAN = "\033[36m"
WHITE = "\033[37m"
GRAY = "\033[90m"
# Background colors
BG_RED = "\033[41m"
BG_GREEN = "\033[42m"
BG_YELLOW = "\033[43m"
BG_BLUE = "\033[44m"
def print_colored(
text: str,
color: str = "",
bold: bool = False,
dim: bool = False,
end: str = "\n",
right: str = "",
):
"""Print colored text to terminal with optional right-aligned text."""
prefix = ""
if bold:
@@ -69,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
prefix += Colors.DIM
if color:
prefix += color
if right:
# Get terminal width (default to 80 if unable to determine)
try:
import shutil
terminal_width = shutil.get_terminal_size().columns
except:
terminal_width = 80
# Add right margin
terminal_width -= 1
# Calculate padding needed
# Account for ANSI escape codes not taking visual space
visible_left_len = len(text)
visible_right_len = len(right)
padding = terminal_width - visible_left_len - visible_right_len
if padding > 0:
output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
else:
@@ -94,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
output = f"{prefix}{text} {right}{Colors.RESET}"
else:
output = f"{prefix}{text}{Colors.RESET}"
print(output, end=end)
@@ -113,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
args_str = f"('{details['text']}')"
elif action_type == "scroll" and "x" in details and "y" in details:
args_str = f"({details['x']}, {details['y']})"
if total_cost > 0:
print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
else:
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
def print_welcome(model: str, agent_loop: str, container_name: str):
"""Print welcome message."""
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
print_colored("Type 'exit' to quit.", dim=True)
async def ainput(prompt: str = ""):
return await asyncio.to_thread(input, prompt)
async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
async def chat_loop(
agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
):
"""Main chat loop with the agent."""
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
history = []
if initial_prompt:
history.append({"role": "user", "content": initial_prompt})
total_cost = 0
while True:
@@ -143,28 +160,28 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
# Get user input with prompt
print_colored("> ", end="")
user_input = await ainput()
if user_input.lower() in ['exit', 'quit', 'q']:
if user_input.lower() in ["exit", "quit", "q"]:
print_colored("\n👋 Goodbye!")
break
if not user_input:
continue
# Add user message to history
history.append({"role": "user", "content": user_input})
# Stream responses from the agent with spinner
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
spinner.hide()
async for result in agent.run(history):
# Add agent responses to history
history.extend(result.get("output", []))
if show_usage:
total_cost += result.get("usage", {}).get("response_cost", 0)
# Process and display the output
for item in result.get("output", []):
if item.get("type") == "message" and item.get("role") == "assistant":
@@ -176,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
if text:
spinner.hide()
print_colored(text)
elif item.get("type") == "computer_call":
# Display computer action
action = item.get("action", {})
@@ -186,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
print_action(action_type, action, total_cost)
spinner.text = f"Performing {action_type}..."
spinner.show()
elif item.get("type") == "function_call":
# Display function call
function_name = item.get("name", "")
@@ -194,18 +211,18 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
print_colored(f"🔧 Calling function: {function_name}", dim=True)
spinner.text = f"Calling {function_name}..."
spinner.show()
elif item.get("type") == "function_call_output":
# Display function output (dimmed)
output = item.get("output", "")
if output and len(output.strip()) > 0:
spinner.hide()
print_colored(f"📤 {output}", dim=True)
spinner.hide()
if show_usage and total_cost > 0:
print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
async def main():
"""Main CLI function."""
@@ -218,90 +235,74 @@ Examples:
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
"""
""",
)
parser.add_argument(
"model",
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')",
)
parser.add_argument(
"--provider",
choices=["cloud", "lume", "winsandbox", "docker"],
default="cloud",
help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
)
parser.add_argument(
"--images",
type=int,
default=3,
help="Number of recent images to keep in context (default: 3)"
)
parser.add_argument(
"--trajectory",
action="store_true",
help="Save trajectory for debugging"
)
parser.add_argument(
"--budget",
type=float,
help="Maximum budget for the session (in dollars)"
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging"
help="Number of recent images to keep in context (default: 3)",
)
parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
parser.add_argument(
"-p", "--prompt",
"-p",
"--prompt",
type=str,
help="Initial prompt to send to the agent. Leave blank for interactive mode."
help="Initial prompt to send to the agent. Leave blank for interactive mode.",
)
parser.add_argument(
"--prompt-file",
type=Path,
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
)
parser.add_argument(
"--predict-click",
dest="predict_click",
type=str,
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
)
parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
parser.add_argument(
"-u", "--usage", action="store_true", help="Show total cost of the agent runs"
)
parser.add_argument(
"-c", "--cache",
action="store_true",
help="Tell the API to enable caching"
)
parser.add_argument(
"-u", "--usage",
action="store_true",
help="Show total cost of the agent runs"
)
parser.add_argument(
"-r", "--max-retries",
"-r",
"--max-retries",
type=int,
default=3,
help="Maximum number of retries for the LLM API calls"
help="Maximum number of retries for the LLM API calls",
)
args = parser.parse_args()
# Check for required environment variables
container_name = os.getenv("CUA_CONTAINER_NAME")
cua_api_key = os.getenv("CUA_API_KEY")
# Prompt for missing environment variables (container name always required)
if not container_name:
if args.provider == "cloud":
@@ -321,13 +322,13 @@ Examples:
if not cua_api_key:
print_colored("❌ API key is required for cloud provider.")
sys.exit(1)
# Check for provider-specific API keys based on model
provider_api_keys = {
"openai/": "OPENAI_API_KEY",
"anthropic/": "ANTHROPIC_API_KEY",
}
# Find matching provider and check for API key
for prefix, env_var in provider_api_keys.items():
if prefix in args.model:
@@ -340,7 +341,7 @@ Examples:
# Set the environment variable for the session
os.environ[env_var] = api_key
break
# Import here to avoid import errors if dependencies are missing
try:
from agent import ComputerAgent
@@ -349,7 +350,7 @@ Examples:
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
sys.exit(1)
# Resolve provider -> os_type, provider_type, api key requirement
provider_map = {
"cloud": ("linux", "cloud", True),
@@ -365,42 +366,46 @@ Examples:
"name": container_name,
}
if needs_api_key:
computer_kwargs["api_key"] = cua_api_key # type: ignore
computer_kwargs["api_key"] = cua_api_key # type: ignore
# Create computer instance
async with Computer(**computer_kwargs) as computer: # type: ignore
async with Computer(**computer_kwargs) as computer: # type: ignore
# Create agent
agent_kwargs = {
"model": args.model,
"tools": [computer],
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
"max_retries": args.max_retries
"max_retries": args.max_retries,
}
if args.images > 0:
agent_kwargs["only_n_most_recent_images"] = args.images
if args.trajectory:
agent_kwargs["trajectory_dir"] = "trajectories"
if args.budget:
agent_kwargs["max_trajectory_budget"] = {
"max_budget": args.budget,
"raise_error": True,
"reset_after_each_run": False
"reset_after_each_run": False,
}
if args.cache:
agent_kwargs["use_prompt_caching"] = True
agent = ComputerAgent(**agent_kwargs)
# If predict-click mode is requested, run once and exit
if args.predict_click:
if not PIL_AVAILABLE:
print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
print_colored(
"❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
Colors.RED,
bold=True,
)
sys.exit(1)
instruction = args.predict_click
@@ -435,6 +440,7 @@ Examples:
try:
from io import BytesIO
with Image.open(BytesIO(img_bytes)) as img:
img = img.convert("RGB")
draw = ImageDraw.Draw(img)
@@ -457,9 +463,9 @@ Examples:
if system == "windows":
os.startfile(str(out_path)) # type: ignore[attr-defined]
elif system == "darwin":
os.system(f"open \"{out_path}\"")
os.system(f'open "{out_path}"')
else:
os.system(f"xdg-open \"{out_path}\"")
os.system(f'xdg-open "{out_path}"')
except Exception:
pass
except Exception as e:
@@ -482,9 +488,8 @@ Examples:
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
if __name__ == "__main__":
try:
asyncio.run(main())
except (KeyboardInterrupt, EOFError) as _:
print_colored("\n\n👋 Goodbye!")
print_colored("\n\n👋 Goodbye!")

View File

@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
Computer library interface.
"""
from computer import Computer as cuaComputer
from .base import AsyncComputerHandler
from .cua import cuaComputerHandler
from .custom import CustomComputerHandler
from computer import Computer as cuaComputer
def is_agent_computer(computer):
"""Check if the given computer is a ComputerHandler or CUA Computer."""
return isinstance(computer, AsyncComputerHandler) or \
isinstance(computer, cuaComputer) or \
(isinstance(computer, dict)) #and "screenshot" in computer)
return (
isinstance(computer, AsyncComputerHandler)
or isinstance(computer, cuaComputer)
or (isinstance(computer, dict))
) # and "screenshot" in computer)
async def make_computer_handler(computer):
"""
Create a computer handler from a computer interface.
Args:
computer: Either a ComputerHandler instance, Computer instance, or dict of functions
Returns:
ComputerHandler: A computer handler instance
Raises:
ValueError: If the computer type is not supported
"""
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
return computer_handler
if isinstance(computer, dict):
return CustomComputerHandler(computer)
raise ValueError(f"Unsupported computer type: {type(computer)}")
raise ValueError(f"Unsupported computer type: {type(computer)}")

View File

@@ -2,69 +2,78 @@
Base computer interface protocol for agent interactions.
"""
from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
from typing import (
Any,
Dict,
List,
Literal,
Optional,
Protocol,
Union,
runtime_checkable,
)
@runtime_checkable
class AsyncComputerHandler(Protocol):
"""Protocol defining the interface for computer interactions."""
# ==== Computer-Use-Preview Action Space ====
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
...
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
...
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
...
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
...
async def type(self, text: str) -> None:
"""Type text."""
...
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
...
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
...
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
...
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
...
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
...
# ==== Anthropic Action Space ====
# ==== Anthropic Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
...
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
...

View File

@@ -3,24 +3,27 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
"""
import base64
from typing import Dict, List, Any, Literal, Union, Optional
from .base import AsyncComputerHandler
from typing import Any, Dict, List, Literal, Optional, Union
from computer import Computer
from .base import AsyncComputerHandler
class cuaComputerHandler(AsyncComputerHandler):
"""Computer handler that implements the Computer protocol using the computer interface."""
def __init__(self, cua_computer: Computer):
"""Initialize with a computer interface (from tool schema)."""
self.cua_computer = cua_computer
self.interface = None
async def _initialize(self):
if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized:
if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
await self.cua_computer.run()
self.interface = self.cua_computer.interface
# ==== Computer-Use-Preview Action Space ====
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
@@ -32,13 +35,13 @@ class cuaComputerHandler(AsyncComputerHandler):
assert self.interface is not None
screen_size = await self.interface.get_screen_size()
return screen_size["width"], screen_size["height"]
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
assert self.interface is not None
screenshot_bytes = await self.interface.screenshot()
return base64.b64encode(screenshot_bytes).decode('utf-8')
return base64.b64encode(screenshot_bytes).decode("utf-8")
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
assert self.interface is not None
@@ -49,34 +52,35 @@ class cuaComputerHandler(AsyncComputerHandler):
else:
# Default to left click for unknown buttons
await self.interface.left_click(x, y)
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
assert self.interface is not None
await self.interface.double_click(x, y)
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
assert self.interface is not None
await self.interface.move_cursor(x, y)
await self.interface.scroll(scroll_x, scroll_y)
async def type(self, text: str) -> None:
"""Type text."""
assert self.interface is not None
await self.interface.type_text(text)
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
assert self.interface is not None
import asyncio
await asyncio.sleep(ms / 1000.0)
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
assert self.interface is not None
await self.interface.move_cursor(x, y)
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
assert self.interface is not None
@@ -87,38 +91,38 @@ class cuaComputerHandler(AsyncComputerHandler):
else:
# Handle key combinations
await self.interface.hotkey(*keys)
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
assert self.interface is not None
if not path:
return
# Start drag from first point
start = path[0]
await self.interface.mouse_down(start["x"], start["y"])
# Move through path
for point in path[1:]:
await self.interface.move_cursor(point["x"], point["y"])
# End drag at last point
end = path[-1]
await self.interface.mouse_up(end["x"], end["y"])
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
# This would need to be implemented based on the specific browser interface
# For now, return empty string
return ""
# ==== Anthropic Computer Action Space ====
# ==== Anthropic Computer Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
assert self.interface is not None
await self.interface.mouse_down(x, y, button="left")
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
assert self.interface is not None
await self.interface.mouse_up(x, y, button="left")
await self.interface.mouse_up(x, y, button="left")

View File

@@ -3,47 +3,49 @@ Custom computer handler implementation that accepts a dictionary of functions.
"""
import base64
from typing import Dict, List, Any, Literal, Union, Optional, Callable
from PIL import Image
import io
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from PIL import Image
from .base import AsyncComputerHandler
class CustomComputerHandler(AsyncComputerHandler):
"""Computer handler that implements the Computer protocol using a dictionary of custom functions."""
def __init__(self, functions: Dict[str, Callable]):
"""
Initialize with a dictionary of functions.
Args:
functions: Dictionary where keys are method names and values are callable functions.
Only 'screenshot' is required, all others are optional.
Raises:
ValueError: If required 'screenshot' function is not provided.
"""
if 'screenshot' not in functions:
if "screenshot" not in functions:
raise ValueError("'screenshot' function is required in functions dictionary")
self.functions = functions
self._last_screenshot_size: Optional[tuple[int, int]] = None
async def _call_function(self, func, *args, **kwargs):
"""
Call a function, handling both async and sync functions.
Args:
func: The function to call
*args: Positional arguments to pass to the function
**kwargs: Keyword arguments to pass to the function
Returns:
The result of the function call
"""
import asyncio
import inspect
if callable(func):
if inspect.iscoroutinefunction(func):
return await func(*args, **kwargs)
@@ -51,14 +53,14 @@ class CustomComputerHandler(AsyncComputerHandler):
return func(*args, **kwargs)
else:
return func
async def _get_value(self, attribute: str):
"""
Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
Args:
attribute: The attribute name to look for
Returns:
The value from the functions dict, called if callable, returned directly if not
"""
@@ -66,20 +68,20 @@ class CustomComputerHandler(AsyncComputerHandler):
get_key = f"get_{attribute}"
if get_key in self.functions:
return await self._call_function(self.functions[get_key])
# Check for '{attribute}'
# Check for '{attribute}'
if attribute in self.functions:
return await self._call_function(self.functions[attribute])
return None
def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
"""
Convert image to base64 string.
Args:
img: Image as bytes, PIL Image, or base64 string
Returns:
str: Base64 encoded image string
"""
@@ -88,43 +90,43 @@ class CustomComputerHandler(AsyncComputerHandler):
return img
elif isinstance(img, bytes):
# Raw bytes
return base64.b64encode(img).decode('utf-8')
return base64.b64encode(img).decode("utf-8")
elif isinstance(img, Image.Image):
# PIL Image
buffer = io.BytesIO()
img.save(buffer, format='PNG')
return base64.b64encode(buffer.getvalue()).decode('utf-8')
img.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode("utf-8")
else:
raise ValueError(f"Unsupported image type: {type(img)}")
# ==== Computer-Use-Preview Action Space ====
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
result = await self._get_value('environment')
result = await self._get_value("environment")
if result is None:
return "linux"
assert result in ["windows", "mac", "linux", "browser"]
return result # type: ignore
return result # type: ignore
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
result = await self._get_value('dimensions')
result = await self._get_value("dimensions")
if result is not None:
return result # type: ignore
return result # type: ignore
# Fallback: use last screenshot size if available
if not self._last_screenshot_size:
await self.screenshot()
assert self._last_screenshot_size is not None, "Failed to get screenshot size"
return self._last_screenshot_size
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
result = await self._call_function(self.functions['screenshot'])
b64_str = self._to_b64_str(result) # type: ignore
result = await self._call_function(self.functions["screenshot"])
b64_str = self._to_b64_str(result) # type: ignore
# Try to extract dimensions for fallback use
try:
if isinstance(result, Image.Image):
@@ -136,74 +138,75 @@ class CustomComputerHandler(AsyncComputerHandler):
except Exception:
# If we can't get dimensions, that's okay
pass
return b64_str
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
if 'click' in self.functions:
await self._call_function(self.functions['click'], x, y, button)
if "click" in self.functions:
await self._call_function(self.functions["click"], x, y, button)
# No-op if not implemented
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
if 'double_click' in self.functions:
await self._call_function(self.functions['double_click'], x, y)
if "double_click" in self.functions:
await self._call_function(self.functions["double_click"], x, y)
# No-op if not implemented
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
if 'scroll' in self.functions:
await self._call_function(self.functions['scroll'], x, y, scroll_x, scroll_y)
if "scroll" in self.functions:
await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
# No-op if not implemented
async def type(self, text: str) -> None:
"""Type text."""
if 'type' in self.functions:
await self._call_function(self.functions['type'], text)
if "type" in self.functions:
await self._call_function(self.functions["type"], text)
# No-op if not implemented
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
if 'wait' in self.functions:
await self._call_function(self.functions['wait'], ms)
if "wait" in self.functions:
await self._call_function(self.functions["wait"], ms)
else:
# Default implementation
import asyncio
await asyncio.sleep(ms / 1000.0)
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
if 'move' in self.functions:
await self._call_function(self.functions['move'], x, y)
if "move" in self.functions:
await self._call_function(self.functions["move"], x, y)
# No-op if not implemented
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
if 'keypress' in self.functions:
await self._call_function(self.functions['keypress'], keys)
if "keypress" in self.functions:
await self._call_function(self.functions["keypress"], keys)
# No-op if not implemented
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
if 'drag' in self.functions:
await self._call_function(self.functions['drag'], path)
if "drag" in self.functions:
await self._call_function(self.functions["drag"], path)
# No-op if not implemented
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
if 'get_current_url' in self.functions:
return await self._get_value('current_url') # type: ignore
if "get_current_url" in self.functions:
return await self._get_value("current_url") # type: ignore
return "" # Default fallback
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
if 'left_mouse_down' in self.functions:
await self._call_function(self.functions['left_mouse_down'], x, y)
if "left_mouse_down" in self.functions:
await self._call_function(self.functions["left_mouse_down"], x, y)
# No-op if not implemented
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
if 'left_mouse_up' in self.functions:
await self._call_function(self.functions['left_mouse_up'], x, y)
if "left_mouse_up" in self.functions:
await self._call_function(self.functions["left_mouse_up"], x, y)
# No-op if not implemented

View File

@@ -3,47 +3,56 @@ Decorators for agent - agent_loop decorator
"""
from typing import List, Optional
from .types import AgentConfigInfo
# Global registry
_agent_configs: List[AgentConfigInfo] = []
def register_agent(models: str, priority: int = 0):
"""
Decorator to register an AsyncAgentConfig class.
Args:
models: Regex pattern to match supported models
priority: Priority for agent selection (higher = more priority)
"""
def decorator(agent_class: type):
# Validate that the class implements AsyncAgentConfig protocol
if not hasattr(agent_class, 'predict_step'):
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
if not hasattr(agent_class, 'predict_click'):
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
if not hasattr(agent_class, 'get_capabilities'):
raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
if not hasattr(agent_class, "predict_step"):
raise ValueError(
f"Agent class {agent_class.__name__} must implement predict_step method"
)
if not hasattr(agent_class, "predict_click"):
raise ValueError(
f"Agent class {agent_class.__name__} must implement predict_click method"
)
if not hasattr(agent_class, "get_capabilities"):
raise ValueError(
f"Agent class {agent_class.__name__} must implement get_capabilities method"
)
# Register the agent config
config_info = AgentConfigInfo(
agent_class=agent_class,
models_regex=models,
priority=priority
agent_class=agent_class, models_regex=models, priority=priority
)
_agent_configs.append(config_info)
# Sort by priority (highest first)
_agent_configs.sort(key=lambda x: x.priority, reverse=True)
return agent_class
return decorator
def get_agent_configs() -> List[AgentConfigInfo]:
"""Get all registered agent configs"""
return _agent_configs.copy()
def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
"""Find the best matching agent config for a model"""
for config_info in _agent_configs:

View File

@@ -12,7 +12,7 @@ Components:
Usage:
# Run the server and UI
python -m agent.human_tool
# Or run components separately
python -m agent.human_tool.server # API server only
python -m agent.human_tool.ui # UI only
@@ -21,9 +21,4 @@ Usage:
from .server import CompletionQueue, completion_queue
from .ui import HumanCompletionUI, create_ui
__all__ = [
"CompletionQueue",
"completion_queue",
"HumanCompletionUI",
"create_ui"
]
__all__ = ["CompletionQueue", "completion_queue", "HumanCompletionUI", "create_ui"]

View File

@@ -8,6 +8,7 @@ with a Gradio UI for human interaction.
import gradio as gr
from fastapi import FastAPI
from .server import app as fastapi_app
from .ui import create_ui
@@ -18,6 +19,7 @@ gradio_demo = create_ui()
CUSTOM_PATH = "/gradio"
app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
# Add a redirect from root to Gradio UI
@fastapi_app.get("/")
async def redirect_to_ui():
@@ -25,14 +27,16 @@ async def redirect_to_ui():
return {
"message": "Human Completion Server is running",
"ui_url": "/gradio",
"api_docs": "/docs"
"api_docs": "/docs",
}
if __name__ == "__main__":
import uvicorn
print("🚀 Starting Human-in-the-Loop Completion Server...")
print("📊 API Server: http://localhost:8002")
print("🎨 Gradio UI: http://localhost:8002/gradio")
print("📚 API Docs: http://localhost:8002/docs")
uvicorn.run(app, host="0.0.0.0", port=8002)

View File

@@ -1,9 +1,9 @@
import asyncio
import uuid
from dataclasses import asdict, dataclass
from datetime import datetime
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
from enum import Enum
from typing import Any, Dict, List, Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
@@ -49,7 +49,7 @@ class CompletionQueue:
self._queue: Dict[str, CompletionCall] = {}
self._pending_order: List[str] = []
self._lock = asyncio.Lock()
async def add_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
"""Add a completion call to the queue."""
async with self._lock:
@@ -59,42 +59,47 @@ class CompletionQueue:
messages=messages,
model=model,
status=CompletionStatus.PENDING,
created_at=datetime.now()
created_at=datetime.now(),
)
self._queue[call_id] = completion_call
self._pending_order.append(call_id)
return call_id
async def get_pending_calls(self) -> List[Dict[str, Any]]:
"""Get all pending completion calls."""
async with self._lock:
pending_calls = []
for call_id in self._pending_order:
if call_id in self._queue and self._queue[call_id].status == CompletionStatus.PENDING:
if (
call_id in self._queue
and self._queue[call_id].status == CompletionStatus.PENDING
):
call = self._queue[call_id]
pending_calls.append({
"id": call.id,
"model": call.model,
"created_at": call.created_at.isoformat(),
"messages": call.messages
})
pending_calls.append(
{
"id": call.id,
"model": call.model,
"created_at": call.created_at.isoformat(),
"messages": call.messages,
}
)
return pending_calls
async def get_call_status(self, call_id: str) -> Optional[Dict[str, Any]]:
"""Get the status of a specific completion call."""
async with self._lock:
if call_id not in self._queue:
return None
call = self._queue[call_id]
result = {
"id": call.id,
"status": call.status.value,
"created_at": call.created_at.isoformat(),
"model": call.model,
"messages": call.messages
"messages": call.messages,
}
if call.completed_at:
result["completed_at"] = call.completed_at.isoformat()
if call.response:
@@ -103,69 +108,74 @@ class CompletionQueue:
result["tool_calls"] = call.tool_calls
if call.error:
result["error"] = call.error
return result
async def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
async def complete_call(
self,
call_id: str,
response: Optional[str] = None,
tool_calls: Optional[List[Dict[str, Any]]] = None,
) -> bool:
"""Mark a completion call as completed with a response or tool calls."""
async with self._lock:
if call_id not in self._queue:
return False
call = self._queue[call_id]
if call.status != CompletionStatus.PENDING:
return False
call.status = CompletionStatus.COMPLETED
call.completed_at = datetime.now()
call.response = response
call.tool_calls = tool_calls
# Remove from pending order
if call_id in self._pending_order:
self._pending_order.remove(call_id)
return True
async def fail_call(self, call_id: str, error: str) -> bool:
"""Mark a completion call as failed with an error."""
async with self._lock:
if call_id not in self._queue:
return False
call = self._queue[call_id]
if call.status != CompletionStatus.PENDING:
return False
call.status = CompletionStatus.FAILED
call.completed_at = datetime.now()
call.error = error
# Remove from pending order
if call_id in self._pending_order:
self._pending_order.remove(call_id)
return True
async def wait_for_completion(self, call_id: str, timeout: float = 300.0) -> Optional[str]:
"""Wait for a completion call to be completed and return the response."""
start_time = asyncio.get_event_loop().time()
while True:
status = await self.get_call_status(call_id)
if not status:
return None
if status["status"] == CompletionStatus.COMPLETED.value:
return status.get("response")
elif status["status"] == CompletionStatus.FAILED.value:
raise Exception(f"Completion failed: {status.get('error', 'Unknown error')}")
# Check timeout
if asyncio.get_event_loop().time() - start_time > timeout:
await self.fail_call(call_id, "Timeout waiting for human response")
raise TimeoutError("Timeout waiting for human response")
# Wait a bit before checking again
await asyncio.sleep(0.5)
@@ -204,9 +214,7 @@ async def get_status(call_id: str):
async def complete_call(call_id: str, response: CompletionResponse):
"""Complete a call with a human response."""
success = await completion_queue.complete_call(
call_id,
response=response.response,
tool_calls=response.tool_calls
call_id, response=response.response, tool_calls=response.tool_calls
)
if success:
return {"status": "success", "message": "Call completed"}
@@ -219,7 +227,9 @@ async def fail_call(call_id: str, error: Dict[str, str]):
"""Mark a call as failed."""
success = await completion_queue.fail_call(call_id, error.get("error", "Unknown error"))
if not success:
raise HTTPException(status_code=404, detail="Completion call not found or already completed")
raise HTTPException(
status_code=404, detail="Completion call not found or already completed"
)
return {"status": "failed"}
@@ -231,4 +241,5 @@ async def root():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8002)

View File

@@ -1,14 +1,17 @@
import gradio as gr
import json
import time
from typing import List, Dict, Any, Optional
from datetime import datetime
import requests
from .server import completion_queue
import base64
import io
import json
import time
from datetime import datetime
from typing import Any, Dict, List, Optional
import gradio as gr
import requests
from PIL import Image
from .server import completion_queue
class HumanCompletionUI:
def __init__(self, server_url: str = "http://localhost:8002"):
self.server_url = server_url
@@ -20,7 +23,7 @@ class HumanCompletionUI:
self.current_button: str = "left"
self.current_scroll_x: int = 0
self.current_scroll_y: int = -120
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Format messages for display in gr.Chatbot with type='messages'."""
formatted = []
@@ -28,7 +31,7 @@ class HumanCompletionUI:
role = msg.get("role", "user")
content = msg.get("content", "")
tool_calls = msg.get("tool_calls", [])
# Handle different content formats
if isinstance(content, list):
# Multi-modal content - can include text and images
@@ -55,7 +58,7 @@ class HumanCompletionUI:
else:
# For URL images, create gr.Image with URL
formatted_content.append(gr.Image(value=image_url))
# Determine final content format
if len(formatted_content) == 1:
content = formatted_content[0]
@@ -63,28 +66,28 @@ class HumanCompletionUI:
content = formatted_content
else:
content = "[Empty content]"
# Ensure role is valid for Gradio Chatbot
if role not in ["user", "assistant"]:
role = "assistant" if role == "system" else "user"
# Invert roles for better display in human UI context
# (what the AI says becomes "user", what human should respond becomes "assistant")
if role == "user":
role = "assistant"
else:
role = "user"
# Add the main message if it has content
if content and str(content).strip():
formatted.append({"role": role, "content": content})
# Handle tool calls - create separate messages for each tool call
if tool_calls:
for tool_call in tool_calls:
function_name = tool_call.get("function", {}).get("name", "unknown")
arguments_str = tool_call.get("function", {}).get("arguments", "{}")
try:
# Parse arguments to format them nicely
arguments = json.loads(arguments_str)
@@ -92,18 +95,20 @@ class HumanCompletionUI:
except json.JSONDecodeError:
# If parsing fails, use the raw string
formatted_args = arguments_str
# Create a formatted message for the tool call
tool_call_content = f"```json\n{formatted_args}\n```"
formatted.append({
"role": role,
"content": tool_call_content,
"metadata": {"title": f"🛠️ Used {function_name}"}
})
formatted.append(
{
"role": role,
"content": tool_call_content,
"metadata": {"title": f"🛠️ Used {function_name}"},
}
)
return formatted
def get_pending_calls(self) -> List[Dict[str, Any]]:
"""Get pending calls from the server."""
try:
@@ -113,38 +118,39 @@ class HumanCompletionUI:
except Exception as e:
print(f"Error fetching pending calls: {e}")
return []
def complete_call_with_response(self, call_id: str, response: str) -> bool:
"""Complete a call with a text response."""
try:
response_data = {"response": response}
response_obj = requests.post(
f"{self.server_url}/complete/{call_id}",
json=response_data,
timeout=10
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
)
response_obj.raise_for_status()
return True
except requests.RequestException as e:
print(f"Error completing call: {e}")
return False
def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
"""Complete a call with tool calls."""
try:
response_data = {"tool_calls": tool_calls}
response_obj = requests.post(
f"{self.server_url}/complete/{call_id}",
json=response_data,
timeout=10
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
)
response_obj.raise_for_status()
return True
except requests.RequestException as e:
print(f"Error completing call: {e}")
return False
def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
def complete_call(
self,
call_id: str,
response: Optional[str] = None,
tool_calls: Optional[List[Dict[str, Any]]] = None,
) -> bool:
"""Complete a call with either a response or tool calls."""
try:
response_data = {}
@@ -152,25 +158,23 @@ class HumanCompletionUI:
response_data["response"] = response
if tool_calls:
response_data["tool_calls"] = tool_calls
response_obj = requests.post(
f"{self.server_url}/complete/{call_id}",
json=response_data,
timeout=10
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
)
response_obj.raise_for_status()
return True
except requests.RequestException as e:
print(f"Error completing call: {e}")
return False
def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
"""Extract the last image from the messages for display above conversation."""
last_image = None
for msg in reversed(messages): # Start from the last message
content = msg.get("content", "")
if isinstance(content, list):
for item in reversed(content): # Get the last image in the message
if item.get("type") == "image_url":
@@ -189,13 +193,13 @@ class HumanCompletionUI:
else:
# For URL images, return the URL
return image_url
return last_image
def refresh_pending_calls(self):
"""Refresh the list of pending calls."""
pending_calls = self.get_pending_calls()
if not pending_calls:
return (
gr.update(choices=["latest"], value="latest"), # dropdown
@@ -205,27 +209,27 @@ class HumanCompletionUI:
gr.update(visible=False), # click_actions_group hidden
gr.update(visible=False), # actions_group hidden
)
# Sort pending calls by created_at to get oldest first
sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
# Create choices for dropdown
choices = [("latest", "latest")] # Add "latest" option first
for call in sorted_calls:
call_id = call["id"]
model = call.get("model", "unknown")
created_at = call.get("created_at", "")
# Format timestamp
try:
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
time_str = dt.strftime("%H:%M:%S")
except:
time_str = created_at
choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
choices.append((choice_label, call_id))
# Default to "latest" which shows the oldest pending conversation
selected_call_id = "latest"
if selected_call_id == "latest" and sorted_calls:
@@ -239,7 +243,7 @@ class HumanCompletionUI:
conversation = []
self.current_call_id = None
self.last_image = None
return (
gr.update(choices=choices, value="latest"),
gr.update(value=self.last_image),
@@ -248,7 +252,7 @@ class HumanCompletionUI:
gr.update(visible=True), # click_actions_group visible when there is a call
gr.update(visible=True), # actions_group visible when there is a call
)
def on_call_selected(self, selected_choice):
"""Handle when a call is selected from the dropdown."""
if not selected_choice:
@@ -259,7 +263,7 @@ class HumanCompletionUI:
gr.update(visible=False), # click_actions_group hidden
gr.update(visible=False), # actions_group hidden
)
pending_calls = self.get_pending_calls()
if not pending_calls:
return (
@@ -269,7 +273,7 @@ class HumanCompletionUI:
gr.update(visible=False), # click_actions_group hidden
gr.update(visible=False), # actions_group hidden
)
# Handle "latest" option
if selected_choice == "latest":
# Sort calls by created_at to get oldest first
@@ -284,17 +288,17 @@ class HumanCompletionUI:
if call_id_short in selected_choice:
call_id = call["id"]
break
if not call_id:
return (
gr.update(value=None), # no image
gr.update(value=[]), # empty chatbot
gr.update(interactive=False)
gr.update(interactive=False),
)
# Find the selected call
selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
if not selected_call:
return (
gr.update(value=None), # no image
@@ -303,12 +307,12 @@ class HumanCompletionUI:
gr.update(visible=False), # click_actions_group hidden
gr.update(visible=False), # actions_group hidden
)
conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
self.current_call_id = call_id
# Get the last image from messages
self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
return (
gr.update(value=self.last_image),
gr.update(value=conversation),
@@ -316,110 +320,111 @@ class HumanCompletionUI:
gr.update(visible=True), # click_actions_group visible
gr.update(visible=True), # actions_group visible
)
def submit_response(self, response_text: str):
"""Submit a text response to the current call."""
if not self.current_call_id:
return (
gr.update(value=response_text), # keep response text
gr.update(value="❌ No call selected") # status
gr.update(value="❌ No call selected"), # status
)
if not response_text.strip():
return (
gr.update(value=response_text), # keep response text
gr.update(value="❌ Response cannot be empty") # status
gr.update(value="❌ Response cannot be empty"), # status
)
success = self.complete_call_with_response(self.current_call_id, response_text)
if success:
status_msg = "✅ Response submitted successfully!"
return (
gr.update(value=""), # clear response text
gr.update(value=status_msg) # status
gr.update(value=status_msg), # status
)
else:
return (
gr.update(value=response_text), # keep response text
gr.update(value="❌ Failed to submit response") # status
gr.update(value="❌ Failed to submit response"), # status
)
def submit_action(self, action_type: str, **kwargs) -> str:
"""Submit a computer action as a tool call."""
if not self.current_call_id:
return "❌ No call selected"
import uuid
# Create tool call structure
action_data = {"type": action_type, **kwargs}
tool_call = {
"id": f"call_{uuid.uuid4().hex[:24]}",
"type": "function",
"function": {
"name": "computer",
"arguments": json.dumps(action_data)
}
"function": {"name": "computer", "arguments": json.dumps(action_data)},
}
success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
if success:
return f"{action_type.capitalize()} action submitted as tool call"
else:
return f"❌ Failed to submit {action_type} action"
def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
def submit_click_action(
self, x: int, y: int, action_type: str = "click", button: str = "left"
) -> str:
"""Submit a coordinate-based action."""
if action_type == "click":
return self.submit_action(action_type, x=x, y=y, button=button)
else:
return self.submit_action(action_type, x=x, y=y)
def submit_type_action(self, text: str) -> str:
"""Submit a type action."""
return self.submit_action("type", text=text)
def submit_hotkey_action(self, keys: str) -> str:
"""Submit a hotkey action."""
return self.submit_action("keypress", keys=keys)
def submit_wait_action(self) -> str:
"""Submit a wait action with no kwargs."""
return self.submit_action("wait")
def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
def submit_description_click(
self, description: str, action_type: str = "click", button: str = "left"
) -> str:
"""Submit a description-based action."""
if action_type == "click":
return self.submit_action(action_type, element_description=description, button=button)
else:
return self.submit_action(action_type, element_description=description)
def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
"""Wait for pending calls to appear or until max_seconds elapsed.
This method loops and checks for pending calls at regular intervals,
returning as soon as a pending call is found or the maximum wait time is reached.
Args:
max_seconds: Maximum number of seconds to wait
check_interval: How often to check for pending calls (in seconds)
"""
import time
start_time = time.time()
while time.time() - start_time < max_seconds:
# Check if there are any pending calls
pending_calls = self.get_pending_calls()
if pending_calls:
# Found pending calls, return immediately
return self.refresh_pending_calls()
# Wait before checking again
time.sleep(check_interval)
# Max wait time reached, return current state
return self.refresh_pending_calls()
@@ -427,79 +432,73 @@ class HumanCompletionUI:
def create_ui():
"""Create the Gradio interface."""
ui_handler = HumanCompletionUI()
with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
gr.Markdown("Review AI conversation requests and provide human responses.")
with gr.Row():
with gr.Column(scale=2):
with gr.Group():
screenshot_image = gr.Image(
label="Interactive Screenshot",
interactive=False,
height=600
label="Interactive Screenshot", interactive=False, height=600
)
# Action type selection for image clicks (wrapped for visibility control)
with gr.Group(visible=False) as click_actions_group:
with gr.Row():
action_type_radio = gr.Dropdown(
label="Interactive Action",
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
choices=[
"click",
"double_click",
"move",
"left_mouse_up",
"left_mouse_down",
"scroll",
],
value="click",
scale=2
scale=2,
)
action_button_radio = gr.Dropdown(
label="Button",
choices=["left", "right", "wheel", "back", "forward"],
value="left",
visible=True,
scale=1
scale=1,
)
scroll_x_input = gr.Number(
label="scroll_x",
value=0,
visible=False,
scale=1
label="scroll_x", value=0, visible=False, scale=1
)
scroll_y_input = gr.Number(
label="scroll_y",
value=-120,
visible=False,
scale=1
label="scroll_y", value=-120, visible=False, scale=1
)
conversation_chatbot = gr.Chatbot(
label="Conversation",
type="messages",
height=500,
show_copy_button=True
label="Conversation", type="messages", height=500, show_copy_button=True
)
with gr.Column(scale=1):
with gr.Group():
call_dropdown = gr.Dropdown(
label="Select a pending conversation request",
choices=["latest"],
interactive=True,
value="latest"
value="latest",
)
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
status_display = gr.Textbox(
label="Status",
interactive=False,
value="Ready to receive requests..."
label="Status", interactive=False, value="Ready to receive requests..."
)
with gr.Group():
response_text = gr.Textbox(
label="Message",
lines=3,
placeholder="Enter your message here..."
label="Message", lines=3, placeholder="Enter your message here..."
)
submit_btn = gr.Button("📤 Submit Message", variant="primary", interactive=False)
submit_btn = gr.Button(
"📤 Submit Message", variant="primary", interactive=False
)
# Action Accordions (wrapped for visibility control)
with gr.Group(visible=False) as actions_group:
with gr.Tabs():
@@ -507,58 +506,73 @@ def create_ui():
with gr.Group():
description_text = gr.Textbox(
label="Element Description",
placeholder="e.g., 'Privacy and security option in left sidebar'"
placeholder="e.g., 'Privacy and security option in left sidebar'",
)
with gr.Row():
description_action_type = gr.Dropdown(
label="Action",
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
value="click"
choices=[
"click",
"double_click",
"move",
"left_mouse_up",
"left_mouse_down",
],
value="click",
)
description_button = gr.Dropdown(
label="Button",
choices=["left", "right", "wheel", "back", "forward"],
value="left"
value="left",
)
description_submit_btn = gr.Button("Submit Click Action")
with gr.Tab("📝 Type Action"):
with gr.Group():
type_text = gr.Textbox(
label="Text to Type",
placeholder="Enter text to type..."
label="Text to Type", placeholder="Enter text to type..."
)
type_submit_btn = gr.Button("Submit Type")
with gr.Tab("⌨️ Keypress Action"):
with gr.Group():
keypress_text = gr.Textbox(
label="Keys",
placeholder="e.g., ctrl+c, alt+tab"
label="Keys", placeholder="e.g., ctrl+c, alt+tab"
)
keypress_submit_btn = gr.Button("Submit Keypress")
with gr.Tab("🧰 Misc Actions"):
with gr.Group():
misc_action_dropdown = gr.Dropdown(
label="Action",
choices=["wait"],
value="wait"
label="Action", choices=["wait"], value="wait"
)
misc_submit_btn = gr.Button("Submit Action")
# Event handlers
refresh_btn.click(
fn=ui_handler.refresh_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
call_dropdown.change(
fn=ui_handler.on_call_selected,
inputs=[call_dropdown],
outputs=[screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
def handle_image_click(evt: gr.SelectData):
if evt.index is not None:
x, y = evt.index
@@ -568,31 +582,44 @@ def create_ui():
sx_i = int(ui_handler.current_scroll_x or 0)
sy_i = int(ui_handler.current_scroll_y or 0)
# Submit a scroll action with x,y position and scroll deltas
result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
result = ui_handler.submit_action(
"scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
)
else:
result = ui_handler.submit_click_action(x, y, action_type, button)
ui_handler.wait_for_pending_calls()
return result
return "No coordinates selected"
screenshot_image.select(
fn=handle_image_click,
outputs=[status_display]
).then(
screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
fn=ui_handler.wait_for_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
# Response submission
submit_btn.click(
fn=ui_handler.submit_response,
inputs=[response_text],
outputs=[response_text, status_display]
outputs=[response_text, status_display],
).then(
fn=ui_handler.refresh_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
# Toggle visibility of controls based on action type
def toggle_action_controls(action_type):
# Button visible only for click
@@ -603,59 +630,63 @@ def create_ui():
# Update state
ui_handler.current_action_type = action_type or "click"
return button_vis, scroll_x_vis, scroll_y_vis
action_type_radio.change(
fn=toggle_action_controls,
inputs=[action_type_radio],
outputs=[action_button_radio, scroll_x_input, scroll_y_input]
outputs=[action_button_radio, scroll_x_input, scroll_y_input],
)
# Keep other control values in ui_handler state
def on_button_change(val):
ui_handler.current_button = (val or "left")
action_button_radio.change(
fn=on_button_change,
inputs=[action_button_radio]
)
ui_handler.current_button = val or "left"
action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
def on_scroll_x_change(val):
try:
ui_handler.current_scroll_x = int(val) if val is not None else 0
except Exception:
ui_handler.current_scroll_x = 0
scroll_x_input.change(
fn=on_scroll_x_change,
inputs=[scroll_x_input]
)
scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
def on_scroll_y_change(val):
try:
ui_handler.current_scroll_y = int(val) if val is not None else 0
except Exception:
ui_handler.current_scroll_y = 0
scroll_y_input.change(
fn=on_scroll_y_change,
inputs=[scroll_y_input]
)
scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
type_submit_btn.click(
fn=ui_handler.submit_type_action,
inputs=[type_text],
outputs=[status_display]
fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
).then(
fn=ui_handler.wait_for_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
keypress_submit_btn.click(
fn=ui_handler.submit_hotkey_action,
inputs=[keypress_text],
outputs=[status_display]
fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
).then(
fn=ui_handler.wait_for_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
def handle_description_submit(description, action_type, button):
if description:
result = ui_handler.submit_description_click(description, action_type, button)
@@ -666,12 +697,19 @@ def create_ui():
description_submit_btn.click(
fn=handle_description_submit,
inputs=[description_text, description_action_type, description_button],
outputs=[status_display]
outputs=[status_display],
).then(
fn=ui_handler.wait_for_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
# Misc action handler
def handle_misc_submit(selected_action):
if selected_action == "wait":
@@ -681,20 +719,32 @@ def create_ui():
return f"Unsupported misc action: {selected_action}"
misc_submit_btn.click(
fn=handle_misc_submit,
inputs=[misc_action_dropdown],
outputs=[status_display]
fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
).then(
fn=ui_handler.wait_for_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
# Load initial data
demo.load(
fn=ui_handler.refresh_pending_calls,
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
outputs=[
call_dropdown,
screenshot_image,
conversation_chatbot,
submit_btn,
click_actions_group,
actions_group,
],
)
return demo

View File

@@ -8,21 +8,22 @@ Exports:
- run_full_dataset(dataset, ...)
- MCPComputerAgent
"""
import time
from typing import Any, Optional
from agent.computers import is_agent_computer
from datasets import load_dataset, Dataset
from hud.datasets import Task, run_dataset
from datasets import Dataset, load_dataset
from hud import trace
from hud.datasets import Task, run_dataset
from .agent import MCPComputerAgent
# ---------------------------------------------------------------------------
# Single-task runner
# ---------------------------------------------------------------------------
async def run_single_task(
dataset: str | Dataset | list[dict[str, Any]],
*,
@@ -47,24 +48,20 @@ async def run_single_task(
# Load dataset and pick a sample
if isinstance(dataset, str):
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
elif isinstance(dataset, list):
dataset = dataset
else:
dataset = dataset["train"]
sample_task = dataset[task_id] # type: ignore[index]
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
# Filter any existing Computer tools
# The eval framework will add its own Computer tool per task
if tools:
tools = [
tool
for tool in tools
if not is_agent_computer(tool)
]
tools = [tool for tool in tools if not is_agent_computer(tool)]
with trace(name=task_prompt):
task = Task(**sample_task) # type: ignore[arg-type]
@@ -87,13 +84,14 @@ async def run_single_task(
)
print(f"Running: {task_prompt}")
result = await agent.run(task, max_steps=10)
print(f"✅ Reward: {getattr(result, 'reward')}")
print(f"✅ Reward: {result.reward}")
# ---------------------------------------------------------------------------
# Full-dataset runner
# ---------------------------------------------------------------------------
async def run_full_dataset(
dataset: str | Dataset | list[dict[str, Any]],
*,
@@ -121,9 +119,9 @@ async def run_full_dataset(
# Run with our MCP-based agent class.
if isinstance(dataset, str):
dataset_name = dataset.split('/')[-1]
dataset_name = dataset.split("/")[-1]
job_name = job_name or f"Evaluation {dataset_name}"
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
else:
dataset_name = "custom"
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
@@ -131,12 +129,8 @@ async def run_full_dataset(
# Filter any existing Computer tools
# The eval framework will add its own Computer tool per task
if tools:
tools = [
tool
for tool in tools
if not is_agent_computer(tool)
]
tools = [tool for tool in tools if not is_agent_computer(tool)]
# Execute evaluation
return await run_dataset(
name=job_name,
@@ -170,4 +164,4 @@ __all__ = [
"run_single_task",
"run_full_dataset",
"MCPComputerAgent",
]
]

View File

@@ -9,26 +9,26 @@ Key differences from the OpenAI OperatorAgent variant:
- Planning is executed via `ComputerAgent.run(messages)`.
- The first yielded result per step is returned as the agent response.
"""
from __future__ import annotations
import base64
import io
import uuid
from pathlib import Path
from typing import Any, ClassVar, Optional
import hud
import mcp.types as types
from agent.agent import ComputerAgent as BaseComputerAgent
from agent.callbacks import PromptInstructionsCallback
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
from agent.computers import is_agent_computer
from agent.responses import make_failed_tool_call_items
from hud.agents import MCPAgent
from hud.tools.computer.settings import computer_settings
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
from agent.responses import make_failed_tool_call_items
from agent.computers import is_agent_computer
from PIL import Image
import mcp.types as types
import hud
import uuid
import base64
from pathlib import Path
class MCPComputerAgent(MCPAgent):
@@ -114,8 +114,10 @@ class MCPComputerAgent(MCPAgent):
self.last_screenshot_b64 = None
buffer = io.BytesIO()
Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
buffer, format="PNG"
)
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
# Ensure a computer shim is present so width/height/environment are known
computer_shim = {
@@ -128,12 +130,8 @@ class MCPComputerAgent(MCPAgent):
}
agent_tools: list[Any] = [computer_shim]
if tools:
agent_tools.extend([
tool
for tool in tools
if not is_agent_computer(tool)
])
agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
agent_kwargs = {
"model": self.model,
"trajectory_dir": trajectory_dir,
@@ -150,9 +148,7 @@ class MCPComputerAgent(MCPAgent):
"telemetry_enabled": telemetry_enabled,
}
self.computer_agent = BaseComputerAgent(
**agent_kwargs
)
self.computer_agent = BaseComputerAgent(**agent_kwargs)
async def get_system_messages(self) -> list[Any]:
"""Create initial messages.
@@ -161,9 +157,7 @@ class MCPComputerAgent(MCPAgent):
"""
return []
async def format_blocks(
self, blocks: list[types.ContentBlock]
) -> list[dict[str, Any]]:
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
"""
Format blocks for OpenAI input format.
@@ -200,42 +194,49 @@ class MCPComputerAgent(MCPAgent):
# Call the ComputerAgent LLM API
async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
items = result['output']
items = result["output"]
if not items or tool_calls:
break
for item in items:
if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
if item["type"] in [
"reasoning",
"message",
"computer_call",
"function_call",
"function_call_output",
]:
agent_result.append(item)
# Add messages to output text
if item['type'] == 'reasoning':
if item["type"] == "reasoning":
output_text.extend(
f"Reasoning: {summary['text']}"
for summary in item['summary']
f"Reasoning: {summary['text']}" for summary in item["summary"]
)
elif item['type'] == 'message':
if isinstance(item['content'], list):
elif item["type"] == "message":
if isinstance(item["content"], list):
output_text.extend(
item['text']
for item in item['content']
if item['type'] == 'output_text'
item["text"]
for item in item["content"]
if item["type"] == "output_text"
)
elif isinstance(item['content'], str):
output_text.append(item['content'])
elif isinstance(item["content"], str):
output_text.append(item["content"])
# If we get a tool call, we're not done
if item['type'] == 'computer_call':
if item["type"] == "computer_call":
id = item["call_id"]
tool_calls.append(MCPToolCall(
name="openai_computer",
arguments=item["action"],
id=id,
))
tool_calls.append(
MCPToolCall(
name="openai_computer",
arguments=item["action"],
id=id,
)
)
is_done = False
self.tool_call_inputs[id] = agent_result
break
# if we have tool calls, we should exit the loop
if tool_calls:
break
@@ -247,7 +248,7 @@ class MCPComputerAgent(MCPAgent):
tool_calls=tool_calls,
done=is_done,
)
def _log_image(self, image_b64: str):
callbacks = self.computer_agent.callbacks
for callback in callbacks:
@@ -257,9 +258,7 @@ class MCPComputerAgent(MCPAgent):
callback._save_artifact("screenshot_after", image_bytes)
async def format_tool_results(
self,
tool_calls: list[MCPToolCall],
tool_results: list[MCPToolResult]
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
) -> list[dict[str, Any]]:
"""Extract latest screenshot from tool results in dict form.
@@ -274,45 +273,60 @@ class MCPComputerAgent(MCPAgent):
previous_output = self.previous_output.copy() or []
# First we need to remove any pending computer_calls from the end of previous_output
while previous_output and previous_output[-1]['type'] == 'computer_call':
while previous_output and previous_output[-1]["type"] == "computer_call":
previous_output.pop()
messages.extend(previous_output)
# If the call is a 'response', don't add the result
if call.name == 'response':
if call.name == "response":
continue
# Otherwise, if we have a result, we should add it to the messages
content = [
{ "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
else { "type": "input_text", "text": "" }
(
{"type": "input_text", "text": content.text}
if isinstance(content, types.TextContent)
else (
{
"type": "input_image",
"image_url": f"data:image/png;base64,{content.data}",
}
if isinstance(content, types.ImageContent)
else {"type": "input_text", "text": ""}
)
)
for content in result.content
]
messages.append({
"role": "user",
"content": content,
})
messages.append(
{
"role": "user",
"content": content,
}
)
continue
# Add the assistant's computer call
messages.extend(self.tool_call_inputs[call.id])
if result.isError:
error_text = "".join([
content.text
for content in result.content
if isinstance(content, types.TextContent)
])
error_text = "".join(
[
content.text
for content in result.content
if isinstance(content, types.TextContent)
]
)
# Replace computer call with failed tool call
messages.pop()
messages.extend(make_failed_tool_call_items(
tool_name=call.name,
tool_kwargs=call.arguments or {},
error_message=error_text,
call_id=call.id,
))
messages.extend(
make_failed_tool_call_items(
tool_name=call.name,
tool_kwargs=call.arguments or {},
error_message=error_text,
call_id=call.id,
)
)
else:
# Get the latest screenshot
screenshots = [
@@ -325,23 +339,27 @@ class MCPComputerAgent(MCPAgent):
if screenshots:
self._log_image(screenshots[0])
self.last_screenshot_b64 = screenshots[0]
messages.append({
"type": "computer_call_output",
"call_id": call.id,
"output": {
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshots[0]}"
},
})
messages.append(
{
"type": "computer_call_output",
"call_id": call.id,
"output": {
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshots[0]}",
},
}
)
else:
# Otherwise, replace computer call with failed tool call
messages.pop()
messages.extend(make_failed_tool_call_items(
tool_name=call.name,
tool_kwargs=call.arguments or {},
error_message="No screenshots returned.",
call_id=call.id,
))
messages.extend(
make_failed_tool_call_items(
tool_name=call.name,
tool_kwargs=call.arguments or {},
error_message="No screenshots returned.",
call_id=call.id,
)
)
return messages

View File

@@ -7,30 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
by consuming the first yielded result from `ComputerAgent.run()`.
"""
import traceback
import time
import traceback
import uuid
from typing import Any, Dict, List, Optional
from agent.agent import ComputerAgent as BaseComputerAgent
from agent.callbacks import PromptInstructionsCallback
from hud.tools.computer.settings import computer_settings
from PIL import Image
from hud.agents import OperatorAgent
from hud.tools.computer.settings import computer_settings
# OpenAI Responses typed models (required)
from openai.types.responses import (
Response,
ResponseComputerToolCall,
ResponseInputParam,
ResponseOutputItem,
ResponseComputerToolCall,
ResponseOutputMessage,
ResponseOutputText,
ResponseReasoningItem,
ResponseUsage,
)
from PIL import Image
def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
def _map_agent_output_to_openai_blocks(
output_items: List[Dict[str, Any]],
) -> List[ResponseOutputItem]:
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
Only a subset is supported: computer_call, assistant message (text), and reasoning.
@@ -40,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
for item in output_items or []:
t = item.get("type")
if t == "computer_call":
comp = ResponseComputerToolCall.model_validate({
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
"type": "computer_call",
"call_id": item["call_id"],
"action": item["action"],
"pending_safety_checks": item.get("pending_safety_checks", []),
"status": "completed",
})
comp = ResponseComputerToolCall.model_validate(
{
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
"type": "computer_call",
"call_id": item["call_id"],
"action": item["action"],
"pending_safety_checks": item.get("pending_safety_checks", []),
"status": "completed",
}
)
blocks.append(comp)
# we will exit early here as the responses api only supports a single step
break
@@ -55,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
content_blocks: List[ResponseOutputText] = []
for c in item.get("content", []) or []:
content_blocks.append(
ResponseOutputText.model_validate({
"type": "output_text",
"text": c["text"],
"annotations": [],
})
ResponseOutputText.model_validate(
{
"type": "output_text",
"text": c["text"],
"annotations": [],
}
)
)
if content_blocks:
msg = ResponseOutputMessage.model_validate({
"id": item.get("id") or f"msg_{uuid.uuid4()}",
"type": "message",
"role": "assistant",
"status": "completed",
"content": [ct.model_dump() for ct in content_blocks],
})
msg = ResponseOutputMessage.model_validate(
{
"id": item.get("id") or f"msg_{uuid.uuid4()}",
"type": "message",
"role": "assistant",
"status": "completed",
"content": [ct.model_dump() for ct in content_blocks],
}
)
blocks.append(msg)
elif t == "reasoning":
reasoning = ResponseReasoningItem.model_validate({
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
"type": "reasoning",
"summary": item["summary"],
})
reasoning = ResponseReasoningItem.model_validate(
{
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
"type": "reasoning",
"summary": item["summary"],
}
)
blocks.append(reasoning)
# Unhandled types are ignored
return blocks
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
out: List[Dict[str, Any]] = []
for it in list(items):
@@ -92,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
out.append(dict(it)) # may raise if not mapping
return out
class FakeAsyncOpenAI:
"""Minimal fake OpenAI client with only `responses.create` implemented.
@@ -132,10 +145,12 @@ class FakeAsyncOpenAI:
# Pre-pend instructions message
effective_input = full_input
if instructions:
effective_input = [{
"role": "user",
"content": instructions,
}] + full_input
effective_input = [
{
"role": "user",
"content": instructions,
}
] + full_input
# Run a single iteration of the ComputerAgent
agent_result: Optional[Dict[str, Any]] = None
@@ -152,32 +167,43 @@ class FakeAsyncOpenAI:
blocks_to_cache = full_input + output
for b in blocks_to_cache:
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
self.blocks_cache[bid] = b # type: ignore[assignment]
self.blocks_cache[bid] = b # type: ignore[assignment]
block_ids.append(bid)
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
self.context_cache[response_id] = block_ids
try:
return Response.model_validate({
"id": response_id,
"created_at": time.time(),
"object": "response",
"model": model,
"output": output,
"parallel_tool_calls": False,
"tool_choice": "auto",
"tools": [],
"previous_response_id": previous_response_id,
"usage": ResponseUsage.model_validate({
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
"input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
"output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
}),
})
return Response.model_validate(
{
"id": response_id,
"created_at": time.time(),
"object": "response",
"model": model,
"output": output,
"parallel_tool_calls": False,
"tool_choice": "auto",
"tools": [],
"previous_response_id": previous_response_id,
"usage": ResponseUsage.model_validate(
{
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
"input_tokens_details": usage.get(
"input_tokens_details", {"cached_tokens": 0}
),
"output_tokens_details": usage.get(
"output_tokens_details", {"reasoning_tokens": 0}
),
}
),
}
)
except Exception as e:
print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
print(
f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
e,
)
if attempt == max_retries - 1:
print(traceback.format_exc())
raise e
@@ -221,9 +247,15 @@ class ProxyOperatorAgent(OperatorAgent):
allowed_tools = allowed_tools or ["openai_computer"]
computer_shim = {
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
'environment': 'linux',
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
"screenshot": lambda: Image.new(
"RGB",
(computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
),
"environment": "linux",
"dimensions": (
computer_settings.OPENAI_COMPUTER_WIDTH,
computer_settings.OPENAI_COMPUTER_HEIGHT,
),
}
# Build tools ensuring the computer_shim is included
agent_tools: list[Any] = [computer_shim]
@@ -258,6 +290,7 @@ class ProxyOperatorAgent(OperatorAgent):
**kwargs,
)
__all__ = [
"FakeAsyncOpenAI",
"ProxyOperatorAgent",

Some files were not shown because too many files have changed in this diff Show More