mirror of
https://github.com/trycua/computer.git
synced 2025-12-20 20:40:42 -06:00
Format codebase with uv run pre-commit run --all-files
This commit is contained in:
22
.github/scripts/get_pyproject_version.py
vendored
22
.github/scripts/get_pyproject_version.py
vendored
@@ -11,6 +11,7 @@ Exit codes:
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
try:
|
||||
import tomllib
|
||||
except ImportError:
|
||||
@@ -20,7 +21,10 @@ except ImportError:
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>", file=sys.stderr)
|
||||
print(
|
||||
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
pyproject_path = sys.argv[1]
|
||||
@@ -28,7 +32,7 @@ def main():
|
||||
|
||||
# tomllib requires binary mode
|
||||
try:
|
||||
with open(pyproject_path, 'rb') as f:
|
||||
with open(pyproject_path, "rb") as f:
|
||||
data = tomllib.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"❌ ERROR: File not found: {pyproject_path}", file=sys.stderr)
|
||||
@@ -37,6 +41,7 @@ def main():
|
||||
# Fallback to toml if using the old library or handle other errors
|
||||
try:
|
||||
import toml
|
||||
|
||||
data = toml.load(pyproject_path)
|
||||
except FileNotFoundError:
|
||||
print(f"❌ ERROR: File not found: {pyproject_path}", file=sys.stderr)
|
||||
@@ -45,7 +50,7 @@ def main():
|
||||
print(f"❌ ERROR: Failed to parse TOML file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
actual_version = data.get('project', {}).get('version')
|
||||
actual_version = data.get("project", {}).get("version")
|
||||
|
||||
if not actual_version:
|
||||
print("❌ ERROR: No version found in pyproject.toml", file=sys.stderr)
|
||||
@@ -56,13 +61,18 @@ def main():
|
||||
print(f" pyproject.toml version: {actual_version}", file=sys.stderr)
|
||||
print(f" Expected version: {expected_version}", file=sys.stderr)
|
||||
print("", file=sys.stderr)
|
||||
print("The version in pyproject.toml must match the version being published.", file=sys.stderr)
|
||||
print(f"Please update pyproject.toml to version {expected_version} or use the correct tag.", file=sys.stderr)
|
||||
print(
|
||||
"The version in pyproject.toml must match the version being published.", file=sys.stderr
|
||||
)
|
||||
print(
|
||||
f"Please update pyproject.toml to version {expected_version} or use the correct tag.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"✅ Version consistency check passed: {actual_version}")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
6
.github/scripts/tests/README.md
vendored
6
.github/scripts/tests/README.md
vendored
@@ -7,6 +7,7 @@ This directory contains comprehensive tests for the GitHub workflow scripts usin
|
||||
**No external dependencies required!**
|
||||
|
||||
This test suite uses:
|
||||
|
||||
- `unittest` - Python's built-in testing framework
|
||||
- `tomllib` - Python 3.11+ built-in TOML parser
|
||||
|
||||
@@ -15,27 +16,32 @@ For Python < 3.11, the `toml` package is used as a fallback.
|
||||
## Running Tests
|
||||
|
||||
### Run all tests
|
||||
|
||||
```bash
|
||||
cd .github/scripts/tests
|
||||
python3 -m unittest discover -v
|
||||
```
|
||||
|
||||
### Run a specific test file
|
||||
|
||||
```bash
|
||||
python3 -m unittest test_get_pyproject_version -v
|
||||
```
|
||||
|
||||
### Run a specific test class
|
||||
|
||||
```bash
|
||||
python3 -m unittest test_get_pyproject_version.TestGetPyprojectVersion -v
|
||||
```
|
||||
|
||||
### Run a specific test method
|
||||
|
||||
```bash
|
||||
python3 -m unittest test_get_pyproject_version.TestGetPyprojectVersion.test_matching_versions -v
|
||||
```
|
||||
|
||||
### Run tests directly from the test file
|
||||
|
||||
```bash
|
||||
python3 test_get_pyproject_version.py
|
||||
```
|
||||
|
||||
120
.github/scripts/tests/test_get_pyproject_version.py
vendored
120
.github/scripts/tests/test_get_pyproject_version.py
vendored
@@ -10,10 +10,10 @@ This test suite covers:
|
||||
"""
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import unittest
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
# Add parent directory to path to import the module
|
||||
@@ -36,46 +36,54 @@ class TestGetPyprojectVersion(unittest.TestCase):
|
||||
|
||||
def create_pyproject_toml(self, version: str) -> Path:
|
||||
"""Helper to create a temporary pyproject.toml file with a given version."""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
|
||||
temp_file.write(f"""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
|
||||
temp_file.write(
|
||||
f"""
|
||||
[project]
|
||||
name = "test-project"
|
||||
version = "{version}"
|
||||
description = "A test project"
|
||||
""")
|
||||
"""
|
||||
)
|
||||
temp_file.close()
|
||||
return Path(temp_file.name)
|
||||
|
||||
def create_pyproject_toml_no_version(self) -> Path:
|
||||
"""Helper to create a pyproject.toml without a version field."""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
|
||||
temp_file.write("""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
|
||||
temp_file.write(
|
||||
"""
|
||||
[project]
|
||||
name = "test-project"
|
||||
description = "A test project without version"
|
||||
""")
|
||||
"""
|
||||
)
|
||||
temp_file.close()
|
||||
return Path(temp_file.name)
|
||||
|
||||
def create_pyproject_toml_no_project(self) -> Path:
|
||||
"""Helper to create a pyproject.toml without a project section."""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
|
||||
temp_file.write("""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
|
||||
temp_file.write(
|
||||
"""
|
||||
[tool.poetry]
|
||||
name = "test-project"
|
||||
version = "1.0.0"
|
||||
""")
|
||||
"""
|
||||
)
|
||||
temp_file.close()
|
||||
return Path(temp_file.name)
|
||||
|
||||
def create_malformed_toml(self) -> Path:
|
||||
"""Helper to create a malformed TOML file."""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.toml', delete=False)
|
||||
temp_file.write("""
|
||||
temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".toml", delete=False)
|
||||
temp_file.write(
|
||||
"""
|
||||
[project
|
||||
name = "test-project
|
||||
version = "1.0.0"
|
||||
""")
|
||||
"""
|
||||
)
|
||||
temp_file.close()
|
||||
return Path(temp_file.name)
|
||||
|
||||
@@ -85,11 +93,11 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml("1.2.3")
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3"]
|
||||
|
||||
# Capture stdout
|
||||
captured_output = StringIO()
|
||||
with patch('sys.stdout', captured_output):
|
||||
with patch("sys.stdout", captured_output):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
@@ -104,11 +112,11 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml("1.2.3")
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.4']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.4"]
|
||||
|
||||
# Capture stderr
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
@@ -127,10 +135,10 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml_no_version()
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
@@ -145,10 +153,10 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml_no_project()
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
@@ -160,7 +168,7 @@ version = "1.0.0"
|
||||
# Test: File not found
|
||||
def test_file_not_found(self):
|
||||
"""Test handling of non-existent pyproject.toml file."""
|
||||
sys.argv = ['get_pyproject_version.py', '/nonexistent/pyproject.toml', '1.0.0']
|
||||
sys.argv = ["get_pyproject_version.py", "/nonexistent/pyproject.toml", "1.0.0"]
|
||||
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
@@ -173,7 +181,7 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_malformed_toml()
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
|
||||
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
@@ -185,44 +193,50 @@ version = "1.0.0"
|
||||
# Test: Incorrect number of arguments - too few
|
||||
def test_too_few_arguments(self):
|
||||
"""Test that providing too few arguments results in usage error."""
|
||||
sys.argv = ['get_pyproject_version.py', 'pyproject.toml']
|
||||
sys.argv = ["get_pyproject_version.py", "pyproject.toml"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 1)
|
||||
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue())
|
||||
self.assertIn(
|
||||
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue(),
|
||||
)
|
||||
|
||||
# Test: Incorrect number of arguments - too many
|
||||
def test_too_many_arguments(self):
|
||||
"""Test that providing too many arguments results in usage error."""
|
||||
sys.argv = ['get_pyproject_version.py', 'pyproject.toml', '1.0.0', 'extra']
|
||||
sys.argv = ["get_pyproject_version.py", "pyproject.toml", "1.0.0", "extra"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 1)
|
||||
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue())
|
||||
self.assertIn(
|
||||
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue(),
|
||||
)
|
||||
|
||||
# Test: No arguments
|
||||
def test_no_arguments(self):
|
||||
"""Test that providing no arguments results in usage error."""
|
||||
sys.argv = ['get_pyproject_version.py']
|
||||
sys.argv = ["get_pyproject_version.py"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 1)
|
||||
self.assertIn("Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue())
|
||||
self.assertIn(
|
||||
"Usage: python get_pyproject_version.py <pyproject_path> <expected_version>",
|
||||
captured_error.getvalue(),
|
||||
)
|
||||
|
||||
# Test: Version with pre-release tags
|
||||
def test_version_with_prerelease_tags(self):
|
||||
@@ -230,15 +244,17 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml("1.2.3-rc.1")
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3-rc.1']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3-rc.1"]
|
||||
|
||||
captured_output = StringIO()
|
||||
with patch('sys.stdout', captured_output):
|
||||
with patch("sys.stdout", captured_output):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 0)
|
||||
self.assertIn("✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue())
|
||||
self.assertIn(
|
||||
"✅ Version consistency check passed: 1.2.3-rc.1", captured_output.getvalue()
|
||||
)
|
||||
finally:
|
||||
pyproject_file.unlink()
|
||||
|
||||
@@ -248,15 +264,17 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml("1.2.3+build.123")
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.2.3+build.123']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.2.3+build.123"]
|
||||
|
||||
captured_output = StringIO()
|
||||
with patch('sys.stdout', captured_output):
|
||||
with patch("sys.stdout", captured_output):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 0)
|
||||
self.assertIn("✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue())
|
||||
self.assertIn(
|
||||
"✅ Version consistency check passed: 1.2.3+build.123", captured_output.getvalue()
|
||||
)
|
||||
finally:
|
||||
pyproject_file.unlink()
|
||||
|
||||
@@ -290,15 +308,17 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml(version)
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), version]
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), version]
|
||||
|
||||
captured_output = StringIO()
|
||||
with patch('sys.stdout', captured_output):
|
||||
with patch("sys.stdout", captured_output):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 0)
|
||||
self.assertIn(f"✅ Version consistency check passed: {version}", captured_output.getvalue())
|
||||
self.assertIn(
|
||||
f"✅ Version consistency check passed: {version}", captured_output.getvalue()
|
||||
)
|
||||
finally:
|
||||
pyproject_file.unlink()
|
||||
|
||||
@@ -308,10 +328,10 @@ version = "1.0.0"
|
||||
pyproject_file = self.create_pyproject_toml("")
|
||||
|
||||
try:
|
||||
sys.argv = ['get_pyproject_version.py', str(pyproject_file), '1.0.0']
|
||||
sys.argv = ["get_pyproject_version.py", str(pyproject_file), "1.0.0"]
|
||||
|
||||
captured_error = StringIO()
|
||||
with patch('sys.stderr', captured_error):
|
||||
with patch("sys.stderr", captured_error):
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
get_pyproject_version.main()
|
||||
|
||||
@@ -327,14 +347,14 @@ class TestSuiteInfo(unittest.TestCase):
|
||||
|
||||
def test_suite_info(self):
|
||||
"""Display test suite information."""
|
||||
print("\n" + "="*70)
|
||||
print("\n" + "=" * 70)
|
||||
print("Test Suite: get_pyproject_version.py")
|
||||
print("Framework: unittest (Python built-in)")
|
||||
print("TOML Library: tomllib (Python 3.11+ built-in)")
|
||||
print("="*70)
|
||||
print("=" * 70)
|
||||
self.assertTrue(True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
# Run tests with verbose output
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
10
.github/workflows/test-validation-script.yml
vendored
10
.github/workflows/test-validation-script.yml
vendored
@@ -3,14 +3,14 @@ name: Test valididation script
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/scripts/**'
|
||||
- '.github/workflows/test-scripts.yml'
|
||||
- ".github/scripts/**"
|
||||
- ".github/workflows/test-scripts.yml"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '.github/scripts/**'
|
||||
- '.github/workflows/test-scripts.yml'
|
||||
- ".github/scripts/**"
|
||||
- ".github/workflows/test-scripts.yml"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
@@ -23,7 +23,7 @@ jobs:
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
python-version: "3.11"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
|
||||
@@ -20,7 +20,7 @@ repos:
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort code formatter
|
||||
args: ['--profile', 'black']
|
||||
args: ["--profile", "black"]
|
||||
files: \.(py)$
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
@@ -35,7 +35,7 @@ repos:
|
||||
hooks:
|
||||
- id: ruff
|
||||
name: ruff linter
|
||||
args: ['--fix']
|
||||
args: ["--fix"]
|
||||
files: \.(py)$
|
||||
|
||||
# Temporarily disabled due to untyped codebase
|
||||
|
||||
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
SOFTWARE.
|
||||
|
||||
86
README.md
86
README.md
@@ -5,18 +5,21 @@
|
||||
<img alt="Cua logo" height="150" src="img/logo_black.png">
|
||||
</picture>
|
||||
|
||||
[](#)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
<br>
|
||||
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
[](#)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
<br>
|
||||
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
|
||||
</div>
|
||||
|
||||
> We're hosting **The Computer-Use Agents SOTA Challenge concluded** at [Hack the North](https://hackthenorth.com) and online!
|
||||
>> **Track A (On-site @ UWaterloo)**: 🏆 ~~Prize: **YC interview guaranteed**.~~ **Concluded**
|
||||
>> **Track B (Remote)**: 🏆 ~~Prize: **Cash award**.~~ **Concluded - Winners will be announced soon**
|
||||
>>> ~~👉 Sign up here: [trycua.com/hackathon](https://www.trycua.com/hackathon)~~
|
||||
> We're hosting **The Computer-Use Agents SOTA Challenge concluded** at [Hack the North](https://hackthenorth.com) and online!
|
||||
>
|
||||
> > **Track A (On-site @ UWaterloo)**: 🏆 ~~Prize: **YC interview guaranteed**.~~ **Concluded**
|
||||
> > **Track B (Remote)**: 🏆 ~~Prize: **Cash award**.~~ **Concluded - Winners will be announced soon**
|
||||
> >
|
||||
> > > ~~👉 Sign up here: [trycua.com/hackathon](https://www.trycua.com/hackathon)~~
|
||||
|
||||
**Cua** ("koo-ah") is Docker for [Computer-Use Agents](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse) - it enables AI agents to control full operating systems in virtual containers and deploy them locally or to the cloud.
|
||||
|
||||
@@ -25,10 +28,12 @@
|
||||
</div>
|
||||
|
||||
With the Computer SDK, you can:
|
||||
|
||||
- automate Windows, Linux, and macOS VMs with a consistent, [pyautogui-like API](https://docs.trycua.com/docs/libraries/computer#interface-actions)
|
||||
- create & manage VMs [locally](https://docs.trycua.com/docs/computer-sdk/computers#cua-local-containers) or using [Cua cloud](https://www.trycua.com/)
|
||||
|
||||
With the Agent SDK, you can:
|
||||
|
||||
- run computer-use models with a [consistent schema](https://docs.trycua.com/docs/agent-sdk/message-format)
|
||||
- benchmark on OSWorld-Verified, SheetBench-V2, and more [with a single line of code using HUD](https://docs.trycua.com/docs/agent-sdk/integrations/hud) ([Notebook](https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb))
|
||||
- combine UI grounding models with any LLM using [composed agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
|
||||
@@ -38,16 +43,16 @@ With the Agent SDK, you can:
|
||||
### CUA Model Zoo 🐨
|
||||
|
||||
| [All-in-one CUAs](https://docs.trycua.com/docs/agent-sdk/supported-agents/computer-use-agents) | [UI Grounding Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) | [UI Planning Models](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents) |
|
||||
|---|---|---|
|
||||
| `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
|
||||
| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
|
||||
| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) |
|
||||
| `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | |
|
||||
| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | |
|
||||
| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
|
||||
| `moondream3+{ui planning}` (supports text-only models) | |
|
||||
| `omniparser+{ui planning}` | | |
|
||||
| `{ui grounding}+{ui planning}` | | |
|
||||
| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
|
||||
| `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-haiku-4-5-20251001` | `huggingface-local/xlangai/OpenCUA-{7B,32B}` | any all-in-one CUA |
|
||||
| `openai/computer-use-preview` | `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}` | any VLM (using liteLLM, requires `tools` parameter) |
|
||||
| `openrouter/z-ai/glm-4.5v` | `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}` | any LLM (using liteLLM, requires `moondream3+` prefix ) |
|
||||
| `gemini-2.5-computer-use-preview-10-2025` | any-all-in-one CUA | |
|
||||
| `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}` | | |
|
||||
| `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` | |
|
||||
| `moondream3+{ui planning}` (supports text-only models) | |
|
||||
| `omniparser+{ui planning}` | | |
|
||||
| `{ui grounding}+{ui planning}` | | |
|
||||
|
||||
- `human/human` → [Human-in-the-Loop](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)
|
||||
|
||||
@@ -55,7 +60,7 @@ Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/
|
||||
|
||||
<br/>
|
||||
|
||||
# Quick Start
|
||||
# Quick Start
|
||||
|
||||
- [Clone a starter template and run the code in <1 min](https://github.com/trycua/agent-template) (⭐️ Recommended!)
|
||||
- [Get started with the Computer-Use Agent CLI](https://docs.trycua.com/docs/quickstart-cli)
|
||||
@@ -68,6 +73,7 @@ Missing a model? [Raise a feature request](https://github.com/trycua/cua/issues/
|
||||
```bash
|
||||
pip install cua-agent[all]
|
||||
```
|
||||
|
||||
```python
|
||||
from agent import ComputerAgent
|
||||
|
||||
@@ -86,8 +92,9 @@ async for result in agent.run(messages):
|
||||
```
|
||||
|
||||
### Output format (OpenAI Agent Responses Format):
|
||||
|
||||
```json
|
||||
{
|
||||
{
|
||||
"output": [
|
||||
# user input
|
||||
{
|
||||
@@ -133,7 +140,7 @@ async for result in agent.run(messages):
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 150,
|
||||
"completion_tokens": 75,
|
||||
@@ -148,6 +155,7 @@ async for result in agent.run(messages):
|
||||
```bash
|
||||
pip install cua-computer[all]
|
||||
```
|
||||
|
||||
```python
|
||||
from computer import Computer
|
||||
|
||||
@@ -174,18 +182,18 @@ async with Computer(
|
||||
|
||||
## Modules
|
||||
|
||||
| Module | Description | Installation |
|
||||
|--------|-------------|---------------|
|
||||
| [**Lume**](./libs/lume/README.md) | VM management for macOS/Linux using Apple's Virtualization.Framework | `curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh \| bash` |
|
||||
| [**Lumier**](./libs/lumier/README.md) | Docker interface for macOS and Linux VMs | `docker pull trycua/lumier:latest` |
|
||||
| [**Computer (Python)**](./libs/python/computer/README.md) | Python Interface for controlling virtual machines | `pip install "cua-computer[all]"` |
|
||||
| [**Computer (Typescript)**](./libs/typescript/computer/README.md) | Typescript Interface for controlling virtual machines | `npm install @trycua/computer` |
|
||||
| [**Agent**](./libs/python/agent/README.md) | AI agent framework for automating tasks | `pip install "cua-agent[all]"` |
|
||||
| [**MCP Server**](./libs/python/mcp-server/README.md) | MCP server for using CUA with Claude Desktop | `pip install cua-mcp-server` |
|
||||
| [**SOM**](./libs/python/som/README.md) | Self-of-Mark library for Agent | `pip install cua-som` |
|
||||
| [**Computer Server**](./libs/python/computer-server/README.md) | Server component for Computer | `pip install cua-computer-server` |
|
||||
| [**Core (Python)**](./libs/python/core/README.md) | Python Core utilities | `pip install cua-core` |
|
||||
| [**Core (Typescript)**](./libs/typescript/core/README.md) | Typescript Core utilities | `npm install @trycua/core` |
|
||||
| Module | Description | Installation |
|
||||
| ----------------------------------------------------------------- | -------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
|
||||
| [**Lume**](./libs/lume/README.md) | VM management for macOS/Linux using Apple's Virtualization.Framework | `curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh \| bash` |
|
||||
| [**Lumier**](./libs/lumier/README.md) | Docker interface for macOS and Linux VMs | `docker pull trycua/lumier:latest` |
|
||||
| [**Computer (Python)**](./libs/python/computer/README.md) | Python Interface for controlling virtual machines | `pip install "cua-computer[all]"` |
|
||||
| [**Computer (Typescript)**](./libs/typescript/computer/README.md) | Typescript Interface for controlling virtual machines | `npm install @trycua/computer` |
|
||||
| [**Agent**](./libs/python/agent/README.md) | AI agent framework for automating tasks | `pip install "cua-agent[all]"` |
|
||||
| [**MCP Server**](./libs/python/mcp-server/README.md) | MCP server for using CUA with Claude Desktop | `pip install cua-mcp-server` |
|
||||
| [**SOM**](./libs/python/som/README.md) | Self-of-Mark library for Agent | `pip install cua-som` |
|
||||
| [**Computer Server**](./libs/python/computer-server/README.md) | Server component for Computer | `pip install cua-computer-server` |
|
||||
| [**Core (Python)**](./libs/python/core/README.md) | Python Core utilities | `pip install cua-core` |
|
||||
| [**Core (Typescript)**](./libs/typescript/core/README.md) | Typescript Core utilities | `npm install @trycua/core` |
|
||||
|
||||
## Community
|
||||
|
||||
@@ -193,7 +201,7 @@ Join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss i
|
||||
|
||||
## License
|
||||
|
||||
Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE.md) file for details.
|
||||
Cua is open-sourced under the MIT License - see the [LICENSE](LICENSE.md) file for details.
|
||||
|
||||
Portions of this project, specifically components adapted from Kasm Technologies Inc., are also licensed under the MIT License. See [libs/kasm/LICENSE](libs/kasm/LICENSE) for details.
|
||||
|
||||
@@ -211,16 +219,16 @@ When you choose to install and use such optional extras, your use, modification,
|
||||
|
||||
Cua uses `bump2version` to manage package versions across all Python modules. A Makefile is provided to simplify the release process.
|
||||
|
||||
### Prerequisites
|
||||
### Prerequisites
|
||||
|
||||
#### install `bump2version`
|
||||
|
||||
using brew
|
||||
|
||||
```
|
||||
brew install bumpversion
|
||||
```
|
||||
|
||||
|
||||
### View Current Versions
|
||||
|
||||
```bash
|
||||
@@ -282,7 +290,7 @@ We welcome contributions to Cua! Please refer to our [Contributing Guidelines](C
|
||||
|
||||
Apple, macOS, and Apple Silicon are trademarks of Apple Inc.
|
||||
Ubuntu and Canonical are registered trademarks of Canonical Ltd.
|
||||
Microsoft is a registered trademark of Microsoft Corporation.
|
||||
Microsoft is a registered trademark of Microsoft Corporation.
|
||||
|
||||
This project is not affiliated with, endorsed by, or sponsored by Apple Inc., Canonical Ltd., Microsoft Corporation, or Kasm Technologies.
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# App-Use: Control Individual Applications with Cua Agents
|
||||
|
||||
*Published on May 31, 2025 by The Cua Team*
|
||||
_Published on May 31, 2025 by The Cua Team_
|
||||
|
||||
Today, we are excited to introduce a new experimental feature landing in the [Cua GitHub repository](https://github.com/trycua/cua): **App-Use**. App-Use allows you to create lightweight virtual desktops that limit agent access to specific applications, improving precision of your agent's trajectory. Perfect for parallel workflows, and focused task execution.
|
||||
|
||||
@@ -33,9 +33,11 @@ agent = ComputerAgent(
|
||||
## Key Benefits
|
||||
|
||||
### 1. Lightweight and Fast
|
||||
|
||||
App-Use creates visual filters, not new processes. Your apps continue running normally - we just control what the agent can see and click on. The virtual desktops are composited views that require no additional compute resources beyond the existing window manager operations.
|
||||
|
||||
### 2. Run Multiple Agents in Parallel
|
||||
|
||||
Deploy a team of specialized agents, each focused on their own apps:
|
||||
|
||||
```python
|
||||
@@ -46,7 +48,7 @@ computer = Computer(experiments=["app-use"])
|
||||
research_desktop = computer.create_desktop_from_apps(["Safari"])
|
||||
research_agent = ComputerAgent(tools=[research_desktop], ...)
|
||||
|
||||
# Writing agent focuses on documents
|
||||
# Writing agent focuses on documents
|
||||
writing_desktop = computer.create_desktop_from_apps(["Pages", "Notes"])
|
||||
writing_agent = ComputerAgent(tools=[writing_desktop], ...)
|
||||
|
||||
@@ -66,6 +68,7 @@ await asyncio.gather(
|
||||
### Requirements
|
||||
|
||||
To get started with App-Use, you'll need:
|
||||
|
||||
- Python 3.11+
|
||||
- macOS Sequoia (15.0) or later
|
||||
|
||||
@@ -85,21 +88,21 @@ from agent import ComputerAgent
|
||||
async def main():
|
||||
computer = Computer()
|
||||
await computer.run()
|
||||
|
||||
|
||||
# Create app-specific desktop sessions
|
||||
desktop = computer.create_desktop_from_apps(["Notes"])
|
||||
|
||||
|
||||
# Initialize an agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[desktop]
|
||||
)
|
||||
|
||||
|
||||
# Take a screenshot (returns bytes by default)
|
||||
screenshot = await desktop.interface.screenshot()
|
||||
with open("app_screenshot.png", "wb") as f:
|
||||
f.write(screenshot)
|
||||
|
||||
|
||||
# Run an agent task
|
||||
async for result in agent.run("Create a new note titled 'Meeting Notes' and add today's agenda items"):
|
||||
print(f"Agent: {result.get('text', '')}")
|
||||
@@ -113,6 +116,7 @@ if __name__ == "__main__":
|
||||
### ⚠️ Important Warning
|
||||
|
||||
Computer-use agents are powerful tools that can interact with your devices. This guide involves using your own macOS and iPhone instead of a VM. **Proceed at your own risk.** Always:
|
||||
|
||||
- Review agent actions before running
|
||||
- Start with non-critical tasks
|
||||
- Monitor agent behavior closely
|
||||
@@ -150,20 +154,20 @@ async def automate_iphone():
|
||||
# Connect to your local computer server
|
||||
my_mac = Computer(use_host_computer_server=True, os_type="macos", experiments=["app-use"])
|
||||
await my_mac.run()
|
||||
|
||||
|
||||
# Create a desktop focused on iPhone Mirroring
|
||||
my_iphone = my_mac.create_desktop_from_apps(["iPhone Mirroring"])
|
||||
|
||||
|
||||
# Initialize an agent for iPhone automation
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[my_iphone]
|
||||
)
|
||||
|
||||
|
||||
# Example: Send a message
|
||||
async for result in agent.run("Open Messages and send 'Hello from Cua!' to John"):
|
||||
print(f"Agent: {result.get('text', '')}")
|
||||
|
||||
|
||||
# Example: Set a reminder
|
||||
async for result in agent.run("Create a reminder to call mom at 5 PM today"):
|
||||
print(f"Agent: {result.get('text', '')}")
|
||||
@@ -175,6 +179,7 @@ if __name__ == "__main__":
|
||||
### iPhone Automation Use Cases
|
||||
|
||||
With Cua's iPhone automation, you can:
|
||||
|
||||
- **Automate messaging**: Send texts, respond to messages, manage conversations
|
||||
- **Control apps**: Navigate any iPhone app using natural language
|
||||
- **Manage settings**: Adjust iPhone settings programmatically
|
||||
@@ -191,6 +196,7 @@ With Cua's iPhone automation, you can:
|
||||
## When to Use What: App-Use vs Multiple Cua Containers
|
||||
|
||||
### Use App-Use within the same macOS Cua Container:
|
||||
|
||||
- ✅ You need lightweight, fast agent focusing (macOS only)
|
||||
- ✅ You want to run multiple agents on one desktop
|
||||
- ✅ You're automating personal devices like iPhones
|
||||
@@ -198,6 +204,7 @@ With Cua's iPhone automation, you can:
|
||||
- ✅ You want low computational overhead
|
||||
|
||||
### Use Multiple Cua Containers:
|
||||
|
||||
- ✅ You need maximum isolation between agents
|
||||
- ✅ You require cross-platform support (Mac/Linux/Windows)
|
||||
- ✅ You need guaranteed resource allocation
|
||||
@@ -215,6 +222,7 @@ With Cua's iPhone automation, you can:
|
||||
### How It Works
|
||||
|
||||
When you create a desktop session with `create_desktop_from_apps()`, App Use:
|
||||
|
||||
- Filters the visual output to show only specified application windows
|
||||
- Routes input events only to those applications
|
||||
- Maintains window layout isolation between different sessions
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Bringing Computer-Use to the Web
|
||||
|
||||
*Published on August 5, 2025 by Morgan Dean*
|
||||
_Published on August 5, 2025 by Morgan Dean_
|
||||
|
||||
In one of our original posts, we explored building Computer-Use Operators on macOS - first with a [manual implementation](build-your-own-operator-on-macos-1.md) using OpenAI's `computer-use-preview` model, then with our [cua-agent framework](build-your-own-operator-on-macos-2.md) for Python developers. While these tutorials have been incredibly popular, we've received consistent feedback from our community: **"Can we use Cua with JavaScript and TypeScript?"**
|
||||
|
||||
@@ -96,7 +96,7 @@ const res = await openai.responses.create({
|
||||
],
|
||||
},
|
||||
],
|
||||
truncation: 'auto'
|
||||
truncation: 'auto',
|
||||
});
|
||||
```
|
||||
|
||||
@@ -144,30 +144,30 @@ Each response contains:
|
||||
|
||||
### Provision a Cua Cloud Container
|
||||
|
||||
1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
|
||||
2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
|
||||
3. Start the cloud container from the dashboard.
|
||||
1. Visit [trycua.com](https://trycua.com), sign up, purchase [credits](https://trycua.com/pricing), and create a new container instance from the [dashboard](https://trycua.com/dashboard).
|
||||
2. Create an API key from the dashboard — be sure to save it in a secure location before continuing.
|
||||
3. Start the cloud container from the dashboard.
|
||||
|
||||
### Environment Setup
|
||||
|
||||
1. Install required packages with your preferred package manager:
|
||||
1. Install required packages with your preferred package manager:
|
||||
|
||||
```bash
|
||||
npm install --save @trycua/computer # or yarn, pnpm, bun
|
||||
npm install --save openai # or yarn, pnpm, bun
|
||||
```
|
||||
```bash
|
||||
npm install --save @trycua/computer # or yarn, pnpm, bun
|
||||
npm install --save openai # or yarn, pnpm, bun
|
||||
```
|
||||
|
||||
Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript.
|
||||
Works with any JavaScript/TypeScript project setup - whether you're using Create React App, Next.js, Vue, Angular, or plain JavaScript.
|
||||
|
||||
2. Save your OpenAI API key, Cua API key, and container name to a `.env` file:
|
||||
2. Save your OpenAI API key, Cua API key, and container name to a `.env` file:
|
||||
|
||||
```bash
|
||||
OPENAI_API_KEY=openai-api-key
|
||||
CUA_API_KEY=cua-api-key
|
||||
CUA_CONTAINER_NAME=cua-cloud-container-name
|
||||
```
|
||||
```bash
|
||||
OPENAI_API_KEY=openai-api-key
|
||||
CUA_API_KEY=cua-api-key
|
||||
CUA_CONTAINER_NAME=cua-cloud-container-name
|
||||
```
|
||||
|
||||
These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework.
|
||||
These environment variables work the same whether you're using vanilla JavaScript, TypeScript, or any web framework.
|
||||
|
||||
## Building the Agent
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Build Your Own Operator on macOS - Part 1
|
||||
|
||||
*Published on March 31, 2025 by Francesco Bonacci*
|
||||
_Published on March 31, 2025 by Francesco Bonacci_
|
||||
|
||||
In this first blogpost, we'll learn how to build our own Computer-Use Operator using OpenAI's `computer-use-preview` model. But first, let's understand what some common terms mean:
|
||||
|
||||
@@ -19,6 +19,7 @@ Check out what it looks like to use your own Operator from a Gradio app:
|
||||
## What You'll Learn
|
||||
|
||||
By the end of this tutorial, you'll be able to:
|
||||
|
||||
- Set up a macOS virtual machine for AI automation
|
||||
- Connect OpenAI's computer-use model to your VM
|
||||
- Create a basic loop for the AI to interact with your VM
|
||||
@@ -26,6 +27,7 @@ By the end of this tutorial, you'll be able to:
|
||||
- Implement safety checks and error handling
|
||||
|
||||
**Prerequisites:**
|
||||
|
||||
- macOS Sonoma (14.0) or later
|
||||
- 8GB RAM minimum (16GB recommended)
|
||||
- OpenAI API access (Tier 3+)
|
||||
@@ -41,15 +43,17 @@ Last March OpenAI released a fine-tuned version of GPT-4o, namely [CUA](https://
|
||||
Professor Ethan Mollick provides an excellent explanation of computer-use agents in this article: [When you give a Claude a mouse](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse).
|
||||
|
||||
### ChatGPT Operator
|
||||
|
||||
OpenAI's computer-use model powers [ChatGPT Operator](https://openai.com/index/introducing-operator), a Chromium-based interface exclusively available to ChatGPT Pro subscribers. Users leverage this functionality to automate web-based tasks such as online shopping, expense report submission, and booking reservations by interacting with websites in a human-like manner.
|
||||
|
||||
## Benefits of Custom Operators
|
||||
|
||||
### Why Build Your Own?
|
||||
|
||||
While OpenAI's Operator uses a controlled Chromium VM instance, there are scenarios where you may want to use your own VM with full desktop capabilities. Here are some examples:
|
||||
|
||||
- Automating native macOS apps like Finder, Xcode
|
||||
- Managing files, changing settings, and running terminal commands
|
||||
- Managing files, changing settings, and running terminal commands
|
||||
- Testing desktop software and applications
|
||||
- Creating workflows that combine web and desktop tasks
|
||||
- Automating media editing in apps like Final Cut Pro and Blender
|
||||
@@ -59,7 +63,9 @@ This gives you more control and flexibility to automate tasks beyond just web br
|
||||
## Access Requirements
|
||||
|
||||
### Model Availability
|
||||
|
||||
As we speak, the **computer-use-preview** model has limited availability:
|
||||
|
||||
- Only accessible to OpenAI tier 3+ users
|
||||
- Additional application process may be required even for eligible users
|
||||
- Cannot be used in the OpenAI Playground
|
||||
@@ -68,15 +74,18 @@ As we speak, the **computer-use-preview** model has limited availability:
|
||||
## Understanding the OpenAI API
|
||||
|
||||
### Responses API Overview
|
||||
|
||||
Let's start with the basics. In our case, we'll use OpenAI's Responses API to communicate with their computer-use model.
|
||||
|
||||
Think of it like this:
|
||||
|
||||
1. We send the model a screenshot of our VM and tell it what we want it to do
|
||||
2. The model looks at the screenshot and decides what actions to take
|
||||
3. It sends back instructions (like "click here" or "type this")
|
||||
4. We execute those instructions in our VM
|
||||
|
||||
The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI's newest way to interact with their AI models. It comes with several built-in tools:
|
||||
|
||||
- **Web search**: Let the AI search the internet
|
||||
- **File search**: Help the AI find documents
|
||||
- **Computer use**: Allow the AI to control a computer (what we'll be using)
|
||||
@@ -84,9 +93,11 @@ The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI
|
||||
As we speak, the computer-use model is only available through the Responses API.
|
||||
|
||||
### Responses API Examples
|
||||
|
||||
Let's look at some simple examples. We'll start with the traditional way of using OpenAI's API with Chat Completions, then show the new Responses API primitive.
|
||||
|
||||
Chat Completions:
|
||||
|
||||
```python
|
||||
# The old way required managing conversation history manually
|
||||
messages = [{"role": "user", "content": "Hello"}]
|
||||
@@ -98,13 +109,14 @@ messages.append(response.choices[0].message) # Manual message tracking
|
||||
```
|
||||
|
||||
Responses API:
|
||||
|
||||
```python
|
||||
# Example 1: Simple web search
|
||||
# The API handles all the complexity for us
|
||||
response = client.responses.create(
|
||||
model="gpt-4",
|
||||
input=[{
|
||||
"role": "user",
|
||||
"role": "user",
|
||||
"content": "What's the latest news about AI?"
|
||||
}],
|
||||
tools=[{
|
||||
@@ -118,7 +130,7 @@ response = client.responses.create(
|
||||
response = client.responses.create(
|
||||
model="gpt-4",
|
||||
input=[{
|
||||
"role": "user",
|
||||
"role": "user",
|
||||
"content": "Find documents about project X"
|
||||
}],
|
||||
tools=[{
|
||||
@@ -130,6 +142,7 @@ response = client.responses.create(
|
||||
```
|
||||
|
||||
### Computer-Use Model Setup
|
||||
|
||||
For our operator, we'll use the computer-use model. Here's how we set it up:
|
||||
|
||||
```python
|
||||
@@ -144,7 +157,7 @@ response = client.responses.create(
|
||||
}],
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"role": "user",
|
||||
"content": [
|
||||
# What we want the AI to do
|
||||
{"type": "input_text", "text": "Open Safari and go to google.com"},
|
||||
@@ -158,6 +171,7 @@ response = client.responses.create(
|
||||
```
|
||||
|
||||
### Understanding the Response
|
||||
|
||||
When we send a request, the API sends back a response that looks like this:
|
||||
|
||||
```json
|
||||
@@ -189,6 +203,7 @@ When we send a request, the API sends back a response that looks like this:
|
||||
```
|
||||
|
||||
Each response contains:
|
||||
|
||||
1. **Reasoning**: The AI's explanation of what it's doing
|
||||
2. **Action**: The specific computer action to perform
|
||||
3. **Safety Checks**: Any potential risks to review
|
||||
@@ -197,15 +212,18 @@ Each response contains:
|
||||
## CUA-Computer Interface
|
||||
|
||||
### Architecture Overview
|
||||
|
||||
Let's break down the main components of our system and how they work together:
|
||||
|
||||
1. **The Virtual Machine (VM)**
|
||||
|
||||
- Think of this as a safe playground for our AI
|
||||
- It's a complete macOS system running inside your computer
|
||||
- Anything the AI does stays inside this VM, keeping your main system safe
|
||||
- We use `lume` to create and manage this VM
|
||||
|
||||
2. **The Computer Interface (CUI)**
|
||||
|
||||
- This is how we control the VM
|
||||
- It can move the mouse, type text, and take screenshots
|
||||
- Works like a remote control for the VM
|
||||
@@ -238,7 +256,7 @@ sequenceDiagram
|
||||
VM-->>CUI: Return current screen
|
||||
CUI->>AI: Send screenshot + instructions
|
||||
AI-->>CUI: Return next action
|
||||
|
||||
|
||||
Note over CUI,VM: Execute the action
|
||||
alt Mouse Click
|
||||
CUI->>VM: Move and click mouse
|
||||
@@ -259,6 +277,7 @@ sequenceDiagram
|
||||
```
|
||||
|
||||
The diagram above shows how information flows through our system:
|
||||
|
||||
1. You start the operator
|
||||
2. The Computer Interface creates a virtual macOS
|
||||
3. Then it enters a loop:
|
||||
@@ -284,23 +303,26 @@ This design keeps everything organized and safe. The AI can only interact with t
|
||||
```
|
||||
|
||||
**Important Storage Notes:**
|
||||
|
||||
- Initial download requires 80GB of free space
|
||||
- After first run, space usage reduces to ~30GB due to macOS's sparse file system
|
||||
- VMs are stored in `~/.lume`
|
||||
- Cached images are stored in `~/.lume/cache`
|
||||
|
||||
You can check your downloaded VM images anytime:
|
||||
|
||||
```bash
|
||||
lume ls
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
| name | os | cpu | memory | disk | display | status | ip | vnc |
|
||||
|--------------------------|---------|-------|---------|----------------|-----------|-----------|----------------|---------------------------------------------------|
|
||||
| macos-sequoia-cua:latest | macOS | 12 | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:kind-forest-zulu-island@127.0.0.1:56085 |
|
||||
| name | os | cpu | memory | disk | display | status | ip | vnc |
|
||||
| ------------------------ | ----- | --- | ------ | ------------- | -------- | ------- | ------------- | ---------------------------------------------- |
|
||||
| macos-sequoia-cua:latest | macOS | 12 | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:kind-forest-zulu-island@127.0.0.1:56085 |
|
||||
|
||||
After checking your available images, you can run the VM to ensure everything is working correctly:
|
||||
|
||||
```bash
|
||||
lume run macos-sequoia-cua:latest
|
||||
```
|
||||
@@ -309,12 +331,14 @@ This design keeps everything organized and safe. The AI can only interact with t
|
||||
**Note**: The `cua-computer` package requires Python 3.10 or later. We recommend creating a dedicated Python environment:
|
||||
|
||||
**Using venv:**
|
||||
|
||||
```bash
|
||||
python -m venv cua-env
|
||||
source cua-env/bin/activate
|
||||
```
|
||||
|
||||
**Using conda:**
|
||||
|
||||
```bash
|
||||
conda create -n cua-env python=3.10
|
||||
conda activate cua-env
|
||||
@@ -332,6 +356,7 @@ This design keeps everything organized and safe. The AI can only interact with t
|
||||
### Building the Operator
|
||||
|
||||
#### Importing Required Modules
|
||||
|
||||
With the prerequisites installed and configured, we're ready to build our first operator.
|
||||
The following example uses asynchronous Python (async/await). You can run it either in a VS Code Notebook or as a standalone Python script.
|
||||
|
||||
@@ -344,12 +369,13 @@ from computer import Computer
|
||||
```
|
||||
|
||||
#### Mapping API Actions to CUA Methods
|
||||
|
||||
The following helper function converts a `computer_call` action from the OpenAI Responses API into corresponding commands on the CUI interface. For example, if the API instructs a `click` action, we move the cursor and perform a left click on the lume VM Sandbox. We will use the computer interface to execute the actions.
|
||||
|
||||
```python
|
||||
async def execute_action(computer, action):
|
||||
action_type = action.type
|
||||
|
||||
|
||||
if action_type == "click":
|
||||
x = action.x
|
||||
y = action.y
|
||||
@@ -360,12 +386,12 @@ async def execute_action(computer, action):
|
||||
await computer.interface.right_click()
|
||||
else:
|
||||
await computer.interface.left_click()
|
||||
|
||||
|
||||
elif action_type == "type":
|
||||
text = action.text
|
||||
print(f"Typing text: {text}")
|
||||
await computer.interface.type_text(text)
|
||||
|
||||
|
||||
elif action_type == "scroll":
|
||||
x = action.x
|
||||
y = action.y
|
||||
@@ -374,7 +400,7 @@ async def execute_action(computer, action):
|
||||
print(f"Scrolling at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})")
|
||||
await computer.interface.move_cursor(x, y)
|
||||
await computer.interface.scroll(scroll_y) # Using vertical scroll only
|
||||
|
||||
|
||||
elif action_type == "keypress":
|
||||
keys = action.keys
|
||||
for key in keys:
|
||||
@@ -386,23 +412,24 @@ async def execute_action(computer, action):
|
||||
await computer.interface.press_key("space")
|
||||
else:
|
||||
await computer.interface.press_key(key)
|
||||
|
||||
|
||||
elif action_type == "wait":
|
||||
wait_time = action.time
|
||||
print(f"Waiting for {wait_time} seconds")
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
|
||||
elif action_type == "screenshot":
|
||||
print("Taking screenshot")
|
||||
# This is handled automatically in the main loop, but we can take an extra one if requested
|
||||
screenshot = await computer.interface.screenshot()
|
||||
return screenshot
|
||||
|
||||
|
||||
else:
|
||||
print(f"Unrecognized action: {action_type}")
|
||||
```
|
||||
|
||||
#### Implementing the Computer-Use Loop
|
||||
|
||||
This section defines a loop that:
|
||||
|
||||
1. Initializes the cua-computer instance (connecting to a macOS sandbox).
|
||||
@@ -423,7 +450,7 @@ async def cua_openai_loop():
|
||||
os_type="macos"
|
||||
) as computer:
|
||||
await computer.run() # Start the lume VM
|
||||
|
||||
|
||||
# Capture the initial screenshot
|
||||
screenshot = await computer.interface.screenshot()
|
||||
screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
|
||||
@@ -438,8 +465,8 @@ async def cua_openai_loop():
|
||||
"environment": "mac"
|
||||
}],
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": "Open Safari, download and install Cursor."},
|
||||
{"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"}
|
||||
@@ -488,7 +515,7 @@ async def cua_openai_loop():
|
||||
"display_height": 768,
|
||||
"environment": "mac"
|
||||
}],
|
||||
input=[{
|
||||
input=[{
|
||||
"type": "computer_call_output",
|
||||
"call_id": last_call_id,
|
||||
"acknowledged_safety_checks": acknowledged_checks,
|
||||
@@ -511,12 +538,15 @@ if __name__ == "__main__":
|
||||
You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-1.ipynb).
|
||||
|
||||
#### Request Handling Differences
|
||||
|
||||
The first request to the OpenAI Responses API is special in that it includes the initial screenshot and prompt. Subsequent requests are handled differently, using the `computer_call_output` type to provide feedback on the executed action.
|
||||
|
||||
##### Initial Request Format
|
||||
|
||||
- We use `role: "user"` with `content` that contains both `input_text` (the prompt) and `input_image` (the screenshot)
|
||||
|
||||
##### Subsequent Request Format
|
||||
|
||||
- We use `type: "computer_call_output"` instead of the user role
|
||||
- We include the `call_id` to link the output to the specific previous action that was executed
|
||||
- We provide any `acknowledged_safety_checks` that were approved
|
||||
@@ -529,6 +559,7 @@ This structured approach allows the API to maintain context and continuity throu
|
||||
## Conclusion
|
||||
|
||||
### Summary
|
||||
|
||||
This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop where:
|
||||
|
||||
- A macOS sandbox is controlled using the CUA interface.
|
||||
@@ -538,9 +569,11 @@ This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop wher
|
||||
In a production setting, you would wrap the action-response cycle in a loop, handling multiple actions and safety checks as needed.
|
||||
|
||||
### Next Steps
|
||||
|
||||
In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/agent) package!
|
||||
|
||||
### Resources
|
||||
|
||||
- [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use)
|
||||
- [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer)
|
||||
- [lume](https://github.com/trycua/cua/tree/main/libs/lume)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Build Your Own Operator on macOS - Part 2
|
||||
|
||||
*Published on April 27, 2025 by Francesco Bonacci*
|
||||
_Published on April 27, 2025 by Francesco Bonacci_
|
||||
|
||||
In our [previous post](build-your-own-operator-on-macos-1.md), we built a basic Computer-Use Operator from scratch using OpenAI's `computer-use-preview` model and our [cua-computer](https://pypi.org/project/cua-computer) package. While educational, implementing the control loop manually can be tedious and error-prone.
|
||||
|
||||
@@ -13,12 +13,14 @@ In this follow-up, we'll explore our [cua-agent](https://pypi.org/project/cua-ag
|
||||
## What You'll Learn
|
||||
|
||||
By the end of this tutorial, you'll be able to:
|
||||
|
||||
- Set up the `cua-agent` framework with various agent loop types and model providers
|
||||
- Understand the different agent loop types and their capabilities
|
||||
- Work with local models for cost-effective workflows
|
||||
- Use a simple UI for your operator
|
||||
|
||||
**Prerequisites:**
|
||||
|
||||
- Completed setup from Part 1 ([lume CLI installed](https://github.com/trycua/cua?tab=readme-ov-file#option-2-full-computer-use-agent-capabilities), macOS CUA image already pulled)
|
||||
- Python 3.10+. We recommend using Conda (or Anaconda) to create an ad hoc Python environment.
|
||||
- API keys for OpenAI and/or Anthropic (optional for local models)
|
||||
@@ -58,6 +60,7 @@ pip install "cua-agent[ui]" # Gradio UI
|
||||
Before running any code examples, let's set up a proper environment:
|
||||
|
||||
1. **Create a new directory** for your project:
|
||||
|
||||
```bash
|
||||
mkdir cua-agent-tutorial
|
||||
cd cua-agent-tutorial
|
||||
@@ -66,13 +69,15 @@ Before running any code examples, let's set up a proper environment:
|
||||
2. **Set up a Python environment** using one of these methods:
|
||||
|
||||
**Option A: Using conda command line**
|
||||
|
||||
```bash
|
||||
# Using conda
|
||||
conda create -n cua-agent python=3.10
|
||||
conda activate cua-agent
|
||||
```
|
||||
|
||||
|
||||
**Option B: Using Anaconda Navigator UI**
|
||||
|
||||
- Open Anaconda Navigator
|
||||
- Click on "Environments" in the left sidebar
|
||||
- Click the "Create" button at the bottom
|
||||
@@ -80,36 +85,41 @@ Before running any code examples, let's set up a proper environment:
|
||||
- Select Python 3.10
|
||||
- Click "Create"
|
||||
- Once created, select the environment and click "Open Terminal" to activate it
|
||||
|
||||
|
||||
**Option C: Using venv**
|
||||
|
||||
```bash
|
||||
python -m venv cua-env
|
||||
source cua-env/bin/activate # On macOS/Linux
|
||||
```
|
||||
|
||||
3. **Install the cua-agent package**:
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[all]"
|
||||
```
|
||||
|
||||
4. **Set up your API keys as environment variables**:
|
||||
|
||||
```bash
|
||||
# For OpenAI models
|
||||
export OPENAI_API_KEY=your_openai_key_here
|
||||
|
||||
|
||||
# For Anthropic models (if needed)
|
||||
export ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
```
|
||||
|
||||
5. **Create a Python file or notebook**:
|
||||
|
||||
|
||||
**Option A: Create a Python script**
|
||||
|
||||
```bash
|
||||
# For a Python script
|
||||
touch cua_agent_example.py
|
||||
```
|
||||
|
||||
|
||||
**Option B: Use VS Code notebooks**
|
||||
|
||||
- Open VS Code
|
||||
- Install the Python extension if you haven't already
|
||||
- Create a new file with a `.ipynb` extension (e.g., `cua_agent_tutorial.ipynb`)
|
||||
@@ -120,9 +130,10 @@ Now you're ready to run the code examples!
|
||||
|
||||
## Understanding Agent Loops
|
||||
|
||||
If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model.
|
||||
If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model.
|
||||
|
||||
In the `cua-agent` framework, an **Agent Loop** is the core abstraction that implements the continuous interaction cycle between an AI model and the computer environment. It manages the flow of:
|
||||
|
||||
1. Capturing screenshots of the computer's state
|
||||
2. Processing these screenshots (with or without UI element detection)
|
||||
3. Sending this visual context to an AI model along with the task instructions
|
||||
@@ -141,6 +152,7 @@ While the core concept remains the same across all agent loops, different AI mod
|
||||
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
|
||||
Each loop handles the same basic pattern we implemented manually in Part 1:
|
||||
|
||||
1. Take a screenshot of the VM
|
||||
2. Send the screenshot and task to the AI model
|
||||
3. Receive an action to perform
|
||||
@@ -169,13 +181,13 @@ Choosing the right agent loop depends not only on your API access and technical
|
||||
|
||||
The performance of different Computer-Use models varies significantly across tasks. These benchmark evaluations measure an agent's ability to follow instructions and complete real-world tasks in different computing environments.
|
||||
|
||||
| Benchmark type | Benchmark | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA | Human |
|
||||
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-------------|-------------|----------------------|-------------|
|
||||
| **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps) | **42.5** | 36.4 | 28 | 38.1 (200 step) | 72.4 |
|
||||
| | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps) | **42.1** | - | - | 29.8 | - |
|
||||
| **Browser Use** | [WebVoyager](https://arxiv.org/abs/2401.13919) | 84.8 | **87** | 84.1 | 87 | - |
|
||||
| | [Online-Mind2web](https://arxiv.org/abs/2504.01382) | **75.8** | 71 | 62.9 | 71 | - |
|
||||
| **Phone Use** | [Android World](https://arxiv.org/abs/2405.14573) | **64.2** | - | - | 59.5 | - |
|
||||
| Benchmark type | Benchmark | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA | Human |
|
||||
| ---------------- | ------------------------------------------------------------------ | ----------- | ---------- | ---------- | --------------- | ----- |
|
||||
| **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps) | **42.5** | 36.4 | 28 | 38.1 (200 step) | 72.4 |
|
||||
| | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps) | **42.1** | - | - | 29.8 | - |
|
||||
| **Browser Use** | [WebVoyager](https://arxiv.org/abs/2401.13919) | 84.8 | **87** | 84.1 | 87 | - |
|
||||
| | [Online-Mind2web](https://arxiv.org/abs/2504.01382) | **75.8** | 71 | 62.9 | 71 | - |
|
||||
| **Phone Use** | [Android World](https://arxiv.org/abs/2405.14573) | **64.2** | - | - | 59.5 | - |
|
||||
|
||||
### When to Use Each Loop
|
||||
|
||||
@@ -210,10 +222,10 @@ async def run_simple_task():
|
||||
model="openai/computer-use-preview",
|
||||
tools=[macos_computer]
|
||||
)
|
||||
|
||||
|
||||
# Define a simple task
|
||||
task = "Open Safari and search for 'Python tutorials'"
|
||||
|
||||
|
||||
# Run the task and process responses
|
||||
async for result in agent.run(task):
|
||||
print(f"Action: {result.get('text')}")
|
||||
@@ -225,6 +237,7 @@ if __name__ == "__main__":
|
||||
|
||||
3. Save the file
|
||||
4. Open a terminal, navigate to your project directory, and run:
|
||||
|
||||
```bash
|
||||
python simple_task.py
|
||||
```
|
||||
@@ -232,6 +245,7 @@ if __name__ == "__main__":
|
||||
5. The code will initialize the macOS virtual machine, create an agent, and execute the task of opening Safari and searching for Python tutorials.
|
||||
|
||||
You can also run this in a VS Code notebook:
|
||||
|
||||
1. Create a new notebook in VS Code (.ipynb file)
|
||||
2. Copy the code into a cell (without the `if __name__ == "__main__":` part)
|
||||
3. Run the cell to execute the code
|
||||
@@ -259,7 +273,7 @@ async def run_multi_task_workflow():
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
tools=[macos_computer]
|
||||
)
|
||||
|
||||
|
||||
tasks = [
|
||||
"Open Safari and go to github.com",
|
||||
"Search for 'trycua/cua'",
|
||||
@@ -267,7 +281,7 @@ async def run_multi_task_workflow():
|
||||
"Click on the 'Issues' tab",
|
||||
"Read the first open issue"
|
||||
]
|
||||
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
print(f"\nTask {i+1}/{len(tasks)}: {task}")
|
||||
async for result in agent.run(task):
|
||||
@@ -301,13 +315,13 @@ async for result in agent.run(task):
|
||||
# Basic information
|
||||
print(f"Response ID: {result.get('id')}")
|
||||
print(f"Response Text: {result.get('text')}")
|
||||
|
||||
|
||||
# Detailed token usage statistics
|
||||
usage = result.get('usage')
|
||||
if usage:
|
||||
print(f"Input Tokens: {usage.get('input_tokens')}")
|
||||
print(f"Output Tokens: {usage.get('output_tokens')}")
|
||||
|
||||
|
||||
# Reasoning and actions
|
||||
for output in result.get('output', []):
|
||||
if output.get('type') == 'reasoning':
|
||||
@@ -318,6 +332,7 @@ async for result in agent.run(task):
|
||||
```
|
||||
|
||||
This structured format allows you to:
|
||||
|
||||
- Log detailed information about agent actions
|
||||
- Provide real-time feedback to users
|
||||
- Track token usage for cost monitoring
|
||||
@@ -330,6 +345,7 @@ One of the most powerful features of the framework is the ability to use local m
|
||||
**How to run this example:**
|
||||
|
||||
1. First, you'll need to install Ollama for running local models:
|
||||
|
||||
- Visit [ollama.com](https://ollama.com) and download the installer for your OS
|
||||
- Follow the installation instructions
|
||||
- Pull the Gemma 3 model:
|
||||
@@ -350,9 +366,9 @@ async def run_with_local_model():
|
||||
model="omniparser+ollama_chat/gemma3",
|
||||
tools=[macos_computer]
|
||||
)
|
||||
|
||||
|
||||
task = "Open the Calculator app and perform a simple calculation"
|
||||
|
||||
|
||||
async for result in agent.run(task):
|
||||
print(f"Action: {result.get('text')}")
|
||||
|
||||
@@ -379,12 +395,14 @@ agent = ComputerAgent(
|
||||
```
|
||||
|
||||
Common local endpoints include:
|
||||
|
||||
- LM Studio: `http://localhost:1234/v1`
|
||||
- vLLM: `http://localhost:8000/v1`
|
||||
- LocalAI: `http://localhost:8080/v1`
|
||||
- Ollama with OpenAI compat: `http://localhost:11434/v1`
|
||||
|
||||
This approach is perfect for:
|
||||
|
||||
- Development and testing without incurring API costs
|
||||
- Offline or air-gapped environments where API access isn't possible
|
||||
- Privacy-sensitive applications where data can't leave your network
|
||||
@@ -406,8 +424,8 @@ UI-TARS is ByteDance's Computer-Use model designed for navigating OS-level inter
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
model=LLM(
|
||||
provider=LLMProvider.OAICOMPAT,
|
||||
name="tgi",
|
||||
provider=LLMProvider.OAICOMPAT,
|
||||
name="tgi",
|
||||
provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1"
|
||||
),
|
||||
tools=[macos_computer]
|
||||
@@ -475,11 +493,13 @@ if __name__ == "__main__":
|
||||
```
|
||||
|
||||
2. Install the UI dependencies if you haven't already:
|
||||
|
||||
```bash
|
||||
pip install "cua-agent[ui]"
|
||||
```
|
||||
|
||||
3. Run the script:
|
||||
|
||||
```bash
|
||||
python launch_ui.py
|
||||
```
|
||||
@@ -498,12 +518,14 @@ if __name__ == "__main__":
|
||||
```
|
||||
|
||||
When you run this, Gradio will display both a local URL and a public URL like:
|
||||
|
||||
```
|
||||
Running on local URL: http://127.0.0.1:7860
|
||||
Running on public URL: https://abcd1234.gradio.live
|
||||
```
|
||||
|
||||
**Security Note:** Be cautious when sharing your Gradio UI publicly:
|
||||
|
||||
- The public URL gives anyone with the link full access to your agent
|
||||
- Consider using basic authentication for additional protection:
|
||||
```python
|
||||
@@ -513,6 +535,7 @@ Running on public URL: https://abcd1234.gradio.live
|
||||
- The temporary link expires when you stop the Gradio application
|
||||
|
||||
This provides:
|
||||
|
||||
- Model provider selection
|
||||
- Agent loop selection
|
||||
- Task input field
|
||||
@@ -566,7 +589,7 @@ async def github_workflow():
|
||||
verbosity=logging.INFO,
|
||||
tools=[macos_computer]
|
||||
)
|
||||
|
||||
|
||||
tasks = [
|
||||
"Look for a repository named trycua/cua on GitHub.",
|
||||
"Check the open issues, open the most recent one and read it.",
|
||||
@@ -575,7 +598,7 @@ async def github_workflow():
|
||||
"From Cursor, open Composer if not already open.",
|
||||
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
|
||||
]
|
||||
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
|
||||
async for result in agent.run(task):
|
||||
@@ -587,11 +610,13 @@ if __name__ == "__main__":
|
||||
```
|
||||
|
||||
2. Make sure your OpenAI API key is set:
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY=your_openai_key_here
|
||||
```
|
||||
|
||||
3. Run the script:
|
||||
|
||||
```bash
|
||||
python github_workflow.py
|
||||
```
|
||||
@@ -604,6 +629,7 @@ if __name__ == "__main__":
|
||||
- Use Cursor's AI features to work on a solution
|
||||
|
||||
This example:
|
||||
|
||||
1. Searches GitHub for a repository
|
||||
2. Reads an issue
|
||||
3. Clones the repository
|
||||
@@ -615,6 +641,7 @@ This example:
|
||||
Let's compare our manual implementation from Part 1 with the framework approach:
|
||||
|
||||
### Manual Implementation (Part 1)
|
||||
|
||||
- Required writing custom code for the interaction loop
|
||||
- Needed explicit handling of different action types
|
||||
- Required direct management of the OpenAI API calls
|
||||
@@ -622,6 +649,7 @@ Let's compare our manual implementation from Part 1 with the framework approach:
|
||||
- Limited to OpenAI's computer-use model
|
||||
|
||||
### Framework Implementation (Part 2)
|
||||
|
||||
- Abstracts the interaction loop
|
||||
- Handles all action types automatically
|
||||
- Manages API calls internally
|
||||
@@ -634,17 +662,21 @@ Let's compare our manual implementation from Part 1 with the framework approach:
|
||||
The `cua-agent` framework transforms what was a complex implementation task into a simple, high-level interface for building Computer-Use Agents. By abstracting away the technical details, it lets you focus on defining the tasks rather than the machinery.
|
||||
|
||||
### When to Use Each Approach
|
||||
|
||||
- **Manual Implementation (Part 1)**: When you need complete control over the interaction loop or are implementing a custom solution
|
||||
- **Framework (Part 2)**: For most applications where you want to quickly build and deploy Computer-Use Agents
|
||||
|
||||
### Next Steps
|
||||
|
||||
With the basics covered, you might want to explore:
|
||||
|
||||
- Customizing the agent's behavior with additional parameters
|
||||
- Building more complex workflows spanning multiple applications
|
||||
- Integrating your agent into other applications
|
||||
- Contributing to the open-source project on GitHub
|
||||
|
||||
### Resources
|
||||
|
||||
- [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent)
|
||||
- [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb)
|
||||
- [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Announcing Cua Agent framework 0.4 and Composite Agents
|
||||
|
||||
*Published on August 26, 2025 by Dillon DuPont*
|
||||
_Published on August 26, 2025 by Dillon DuPont_
|
||||
|
||||
<img src="./assets/composite-agents.png" alt="Composite Agents">
|
||||
|
||||
@@ -12,7 +12,7 @@ This is the kind of problem that makes you wonder if we're building the future o
|
||||
|
||||
## What we fixed
|
||||
|
||||
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
|
||||
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
|
||||
|
||||
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-3-5-sonnet-20241022"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
|
||||
|
||||
@@ -42,7 +42,7 @@ agent = ComputerAgent(
|
||||
|
||||
This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository.
|
||||
|
||||
You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
|
||||
You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
@@ -63,12 +63,11 @@ We're building integration with HUD evals, allowing us to curate and benchmark m
|
||||
|
||||
If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## Links
|
||||
|
||||
* **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
|
||||
* **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
- **Composite Agent Docs:** [https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents](https://docs.trycua.com/docs/agent-sdk/supported-agents/composed-agents)
|
||||
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
|
||||
Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Computer-Use Agents SOTA Challenge: Hack the North + Global Online
|
||||
|
||||
*Published on August 25, 2025 by Francesco Bonacci*
|
||||
_Published on August 25, 2025 by Francesco Bonacci_
|
||||
|
||||
We’re bringing something new to [Hack the North](https://hackthenorth.com), Canada’s largest hackathon, this year: a head-to-head competition for **Computer-Use Agents** - on-site at Waterloo and a **Global online challenge**. From September 12–14, 2025, teams build on the **Cua Agent Framework** and are scored in **HUD’s OSWorld-Verified** environment to push past today’s SOTA on [OS-World](https://os-world.github.io).
|
||||
|
||||
@@ -14,7 +14,8 @@ There’s one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Age
|
||||
|
||||
**Cua** and [**Ollama**](https://ollama.com) organize a global hackathon to find the **most creative uses of local and hybrid computer-use agents**. There are no geographic restrictions on who can join — this is a worldwide competition focused on **originality, impact, and inventive applications** that showcase what's possible with local and hybrid inference.
|
||||
|
||||
**Prizes:**
|
||||
**Prizes:**
|
||||
|
||||
- 1st **MacBook Air M4 (or equivalent value)** + features in Cua & Ollama channels
|
||||
- 2nd **$500 CAD + swag**
|
||||
- 3rd **swag + public feature**
|
||||
@@ -26,36 +27,42 @@ There’s one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Age
|
||||
Two different tracks, two different processes:
|
||||
|
||||
### On-site (Track A)
|
||||
|
||||
Build during the weekend and submit a repo with a one-line start command. **HUD** executes your command in a clean environment and runs **OSWorld-Verified**. Scores come from official benchmark results; ties break by median, then wall-clock time, then earliest submission. Any model setup is allowed (cloud or local).
|
||||
|
||||
**HUD** runs official evaluations immediately after submission. Winners are announced at the **closing ceremony**.
|
||||
|
||||
### Rules
|
||||
|
||||
- Fork and star the [Cua repo](https://github.com/trycua/cua).
|
||||
- Add your agent and instructions in `samples/community/hack-the-north/<YOUR_TEAM_NAME>`.
|
||||
- Include a README with details on the approach and any required notes.
|
||||
- Submit a PR.
|
||||
- Include a README with details on the approach and any required notes.
|
||||
- Submit a PR.
|
||||
|
||||
**Deadline: Sept 15, 8:00 AM EDT**
|
||||
|
||||
### Global Online (Track B)
|
||||
|
||||
Open to anyone, anywhere. Build on your own timeline and submit through the **Cua Discord form** by the deadline.
|
||||
|
||||
**Project Requirements:**
|
||||
|
||||
- Your agent must integrate **Cua and Ollama** in some way
|
||||
- Your agent must be **easily runnable by judges**
|
||||
|
||||
Judged by **Cua** and **Ollama** teams on:
|
||||
- **Creativity (30%)** – originality, usefulness, surprise factor
|
||||
- **Technical Depth (30%)** – quality of engineering and agent design
|
||||
- **Use of Ollama (30%)** – effective integration of local/hybrid inference
|
||||
- **Polish (10%)** – presentation, clarity, demo readiness
|
||||
Judged by **Cua** and **Ollama** teams on:
|
||||
|
||||
- **Creativity (30%)** – originality, usefulness, surprise factor
|
||||
- **Technical Depth (30%)** – quality of engineering and agent design
|
||||
- **Use of Ollama (30%)** – effective integration of local/hybrid inference
|
||||
- **Polish (10%)** – presentation, clarity, demo readiness
|
||||
|
||||
### Submission Process
|
||||
|
||||
Submissions will be collected via a **form link provided in the Cua Discord**. Your submission must contain:
|
||||
|
||||
- **GitHub repo** containing the agent source code and a clear README with instructions on how to use the agent
|
||||
- **Explanation** of the models and tools used, and what's local or hybrid about your design
|
||||
- **Explanation** of the models and tools used, and what's local or hybrid about your design
|
||||
- **Short demo video** (up to two minutes)
|
||||
|
||||
A **commit freeze** will be used to ensure that no changes are made after the deadline. Winners will be announced after judging is complete.
|
||||
@@ -68,12 +75,13 @@ A **commit freeze** will be used to ensure that no changes are made after the de
|
||||
|
||||
Bring a team, pick a model stack, and push what agents can do on real computers. We can’t wait to see what you build at **Hack the North 2025**.
|
||||
|
||||
**Discord channels**
|
||||
**Discord channels**
|
||||
|
||||
- Join the Discord first: https://discord.gg/cua-ai
|
||||
- **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
|
||||
- **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
|
||||
- **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
|
||||
- **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
|
||||
|
||||
**Contact**
|
||||
Questions on Hack the North? Email **hackthenorth@trycua.com**.
|
||||
|
||||
*P.S. If you’re planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; we’ll share office-hour times in both Discord channels.*
|
||||
_P.S. If you’re planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at docs.trycua.com; we’ll share office-hour times in both Discord channels._
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# What happens when hackathon judging is a public benchmark (Hack the North edition)
|
||||
|
||||
*Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025*
|
||||
_Written by Francesco Bonacci — Reviewed by Parth Patel (HUD W25) — Sept 25, 2025_
|
||||
|
||||
## Prologue
|
||||
|
||||
@@ -16,7 +16,7 @@ The rest, as they say, was a 36h story worth telling—and a playbook worth shar
|
||||
|
||||
## The sign-up problem we had to invent
|
||||
|
||||
We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. It’s kind of an open secret that UWaterloo turns out some of the sharpest hackers around (*no pun intended, HackMIT*). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. *This wasn’t a blank-slate, build-whatever-you-want kind of track.*
|
||||
We joined as a sponsor at the last minute, thanks to a push from our friend @Michael Chiang at Ollama—Waterloo alum, naturally. It’s kind of an open secret that UWaterloo turns out some of the sharpest hackers around (_no pun intended, HackMIT_). It was a bit of a scramble, but also great timing—our Agent framework had just finished a major refactor, with support for **100+ VLM configurations** now live. Naturally, we wanted to stress-test it at scale—and see whether teams could come up with SOTA-level setups. _This wasn’t a blank-slate, build-whatever-you-want kind of track._
|
||||
|
||||
From day one, though, we knew we’d have to fight for sign-ups. This was a niche track, and a guaranteed YC interview alone wouldn’t be enough to pull people in.
|
||||
|
||||
@@ -24,7 +24,7 @@ Unfortunately, Hack the North (HTN) didn’t offer an interest form to help us e
|
||||
|
||||
On top of that, we were discouraged from external promotion on [lu.ma](http://lu.ma). So we spun up our own sign-up page at **trycua.com/hackathon** and built ad-hoc Discord channels to share track details. We emphasized—repeatedly—that only students already accepted to Hack the North should register.
|
||||
|
||||
*(Moral: the “measure-zero effect”—no matter how many times you say it, some people won’t see it. Plenty of invalid sign-ups still slipped through.)*
|
||||
_(Moral: the “measure-zero effect”—no matter how many times you say it, some people won’t see it. Plenty of invalid sign-ups still slipped through.)_
|
||||
|
||||
Even so, having your own form is absolutely worth it: it gives you an **early funnel**, surfaces demand signals ahead of time, and—crucially—**lets you require platform sign-up before kickoff**. In our case, Hack the North didn’t provide Devpost access until the very end, so our form was the only way to build a working roster.
|
||||
|
||||
@@ -45,13 +45,13 @@ Day 0 on campus made the difference. We arrived a couple of hours early to colle
|
||||
|
||||

|
||||
|
||||
*(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)*
|
||||
_(Our Founding Engineer, Morgan, hangs out with students at the stand, while Adam (runner-up) hacks on the side.)_
|
||||
|
||||
## 02:30 a.m. is still prime time at a hackathon
|
||||
|
||||
Hack the North gives sponsors a 30-minute API Workshop during the early hours of the event—a perfect moment to shift from talking to building.
|
||||
|
||||
Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it.
|
||||
Our slot landed at **2:30 a.m.** (_perks of the cheapest sponsor tier_). Thirty students showed up, energy surprisingly high. James, our new Founding DevRel Engineer, led the session and nailed it.
|
||||
|
||||
**Our track rules were simple:**
|
||||
|
||||
@@ -67,7 +67,7 @@ Our slot landed at **2:30 a.m.** (*perks of the cheapest sponsor tier*). Thirty
|
||||
|
||||

|
||||
|
||||
*(Our CUA Workshop at 2:30 AM.)*
|
||||
_(Our CUA Workshop at 2:30 AM.)_
|
||||
|
||||
## Making it possible to focus on the work
|
||||
|
||||
@@ -87,7 +87,7 @@ We provided:
|
||||
|
||||
**After the workshop buzz.** Morning interest was high, but Docker setup + requiring focus on a single track thinned the crowd. Most sponsor prizes are broad (“use our product and you qualify”), letting students stack tracks. Ours required commitment. Upside: those who stayed shipped sharper, higher-quality submissions.
|
||||
|
||||
**The bell curve of submissions.** Most entries used *claude-sonnet-4-20250514*—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries.
|
||||
**The bell curve of submissions.** Most entries used _claude-sonnet-4-20250514_—proof that docs and public leaderboards ([OSWorld](https://os-world.github.io/#benchmark)) guide choices. Results clustered around the safe pick, with fewer pushing boundaries.
|
||||
|
||||
**Who went beyond the baseline.** A few tried multi-agent/tool graphs. One standout—[**cuala**](https://github.com/YeIIcw/cuala)—was a clean reference: deterministic actions, verifiable state changes, callbacks for saving images and trajectories.
|
||||
|
||||
@@ -97,7 +97,7 @@ We provided:
|
||||
|
||||
We skipped a full end-to-end **Cua × HUD** dry-run. It showed.
|
||||
|
||||
- Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for *computer-use-preview* (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks.
|
||||
- Hackers ran out of inference credits. Desktop tasks are token-heavy. A full OSWorld run (200 max steps) for _computer-use-preview_ (OpenAI Operator API) can cost >$600. Serious attempts: ~400k tokens × 14 tasks.
|
||||
- Python version/build mismatches surfaced, requiring debug time across both OSS repos.
|
||||
- Our Cua framework lacked a **Response Agent** to complete evaluation loops. Some runs stalled until patched.
|
||||
|
||||
@@ -112,28 +112,31 @@ We skipped a full end-to-end **Cua × HUD** dry-run. It showed.
|
||||
|
||||

|
||||
|
||||
*(Leaderboard on HUD)*
|
||||
_(Leaderboard on HUD)_
|
||||
|
||||
### Winners
|
||||
|
||||
**🥇 Winner — Ram**
|
||||
- Devpost: https://devpost.com/software/sota-computer-use-agent-challenge
|
||||
- Code: https://github.com/Ram-Raghav-S/cua/tree/ram
|
||||
**🥇 Winner — Ram**
|
||||
|
||||
- Devpost: https://devpost.com/software/sota-computer-use-agent-challenge
|
||||
- Code: https://github.com/Ram-Raghav-S/cua/tree/ram
|
||||
- Score: 68.3%
|
||||
|
||||
**🥈 Runner-up — Aryan**
|
||||
- Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt
|
||||
- Code: https://github.com/Tumph/cua
|
||||
**🥈 Runner-up — Aryan**
|
||||
|
||||
- Devpost: https://devpost.com/software/loopdeloop-computer-use-agent-sota-attempt
|
||||
- Code: https://github.com/Tumph/cua
|
||||
- Score: 55.9%
|
||||
|
||||
**🥉 Special Mention — Adam**
|
||||
- Devpost: https://devpost.com/software/cuala
|
||||
- Code: https://github.com/YeIIcw/cuala
|
||||
**🥉 Special Mention — Adam**
|
||||
|
||||
- Devpost: https://devpost.com/software/cuala
|
||||
- Code: https://github.com/YeIIcw/cuala
|
||||
- Score: 42.1%
|
||||
|
||||

|
||||
|
||||
*(Our finalists before the award ceremony)*
|
||||
_(Our finalists before the award ceremony)_
|
||||
|
||||
## What We’d Keep
|
||||
|
||||
@@ -163,4 +166,4 @@ Whether you’re a hacker who wants to participate, or a company looking to spon
|
||||
|
||||

|
||||
|
||||
*(HTN Closing Ceremony — Cua Track Winner Announcement)*
|
||||
_(HTN Closing Ceremony — Cua Track Winner Announcement)_
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Cua × HUD - Evaluate Any Computer-Use Agent
|
||||
|
||||
*Published on August 27, 2025 by Dillon DuPont*
|
||||
_Published on August 27, 2025 by Dillon DuPont_
|
||||
|
||||
You can now benchmark any GUI-capable agent on real computer-use tasks through our new integration with [HUD](https://hud.so), the evaluation platform for computer-use agents.
|
||||
|
||||
@@ -70,9 +70,9 @@ Watch your agent work in real-time. Example output:
|
||||
```md
|
||||
Starting full dataset run...
|
||||
╔═════════════════════════════════════════════════════════════════╗
|
||||
║ 🚀 See your agent live at: ║
|
||||
║ 🚀 See your agent live at: ║
|
||||
╟─────────────────────────────────────────────────────────────────╢
|
||||
║ https://app.hud.so/jobs/fe05805d-4da9-4fc6-84b5-5c518528fd3c ║
|
||||
║ https://app.hud.so/jobs/fe05805d-4da9-4fc6-84b5-5c518528fd3c ║
|
||||
╚═════════════════════════════════════════════════════════════════╝
|
||||
```
|
||||
|
||||
@@ -90,4 +90,4 @@ Customize your evaluation with these options:
|
||||
|
||||
- Notebook with end‑to‑end examples: https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb
|
||||
- Docs: https://docs.trycua.com/docs/agent-sdk/integrations/hud
|
||||
- Live traces: https://app.hud.so
|
||||
- Live traces: https://app.hud.so
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# When Agents Need Human Wisdom - Introducing Human-In-The-Loop Support
|
||||
|
||||
*Published on August 29, 2025 by Francesco Bonacci*
|
||||
_Published on August 29, 2025 by Francesco Bonacci_
|
||||
|
||||
Sometimes the best AI agent is a human. Whether you're creating training demonstrations, evaluating complex scenarios, or need to intervene when automation hits a wall, our new Human-In-The-Loop integration puts you directly in control.
|
||||
|
||||
With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could benchmark any agent at scale. Today's update lets you *become* the agent when it matters most—seamlessly switching between automated intelligence and human judgment.
|
||||
With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could benchmark any agent at scale. Today's update lets you _become_ the agent when it matters most—seamlessly switching between automated intelligence and human judgment.
|
||||
|
||||
<div align="center">
|
||||
<video src="https://github.com/user-attachments/assets/9091b50f-26e7-4981-95ce-40e5d42a1260" width="600" controls></video>
|
||||
@@ -20,11 +20,12 @@ With yesterday's [HUD evaluation integration](hud-agent-evals.md), you could ben
|
||||
|
||||
## Why Human-In-The-Loop?
|
||||
|
||||
Even the most sophisticated agents encounter edge cases, ambiguous interfaces, or tasks requiring human judgment. Rather than failing gracefully, they can now fail *intelligently*—by asking for human help.
|
||||
Even the most sophisticated agents encounter edge cases, ambiguous interfaces, or tasks requiring human judgment. Rather than failing gracefully, they can now fail _intelligently_—by asking for human help.
|
||||
|
||||
This approach bridges the gap between fully automated systems and pure manual control, letting you:
|
||||
|
||||
- **Demonstrate complex workflows** that agents can learn from
|
||||
- **Evaluate tricky scenarios** where ground truth requires human assessment
|
||||
- **Evaluate tricky scenarios** where ground truth requires human assessment
|
||||
- **Intervene selectively** when automated agents need guidance
|
||||
- **Test and debug** your tools and environments manually
|
||||
|
||||
@@ -64,7 +65,7 @@ Combine model intelligence with human precision—let AI plan, then execute manu
|
||||
|
||||
```python
|
||||
agent = ComputerAgent(
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+human/human",
|
||||
"huggingface-local/HelloKKMe/GTA1-7B+human/human",
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
@@ -81,7 +82,7 @@ Start automated, escalate to human when needed:
|
||||
# Primary automated agent
|
||||
primary_agent = ComputerAgent("openai/computer-use-preview", tools=[computer])
|
||||
|
||||
# Human fallback agent
|
||||
# Human fallback agent
|
||||
fallback_agent = ComputerAgent("human/human", tools=[computer])
|
||||
|
||||
try:
|
||||
@@ -101,22 +102,26 @@ except Exception:
|
||||
The human-in-the-loop interface provides a rich, responsive experience:
|
||||
|
||||
### **Visual Environment**
|
||||
|
||||
- **Screenshot display** with live updates as you work
|
||||
- **Click handlers** for direct interaction with UI elements
|
||||
- **Click handlers** for direct interaction with UI elements
|
||||
- **Zoom and pan** to see details clearly
|
||||
|
||||
### **Action Controls**
|
||||
|
||||
- **Click actions** - precise cursor positioning and clicking
|
||||
- **Keyboard input** - type text naturally or send specific key combinations
|
||||
- **Action history** - see the sequence of actions taken
|
||||
- **Undo support** - step back when needed
|
||||
|
||||
### **Tool Integration**
|
||||
### **Tool Integration**
|
||||
|
||||
- **Full OpenAI compatibility** - standard tool call format
|
||||
- **Custom tools** - integrate your own tools seamlessly
|
||||
- **Real-time feedback** - see tool responses immediately
|
||||
|
||||
### **Smart Polling**
|
||||
|
||||
- **Responsive updates** - UI refreshes when new completions arrive
|
||||
- **Background processing** - continue working while waiting for tasks
|
||||
- **Session persistence** - resume interrupted sessions
|
||||
@@ -124,6 +129,7 @@ The human-in-the-loop interface provides a rich, responsive experience:
|
||||
## Real-World Use Cases
|
||||
|
||||
### **Training Data Generation**
|
||||
|
||||
Create perfect demonstrations for fine-tuning:
|
||||
|
||||
```python
|
||||
@@ -132,7 +138,7 @@ demo_agent = ComputerAgent("human/human", tools=[computer])
|
||||
|
||||
tasks = [
|
||||
"Create a budget spreadsheet with income and expense categories",
|
||||
"Apply conditional formatting to highlight overbudget items",
|
||||
"Apply conditional formatting to highlight overbudget items",
|
||||
"Generate a pie chart showing expense distribution"
|
||||
]
|
||||
|
||||
@@ -143,6 +149,7 @@ for task in tasks:
|
||||
```
|
||||
|
||||
### **Evaluation and Ground Truth**
|
||||
|
||||
Validate agent performance on complex scenarios:
|
||||
|
||||
```python
|
||||
@@ -154,6 +161,7 @@ async for _ in evaluator.run("Review this completed form and rate accuracy (1-10
|
||||
```
|
||||
|
||||
### **Interactive Debugging**
|
||||
|
||||
Step through agent behavior manually:
|
||||
|
||||
```python
|
||||
@@ -165,6 +173,7 @@ async for _ in debug_agent.run("Reproduce the agent's failed login sequence"):
|
||||
```
|
||||
|
||||
### **Edge Case Handling**
|
||||
|
||||
Handle scenarios that break automated agents:
|
||||
|
||||
```python
|
||||
@@ -180,26 +189,26 @@ async for _ in edge_case_agent.run("Navigate this CAPTCHA-protected form"):
|
||||
Customize the human agent experience:
|
||||
|
||||
- **UI refresh rate**: Adjust polling frequency for your workflow
|
||||
- **Image quality**: Balance detail vs. performance for screenshots
|
||||
- **Image quality**: Balance detail vs. performance for screenshots
|
||||
- **Action logging**: Save detailed traces for analysis and training
|
||||
- **Session timeout**: Configure idle timeouts for security
|
||||
- **Tool permissions**: Restrict which tools humans can access
|
||||
|
||||
## When to Use Human-In-The-Loop
|
||||
|
||||
| **Scenario** | **Why Human Control** |
|
||||
|--------------|----------------------|
|
||||
| **Creating training data** | Perfect demonstrations for model fine-tuning |
|
||||
| **Evaluating complex tasks** | Human judgment for subjective or nuanced assessment |
|
||||
| **Handling edge cases** | CAPTCHAs, unusual UIs, context-dependent decisions |
|
||||
| **Debugging workflows** | Step through failures to identify breaking points |
|
||||
| **High-stakes operations** | Critical tasks requiring human oversight and approval |
|
||||
| **Testing new environments** | Validate tools and environments work as expected |
|
||||
| **Scenario** | **Why Human Control** |
|
||||
| ---------------------------- | ----------------------------------------------------- |
|
||||
| **Creating training data** | Perfect demonstrations for model fine-tuning |
|
||||
| **Evaluating complex tasks** | Human judgment for subjective or nuanced assessment |
|
||||
| **Handling edge cases** | CAPTCHAs, unusual UIs, context-dependent decisions |
|
||||
| **Debugging workflows** | Step through failures to identify breaking points |
|
||||
| **High-stakes operations** | Critical tasks requiring human oversight and approval |
|
||||
| **Testing new environments** | Validate tools and environments work as expected |
|
||||
|
||||
## Learn More
|
||||
|
||||
- **Interactive examples**: Try human-in-the-loop control with sample tasks
|
||||
- **Training data pipelines**: Learn how to convert human demonstrations into model training data
|
||||
- **Training data pipelines**: Learn how to convert human demonstrations into model training data
|
||||
- **Evaluation frameworks**: Build human-validated test suites for your agents
|
||||
- **API documentation**: Full reference for human agent configuration
|
||||
|
||||
@@ -207,4 +216,4 @@ Ready to put humans back in the loop? The most sophisticated AI system knows whe
|
||||
|
||||
---
|
||||
|
||||
*Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop).*
|
||||
_Questions about human-in-the-loop agents? Join the conversation in our [Discord community](https://discord.gg/cua-ai) or check out our [documentation](https://docs.trycua.com/docs/agent-sdk/supported-agents/human-in-the-loop)._
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Introducing Cua Cloud Sandbox: Computer-Use Agents in the Cloud
|
||||
|
||||
*Published on May 28, 2025 by Francesco Bonacci*
|
||||
_Published on May 28, 2025 by Francesco Bonacci_
|
||||
|
||||
Welcome to the next chapter in our Computer-Use Agent journey! In [Part 1](./build-your-own-operator-on-macos-1), we showed you how to build your own Operator on macOS. In [Part 2](./build-your-own-operator-on-macos-2), we explored the cua-agent framework. Today, we're excited to introduce **Cua Cloud Sandbox** – the easiest way to deploy Computer-Use Agents at scale.
|
||||
|
||||
@@ -14,9 +14,9 @@ Think of Cua Cloud as **Docker for Computer-Use Agents**. Instead of managing VM
|
||||
|
||||
## Why Cua Cloud Sandbox?
|
||||
|
||||
Four months ago, we launched [**Lume**](https://github.com/trycua/cua/tree/main/libs/lume) and [**Cua**](https://github.com/trycua/cua) with the goal to bring sandboxed VMs and Computer-Use Agents on Apple Silicon. The developer's community response was incredible 🎉
|
||||
Four months ago, we launched [**Lume**](https://github.com/trycua/cua/tree/main/libs/lume) and [**Cua**](https://github.com/trycua/cua) with the goal to bring sandboxed VMs and Computer-Use Agents on Apple Silicon. The developer's community response was incredible 🎉
|
||||
|
||||
Going from prototype to production revealed a problem though: **local macOS VMs don't scale**, neither are they easily portable.
|
||||
Going from prototype to production revealed a problem though: **local macOS VMs don't scale**, neither are they easily portable.
|
||||
|
||||
Our Discord community, YC peers, and early pilot customers kept hitting the same issues. Storage constraints meant **20-40GB per VM** filled laptops fast. Different hardware architectures (Apple Silicon ARM vs Intel x86) prevented portability of local workflows. Every new user lost a day to setup and configuration.
|
||||
|
||||
@@ -55,7 +55,7 @@ async def run_cloud_agent():
|
||||
name=os.getenv("CUA_CONTAINER_NAME"),
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
)
|
||||
|
||||
|
||||
# Create an agent with your preferred loop
|
||||
agent = ComputerAgent(
|
||||
model="openai/gpt-4o",
|
||||
@@ -63,7 +63,7 @@ async def run_cloud_agent():
|
||||
verbosity=logging.INFO,
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
|
||||
# Run a task
|
||||
async for result in agent.run("Open Chrome and search for AI news"):
|
||||
print(f"Response: {result.get('text')}")
|
||||
@@ -102,14 +102,14 @@ async def github_automation():
|
||||
name="github-automation",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
)
|
||||
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="openai/gpt-4o",
|
||||
save_trajectory=True,
|
||||
verbosity=logging.INFO,
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
|
||||
tasks = [
|
||||
"Look for a repository named trycua/cua on GitHub.",
|
||||
"Check the open issues, open the most recent one and read it.",
|
||||
@@ -119,17 +119,17 @@ async def github_automation():
|
||||
"Commit the changes with a descriptive message.",
|
||||
"Create a pull request."
|
||||
]
|
||||
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
|
||||
async for result in agent.run(task):
|
||||
print(f"Response: {result.get('text')}")
|
||||
|
||||
|
||||
# Check if any tools were used
|
||||
tools = result.get('tools')
|
||||
if tools:
|
||||
print(f"Tools used: {tools}")
|
||||
|
||||
|
||||
print(f"Task {i+1} completed")
|
||||
|
||||
# Run the automation
|
||||
@@ -153,13 +153,13 @@ async def scrape_website(site_name, url):
|
||||
name=f"scraper-{site_name}",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
)
|
||||
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="openai/gpt-4o",
|
||||
save_trajectory=True,
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
|
||||
results = []
|
||||
tasks = [
|
||||
f"Navigate to {url}",
|
||||
@@ -167,7 +167,7 @@ async def scrape_website(site_name, url):
|
||||
"Take a screenshot of the page",
|
||||
"Save the extracted data to a file"
|
||||
]
|
||||
|
||||
|
||||
for task in tasks:
|
||||
async for result in agent.run(task):
|
||||
results.append({
|
||||
@@ -175,7 +175,7 @@ async def scrape_website(site_name, url):
|
||||
'task': task,
|
||||
'response': result.get('text')
|
||||
})
|
||||
|
||||
|
||||
return results
|
||||
|
||||
async def parallel_scraping():
|
||||
@@ -185,11 +185,11 @@ async def parallel_scraping():
|
||||
("HackerNews", "https://news.ycombinator.com"),
|
||||
("TechCrunch", "https://techcrunch.com")
|
||||
]
|
||||
|
||||
|
||||
# Run all scraping tasks in parallel
|
||||
tasks = [scrape_website(name, url) for name, url in sites]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
# Process results
|
||||
for site_results in results:
|
||||
print(f"\nResults from {site_results[0]['site']}:")
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# From Lume to Containerization: Our Journey Meets Apple's Vision
|
||||
|
||||
*Published on June 10, 2025 by Francesco Bonacci*
|
||||
_Published on June 10, 2025 by Francesco Bonacci_
|
||||
|
||||
Yesterday, Apple announced their new [Containerization framework](https://github.com/apple/containerization) at WWDC. Since then, our Discord and X users have been asking what this means for Cua virtualization capabilities on Apple Silicon. We've been working in this space for months - from [Lume](https://github.com/trycua/cua/tree/main/libs/lume) to [Lumier](https://github.com/trycua/cua/tree/main/libs/lumier) to [Cua Cloud Sandbox](./introducing-cua-cloud-containers). Here's our take on Apple's announcement.
|
||||
|
||||
@@ -40,6 +40,7 @@ How Apple's Framework Works:
|
||||
```
|
||||
|
||||
Why is this better?
|
||||
|
||||
- **Better security**: Each container is completely separate
|
||||
- **Better performance**: Each container gets its own resources
|
||||
- **Real isolation**: If one container has problems, others aren't affected
|
||||
@@ -71,6 +72,7 @@ While Apple's new framework focuses on containers, we've been building VM manage
|
||||
[Lume](https://github.com/trycua/cua/tree/main/libs/lume) is our command-line tool for creating and managing VMs on Apple Silicon. We built it because setting up VMs on macOS was too complicated.
|
||||
|
||||
What Lume does:
|
||||
|
||||
- **Direct control**: Works directly with Apple's Virtualization framework
|
||||
- **Ready-to-use images**: Start a macOS or Linux VM with one command
|
||||
- **API server**: Control VMs from other programs (runs on port 7777)
|
||||
@@ -91,6 +93,7 @@ lume run macos-sequoia-vanilla:latest
|
||||
[Lumier](https://github.com/trycua/lumier) works differently. It lets you use Docker commands to manage VMs. But here's the key: **Docker is just for packaging, not for isolation**.
|
||||
|
||||
What makes Lumier useful:
|
||||
|
||||
- **Familiar commands**: If you know Docker, you know Lumier
|
||||
- **Web access**: Connect to your VM through a browser
|
||||
- **Save your work**: VMs remember their state
|
||||
@@ -127,6 +130,7 @@ Docker → Lume → Full VM → Mac Hardware
|
||||
### When to Use What
|
||||
|
||||
**Apple's Containerization**
|
||||
|
||||
- ✅ Perfect for: Running containers with maximum security
|
||||
- ✅ Starts in under a second
|
||||
- ✅ Uses less memory and CPU
|
||||
@@ -134,6 +138,7 @@ Docker → Lume → Full VM → Mac Hardware
|
||||
- ❌ Only for containers, not full VMs
|
||||
|
||||
**Lume**
|
||||
|
||||
- ✅ Perfect for: Development and testing
|
||||
- ✅ Full control over macOS/Linux VMs
|
||||
- ✅ Works on current macOS versions
|
||||
@@ -141,6 +146,7 @@ Docker → Lume → Full VM → Mac Hardware
|
||||
- ❌ Uses more resources than containers
|
||||
|
||||
**Lumier**
|
||||
|
||||
- ✅ Perfect for: Teams already using Docker
|
||||
- ✅ Easy to share and deploy
|
||||
- ✅ Access through your browser
|
||||
@@ -173,4 +179,4 @@ Apple's announcement confirms we're on the right path. Here's what we're looking
|
||||
|
||||
---
|
||||
|
||||
*Questions about virtualization on Apple Silicon? Come chat with us on Discord!*
|
||||
_Questions about virtualization on Apple Silicon? Come chat with us on Discord!_
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Sandboxed Python Execution: Run Code Safely in Cua Containers
|
||||
|
||||
*Published on June 23, 2025 by Dillon DuPont*
|
||||
_Published on June 23, 2025 by Dillon DuPont_
|
||||
|
||||
Cua's computer-use capabilities that we touched on in [Building your own Operator on macOS - Part 2](build-your-own-operator-on-macos-2.md) – your AI agents can click, scroll, type, and interact with any desktop application. But what if your agent needs to do more than just UI automation? What if it needs to process data, make API calls, analyze images, or run complex logic alongside those UI interactions, within the same virtual environment?
|
||||
|
||||
@@ -49,15 +49,19 @@ What's happening here? When you call `greet_and_print()`, Cua extracts the funct
|
||||
Cua's sandboxed execution system employs several key architectural components:
|
||||
|
||||
### 1. Source Code Extraction
|
||||
|
||||
Cua uses Python's `inspect.getsource()` to extract your function's source code and reconstruct the function definition in the remote environment.
|
||||
|
||||
### 2. Virtual Environment Isolation
|
||||
|
||||
Each sandboxed function runs in a named virtual environment within the container. This provides complete dependency isolation between different functions and their respective environments.
|
||||
|
||||
### 3. Data Serialization and Transport
|
||||
|
||||
Arguments and return values are serialized as JSON and transported between the host and container. This ensures compatibility across different Python versions and execution environments.
|
||||
|
||||
### 4. Comprehensive Error Handling
|
||||
|
||||
The system captures both successful results and exceptions, preserving stack traces and error information for debugging purposes.
|
||||
|
||||
## Getting your sandbox ready
|
||||
@@ -73,10 +77,10 @@ async def main():
|
||||
# Fire up the computer
|
||||
computer = Computer()
|
||||
await computer.run()
|
||||
|
||||
|
||||
# Make it the default for all sandboxed functions
|
||||
set_default_computer(computer)
|
||||
|
||||
|
||||
# Install some packages in a virtual environment
|
||||
await computer.venv_install("demo_venv", ["requests", "beautifulsoup4"])
|
||||
```
|
||||
@@ -104,7 +108,7 @@ def automate_browser_with_playwright():
|
||||
import time
|
||||
import base64
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
# Launch browser (visible, because why not?)
|
||||
@@ -112,68 +116,68 @@ def automate_browser_with_playwright():
|
||||
headless=False,
|
||||
args=['--no-sandbox', '--disable-dev-shm-usage']
|
||||
)
|
||||
|
||||
|
||||
page = browser.new_page()
|
||||
page.set_viewport_size({"width": 1280, "height": 720})
|
||||
|
||||
|
||||
actions = []
|
||||
screenshots = {}
|
||||
|
||||
|
||||
# Let's visit example.com and poke around
|
||||
page.goto("https://example.com")
|
||||
actions.append("Navigated to example.com")
|
||||
|
||||
|
||||
# Grab a screenshot because screenshots are cool
|
||||
screenshot_bytes = page.screenshot(full_page=True)
|
||||
screenshots["initial"] = base64.b64encode(screenshot_bytes).decode()
|
||||
|
||||
|
||||
# Get some basic info
|
||||
title = page.title()
|
||||
actions.append(f"Page title: {title}")
|
||||
|
||||
|
||||
# Find links and headings
|
||||
try:
|
||||
links = page.locator("a").all()
|
||||
link_texts = [link.text_content() for link in links[:5]]
|
||||
actions.append(f"Found {len(links)} links: {link_texts}")
|
||||
|
||||
|
||||
headings = page.locator("h1, h2, h3").all()
|
||||
heading_texts = [h.text_content() for h in headings[:3]]
|
||||
actions.append(f"Found headings: {heading_texts}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
actions.append(f"Element interaction error: {str(e)}")
|
||||
|
||||
|
||||
# Let's try a form for good measure
|
||||
try:
|
||||
page.goto("https://httpbin.org/forms/post")
|
||||
actions.append("Navigated to form page")
|
||||
|
||||
|
||||
# Fill out the form
|
||||
page.fill('input[name="custname"]', "Test User from Sandboxed Environment")
|
||||
page.fill('input[name="custtel"]', "555-0123")
|
||||
page.fill('input[name="custemail"]', "test@example.com")
|
||||
page.select_option('select[name="size"]', "large")
|
||||
|
||||
|
||||
actions.append("Filled out form fields")
|
||||
|
||||
|
||||
# Submit and see what happens
|
||||
page.click('input[type="submit"]')
|
||||
page.wait_for_load_state("networkidle")
|
||||
|
||||
|
||||
actions.append("Submitted form")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
actions.append(f"Form interaction error: {str(e)}")
|
||||
|
||||
|
||||
browser.close()
|
||||
|
||||
|
||||
return {
|
||||
"actions_performed": actions,
|
||||
"screenshots": screenshots,
|
||||
"success": True
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Browser automation failed: {str(e)}"}
|
||||
|
||||
@@ -196,9 +200,9 @@ def security_audit_tool(code_snippet):
|
||||
"""Analyze code for potential security issues"""
|
||||
import ast
|
||||
import re
|
||||
|
||||
|
||||
issues = []
|
||||
|
||||
|
||||
# Check for the usual suspects
|
||||
dangerous_patterns = [
|
||||
(r'eval\s*\(', "Use of eval() function"),
|
||||
@@ -207,11 +211,11 @@ def security_audit_tool(code_snippet):
|
||||
(r'subprocess\.', "Subprocess usage"),
|
||||
(r'os\.system\s*\(', "OS system call"),
|
||||
]
|
||||
|
||||
|
||||
for pattern, description in dangerous_patterns:
|
||||
if re.search(pattern, code_snippet):
|
||||
issues.append(description)
|
||||
|
||||
|
||||
# Get fancy with AST analysis
|
||||
try:
|
||||
tree = ast.parse(code_snippet)
|
||||
@@ -222,7 +226,7 @@ def security_audit_tool(code_snippet):
|
||||
issues.append(f"Dangerous function call: {node.func.id}")
|
||||
except SyntaxError:
|
||||
issues.append("Syntax error in code")
|
||||
|
||||
|
||||
return {
|
||||
"security_issues": issues,
|
||||
"risk_level": "HIGH" if len(issues) > 2 else "MEDIUM" if issues else "LOW"
|
||||
@@ -245,34 +249,34 @@ def take_screenshot_and_analyze():
|
||||
import base64
|
||||
from PIL import ImageGrab
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
try:
|
||||
# Grab the screen
|
||||
screenshot = ImageGrab.grab()
|
||||
|
||||
|
||||
# Convert to base64 for easy transport
|
||||
buffer = io.BytesIO()
|
||||
screenshot.save(buffer, format='PNG')
|
||||
screenshot_data = base64.b64encode(buffer.getvalue()).decode()
|
||||
|
||||
|
||||
# Get some basic info
|
||||
screen_info = {
|
||||
"size": screenshot.size,
|
||||
"mode": screenshot.mode,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
|
||||
# Analyze the colors (because why not?)
|
||||
colors = screenshot.getcolors(maxcolors=256*256*256)
|
||||
dominant_color = max(colors, key=lambda x: x[0])[1] if colors else None
|
||||
|
||||
|
||||
return {
|
||||
"screenshot_base64": screenshot_data,
|
||||
"screen_info": screen_info,
|
||||
"dominant_color": dominant_color,
|
||||
"unique_colors": len(colors) if colors else 0
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Screenshot failed: {str(e)}"}
|
||||
|
||||
@@ -287,6 +291,7 @@ print("Desktop analysis complete!")
|
||||
## Pro tips for sandboxed success
|
||||
|
||||
### Keep it self-contained
|
||||
|
||||
Always put your imports inside the function. Trust us on this one:
|
||||
|
||||
```python
|
||||
@@ -294,12 +299,13 @@ Always put your imports inside the function. Trust us on this one:
|
||||
def good_function():
|
||||
import os # Import inside the function
|
||||
import json
|
||||
|
||||
|
||||
# Your code here
|
||||
return {"result": "success"}
|
||||
```
|
||||
|
||||
### Install dependencies first
|
||||
|
||||
Don't forget to install packages before using them:
|
||||
|
||||
```python
|
||||
@@ -314,13 +320,14 @@ def data_analysis():
|
||||
```
|
||||
|
||||
### Use descriptive environment names
|
||||
|
||||
Future you will thank you:
|
||||
|
||||
```python
|
||||
@sandboxed("data_processing_env")
|
||||
def process_data(): pass
|
||||
|
||||
@sandboxed("web_scraping_env")
|
||||
@sandboxed("web_scraping_env")
|
||||
def scrape_site(): pass
|
||||
|
||||
@sandboxed("ml_training_env")
|
||||
@@ -328,6 +335,7 @@ def train_model(): pass
|
||||
```
|
||||
|
||||
### Always handle errors gracefully
|
||||
|
||||
Things break. Plan for it:
|
||||
|
||||
```python
|
||||
@@ -345,6 +353,7 @@ def robust_function(data):
|
||||
Let's be honest – there's some overhead here. Code needs to be serialized, sent over the network, and executed remotely. But for most use cases, the benefits far outweigh the costs.
|
||||
|
||||
If you're building something performance-critical, consider:
|
||||
|
||||
- Batching multiple operations into a single sandboxed function
|
||||
- Minimizing data transfer between host and container
|
||||
- Using persistent virtual environments
|
||||
@@ -369,4 +378,4 @@ Happy coding (safely)!
|
||||
|
||||
---
|
||||
|
||||
*Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!*
|
||||
_Want to dive deeper? Check out our [sandboxed functions examples](https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py) and [virtual environment tests](https://github.com/trycua/cua/blob/main/tests/venv.py) on GitHub. Questions? Come chat with us on Discord!_
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Training Computer-Use Models: Creating Human Trajectories with Cua
|
||||
|
||||
*Published on May 1, 2025 by Dillon DuPont*
|
||||
_Published on May 1, 2025 by Dillon DuPont_
|
||||
|
||||
In our previous posts, we covered [building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [using the Agent framework](build-your-own-operator-on-macos-2) to simplify development. Today, we'll focus on a critical aspect of improving computer-use agents and models: gathering high-quality demonstration data using Cua's Computer-Use Interface (CUI) and its Gradio UI to create and share human-generated trajectories.
|
||||
|
||||
@@ -8,10 +8,10 @@ Why is this important? Underlying models used by Computer-use agents need exampl
|
||||
|
||||
<video src="https://github.com/user-attachments/assets/c586d460-3877-4b5f-a736-3248886d2134" controls width="600"></video>
|
||||
|
||||
|
||||
## What You'll Learn
|
||||
|
||||
By the end of this tutorial, you'll be able to:
|
||||
|
||||
- Set up the Computer-Use Interface (CUI) with Gradio UI support
|
||||
- Record your own computer interaction trajectories
|
||||
- Organize and tag your demonstrations
|
||||
@@ -19,6 +19,7 @@ By the end of this tutorial, you'll be able to:
|
||||
- Contribute to improving computer-use AI for everyone
|
||||
|
||||
**Prerequisites:**
|
||||
|
||||
- macOS Sonoma (14.0) or later
|
||||
- Python 3.10+
|
||||
- Basic familiarity with Python and terminal commands
|
||||
@@ -38,6 +39,7 @@ Human trajectories, in the context of Computer-use AI Agents, are recordings of
|
||||
- Time spent on different elements
|
||||
|
||||
These trajectories serve as examples for AI models to learn from, helping them understand the relationship between:
|
||||
|
||||
1. The visual state of the screen
|
||||
2. The user's goal or task
|
||||
3. The most appropriate action to take
|
||||
@@ -59,17 +61,19 @@ By contributing high-quality demonstrations, you're helping to create more capab
|
||||
The Computer-Use Interface includes an optional Gradio UI specifically designed to make recording and sharing demonstrations easy. Let's set it up:
|
||||
|
||||
1. **Create a Python environment** (optional but recommended):
|
||||
|
||||
```bash
|
||||
# Using conda
|
||||
conda create -n cua-trajectories python=3.10
|
||||
conda activate cua-trajectories
|
||||
|
||||
|
||||
# Using venv
|
||||
python -m venv cua-trajectories
|
||||
source cua-trajectories/bin/activate # On macOS/Linux
|
||||
```
|
||||
|
||||
2. **Install the CUI package with UI support**:
|
||||
|
||||
```bash
|
||||
pip install "cua-computer[ui]"
|
||||
```
|
||||
@@ -145,6 +149,7 @@ Effective tagging and organization make your demonstrations more valuable to res
|
||||
### Task-Based Tags
|
||||
|
||||
Describe what the demonstration accomplishes:
|
||||
|
||||
- `web-browsing`
|
||||
- `document-editing`
|
||||
- `file-management`
|
||||
@@ -154,6 +159,7 @@ Describe what the demonstration accomplishes:
|
||||
### Application Tags
|
||||
|
||||
Identify the applications used:
|
||||
|
||||
- `finder`
|
||||
- `safari`
|
||||
- `notes`
|
||||
@@ -163,6 +169,7 @@ Identify the applications used:
|
||||
### Complexity Tags
|
||||
|
||||
Indicate the difficulty level:
|
||||
|
||||
- `beginner`
|
||||
- `intermediate`
|
||||
- `advanced`
|
||||
@@ -171,6 +178,7 @@ Indicate the difficulty level:
|
||||
### UI Element Tags
|
||||
|
||||
Highlight specific UI interactions:
|
||||
|
||||
- `drag-and-drop`
|
||||
- `menu-navigation`
|
||||
- `form-filling`
|
||||
@@ -229,11 +237,11 @@ from computer import Computer
|
||||
computer = Computer(os_type="macos", display="1024x768", memory="8GB", cpu="4")
|
||||
try:
|
||||
await computer.run()
|
||||
|
||||
|
||||
screenshot = await computer.interface.screenshot()
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(screenshot)
|
||||
|
||||
|
||||
await computer.interface.move_cursor(100, 100)
|
||||
await computer.interface.left_click()
|
||||
await computer.interface.right_click(300, 300)
|
||||
@@ -280,6 +288,7 @@ You can also learn from existing trajectory datasets contributed by the communit
|
||||
### Summary
|
||||
|
||||
In this guide, we've covered how to:
|
||||
|
||||
- Set up the Computer-Use Interface with Gradio UI
|
||||
- Record high-quality human demonstrations
|
||||
- Organize and tag your trajectories
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Trajectory Viewer for Cua
|
||||
|
||||
*Published on May 13, 2025 by Dillon DuPont*
|
||||
_Published on May 13, 2025 by Dillon DuPont_
|
||||
|
||||
Don’t forget to check out [Part 1: Building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [Part 2: Using the Agent framework](build-your-own-operator-on-macos-2) for setting up your Cua environment and basic tips and tricks!
|
||||
|
||||
@@ -18,7 +18,7 @@ Think of a trajectory as a detailed video recording of your agent’s journey:
|
||||
- **Observations**: What did the agent see (the exact screen content) at each point in time?
|
||||
- **Actions**: What clicks, keystrokes, or commands did it perform in response?
|
||||
- **Decisions**: Which options did it choose, and why?
|
||||
Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
|
||||
Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
|
||||
|
||||
## So, what’s Cua’s Trajectory Viewer and why use it?
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Ubuntu Docker Support in Cua with Kasm
|
||||
|
||||
*Published Aug 26, 2025 by Francesco Bonacci*
|
||||
_Published Aug 26, 2025 by Francesco Bonacci_
|
||||
|
||||
Today we’re shipping **Ubuntu Docker support** in Cua. You get a full Linux desktop inside a Docker container, viewable right in your browser—no VM spin-up, no extra clients. It behaves the same on macOS, Windows, and Linux.
|
||||
|
||||
@@ -16,17 +16,17 @@ We wanted something lightweight, isolated, and identical across machines. So we
|
||||
|
||||
Short answer: **portability, startup time, and ops friction.**
|
||||
|
||||
* **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
|
||||
* **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
|
||||
* **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
|
||||
* **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
|
||||
* **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
|
||||
- **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
|
||||
- **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
|
||||
- **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
|
||||
- **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
|
||||
- **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
|
||||
|
||||
**When we *do* reach for QEMU/KVM:**
|
||||
**When we _do_ reach for QEMU/KVM:**
|
||||
|
||||
* You need **true OS isolation** or to run **non-Linux** guests.
|
||||
* You want **kernel-level features** or **device/GPU passthrough** (VFIO).
|
||||
* You’re optimizing for **hardware realism** over startup speed and density.
|
||||
- You need **true OS isolation** or to run **non-Linux** guests.
|
||||
- You want **kernel-level features** or **device/GPU passthrough** (VFIO).
|
||||
- You’re optimizing for **hardware realism** over startup speed and density.
|
||||
|
||||
For this release, the goal was a **cross-platform Linux desktop that feels instant and identical** across local dev and CI. Containers + KasmVNC hit that sweet spot.
|
||||
|
||||
@@ -174,10 +174,10 @@ await computer.run()
|
||||
|
||||
## Links
|
||||
|
||||
* **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
|
||||
* **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
|
||||
* **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
|
||||
* **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
|
||||
* **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
- **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
|
||||
- **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
|
||||
- **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
|
||||
- **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
|
||||
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
|
||||
Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# Your Windows PC is Already the Perfect Development Environment for Computer-Use Agents
|
||||
|
||||
*Published on June 18, 2025 by Dillon DuPont*
|
||||
_Published on June 18, 2025 by Dillon DuPont_
|
||||
|
||||
Over the last few months, our enterprise users kept asking the same type of question: *"When are you adding support for AutoCAD?"* *"What about SAP integration?"* *"Can you automate our MES system?"* - each request was for different enterprise applications we'd never heard of.
|
||||
Over the last few months, our enterprise users kept asking the same type of question: _"When are you adding support for AutoCAD?"_ _"What about SAP integration?"_ _"Can you automate our MES system?"_ - each request was for different enterprise applications we'd never heard of.
|
||||
|
||||
At first, we deflected. We've been building Cua to work across different environments - from [Lume for macOS VMs](./lume-to-containerization) to cloud containers. But these requests kept piling up. AutoCAD automation. SAP integration. Specialized manufacturing systems.
|
||||
At first, we deflected. We've been building Cua to work across different environments - from [Lume for macOS VMs](./lume-to-containerization) to cloud containers. But these requests kept piling up. AutoCAD automation. SAP integration. Specialized manufacturing systems.
|
||||
|
||||
Then it hit us: **they all ran exclusively on Windows**.
|
||||
|
||||
@@ -80,6 +80,7 @@ python -m agent.ui
|
||||
```
|
||||
|
||||
**What you get**:
|
||||
|
||||
- Visual interface in your browser
|
||||
- Real-time agent action viewing
|
||||
- Natural language task instructions
|
||||
@@ -101,21 +102,21 @@ async def test_windows_agent():
|
||||
os_type="windows",
|
||||
memory="4GB",
|
||||
)
|
||||
|
||||
|
||||
# Start the VM (~35s)
|
||||
await computer.run()
|
||||
|
||||
|
||||
# Create agent with your preferred model
|
||||
agent = ComputerAgent(
|
||||
model="openai/computer-use-preview",
|
||||
save_trajectory=True,
|
||||
tools=[computer]
|
||||
)
|
||||
|
||||
|
||||
# Give it a task
|
||||
async for result in agent.run("Open Calculator and compute 15% tip on $47.50"):
|
||||
print(f"Agent action: {result}")
|
||||
|
||||
|
||||
# Shutdown the VM
|
||||
await computer.stop()
|
||||
|
||||
@@ -123,6 +124,7 @@ asyncio.run(test_windows_agent())
|
||||
```
|
||||
|
||||
**What you get**:
|
||||
|
||||
- Full programmatic control
|
||||
- Custom agent workflows
|
||||
- Integration with your existing code
|
||||
@@ -141,6 +143,7 @@ asyncio.run(test_windows_agent())
|
||||
Let's see how different testing approaches stack up:
|
||||
|
||||
### Windows Sandbox + Cua
|
||||
|
||||
- **Perfect for**: Quick testing and development
|
||||
- **Cost**: Free (built into Windows)
|
||||
- **Setup time**: Under 5 minutes
|
||||
@@ -149,6 +152,7 @@ Let's see how different testing approaches stack up:
|
||||
- **Requires**: Windows 10/11 with 4GB+ RAM
|
||||
|
||||
### Traditional VMs
|
||||
|
||||
- **Perfect for**: Complex testing scenarios
|
||||
- **Full customization**: Any Windows version
|
||||
- **Heavy resource usage**: Slow to start/stop
|
||||
@@ -160,6 +164,7 @@ Let's see how different testing approaches stack up:
|
||||
Here's what our enterprise users are building with Windows Sandbox:
|
||||
|
||||
### CAD and Engineering Automation
|
||||
|
||||
```python
|
||||
# Example: AutoCAD drawing automation
|
||||
task = """
|
||||
@@ -172,6 +177,7 @@ task = """
|
||||
```
|
||||
|
||||
### Manufacturing and ERP Integration
|
||||
|
||||
```python
|
||||
# Example: SAP workflow automation
|
||||
task = """
|
||||
@@ -184,6 +190,7 @@ task = """
|
||||
```
|
||||
|
||||
### Financial Software Automation
|
||||
|
||||
```python
|
||||
# Example: Trading platform automation
|
||||
task = """
|
||||
@@ -196,6 +203,7 @@ task = """
|
||||
```
|
||||
|
||||
### Legacy Windows Application Integration
|
||||
|
||||
```python
|
||||
# Example: Custom Windows application automation
|
||||
task = """
|
||||
@@ -210,12 +218,14 @@ task = """
|
||||
## System Requirements and Performance
|
||||
|
||||
### What You Need
|
||||
|
||||
- **Windows 10/11**: Any edition that supports Windows Sandbox
|
||||
- **Memory**: 4GB minimum (8GB recommended for CAD/professional software)
|
||||
- **CPU**: Virtualization support (enabled by default on modern systems)
|
||||
- **Storage**: A few GB free space
|
||||
|
||||
### Performance Tips
|
||||
|
||||
- **Close unnecessary applications** before starting Windows Sandbox
|
||||
- **Allocate appropriate memory** based on your RPA workflow complexity
|
||||
- **Use SSD storage** for faster sandbox startup
|
||||
@@ -234,4 +244,4 @@ But for development, prototyping, and learning Windows RPA workflows, **Windows
|
||||
|
||||
---
|
||||
|
||||
*Ready to see AI agents control your Windows applications? Come share your testing experiences on Discord!*
|
||||
_Ready to see AI agents control your Windows applications? Come share your testing experiences on Discord!_
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
{
|
||||
"pages": [
|
||||
"introduction",
|
||||
"screenspot-v2",
|
||||
"screenspot-pro",
|
||||
"interactive",
|
||||
"osworld-verified"
|
||||
]
|
||||
}
|
||||
"pages": ["introduction", "screenspot-v2", "screenspot-pro", "interactive", "osworld-verified"]
|
||||
}
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
{
|
||||
"title": "Callbacks",
|
||||
"description": "Extending agents with callback hooks and built-in handlers",
|
||||
"pages": [
|
||||
"agent-lifecycle",
|
||||
"trajectories",
|
||||
"logging",
|
||||
"cost-saving",
|
||||
"pii-anonymization"
|
||||
]
|
||||
"title": "Callbacks",
|
||||
"description": "Extending agents with callback hooks and built-in handlers",
|
||||
"pages": ["agent-lifecycle", "trajectories", "logging", "cost-saving", "pii-anonymization"]
|
||||
}
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
{
|
||||
"title": "Agent SDK",
|
||||
"description": "Build computer-using agents with the Agent SDK",
|
||||
"pages": [
|
||||
"agent-loops",
|
||||
"supported-agents",
|
||||
"supported-model-providers",
|
||||
"chat-history",
|
||||
"message-format",
|
||||
"customizing-computeragent",
|
||||
"callbacks",
|
||||
"custom-tools",
|
||||
"custom-computer-handlers",
|
||||
"prompt-caching",
|
||||
"usage-tracking",
|
||||
"benchmarks",
|
||||
"migration-guide",
|
||||
"integrations"
|
||||
]
|
||||
"title": "Agent SDK",
|
||||
"description": "Build computer-using agents with the Agent SDK",
|
||||
"pages": [
|
||||
"agent-loops",
|
||||
"supported-agents",
|
||||
"supported-model-providers",
|
||||
"chat-history",
|
||||
"message-format",
|
||||
"customizing-computeragent",
|
||||
"callbacks",
|
||||
"custom-tools",
|
||||
"custom-computer-handlers",
|
||||
"prompt-caching",
|
||||
"usage-tracking",
|
||||
"benchmarks",
|
||||
"migration-guide",
|
||||
"integrations"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,10 +1,5 @@
|
||||
{
|
||||
"title": "Supported Agents",
|
||||
"description": "Models and configurations supported by the Agent SDK",
|
||||
"pages": [
|
||||
"computer-use-agents",
|
||||
"grounding-models",
|
||||
"composed-agents",
|
||||
"human-in-the-loop"
|
||||
]
|
||||
"title": "Supported Agents",
|
||||
"description": "Models and configurations supported by the Agent SDK",
|
||||
"pages": ["computer-use-agents", "grounding-models", "composed-agents", "human-in-the-loop"]
|
||||
}
|
||||
|
||||
@@ -1,11 +1,5 @@
|
||||
{
|
||||
"title": "Computer SDK",
|
||||
"description": "Build computer-using agents with the Computer SDK",
|
||||
"pages": [
|
||||
"computers",
|
||||
"cloud-vm-management",
|
||||
"commands",
|
||||
"computer-ui",
|
||||
"sandboxed-python"
|
||||
]
|
||||
"title": "Computer SDK",
|
||||
"description": "Build computer-using agents with the Computer SDK",
|
||||
"pages": ["computers", "cloud-vm-management", "commands", "computer-ui", "sandboxed-python"]
|
||||
}
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
{
|
||||
"title": "Example Use Cases",
|
||||
"description": "Real-world examples of building with Cua",
|
||||
"pages": [
|
||||
"form-filling"
|
||||
]
|
||||
"title": "Example Use Cases",
|
||||
"description": "Real-world examples of building with Cua",
|
||||
"pages": ["form-filling"]
|
||||
}
|
||||
|
||||
@@ -17,10 +17,12 @@ Lume follows the XDG Base Directory specification for the configuration file:
|
||||
- Configuration is stored in `$XDG_CONFIG_HOME/lume/config.yaml` (defaults to `~/.config/lume/config.yaml`)
|
||||
|
||||
By default, other data is stored in:
|
||||
|
||||
- VM data: `~/.lume`
|
||||
- Cache files: `~/.lume/cache`
|
||||
|
||||
The config file contains settings for:
|
||||
|
||||
- VM storage locations and the default location
|
||||
- Cache directory location
|
||||
- Whether caching is enabled
|
||||
@@ -88,6 +90,7 @@ lume delete <name>
|
||||
### How to Install macOS from an IPSW Image
|
||||
|
||||
#### Create a new macOS VM using the latest supported IPSW image:
|
||||
|
||||
Run the following command to create a new macOS virtual machine using the latest available IPSW image:
|
||||
|
||||
```bash
|
||||
@@ -95,6 +98,7 @@ lume create <name> --os macos --ipsw latest
|
||||
```
|
||||
|
||||
#### Create a new macOS VM using a specific IPSW image:
|
||||
|
||||
To create a macOS virtual machine from an older or specific IPSW file, first download the desired IPSW (UniversalMac) from a trusted source.
|
||||
|
||||
Then, use the downloaded IPSW path:
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
{
|
||||
"pages": [
|
||||
"installation",
|
||||
"prebuilt-images",
|
||||
"cli-reference",
|
||||
"http-api",
|
||||
"faq"
|
||||
]
|
||||
"pages": ["installation", "prebuilt-images", "cli-reference", "http-api", "faq"]
|
||||
}
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
{
|
||||
"pages": [
|
||||
"installation",
|
||||
"docker",
|
||||
"docker-compose",
|
||||
"building-lumier"
|
||||
]
|
||||
"pages": ["installation", "docker", "docker-compose", "building-lumier"]
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
{
|
||||
"pages": [
|
||||
"installation",
|
||||
"configuration",
|
||||
"usage",
|
||||
"tools",
|
||||
"client-integrations",
|
||||
"llm-integrations"
|
||||
]
|
||||
}
|
||||
"pages": [
|
||||
"installation",
|
||||
"configuration",
|
||||
"usage",
|
||||
"tools",
|
||||
"client-integrations",
|
||||
"llm-integrations"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
{
|
||||
"title": "Home",
|
||||
"description": "Documentation Home",
|
||||
"root": true,
|
||||
"defaultOpen": true,
|
||||
"pages": [
|
||||
"index",
|
||||
"quickstart-devs",
|
||||
"quickstart-cli",
|
||||
"telemetry",
|
||||
"example-usecases",
|
||||
"---[BookCopy]Computer Playbook---",
|
||||
"...computer-sdk",
|
||||
"---[BookCopy]Agent Playbook---",
|
||||
"...agent-sdk",
|
||||
"---[CodeXml]API Reference---",
|
||||
"...libraries"
|
||||
]
|
||||
}
|
||||
"title": "Home",
|
||||
"description": "Documentation Home",
|
||||
"root": true,
|
||||
"defaultOpen": true,
|
||||
"pages": [
|
||||
"index",
|
||||
"quickstart-devs",
|
||||
"quickstart-cli",
|
||||
"telemetry",
|
||||
"example-usecases",
|
||||
"---[BookCopy]Computer Playbook---",
|
||||
"...computer-sdk",
|
||||
"---[BookCopy]Agent Playbook---",
|
||||
"...agent-sdk",
|
||||
"---[CodeXml]API Reference---",
|
||||
"...libraries"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -43,4 +43,4 @@
|
||||
"sharp"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
import {
|
||||
defineConfig,
|
||||
defineDocs,
|
||||
frontmatterSchema,
|
||||
metaSchema,
|
||||
} from 'fumadocs-mdx/config';
|
||||
import { defineConfig, defineDocs, frontmatterSchema, metaSchema } from 'fumadocs-mdx/config';
|
||||
import { z } from 'zod';
|
||||
|
||||
// You can customise Zod schemas for frontmatter and `meta.json` here
|
||||
|
||||
@@ -1,18 +1,9 @@
|
||||
import { getApiVersions, source } from '@/lib/source';
|
||||
import { getMDXComponents } from '@/mdx-components';
|
||||
import { buttonVariants } from 'fumadocs-ui/components/ui/button';
|
||||
import {
|
||||
Popover,
|
||||
PopoverContent,
|
||||
PopoverTrigger,
|
||||
} from 'fumadocs-ui/components/ui/popover';
|
||||
import { Popover, PopoverContent, PopoverTrigger } from 'fumadocs-ui/components/ui/popover';
|
||||
import { createRelativeLink } from 'fumadocs-ui/mdx';
|
||||
import {
|
||||
DocsBody,
|
||||
DocsDescription,
|
||||
DocsPage,
|
||||
DocsTitle,
|
||||
} from 'fumadocs-ui/page';
|
||||
import { DocsBody, DocsDescription, DocsPage, DocsTitle } from 'fumadocs-ui/page';
|
||||
import { cn } from 'fumadocs-ui/utils/cn';
|
||||
import { ChevronDown, CodeXml, ExternalLink } from 'lucide-react';
|
||||
import type { Metadata } from 'next';
|
||||
@@ -20,9 +11,7 @@ import Link from 'next/link';
|
||||
import { notFound, redirect } from 'next/navigation';
|
||||
import { PageFeedback } from '@/components/page-feedback';
|
||||
|
||||
export default async function Page(props: {
|
||||
params: Promise<{ slug?: string[] }>;
|
||||
}) {
|
||||
export default async function Page(props: { params: Promise<{ slug?: string[] }> }) {
|
||||
const params = await props.params;
|
||||
const slug = params.slug || [];
|
||||
const page = source.getPage(slug);
|
||||
@@ -66,7 +55,8 @@ export default async function Page(props: {
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
fill="currentColor"
|
||||
className="h-5"
|
||||
viewBox="0 0 448 512">
|
||||
viewBox="0 0 448 512"
|
||||
>
|
||||
<title>Windows</title>
|
||||
<path d="M0 93.7l183.6-25.3v177.4H0V93.7zm0 324.6l183.6 25.3V268.4H0v149.9zm203.8 28L448 480V268.4H203.8v177.9zm0-380.6v180.1H448V32L203.8 65.7z" />
|
||||
</svg>
|
||||
@@ -76,7 +66,8 @@ export default async function Page(props: {
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
fill="currentColor"
|
||||
className="h-5"
|
||||
viewBox="0 0 384 512">
|
||||
viewBox="0 0 384 512"
|
||||
>
|
||||
<title>macOS</title>
|
||||
<path d="M318.7 268.7c-.2-36.7 16.4-64.4 50-84.8-18.8-26.9-47.2-41.7-84.7-44.6-35.5-2.8-74.3 20.7-88.5 20.7-15 0-49.4-19.7-76.4-19.7C63.3 141.2 4 184.8 4 273.5q0 39.3 14.4 81.2c12.8 36.7 59 126.7 107.2 125.2 25.2-.6 43-17.9 75.8-17.9 31.8 0 48.3 17.9 76.4 17.9 48.6-.7 90.4-82.5 102.6-119.3-65.2-30.7-61.7-90-61.7-91.9zm-56.6-164.2c27.3-32.4 24.8-61.9 24-72.5-24.1 1.4-52 16.4-67.9 34.9-17.5 19.8-27.8 44.3-25.6 71.9 26.1 2 49.9-11.4 69.5-34.3z" />
|
||||
</svg>
|
||||
@@ -86,7 +77,8 @@ export default async function Page(props: {
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
fill="currentColor"
|
||||
className="h-5"
|
||||
viewBox="0 0 448 512">
|
||||
viewBox="0 0 448 512"
|
||||
>
|
||||
<title>Linux</title>
|
||||
<path d="M220.8 123.3c1 .5 1.8 1.7 3 1.7 1.1 0 2.8-.4 2.9-1.5 .2-1.4-1.9-2.3-3.2-2.9-1.7-.7-3.9-1-5.5-.1-.4 .2-.8 .7-.6 1.1 .3 1.3 2.3 1.1 3.4 1.7zm-21.9 1.7c1.2 0 2-1.2 3-1.7 1.1-.6 3.1-.4 3.5-1.6 .2-.4-.2-.9-.6-1.1-1.6-.9-3.8-.6-5.5 .1-1.3 .6-3.4 1.5-3.2 2.9 .1 1 1.8 1.5 2.8 1.4zM420 403.8c-3.6-4-5.3-11.6-7.2-19.7-1.8-8.1-3.9-16.8-10.5-22.4-1.3-1.1-2.6-2.1-4-2.9-1.3-.8-2.7-1.5-4.1-2 9.2-27.3 5.6-54.5-3.7-79.1-11.4-30.1-31.3-56.4-46.5-74.4-17.1-21.5-33.7-41.9-33.4-72C311.1 85.4 315.7 .1 234.8 0 132.4-.2 158 103.4 156.9 135.2c-1.7 23.4-6.4 41.8-22.5 64.7-18.9 22.5-45.5 58.8-58.1 96.7-6 17.9-8.8 36.1-6.2 53.3-6.5 5.8-11.4 14.7-16.6 20.2-4.2 4.3-10.3 5.9-17 8.3s-14 6-18.5 14.5c-2.1 3.9-2.8 8.1-2.8 12.4 0 3.9 .6 7.9 1.2 11.8 1.2 8.1 2.5 15.7 .8 20.8-5.2 14.4-5.9 24.4-2.2 31.7 3.8 7.3 11.4 10.5 20.1 12.3 17.3 3.6 40.8 2.7 59.3 12.5 19.8 10.4 39.9 14.1 55.9 10.4 11.6-2.6 21.1-9.6 25.9-20.2 12.5-.1 26.3-5.4 48.3-6.6 14.9-1.2 33.6 5.3 55.1 4.1 .6 2.3 1.4 4.6 2.5 6.7v.1c8.3 16.7 23.8 24.3 40.3 23 16.6-1.3 34.1-11 48.3-27.9 13.6-16.4 36-23.2 50.9-32.2 7.4-4.5 13.4-10.1 13.9-18.3 .4-8.2-4.4-17.3-15.5-29.7zM223.7 87.3c9.8-22.2 34.2-21.8 44-.4 6.5 14.2 3.6 30.9-4.3 40.4-1.6-.8-5.9-2.6-12.6-4.9 1.1-1.2 3.1-2.7 3.9-4.6 4.8-11.8-.2-27-9.1-27.3-7.3-.5-13.9 10.8-11.8 23-4.1-2-9.4-3.5-13-4.4-1-6.9-.3-14.6 2.9-21.8zM183 75.8c10.1 0 20.8 14.2 19.1 33.5-3.5 1-7.1 2.5-10.2 4.6 1.2-8.9-3.3-20.1-9.6-19.6-8.4 .7-9.8 21.2-1.8 28.1 1 .8 1.9-.2-5.9 5.5-15.6-14.6-10.5-52.1 8.4-52.1zm-13.6 60.7c6.2-4.6 13.6-10 14.1-10.5 4.7-4.4 13.5-14.2 27.9-14.2 7.1 0 15.6 2.3 25.9 8.9 6.3 4.1 11.3 4.4 22.6 9.3 8.4 3.5 13.7 9.7 10.5 18.2-2.6 7.1-11 14.4-22.7 18.1-11.1 3.6-19.8 16-38.2 14.9-3.9-.2-7-1-9.6-2.1-8-3.5-12.2-10.4-20-15-8.6-4.8-13.2-10.4-14.7-15.3-1.4-4.9 0-9 4.2-12.3zm3.3 334c-2.7 35.1-43.9 34.4-75.3 18-29.9-15.8-68.6-6.5-76.5-21.9-2.4-4.7-2.4-12.7 2.6-26.4v-.2c2.4-7.6 .6-16-.6-23.9-1.2-7.8-1.8-15 .9-20 3.5-6.7 8.5-9.1 14.8-11.3 10.3-3.7 11.8-3.4 19.6-9.9 5.5-5.7 9.5-12.9 14.3-18 5.1-5.5 10-8.1 17.7-6.9 8.1 1.2 15.1 6.8 21.9 16l19.6 35.6c9.5 19.9 43.1 48.4 41 68.9zm-1.4-25.9c-4.1-6.6-9.6-13.6-14.4-19.6 7.1 0 14.2-2.2 16.7-8.9 2.3-6.2 0-14.9-7.4-24.9-13.5-18.2-38.3-32.5-38.3-32.5-13.5-8.4-21.1-18.7-24.6-29.9s-3-23.3-.3-35.2c5.2-22.9 18.6-45.2 27.2-59.2 2.3-1.7 .8 3.2-8.7 20.8-8.5 16.1-24.4 53.3-2.6 82.4 .6-20.7 5.5-41.8 13.8-61.5 12-27.4 37.3-74.9 39.3-112.7 1.1 .8 4.6 3.2 6.2 4.1 4.6 2.7 8.1 6.7 12.6 10.3 12.4 10 28.5 9.2 42.4 1.2 6.2-3.5 11.2-7.5 15.9-9 9.9-3.1 17.8-8.6 22.3-15 7.7 30.4 25.7 74.3 37.2 95.7 6.1 11.4 18.3 35.5 23.6 64.6 3.3-.1 7 .4 10.9 1.4 13.8-35.7-11.7-74.2-23.3-84.9-4.7-4.6-4.9-6.6-2.6-6.5 12.6 11.2 29.2 33.7 35.2 59 2.8 11.6 3.3 23.7 .4 35.7 16.4 6.8 35.9 17.9 30.7 34.8-2.2-.1-3.2 0-4.2 0 3.2-10.1-3.9-17.6-22.8-26.1-19.6-8.6-36-8.6-38.3 12.5-12.1 4.2-18.3 14.7-21.4 27.3-2.8 11.2-3.6 24.7-4.4 39.9-.5 7.7-3.6 18-6.8 29-32.1 22.9-76.7 32.9-114.3 7.2zm257.4-11.5c-.9 16.8-41.2 19.9-63.2 46.5-13.2 15.7-29.4 24.4-43.6 25.5s-26.5-4.8-33.7-19.3c-4.7-11.1-2.4-23.1 1.1-36.3 3.7-14.2 9.2-28.8 9.9-40.6 .8-15.2 1.7-28.5 4.2-38.7 2.6-10.3 6.6-17.2 13.7-21.1 .3-.2 .7-.3 1-.5 .8 13.2 7.3 26.6 18.8 29.5 12.6 3.3 30.7-7.5 38.4-16.3 9-.3 15.7-.9 22.6 5.1 9.9 8.5 7.1 30.3 17.1 41.6 10.6 11.6 14 19.5 13.7 24.6zM173.3 148.7c2 1.9 4.7 4.5 8 7.1 6.6 5.2 15.8 10.6 27.3 10.6 11.6 0 22.5-5.9 31.8-10.8 4.9-2.6 10.9-7 14.8-10.4s5.9-6.3 3.1-6.6-2.6 2.6-6 5.1c-4.4 3.2-9.7 7.4-13.9 9.8-7.4 4.2-19.5 10.2-29.9 10.2s-18.7-4.8-24.9-9.7c-3.1-2.5-5.7-5-7.7-6.9-1.5-1.4-1.9-4.6-4.3-4.9-1.4-.1-1.8 3.7 1.7 6.5z" />
|
||||
</svg>
|
||||
@@ -96,10 +88,7 @@ export default async function Page(props: {
|
||||
|
||||
<div className="flex flex-row gap-2 items-left">
|
||||
{pypi && (
|
||||
<a
|
||||
target="_blank"
|
||||
href={`https://pypi.org/project/${pypi}/`}
|
||||
rel="noreferrer">
|
||||
<a target="_blank" href={`https://pypi.org/project/${pypi}/`} rel="noreferrer">
|
||||
<img
|
||||
src={`https://img.shields.io/pypi/v/${pypi}?color=blue`}
|
||||
className="h-5"
|
||||
@@ -108,10 +97,7 @@ export default async function Page(props: {
|
||||
</a>
|
||||
)}
|
||||
{npm && (
|
||||
<a
|
||||
target="_blank"
|
||||
href={`https://www.npmjs.com/package/${npm}`}
|
||||
rel="noreferrer">
|
||||
<a target="_blank" href={`https://www.npmjs.com/package/${npm}`} rel="noreferrer">
|
||||
<img
|
||||
src={`https://img.shields.io/npm/v/${npm}?color=bf4c4b`}
|
||||
className="h-5"
|
||||
@@ -138,7 +124,8 @@ export default async function Page(props: {
|
||||
target="_blank"
|
||||
className="inline-flex gap-2 w-fit items-center justify-center rounded-md text-sm font-medium transition-colors duration-100 disabled:pointer-events-none disabled:opacity-50 focus-visible:outline-none hover:bg-fd-accent hover:text-fd-accent-foreground p-1.5 [&_svg]:size-5 text-fd-muted-foreground md:[&_svg]:size-4.5"
|
||||
aria-label="Source"
|
||||
data-active="false">
|
||||
data-active="false"
|
||||
>
|
||||
<svg role="img" viewBox="0 0 24 24" fill="currentColor">
|
||||
<path d="M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12"></path>
|
||||
</svg>
|
||||
@@ -162,12 +149,13 @@ export default async function Page(props: {
|
||||
href={link}
|
||||
rel="noreferrer noopener"
|
||||
target="_blank"
|
||||
className="inline-flex gap-2 w-full items-center rounded-md p-2 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground">
|
||||
className="inline-flex gap-2 w-full items-center rounded-md p-2 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground"
|
||||
>
|
||||
{link.includes('python')
|
||||
? 'Python'
|
||||
: link.includes('typescript')
|
||||
? 'TypeScript'
|
||||
: `Source ${index + 1}`}
|
||||
? 'TypeScript'
|
||||
: `Source ${index + 1}`}
|
||||
<ExternalLink className="w-4 h-4 ml-auto" />
|
||||
</a>
|
||||
))}
|
||||
@@ -190,10 +178,7 @@ export default async function Page(props: {
|
||||
};
|
||||
|
||||
return (
|
||||
<DocsPage
|
||||
toc={page.data.toc}
|
||||
tableOfContent={{ header: tocHeader() }}
|
||||
full={page.data.full}>
|
||||
<DocsPage toc={page.data.toc} tableOfContent={{ header: tocHeader() }} full={page.data.full}>
|
||||
<div className="flex flex-row w-full items-start">
|
||||
<div className="flex-1">
|
||||
<div className="flex flex-row w-full">
|
||||
@@ -209,15 +194,14 @@ export default async function Page(props: {
|
||||
size: 'sm',
|
||||
className: 'gap-2',
|
||||
})
|
||||
)}>
|
||||
)}
|
||||
>
|
||||
{(() => {
|
||||
// Find the current version label
|
||||
let currentLabel = 'Current';
|
||||
if (apiVersionSlug.length > 0) {
|
||||
const found = versionItems.find(
|
||||
(item) =>
|
||||
item.label !== 'Current' &&
|
||||
apiVersionSlug[0] === item.label
|
||||
(item) => item.label !== 'Current' && apiVersionSlug[0] === item.label
|
||||
);
|
||||
if (found) currentLabel = found.label;
|
||||
}
|
||||
@@ -238,10 +222,8 @@ export default async function Page(props: {
|
||||
: `/api/${apiSection}/${item.label}`;
|
||||
// Highlight current version
|
||||
const isCurrent =
|
||||
(item.label === 'Current' &&
|
||||
apiVersionSlug.length === 0) ||
|
||||
(item.label !== 'Current' &&
|
||||
apiVersionSlug[0] === item.label);
|
||||
(item.label === 'Current' && apiVersionSlug.length === 0) ||
|
||||
(item.label !== 'Current' && apiVersionSlug[0] === item.label);
|
||||
return (
|
||||
<Link
|
||||
key={item.label}
|
||||
@@ -249,7 +231,8 @@ export default async function Page(props: {
|
||||
className={cn(
|
||||
'px-3 py-1 rounded hover:bg-fd-muted',
|
||||
isCurrent && 'font-bold bg-fd-muted'
|
||||
)}>
|
||||
)}
|
||||
>
|
||||
API version: {item.label}
|
||||
</Link>
|
||||
);
|
||||
@@ -259,9 +242,7 @@ export default async function Page(props: {
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<DocsDescription className="text-md mt-1">
|
||||
{page.data.description}
|
||||
</DocsDescription>
|
||||
<DocsDescription className="text-md mt-1">{page.data.description}</DocsDescription>
|
||||
</div>
|
||||
</div>
|
||||
<DocsBody>
|
||||
@@ -290,8 +271,7 @@ export async function generateMetadata(props: {
|
||||
|
||||
let title = `${page.data.title} | Cua Docs`;
|
||||
if (page.url.includes('api')) title = `${page.data.title} | Cua API Docs`;
|
||||
if (page.url.includes('guide'))
|
||||
title = ` Guide: ${page.data.title} | Cua Docs`;
|
||||
if (page.url.includes('guide')) title = ` Guide: ${page.data.title} | Cua Docs`;
|
||||
|
||||
return {
|
||||
title,
|
||||
|
||||
@@ -24,9 +24,7 @@ export default function Layout({ children }: { children: ReactNode }) {
|
||||
<PostHogPageView />
|
||||
</Suspense>
|
||||
<AnalyticsTracker />
|
||||
<RootProvider search={{ options: { api: '/docs/api/search' } }}>
|
||||
{children}
|
||||
</RootProvider>
|
||||
<RootProvider search={{ options: { api: '/docs/api/search' } }}>{children}</RootProvider>
|
||||
<Footer />
|
||||
<CookieConsent />
|
||||
</PHProvider>
|
||||
|
||||
@@ -5,10 +5,7 @@ import { notFound } from 'next/navigation';
|
||||
|
||||
export const revalidate = false;
|
||||
|
||||
export async function GET(
|
||||
_req: NextRequest,
|
||||
{ params }: { params: Promise<{ slug?: string[] }> }
|
||||
) {
|
||||
export async function GET(_req: NextRequest, { params }: { params: Promise<{ slug?: string[] }> }) {
|
||||
const { slug } = await params;
|
||||
const page = source.getPage(slug);
|
||||
if (!page) notFound();
|
||||
|
||||
@@ -55,14 +55,17 @@ export function EditableCodeBlock({
|
||||
const [values, setValues] = useState<Record<string, string>>(defaultValues);
|
||||
|
||||
const updateValue = (key: string, value: string) => {
|
||||
setValues(prev => ({ ...prev, [key]: value }));
|
||||
setValues((prev) => ({ ...prev, [key]: value }));
|
||||
};
|
||||
|
||||
return (
|
||||
<EditableCodeContext.Provider value={{ values, updateValue }}>
|
||||
<Base.CodeBlock title={title} className={cn('my-4', className)}>
|
||||
<Base.Pre className={cn(`language-${lang}`, "px-3")}>
|
||||
<code className={cn(`language-${lang}`)} style={{ display: 'block', whiteSpace: 'pre-wrap' }}>
|
||||
<Base.Pre className={cn(`language-${lang}`, 'px-3')}>
|
||||
<code
|
||||
className={cn(`language-${lang}`)}
|
||||
style={{ display: 'block', whiteSpace: 'pre-wrap' }}
|
||||
>
|
||||
{children}
|
||||
</code>
|
||||
</Base.Pre>
|
||||
@@ -219,9 +222,7 @@ export function EditableValue({
|
||||
value={value}
|
||||
onChange={(e) => updateValue(placeholder, e.target.value)}
|
||||
placeholder={placeholder}
|
||||
className={cn(
|
||||
type === 'password' && value && 'text-security-disc'
|
||||
)}
|
||||
className={cn(type === 'password' && value && 'text-security-disc')}
|
||||
style={{
|
||||
display: 'inline',
|
||||
width: inputWidth,
|
||||
|
||||
@@ -34,7 +34,7 @@ interface IOUProps {
|
||||
}
|
||||
|
||||
/**
|
||||
* A React component that visualizes and calculates the Intersection over Union (IOU)
|
||||
* A React component that visualizes and calculates the Intersection over Union (IOU)
|
||||
* of two rectangles on a canvas
|
||||
* @param props - The component props
|
||||
* @returns The rendered IOU visualization component
|
||||
@@ -130,12 +130,7 @@ export default function IOU({ title, description, rect1, rect2 }: IOUProps) {
|
||||
<h3 className="text-sm font-semibold ">{title}</h3>
|
||||
<div className="flex items-start gap-6">
|
||||
<div>
|
||||
<canvas
|
||||
ref={canvasRef}
|
||||
width={200}
|
||||
height={150}
|
||||
className="border bg-white rounded-md"
|
||||
/>
|
||||
<canvas ref={canvasRef} width={200} height={150} className="border bg-white rounded-md" />
|
||||
<div className="mt-2 text-sm">
|
||||
<div className="font-mono mb-2">IOU = {actualIOU.toFixed(3)}</div>
|
||||
<span className="">{description}</span>
|
||||
|
||||
@@ -28,10 +28,7 @@ export function Mermaid({ chart }: { chart: string }) {
|
||||
theme: resolvedTheme === 'dark' ? 'dark' : 'default',
|
||||
});
|
||||
|
||||
const { svg, bindFunctions } = await mermaid.render(
|
||||
id,
|
||||
chart.replaceAll('\\n', '\n'),
|
||||
);
|
||||
const { svg, bindFunctions } = await mermaid.render(id, chart.replaceAll('\\n', '\n'));
|
||||
|
||||
bindFunctions?.(container);
|
||||
setSvg(svg);
|
||||
@@ -44,4 +41,4 @@ export function Mermaid({ chart }: { chart: string }) {
|
||||
}, [chart, id, resolvedTheme]);
|
||||
|
||||
return <div ref={containerRef} dangerouslySetInnerHTML={{ __html: svg }} />;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ export function PageFeedback() {
|
||||
<p className="text-sm text-fd-muted-foreground text-left">
|
||||
{feedback === 'helpful'
|
||||
? 'Thanks for your feedback!'
|
||||
: 'Thanks for your feedback. We\'ll work on improving this page.'}
|
||||
: "Thanks for your feedback. We'll work on improving this page."}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -34,9 +34,7 @@ export async function getApiVersions(
|
||||
...versions.filter((v) => v.label === 'Current'),
|
||||
...versions
|
||||
.filter((v) => v.label !== 'Current')
|
||||
.sort((a, b) =>
|
||||
b.label.localeCompare(a.label, undefined, { numeric: true })
|
||||
),
|
||||
.sort((a, b) => b.label.localeCompare(a.label, undefined, { numeric: true })),
|
||||
];
|
||||
}
|
||||
|
||||
|
||||
@@ -2,11 +2,7 @@
|
||||
"compilerOptions": {
|
||||
"baseUrl": ".",
|
||||
"target": "ESNext",
|
||||
"lib": [
|
||||
"dom",
|
||||
"dom.iterable",
|
||||
"esnext"
|
||||
],
|
||||
"lib": ["dom", "dom.iterable", "esnext"],
|
||||
"allowJs": true,
|
||||
"skipLibCheck": true,
|
||||
"strict": true,
|
||||
@@ -20,12 +16,8 @@
|
||||
"jsx": "preserve",
|
||||
"incremental": true,
|
||||
"paths": {
|
||||
"@/.source": [
|
||||
"./.source/index.ts"
|
||||
],
|
||||
"@/*": [
|
||||
"./src/*"
|
||||
]
|
||||
"@/.source": ["./.source/index.ts"],
|
||||
"@/*": ["./src/*"]
|
||||
},
|
||||
"plugins": [
|
||||
{
|
||||
@@ -33,13 +25,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"include": [
|
||||
"next-env.d.ts",
|
||||
"**/*.ts",
|
||||
"**/*.tsx",
|
||||
".next/types/**/*.ts"
|
||||
],
|
||||
"exclude": [
|
||||
"node_modules"
|
||||
]
|
||||
}
|
||||
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
|
||||
@@ -2,16 +2,15 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import traceback
|
||||
import signal
|
||||
|
||||
from computer import Computer, VMProviderType
|
||||
import traceback
|
||||
|
||||
# Import the unified agent class and types
|
||||
from agent import ComputerAgent
|
||||
from computer import Computer, VMProviderType
|
||||
|
||||
# Import utility functions
|
||||
from utils import load_dotenv_files, handle_sigint
|
||||
from utils import handle_sigint, load_dotenv_files
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -40,25 +39,20 @@ async def run_agent_example():
|
||||
# Create ComputerAgent with new API
|
||||
agent = ComputerAgent(
|
||||
# Supported models:
|
||||
|
||||
# == OpenAI CUA (computer-use-preview) ==
|
||||
model="openai/computer-use-preview",
|
||||
|
||||
# == Anthropic CUA (Claude > 3.5) ==
|
||||
# model="anthropic/claude-opus-4-20250514",
|
||||
# model="anthropic/claude-opus-4-20250514",
|
||||
# model="anthropic/claude-sonnet-4-20250514",
|
||||
# model="anthropic/claude-3-7-sonnet-20250219",
|
||||
# model="anthropic/claude-3-5-sonnet-20241022",
|
||||
|
||||
# == UI-TARS ==
|
||||
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
|
||||
# model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
|
||||
# model="ollama_chat/0000/ui-tars-1.5-7b",
|
||||
|
||||
# == Omniparser + Any LLM ==
|
||||
# model="omniparser+anthropic/claude-opus-4-20250514",
|
||||
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
|
||||
|
||||
tools=[computer],
|
||||
only_n_most_recent_images=3,
|
||||
verbosity=logging.DEBUG,
|
||||
@@ -79,18 +73,18 @@ async def run_agent_example():
|
||||
|
||||
# Use message-based conversation history
|
||||
history = []
|
||||
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
|
||||
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": task})
|
||||
|
||||
|
||||
# Run agent with conversation history
|
||||
async for result in agent.run(history, stream=False):
|
||||
# Add agent outputs to history
|
||||
history += result.get("output", [])
|
||||
|
||||
|
||||
# Print output for debugging
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message":
|
||||
@@ -104,7 +98,7 @@ async def run_agent_example():
|
||||
print(f"Computer Action: {action_type}({action})")
|
||||
elif item.get("type") == "computer_call_output":
|
||||
print("Computer Output: [Screenshot/Result]")
|
||||
|
||||
|
||||
print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from utils import load_dotenv_files
|
||||
|
||||
load_dotenv_files()
|
||||
|
||||
from computer.providers.cloud.provider import CloudProvider
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
api_key = os.getenv("CUA_API_KEY")
|
||||
if not api_key:
|
||||
@@ -13,7 +15,7 @@ async def main() -> None:
|
||||
api_base = os.getenv("CUA_API_BASE")
|
||||
if api_base:
|
||||
print(f"Using API base: {api_base}")
|
||||
|
||||
|
||||
provider = CloudProvider(api_key=api_key, verbose=True)
|
||||
async with provider:
|
||||
|
||||
@@ -23,7 +25,7 @@ async def main() -> None:
|
||||
for vm in vms:
|
||||
print(
|
||||
f"name: {vm['name']}\n",
|
||||
f"status: {vm['status']}\n", # pending, running, stopped, terminated, failed
|
||||
f"status: {vm['status']}\n", # pending, running, stopped, terminated, failed
|
||||
f"api_url: {vm.get('api_url')}\n",
|
||||
f"vnc_url: {vm.get('vnc_url')}\n",
|
||||
)
|
||||
@@ -59,12 +61,13 @@ async def main() -> None:
|
||||
# # To probe a VM's status via its public hostname (if you know the name):
|
||||
# name = "m-linux-96lcxd2c2k"
|
||||
# info = await provider.get_vm(name)
|
||||
# print("get_vm info:\n",
|
||||
# print("get_vm info:\n",
|
||||
# f"name: {info['name']}\n",
|
||||
# f"status: {info['status']}\n", # running
|
||||
# f"api_url: {info.get('api_url')}\n",
|
||||
# f"os_type: {info.get('os_type')}\n",
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -19,6 +19,7 @@ This example demonstrates how to control a Cua Cloud Sandbox using the OpenAI `c
|
||||
|
||||
2. **Set up environment variables:**
|
||||
Create a `.env` file with the following variables:
|
||||
|
||||
- `OPENAI_API_KEY` — your OpenAI API key
|
||||
- `CUA_API_KEY` — your Cua Cloud API key
|
||||
- `CUA_CONTAINER_NAME` — the name of your provisioned sandbox
|
||||
|
||||
@@ -1,63 +1,63 @@
|
||||
import type { Computer } from "@trycua/computer";
|
||||
import type OpenAI from "openai";
|
||||
import type { Computer } from '@trycua/computer';
|
||||
import type OpenAI from 'openai';
|
||||
|
||||
export async function executeAction(
|
||||
computer: Computer,
|
||||
action: OpenAI.Responses.ResponseComputerToolCall["action"],
|
||||
computer: Computer,
|
||||
action: OpenAI.Responses.ResponseComputerToolCall['action']
|
||||
) {
|
||||
switch (action.type) {
|
||||
case "click": {
|
||||
const { x, y, button } = action;
|
||||
console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
|
||||
await computer.interface.moveCursor(x, y);
|
||||
if (button === "right") await computer.interface.rightClick();
|
||||
else await computer.interface.leftClick();
|
||||
break;
|
||||
}
|
||||
case "type":
|
||||
{
|
||||
const { text } = action;
|
||||
console.log(`Typing text: ${text}`);
|
||||
await computer.interface.typeText(text);
|
||||
}
|
||||
break;
|
||||
case "scroll": {
|
||||
const { x: locX, y: locY, scroll_x, scroll_y } = action;
|
||||
console.log(
|
||||
`Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`,
|
||||
);
|
||||
await computer.interface.moveCursor(locX, locY);
|
||||
await computer.interface.scroll(scroll_x, scroll_y);
|
||||
break;
|
||||
}
|
||||
case "keypress": {
|
||||
const { keys } = action;
|
||||
for (const key of keys) {
|
||||
console.log(`Pressing key: ${key}.`);
|
||||
// Map common key names to CUA equivalents
|
||||
if (key.toLowerCase() === "enter") {
|
||||
await computer.interface.pressKey("return");
|
||||
} else if (key.toLowerCase() === "space") {
|
||||
await computer.interface.pressKey("space");
|
||||
} else {
|
||||
await computer.interface.pressKey(key);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "wait": {
|
||||
console.log(`Waiting for 3 seconds.`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
|
||||
break;
|
||||
}
|
||||
case "screenshot": {
|
||||
console.log("Taking screenshot.");
|
||||
// This is handled automatically in the main loop, but we can take an extra one if requested
|
||||
const screenshot = await computer.interface.screenshot();
|
||||
return screenshot;
|
||||
}
|
||||
default:
|
||||
console.log(`Unrecognized action: ${action.type}`);
|
||||
break;
|
||||
}
|
||||
switch (action.type) {
|
||||
case 'click': {
|
||||
const { x, y, button } = action;
|
||||
console.log(`Executing click at (${x}, ${y}) with button '${button}'.`);
|
||||
await computer.interface.moveCursor(x, y);
|
||||
if (button === 'right') await computer.interface.rightClick();
|
||||
else await computer.interface.leftClick();
|
||||
break;
|
||||
}
|
||||
case 'type':
|
||||
{
|
||||
const { text } = action;
|
||||
console.log(`Typing text: ${text}`);
|
||||
await computer.interface.typeText(text);
|
||||
}
|
||||
break;
|
||||
case 'scroll': {
|
||||
const { x: locX, y: locY, scroll_x, scroll_y } = action;
|
||||
console.log(
|
||||
`Scrolling at (${locX}, ${locY}) with offsets (scroll_x=${scroll_x}, scroll_y=${scroll_y}).`
|
||||
);
|
||||
await computer.interface.moveCursor(locX, locY);
|
||||
await computer.interface.scroll(scroll_x, scroll_y);
|
||||
break;
|
||||
}
|
||||
case 'keypress': {
|
||||
const { keys } = action;
|
||||
for (const key of keys) {
|
||||
console.log(`Pressing key: ${key}.`);
|
||||
// Map common key names to CUA equivalents
|
||||
if (key.toLowerCase() === 'enter') {
|
||||
await computer.interface.pressKey('return');
|
||||
} else if (key.toLowerCase() === 'space') {
|
||||
await computer.interface.pressKey('space');
|
||||
} else {
|
||||
await computer.interface.pressKey(key);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'wait': {
|
||||
console.log(`Waiting for 3 seconds.`);
|
||||
await new Promise((resolve) => setTimeout(resolve, 3 * 1000));
|
||||
break;
|
||||
}
|
||||
case 'screenshot': {
|
||||
console.log('Taking screenshot.');
|
||||
// This is handled automatically in the main loop, but we can take an extra one if requested
|
||||
const screenshot = await computer.interface.screenshot();
|
||||
return screenshot;
|
||||
}
|
||||
default:
|
||||
console.log(`Unrecognized action: ${action.type}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,104 +1,103 @@
|
||||
import { Computer, OSType } from "@trycua/computer";
|
||||
import OpenAI from "openai";
|
||||
import { executeAction } from "./helpers";
|
||||
import { Computer, OSType } from '@trycua/computer';
|
||||
import OpenAI from 'openai';
|
||||
import { executeAction } from './helpers';
|
||||
|
||||
import "dotenv/config";
|
||||
import 'dotenv/config';
|
||||
|
||||
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
||||
|
||||
const COMPUTER_USE_PROMPT = "Open firefox and go to trycua.com";
|
||||
const COMPUTER_USE_PROMPT = 'Open firefox and go to trycua.com';
|
||||
|
||||
// Initialize the Computer Connection
|
||||
const computer = new Computer({
|
||||
apiKey: process.env.CUA_API_KEY!,
|
||||
name: process.env.CUA_CONTAINER_NAME!,
|
||||
osType: OSType.LINUX,
|
||||
apiKey: process.env.CUA_API_KEY!,
|
||||
name: process.env.CUA_CONTAINER_NAME!,
|
||||
osType: OSType.LINUX,
|
||||
});
|
||||
|
||||
await computer.run();
|
||||
// Take the initial screenshot
|
||||
const screenshot = await computer.interface.screenshot();
|
||||
const screenshotBase64 = screenshot.toString("base64");
|
||||
const screenshotBase64 = screenshot.toString('base64');
|
||||
|
||||
// Setup openai config for computer use
|
||||
const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
|
||||
model: "computer-use-preview",
|
||||
tools: [
|
||||
{
|
||||
type: "computer_use_preview",
|
||||
display_width: 1024,
|
||||
display_height: 768,
|
||||
environment: "linux", // we're using a linux vm
|
||||
},
|
||||
],
|
||||
truncation: "auto",
|
||||
model: 'computer-use-preview',
|
||||
tools: [
|
||||
{
|
||||
type: 'computer_use_preview',
|
||||
display_width: 1024,
|
||||
display_height: 768,
|
||||
environment: 'linux', // we're using a linux vm
|
||||
},
|
||||
],
|
||||
truncation: 'auto',
|
||||
};
|
||||
|
||||
// Send initial screenshot to the openai computer use model
|
||||
let res = await openai.responses.create({
|
||||
...computerUseConfig,
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
// what we want the ai to do
|
||||
{ type: "input_text", text: COMPUTER_USE_PROMPT },
|
||||
// current screenshot of the vm
|
||||
{
|
||||
type: "input_image",
|
||||
image_url: `data:image/png;base64,${screenshotBase64}`,
|
||||
detail: "auto",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
...computerUseConfig,
|
||||
input: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
// what we want the ai to do
|
||||
{ type: 'input_text', text: COMPUTER_USE_PROMPT },
|
||||
// current screenshot of the vm
|
||||
{
|
||||
type: 'input_image',
|
||||
image_url: `data:image/png;base64,${screenshotBase64}`,
|
||||
detail: 'auto',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Loop until there are no more computer use actions.
|
||||
while (true) {
|
||||
const computerCalls = res.output.filter((o) => o.type === "computer_call");
|
||||
if (computerCalls.length < 1) {
|
||||
console.log("No more computer calls. Loop complete.");
|
||||
break;
|
||||
}
|
||||
// Get the first call
|
||||
const call = computerCalls[0];
|
||||
const action = call.action;
|
||||
console.log("Received action from OpenAI Responses API:", action);
|
||||
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] =
|
||||
[];
|
||||
if (call.pending_safety_checks.length > 0) {
|
||||
console.log("Safety checks pending:", call.pending_safety_checks);
|
||||
// In a real implementation, you would want to get user confirmation here
|
||||
ackChecks = call.pending_safety_checks;
|
||||
}
|
||||
const computerCalls = res.output.filter((o) => o.type === 'computer_call');
|
||||
if (computerCalls.length < 1) {
|
||||
console.log('No more computer calls. Loop complete.');
|
||||
break;
|
||||
}
|
||||
// Get the first call
|
||||
const call = computerCalls[0];
|
||||
const action = call.action;
|
||||
console.log('Received action from OpenAI Responses API:', action);
|
||||
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = [];
|
||||
if (call.pending_safety_checks.length > 0) {
|
||||
console.log('Safety checks pending:', call.pending_safety_checks);
|
||||
// In a real implementation, you would want to get user confirmation here
|
||||
ackChecks = call.pending_safety_checks;
|
||||
}
|
||||
|
||||
// Execute the action in the container
|
||||
await executeAction(computer, action);
|
||||
// Wait for changes to process within the container (1sec)
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
// Execute the action in the container
|
||||
await executeAction(computer, action);
|
||||
// Wait for changes to process within the container (1sec)
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
|
||||
// Capture new screenshot
|
||||
const newScreenshot = await computer.interface.screenshot();
|
||||
const newScreenshotBase64 = newScreenshot.toString("base64");
|
||||
// Capture new screenshot
|
||||
const newScreenshot = await computer.interface.screenshot();
|
||||
const newScreenshotBase64 = newScreenshot.toString('base64');
|
||||
|
||||
// Screenshot back as computer_call_output
|
||||
// Screenshot back as computer_call_output
|
||||
|
||||
res = await openai.responses.create({
|
||||
...computerUseConfig,
|
||||
previous_response_id: res.id,
|
||||
input: [
|
||||
{
|
||||
type: "computer_call_output",
|
||||
call_id: call.call_id,
|
||||
acknowledged_safety_checks: ackChecks,
|
||||
output: {
|
||||
type: "computer_screenshot",
|
||||
image_url: `data:image/png;base64,${newScreenshotBase64}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
res = await openai.responses.create({
|
||||
...computerUseConfig,
|
||||
previous_response_id: res.id,
|
||||
input: [
|
||||
{
|
||||
type: 'computer_call_output',
|
||||
call_id: call.call_id,
|
||||
acknowledged_safety_checks: ackChecks,
|
||||
output: {
|
||||
type: 'computer_screenshot',
|
||||
image_url: `data:image/png;base64,${newScreenshotBase64}`,
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
process.exit();
|
||||
|
||||
@@ -1,17 +1,13 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "esnext",
|
||||
"lib": [
|
||||
"es2023"
|
||||
],
|
||||
"lib": ["es2023"],
|
||||
"moduleDetection": "force",
|
||||
"module": "preserve",
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"resolveJsonModule": true,
|
||||
"types": [
|
||||
"node"
|
||||
],
|
||||
"types": ["node"],
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"strict": true,
|
||||
"noUnusedLocals": true,
|
||||
@@ -21,9 +17,7 @@
|
||||
"isolatedModules": true,
|
||||
"verbatimModuleSyntax": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "build",
|
||||
"outDir": "build"
|
||||
},
|
||||
"include": [
|
||||
"src"
|
||||
]
|
||||
}
|
||||
"include": ["src"]
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import os
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -20,8 +20,9 @@ for path in pythonpath.split(":"):
|
||||
print(f"Added to sys.path: {path}")
|
||||
|
||||
from computer.computer import Computer
|
||||
from computer.providers.base import VMProviderType
|
||||
from computer.logger import LogLevel
|
||||
from computer.providers.base import VMProviderType
|
||||
|
||||
|
||||
async def main():
|
||||
try:
|
||||
@@ -29,17 +30,15 @@ async def main():
|
||||
|
||||
# Create a local macOS computer
|
||||
computer = Computer(
|
||||
display="1024x768",
|
||||
memory="8GB",
|
||||
cpu="4",
|
||||
display="1024x768",
|
||||
memory="8GB",
|
||||
cpu="4",
|
||||
os_type="macos",
|
||||
name="macos",
|
||||
verbosity=LogLevel.VERBOSE,
|
||||
provider_type=VMProviderType.LUME,
|
||||
storage="/Users/<USER>/repos/trycua/computer/examples/storage",
|
||||
shared_directories=[
|
||||
"/Users/<USER>/repos/trycua/computer/examples/shared"
|
||||
],
|
||||
shared_directories=["/Users/<USER>/repos/trycua/computer/examples/shared"],
|
||||
ephemeral=False,
|
||||
)
|
||||
|
||||
@@ -50,22 +49,22 @@ async def main():
|
||||
# name=os.getenv("CONTAINER_NAME"),
|
||||
# provider_type=VMProviderType.CLOUD,
|
||||
# )
|
||||
|
||||
|
||||
try:
|
||||
# Run the computer with default parameters
|
||||
await computer.run()
|
||||
|
||||
|
||||
screenshot = await computer.interface.screenshot()
|
||||
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path("./output")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
screenshot_path = output_dir / "screenshot.png"
|
||||
with open(screenshot_path, "wb") as f:
|
||||
f.write(screenshot)
|
||||
print(f"Screenshot saved to: {screenshot_path.absolute()}")
|
||||
|
||||
|
||||
# await computer.interface.hotkey("command", "space")
|
||||
|
||||
# res = await computer.interface.run_command("touch ./Downloads/empty_file")
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import os
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -21,12 +21,13 @@ for path in pythonpath.split(":"):
|
||||
print(f"Added to sys.path: {path}")
|
||||
|
||||
from computer.computer import Computer
|
||||
from computer.providers.base import VMProviderType
|
||||
from computer.logger import LogLevel
|
||||
from computer.providers.base import VMProviderType
|
||||
|
||||
# ANSI color codes
|
||||
RED = '\033[91m'
|
||||
RESET = '\033[0m'
|
||||
RED = "\033[91m"
|
||||
RESET = "\033[0m"
|
||||
|
||||
|
||||
async def main():
|
||||
try:
|
||||
@@ -39,15 +40,15 @@ async def main():
|
||||
name=os.getenv("CONTAINER_NAME") or "",
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
# Run the computer with default parameters
|
||||
await computer.run()
|
||||
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path("./output")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
# Keyboard Actions Examples
|
||||
print("\n=== Keyboard Actions ===")
|
||||
await computer.interface.type_text("Hello, World!")
|
||||
@@ -65,8 +66,10 @@ async def main():
|
||||
|
||||
@sandboxed("demo_venv")
|
||||
def greet_and_print(name):
|
||||
from mss import mss
|
||||
import os
|
||||
|
||||
from mss import mss
|
||||
|
||||
# get username
|
||||
username = os.getlogin()
|
||||
print(f"Hello from inside the container, {name}!")
|
||||
@@ -75,9 +78,9 @@ async def main():
|
||||
|
||||
# take a screenshot
|
||||
with mss() as sct:
|
||||
filename = sct.shot(mon=-1, output='C:/Users/azureuser/Desktop/fullscreen.png')
|
||||
filename = sct.shot(mon=-1, output="C:/Users/azureuser/Desktop/fullscreen.png")
|
||||
print(filename)
|
||||
|
||||
|
||||
return {"greeted": name, "username": username}
|
||||
|
||||
# Call with args and kwargs
|
||||
@@ -94,33 +97,32 @@ async def main():
|
||||
with open(screenshot_path, "wb") as f:
|
||||
f.write(screenshot)
|
||||
print(f"Screenshot saved to: {screenshot_path.absolute()}")
|
||||
|
||||
|
||||
# Clipboard Actions Examples
|
||||
print("\n=== Clipboard Actions ===")
|
||||
await computer.interface.set_clipboard("Test clipboard")
|
||||
content = await computer.interface.copy_to_clipboard()
|
||||
print(f"Clipboard content: {content}")
|
||||
|
||||
|
||||
# Simple REPL Loop
|
||||
print("\n=== Command REPL ===")
|
||||
print("Enter commands to run on the remote computer.")
|
||||
print("Type 'exit' or 'quit' to leave the REPL.\n")
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Get command from user
|
||||
command = input("command> ").strip()
|
||||
|
||||
|
||||
# Check for exit commands
|
||||
if command.lower() in ['exit', 'quit', '']:
|
||||
if command.lower() in ['exit', 'quit']:
|
||||
if command.lower() in ["exit", "quit", ""]:
|
||||
if command.lower() in ["exit", "quit"]:
|
||||
print("Exiting REPL...")
|
||||
break
|
||||
|
||||
|
||||
# Run the command
|
||||
result = await computer.interface.run_command(command)
|
||||
|
||||
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
print(f"{RED}{result.stderr}{RESET}")
|
||||
@@ -130,7 +132,6 @@ async def main():
|
||||
except Exception as e:
|
||||
print(f"{RED}Error running command: {e}{RESET}")
|
||||
|
||||
|
||||
finally:
|
||||
# Important to clean up resources
|
||||
# await computer.stop()
|
||||
|
||||
@@ -23,9 +23,9 @@ if __name__ == "__main__":
|
||||
server_name="0.0.0.0",
|
||||
server_port=7860,
|
||||
)
|
||||
|
||||
|
||||
# Optional: Using the saved dataset
|
||||
# import datasets
|
||||
# from computer.ui.utils import convert_to_unsloth
|
||||
# ds = datasets.load_dataset("ddupont/highquality-cua-demonstrations")
|
||||
# ds = convert_to_unsloth(ds)
|
||||
# ds = convert_to_unsloth(ds)
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import asyncio
|
||||
from computer.providers.factory import VMProviderFactory
|
||||
from computer import Computer, VMProviderType
|
||||
import os
|
||||
|
||||
from computer import Computer, VMProviderType
|
||||
from computer.providers.factory import VMProviderFactory
|
||||
|
||||
|
||||
async def main():
|
||||
# # Create docker provider
|
||||
# provider = VMProviderFactory.create_provider(
|
||||
@@ -39,5 +41,6 @@ async def main():
|
||||
with open("screenshot_docker.png", "wb") as f:
|
||||
f.write(screenshot)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -6,7 +6,7 @@ hud_eval_examples.py — minimal HUD evaluation runner
|
||||
- No Docker/local computer usage
|
||||
"""
|
||||
|
||||
#imports
|
||||
# imports
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
@@ -14,13 +14,15 @@ import uuid
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
from dotenv import load_dotenv, find_dotenv
|
||||
from agent import ComputerAgent
|
||||
from agent.integrations.hud import run_full_dataset
|
||||
from dotenv import find_dotenv, load_dotenv
|
||||
|
||||
"""
|
||||
Loading env
|
||||
"""
|
||||
|
||||
|
||||
def load_env_or_fail() -> None:
|
||||
# Walk up from CWD / file dir to find nearest .env
|
||||
env_path = find_dotenv(usecwd=False)
|
||||
@@ -32,17 +34,19 @@ def load_env_or_fail() -> None:
|
||||
if not os.getenv("HUD_API_KEY"):
|
||||
raise EnvironmentError("❌ HUD_API_KEY is missing in the loaded environment")
|
||||
|
||||
|
||||
"""
|
||||
Build Agent Config
|
||||
- customize agent behavior, tool integration, callbacks, resource management, and more
|
||||
- https://docs.trycua.com/docs/agent-sdk/agent-loops#parameters
|
||||
- https://docs.trycua.com/docs/agent-sdk/supported-model-providers
|
||||
"""
|
||||
|
||||
|
||||
def build_agent_config() -> dict:
|
||||
|
||||
instruction = "You are a computer-using agent graded by deterministic checkers."
|
||||
|
||||
|
||||
return {
|
||||
"model": "openai/computer-use-preview",
|
||||
"trajectory_dir": str(Path("trajectories")),
|
||||
@@ -51,21 +55,25 @@ def build_agent_config() -> dict:
|
||||
"instruction": instruction,
|
||||
}
|
||||
|
||||
|
||||
"""
|
||||
Hud Eval
|
||||
"""
|
||||
|
||||
|
||||
async def run_hud_eval() -> None:
|
||||
#load env and agent config
|
||||
# load env and agent config
|
||||
load_env_or_fail()
|
||||
agent_config = build_agent_config()
|
||||
|
||||
# Initialize to ensure config is valid (tools, verbosity, etc.)
|
||||
_ = ComputerAgent(**agent_config)
|
||||
|
||||
job_name = f"osworld-test-{str(uuid.uuid4())[:4]}" #job name (each run of your task is a job on hud)
|
||||
job_name = (
|
||||
f"osworld-test-{str(uuid.uuid4())[:4]}" # job name (each run of your task is a job on hud)
|
||||
)
|
||||
print(f"🚀 Running HUD eval: {job_name}")
|
||||
|
||||
|
||||
"""
|
||||
Customize your hud eval below, check the doc for additional params
|
||||
- https://docs.trycua.com/docs/agent-sdk/integrations/hud#parameters-1
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
from pylume import PyLume, ImageRef, VMRunOpts, SharedDirectory, VMConfig, VMUpdateOpts
|
||||
|
||||
from pylume import ImageRef, PyLume, SharedDirectory, VMConfig, VMRunOpts, VMUpdateOpts
|
||||
|
||||
|
||||
async def main():
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from pathlib import Path
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -18,14 +18,16 @@ for path in pythonpath.split(":"):
|
||||
print(f"Added to sys.path: {path}")
|
||||
|
||||
import asyncio
|
||||
|
||||
from computer.computer import Computer
|
||||
from computer.helpers import sandboxed
|
||||
|
||||
|
||||
async def main():
|
||||
# Initialize the computer in a Cua Container
|
||||
computer = Computer()
|
||||
await computer.run()
|
||||
|
||||
|
||||
# Install a package in a virtual environment in the container
|
||||
await computer.venv_install("demo_venv", ["requests", "macos-pyxa"])
|
||||
|
||||
@@ -39,6 +41,7 @@ async def main():
|
||||
def greet_and_print(name):
|
||||
# get .html of the current Safari tab
|
||||
import PyXA
|
||||
|
||||
safari = PyXA.Application("Safari")
|
||||
current_doc = safari.current_document
|
||||
html = current_doc.source()
|
||||
@@ -50,5 +53,6 @@ async def main():
|
||||
result = await greet_and_print("Cua")
|
||||
print("Result from sandboxed function:", result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -9,17 +9,18 @@ This script shows how to:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import time
|
||||
from PIL import Image
|
||||
from typing import Dict, Any, List, Optional
|
||||
import numpy as np
|
||||
import io
|
||||
import base64
|
||||
import glob
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -42,8 +43,8 @@ if str(libs_path) not in sys.path:
|
||||
sys.path.append(str(libs_path))
|
||||
print(f"Added to sys.path: {libs_path}")
|
||||
|
||||
from som import OmniParser, ParseResult, IconElement, TextElement
|
||||
from som.models import UIElement, ParserMetadata, BoundingBox
|
||||
from som import IconElement, OmniParser, ParseResult, TextElement
|
||||
from som.models import BoundingBox, ParserMetadata, UIElement
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -361,7 +362,7 @@ def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False):
|
||||
|
||||
# Update timing totals
|
||||
total_time += t.elapsed_time
|
||||
|
||||
|
||||
# Write summary for this combination
|
||||
avg_time = total_time / len(image_files)
|
||||
f.write(
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
"""Utility functions for example scripts."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@@ -4,11 +4,13 @@ Learn more at: https://learn.microsoft.com/en-us/windows/security/application-se
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from computer import Computer
|
||||
|
||||
|
||||
async def main():
|
||||
"""Test the Windows Sandbox provider."""
|
||||
|
||||
|
||||
# Create a computer instance using Windows Sandbox
|
||||
computer = Computer(
|
||||
provider_type="winsandbox",
|
||||
@@ -16,19 +18,19 @@ async def main():
|
||||
memory="4GB",
|
||||
# ephemeral=True, # Always true for Windows Sandbox
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
print("Starting Windows Sandbox...")
|
||||
await computer.run()
|
||||
|
||||
|
||||
print("Windows Sandbox is ready!")
|
||||
print(f"IP Address: {await computer.get_ip()}")
|
||||
|
||||
|
||||
# Test basic functionality
|
||||
print("Testing basic functionality...")
|
||||
screenshot = await computer.interface.screenshot()
|
||||
print(f"Screenshot taken: {len(screenshot)} bytes")
|
||||
|
||||
|
||||
# Test running a command
|
||||
print("Testing command execution...")
|
||||
result = await computer.interface.run_command("echo Hello from Windows Sandbox!")
|
||||
@@ -36,16 +38,18 @@ async def main():
|
||||
|
||||
print("Press any key to continue...")
|
||||
input()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
finally:
|
||||
print("Stopping Windows Sandbox...")
|
||||
await computer.stop()
|
||||
print("Windows Sandbox stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -70,14 +70,17 @@ async with provider:
|
||||
## Container Configuration
|
||||
|
||||
### Ports
|
||||
|
||||
- **6901**: VNC web interface (noVNC)
|
||||
- **8080**: Computer-server API endpoint
|
||||
|
||||
### Environment Variables
|
||||
|
||||
- `VNC_PW`: VNC password (default: "password")
|
||||
- `DISPLAY`: X11 display (set to ":0")
|
||||
|
||||
### Volumes
|
||||
|
||||
- `/home/kasm-user/storage`: Persistent storage mount point
|
||||
- `/home/kasm-user/shared`: Shared folder mount point
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ We're always looking for suggestions to make lume better. If you have an idea:
|
||||
## Documentation
|
||||
|
||||
Documentation improvements are always welcome. You can:
|
||||
|
||||
- Fix typos or unclear explanations
|
||||
- Add examples and use cases
|
||||
- Improve API documentation
|
||||
@@ -36,4 +37,4 @@ Documentation improvements are always welcome. You can:
|
||||
|
||||
For detailed instructions on setting up your development environment and submitting code contributions, please see our [Development.md](docs/Development.md) guide.
|
||||
|
||||
Feel free to join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get help with your contributions.
|
||||
Feel free to join our [Discord community](https://discord.com/invite/mVnXXpdE85) to discuss ideas or get help with your contributions.
|
||||
|
||||
@@ -5,6 +5,7 @@ This guide will help you set up your development environment and understand the
|
||||
## Environment Setup
|
||||
|
||||
Lume development requires:
|
||||
|
||||
- Swift 6 or higher
|
||||
- Xcode 15 or higher
|
||||
- macOS Sequoia 15.2 or higher
|
||||
@@ -16,12 +17,13 @@ If you're working on Lume in the context of the Cua monorepo, we recommend using
|
||||
# Open VS Code workspace from the root of the monorepo
|
||||
code .vscode/lume.code-workspace
|
||||
```
|
||||
|
||||
This workspace is preconfigured with Swift language support, build tasks, and debug configurations.
|
||||
|
||||
## Setting Up the Repository Locally
|
||||
|
||||
1. **Fork the Repository**: Create your own fork of lume
|
||||
2. **Clone the Repository**:
|
||||
2. **Clone the Repository**:
|
||||
```bash
|
||||
git clone https://github.com/trycua/lume.git
|
||||
cd lume
|
||||
|
||||
@@ -8,13 +8,13 @@
|
||||
</picture>
|
||||
</div>
|
||||
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
|
||||
</h1>
|
||||
</div>
|
||||
|
||||
|
||||
**lume** is a lightweight Command Line Interface and local API server to create, run and manage macOS and Linux virtual machines (VMs) with near-native performance on Apple Silicon, using Apple's `Virtualization.Framework`.
|
||||
|
||||
### Run prebuilt macOS images in just 1 step
|
||||
@@ -43,6 +43,7 @@ All prebuilt images use the default password `lume`. Change this immediately aft
|
||||
</Callout>
|
||||
|
||||
**System Requirements**:
|
||||
|
||||
- Apple Silicon Mac (M1, M2, M3, etc.)
|
||||
- macOS 13.0 or later
|
||||
- At least 8GB of RAM (16GB recommended)
|
||||
|
||||
@@ -8,9 +8,10 @@
|
||||
</picture>
|
||||
</div>
|
||||
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
|
||||
</h1>
|
||||
</div>
|
||||
|
||||
@@ -21,6 +22,7 @@ macOS and Linux virtual machines in a Docker container.
|
||||
</div>
|
||||
|
||||
## What is Lumier?
|
||||
|
||||
**Lumier** is an interface for running macOS virtual machines with minimal setup. It uses Docker as a packaging system to deliver a pre-configured environment that connects to the `lume` virtualization service running on your host machine. With Lumier, you get:
|
||||
|
||||
- A ready-to-use macOS or Linux virtual machine in minutes
|
||||
@@ -35,6 +37,7 @@ Before using Lumier, make sure you have:
|
||||
1. **Docker for Apple Silicon** - download it [here](https://desktop.docker.com/mac/main/arm64/Docker.dmg) and follow the installation instructions.
|
||||
|
||||
2. **Lume** - This is the virtualization CLI that powers Lumier. Install it with this command:
|
||||
|
||||
```bash
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
|
||||
```
|
||||
@@ -69,6 +72,7 @@ After running the command above, you can access your macOS VM through a web brow
|
||||
This project was inspired by [dockur/windows](https://github.com/dockur/windows) and [dockur/macos](https://github.com/dockur/macos), which pioneered the approach of running Windows and macOS VMs in Docker containers.
|
||||
|
||||
Main differences with dockur/macos:
|
||||
|
||||
- Lumier is specifically designed for macOS virtualization
|
||||
- Lumier supports Apple Silicon (M1/M2/M3/M4) while dockur/macos only supports Intel
|
||||
- Lumier uses the Apple Virtualization Framework (Vz) through the `lume` CLI to create true virtual machines, while dockur relies on KVM.
|
||||
|
||||
@@ -8,10 +8,11 @@
|
||||
</picture>
|
||||
</div>
|
||||
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
[](https://pypi.org/project/cua-computer/)
|
||||
[](#)
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
[](https://pypi.org/project/cua-computer/)
|
||||
|
||||
</h1>
|
||||
</div>
|
||||
|
||||
@@ -47,7 +48,7 @@ async def main():
|
||||
name=os.getenv("CUA_CONTAINER_NAME"),
|
||||
api_key=os.getenv("CUA_API_KEY")
|
||||
) as computer:
|
||||
|
||||
|
||||
# Create agent
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-3-5-sonnet-20241022",
|
||||
@@ -56,10 +57,10 @@ async def main():
|
||||
trajectory_dir="trajectories",
|
||||
max_trajectory_budget=5.0 # $5 budget limit
|
||||
)
|
||||
|
||||
|
||||
# Run agent
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
@@ -84,4 +85,4 @@ if __name__ == "__main__":
|
||||
|
||||
## License
|
||||
|
||||
MIT License - see LICENSE file for details.
|
||||
MIT License - see LICENSE file for details.
|
||||
|
||||
@@ -5,19 +5,13 @@ agent - Decorator-based Computer Use Agent with liteLLM integration
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .decorators import register_agent
|
||||
from .agent import ComputerAgent
|
||||
from .types import Messages, AgentResponse
|
||||
|
||||
# Import loops to register them
|
||||
from . import loops
|
||||
from .agent import ComputerAgent
|
||||
from .decorators import register_agent
|
||||
from .types import AgentResponse, Messages
|
||||
|
||||
__all__ = [
|
||||
"register_agent",
|
||||
"ComputerAgent",
|
||||
"Messages",
|
||||
"AgentResponse"
|
||||
]
|
||||
__all__ = ["register_agent", "ComputerAgent", "Messages", "AgentResponse"]
|
||||
|
||||
__version__ = "0.4.0"
|
||||
|
||||
|
||||
@@ -5,8 +5,9 @@ Usage:
|
||||
python -m agent.cli <model_string>
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
from .cli import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -2,27 +2,30 @@ import asyncio
|
||||
import functools
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
|
||||
|
||||
from litellm import acompletion, completion
|
||||
from litellm.llms.custom_llm import CustomLLM
|
||||
from litellm import completion, acompletion
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
|
||||
# Try to import HuggingFace dependencies
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
HF_AVAILABLE = True
|
||||
except ImportError:
|
||||
HF_AVAILABLE = False
|
||||
|
||||
from .models import load_model as load_model_handler
|
||||
|
||||
|
||||
class HuggingFaceLocalAdapter(CustomLLM):
|
||||
"""HuggingFace Local Adapter for running vision-language models locally."""
|
||||
|
||||
|
||||
def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
|
||||
"""Initialize the adapter.
|
||||
|
||||
|
||||
Args:
|
||||
device: Device to load model on ("auto", "cuda", "cpu", etc.)
|
||||
trust_remote_code: Whether to trust remote code
|
||||
@@ -34,129 +37,120 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
||||
# Cache for model handlers keyed by model_name
|
||||
self._handlers: Dict[str, Any] = {}
|
||||
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
||||
|
||||
|
||||
def _get_handler(self, model_name: str):
|
||||
"""Get or create a model handler for the given model name."""
|
||||
if model_name not in self._handlers:
|
||||
self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
|
||||
self._handlers[model_name] = load_model_handler(
|
||||
model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
|
||||
)
|
||||
return self._handlers[model_name]
|
||||
|
||||
|
||||
def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Convert OpenAI format messages to HuggingFace format.
|
||||
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
|
||||
|
||||
Returns:
|
||||
Messages in HuggingFace format
|
||||
"""
|
||||
converted_messages = []
|
||||
|
||||
|
||||
for message in messages:
|
||||
converted_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
|
||||
converted_message = {"role": message["role"], "content": []}
|
||||
|
||||
content = message.get("content", [])
|
||||
if isinstance(content, str):
|
||||
# Simple text content
|
||||
converted_message["content"].append({
|
||||
"type": "text",
|
||||
"text": content
|
||||
})
|
||||
converted_message["content"].append({"type": "text", "text": content})
|
||||
elif isinstance(content, list):
|
||||
# Multi-modal content
|
||||
for item in content:
|
||||
if item.get("type") == "text":
|
||||
converted_message["content"].append({
|
||||
"type": "text",
|
||||
"text": item.get("text", "")
|
||||
})
|
||||
converted_message["content"].append(
|
||||
{"type": "text", "text": item.get("text", "")}
|
||||
)
|
||||
elif item.get("type") == "image_url":
|
||||
# Convert image_url format to image format
|
||||
image_url = item.get("image_url", {}).get("url", "")
|
||||
converted_message["content"].append({
|
||||
"type": "image",
|
||||
"image": image_url
|
||||
})
|
||||
|
||||
converted_message["content"].append({"type": "image", "image": image_url})
|
||||
|
||||
converted_messages.append(converted_message)
|
||||
|
||||
|
||||
return converted_messages
|
||||
|
||||
|
||||
def _generate(self, **kwargs) -> str:
|
||||
"""Generate response using the local HuggingFace model.
|
||||
|
||||
|
||||
Args:
|
||||
**kwargs: Keyword arguments containing messages and model info
|
||||
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
if not HF_AVAILABLE:
|
||||
raise ImportError(
|
||||
"HuggingFace transformers dependencies not found. "
|
||||
"Please install with: pip install \"cua-agent[uitars-hf]\""
|
||||
'Please install with: pip install "cua-agent[uitars-hf]"'
|
||||
)
|
||||
|
||||
|
||||
# Extract messages and model from kwargs
|
||||
messages = kwargs.get('messages', [])
|
||||
model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
|
||||
max_new_tokens = kwargs.get('max_tokens', 128)
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
|
||||
max_new_tokens = kwargs.get("max_tokens", 128)
|
||||
|
||||
# Warn about ignored kwargs
|
||||
ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
|
||||
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
|
||||
if ignored_kwargs:
|
||||
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
||||
|
||||
|
||||
# Convert messages to HuggingFace format
|
||||
hf_messages = self._convert_messages(messages)
|
||||
|
||||
|
||||
# Delegate to model handler
|
||||
handler = self._get_handler(model_name)
|
||||
generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
|
||||
return generated_text
|
||||
|
||||
|
||||
def completion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Synchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
|
||||
return completion(
|
||||
model=f"huggingface-local/{kwargs['model']}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
|
||||
|
||||
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Asynchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
self._executor, functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
|
||||
return await acompletion(
|
||||
model=f"huggingface-local/{kwargs['model']}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
|
||||
|
||||
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
||||
"""Synchronous streaming method.
|
||||
|
||||
|
||||
Returns:
|
||||
Iterator of GenericStreamingChunk
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
@@ -165,22 +159,21 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
|
||||
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
||||
"""Asynchronous streaming method.
|
||||
|
||||
|
||||
Returns:
|
||||
AsyncIterator of GenericStreamingChunk
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
self._executor, functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
@@ -189,5 +182,5 @@ class HuggingFaceLocalAdapter(CustomLLM):
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
import os
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Any, AsyncIterator, Dict, Iterator, List
|
||||
|
||||
import requests
|
||||
from typing import List, Dict, Any, Iterator, AsyncIterator
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
from litellm import acompletion, completion
|
||||
from litellm.llms.custom_llm import CustomLLM
|
||||
from litellm import completion, acompletion
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
|
||||
|
||||
class HumanAdapter(CustomLLM):
|
||||
"""Human Adapter for human-in-the-loop completions.
|
||||
|
||||
|
||||
This adapter sends completion requests to a human completion server
|
||||
where humans can review and respond to AI requests.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, base_url: str | None = None, timeout: float = 300.0, **kwargs):
|
||||
"""Initialize the human adapter.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL for the human completion server.
|
||||
Defaults to HUMAN_BASE_URL environment variable or http://localhost:8002
|
||||
@@ -24,60 +25,58 @@ class HumanAdapter(CustomLLM):
|
||||
**kwargs: Additional arguments
|
||||
"""
|
||||
super().__init__()
|
||||
self.base_url = base_url or os.getenv('HUMAN_BASE_URL', 'http://localhost:8002')
|
||||
self.base_url = base_url or os.getenv("HUMAN_BASE_URL", "http://localhost:8002")
|
||||
self.timeout = timeout
|
||||
|
||||
|
||||
# Ensure base_url doesn't end with slash
|
||||
self.base_url = self.base_url.rstrip('/')
|
||||
|
||||
self.base_url = self.base_url.rstrip("/")
|
||||
|
||||
def _queue_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
|
||||
"""Queue a completion request and return the call ID.
|
||||
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
model: Model name
|
||||
|
||||
|
||||
Returns:
|
||||
Call ID for tracking the request
|
||||
|
||||
|
||||
Raises:
|
||||
Exception: If queueing fails
|
||||
"""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/queue",
|
||||
json={"messages": messages, "model": model},
|
||||
timeout=10
|
||||
f"{self.base_url}/queue", json={"messages": messages, "model": model}, timeout=10
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["id"]
|
||||
except requests.RequestException as e:
|
||||
raise Exception(f"Failed to queue completion request: {e}")
|
||||
|
||||
|
||||
def _wait_for_completion(self, call_id: str) -> Dict[str, Any]:
|
||||
"""Wait for human to complete the call.
|
||||
|
||||
|
||||
Args:
|
||||
call_id: ID of the queued completion call
|
||||
|
||||
|
||||
Returns:
|
||||
Dict containing response and/or tool_calls
|
||||
|
||||
|
||||
Raises:
|
||||
TimeoutError: If timeout is exceeded
|
||||
Exception: If completion fails
|
||||
"""
|
||||
import time
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Check status
|
||||
status_response = requests.get(f"{self.base_url}/status/{call_id}")
|
||||
status_response.raise_for_status()
|
||||
status_data = status_response.json()
|
||||
|
||||
|
||||
if status_data["status"] == "completed":
|
||||
result = {}
|
||||
if "response" in status_data and status_data["response"]:
|
||||
@@ -88,38 +87,41 @@ class HumanAdapter(CustomLLM):
|
||||
elif status_data["status"] == "failed":
|
||||
error_msg = status_data.get("error", "Unknown error")
|
||||
raise Exception(f"Completion failed: {error_msg}")
|
||||
|
||||
|
||||
# Check timeout
|
||||
if time.time() - start_time > self.timeout:
|
||||
raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
|
||||
|
||||
raise TimeoutError(
|
||||
f"Timeout waiting for human response after {self.timeout} seconds"
|
||||
)
|
||||
|
||||
# Wait before checking again
|
||||
time.sleep(1.0)
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
if time.time() - start_time > self.timeout:
|
||||
raise TimeoutError(f"Timeout waiting for human response: {e}")
|
||||
# Continue trying if we haven't timed out
|
||||
time.sleep(1.0)
|
||||
|
||||
|
||||
async def _async_wait_for_completion(self, call_id: str) -> Dict[str, Any]:
|
||||
"""Async version of wait_for_completion.
|
||||
|
||||
|
||||
Args:
|
||||
call_id: ID of the queued completion call
|
||||
|
||||
|
||||
Returns:
|
||||
Dict containing response and/or tool_calls
|
||||
|
||||
|
||||
Raises:
|
||||
TimeoutError: If timeout is exceeded
|
||||
Exception: If completion fails
|
||||
"""
|
||||
import aiohttp
|
||||
import time
|
||||
|
||||
|
||||
import aiohttp
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
while True:
|
||||
try:
|
||||
@@ -127,7 +129,7 @@ class HumanAdapter(CustomLLM):
|
||||
async with session.get(f"{self.base_url}/status/{call_id}") as response:
|
||||
response.raise_for_status()
|
||||
status_data = await response.json()
|
||||
|
||||
|
||||
if status_data["status"] == "completed":
|
||||
result = {}
|
||||
if "response" in status_data and status_data["response"]:
|
||||
@@ -138,166 +140,158 @@ class HumanAdapter(CustomLLM):
|
||||
elif status_data["status"] == "failed":
|
||||
error_msg = status_data.get("error", "Unknown error")
|
||||
raise Exception(f"Completion failed: {error_msg}")
|
||||
|
||||
|
||||
# Check timeout
|
||||
if time.time() - start_time > self.timeout:
|
||||
raise TimeoutError(f"Timeout waiting for human response after {self.timeout} seconds")
|
||||
|
||||
raise TimeoutError(
|
||||
f"Timeout waiting for human response after {self.timeout} seconds"
|
||||
)
|
||||
|
||||
# Wait before checking again
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
if time.time() - start_time > self.timeout:
|
||||
raise TimeoutError(f"Timeout waiting for human response: {e}")
|
||||
# Continue trying if we haven't timed out
|
||||
await asyncio.sleep(1.0)
|
||||
|
||||
|
||||
def _generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
|
||||
"""Generate a human response for the given messages.
|
||||
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
model: Model name
|
||||
|
||||
|
||||
Returns:
|
||||
Dict containing response and/or tool_calls
|
||||
"""
|
||||
# Queue the completion request
|
||||
call_id = self._queue_completion(messages, model)
|
||||
|
||||
|
||||
# Wait for human response
|
||||
response = self._wait_for_completion(call_id)
|
||||
|
||||
|
||||
return response
|
||||
|
||||
async def _async_generate_response(self, messages: List[Dict[str, Any]], model: str) -> Dict[str, Any]:
|
||||
|
||||
async def _async_generate_response(
|
||||
self, messages: List[Dict[str, Any]], model: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Async version of _generate_response.
|
||||
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
model: Model name
|
||||
|
||||
|
||||
Returns:
|
||||
Dict containing response and/or tool_calls
|
||||
"""
|
||||
# Queue the completion request (sync operation)
|
||||
call_id = self._queue_completion(messages, model)
|
||||
|
||||
|
||||
# Wait for human response (async)
|
||||
response = await self._async_wait_for_completion(call_id)
|
||||
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def completion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Synchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with human-generated text or tool calls
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model = kwargs.get('model', 'human')
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model = kwargs.get("model", "human")
|
||||
|
||||
# Generate human response
|
||||
human_response_data = self._generate_response(messages, model)
|
||||
|
||||
|
||||
# Create ModelResponse with proper structure
|
||||
from litellm.types.utils import ModelResponse, Choices, Message
|
||||
import uuid
|
||||
import time
|
||||
|
||||
import uuid
|
||||
|
||||
from litellm.types.utils import Choices, Message, ModelResponse
|
||||
|
||||
# Create message content based on response type
|
||||
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
||||
# Tool calls response
|
||||
message = Message(
|
||||
role="assistant",
|
||||
content=human_response_data.get("response", ""),
|
||||
tool_calls=human_response_data["tool_calls"]
|
||||
tool_calls=human_response_data["tool_calls"],
|
||||
)
|
||||
else:
|
||||
# Text response
|
||||
message = Message(
|
||||
role="assistant",
|
||||
content=human_response_data.get("response", "")
|
||||
)
|
||||
|
||||
choice = Choices(
|
||||
finish_reason="stop",
|
||||
index=0,
|
||||
message=message
|
||||
)
|
||||
|
||||
message = Message(role="assistant", content=human_response_data.get("response", ""))
|
||||
|
||||
choice = Choices(finish_reason="stop", index=0, message=message)
|
||||
|
||||
result = ModelResponse(
|
||||
id=f"human-{uuid.uuid4()}",
|
||||
choices=[choice],
|
||||
created=int(time.time()),
|
||||
model=f"human/{model}",
|
||||
object="chat.completion"
|
||||
object="chat.completion",
|
||||
)
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Asynchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with human-generated text or tool calls
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model = kwargs.get('model', 'human')
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model = kwargs.get("model", "human")
|
||||
|
||||
# Generate human response
|
||||
human_response_data = await self._async_generate_response(messages, model)
|
||||
|
||||
|
||||
# Create ModelResponse with proper structure
|
||||
from litellm.types.utils import ModelResponse, Choices, Message
|
||||
import uuid
|
||||
import time
|
||||
|
||||
import uuid
|
||||
|
||||
from litellm.types.utils import Choices, Message, ModelResponse
|
||||
|
||||
# Create message content based on response type
|
||||
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
||||
# Tool calls response
|
||||
message = Message(
|
||||
role="assistant",
|
||||
content=human_response_data.get("response", ""),
|
||||
tool_calls=human_response_data["tool_calls"]
|
||||
tool_calls=human_response_data["tool_calls"],
|
||||
)
|
||||
else:
|
||||
# Text response
|
||||
message = Message(
|
||||
role="assistant",
|
||||
content=human_response_data.get("response", "")
|
||||
)
|
||||
|
||||
choice = Choices(
|
||||
finish_reason="stop",
|
||||
index=0,
|
||||
message=message
|
||||
)
|
||||
|
||||
message = Message(role="assistant", content=human_response_data.get("response", ""))
|
||||
|
||||
choice = Choices(finish_reason="stop", index=0, message=message)
|
||||
|
||||
result = ModelResponse(
|
||||
id=f"human-{uuid.uuid4()}",
|
||||
choices=[choice],
|
||||
created=int(time.time()),
|
||||
model=f"human/{model}",
|
||||
object="chat.completion"
|
||||
object="chat.completion",
|
||||
)
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
||||
"""Synchronous streaming method.
|
||||
|
||||
|
||||
Yields:
|
||||
Streaming chunks with human-generated text or tool calls
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model = kwargs.get('model', 'human')
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model = kwargs.get("model", "human")
|
||||
|
||||
# Generate human response
|
||||
human_response_data = self._generate_response(messages, model)
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
# Handle tool calls vs text response
|
||||
if "tool_calls" in human_response_data and human_response_data["tool_calls"]:
|
||||
# Stream tool calls as a single chunk
|
||||
@@ -319,22 +313,26 @@ class HumanAdapter(CustomLLM):
|
||||
"is_finished": True,
|
||||
"text": response_text,
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": len(response_text.split()), "prompt_tokens": 0, "total_tokens": len(response_text.split())},
|
||||
"usage": {
|
||||
"completion_tokens": len(response_text.split()),
|
||||
"prompt_tokens": 0,
|
||||
"total_tokens": len(response_text.split()),
|
||||
},
|
||||
}
|
||||
yield generic_chunk
|
||||
|
||||
|
||||
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
||||
"""Asynchronous streaming method.
|
||||
|
||||
|
||||
Yields:
|
||||
Streaming chunks with human-generated text or tool calls
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model = kwargs.get('model', 'human')
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model = kwargs.get("model", "human")
|
||||
|
||||
# Generate human response
|
||||
human_response = await self._async_generate_response(messages, model)
|
||||
|
||||
|
||||
# Return as single streaming chunk
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
@@ -342,7 +340,11 @@ class HumanAdapter(CustomLLM):
|
||||
"is_finished": True,
|
||||
"text": human_response,
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": len(human_response.split()), "prompt_tokens": 0, "total_tokens": len(human_response.split())},
|
||||
"usage": {
|
||||
"completion_tokens": len(human_response.split()),
|
||||
"prompt_tokens": 0,
|
||||
"total_tokens": len(human_response.split()),
|
||||
},
|
||||
}
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
@@ -1,24 +1,26 @@
|
||||
import asyncio
|
||||
import functools
|
||||
import warnings
|
||||
import io
|
||||
import base64
|
||||
import functools
|
||||
import io
|
||||
import math
|
||||
import re
|
||||
import warnings
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Iterator, AsyncIterator, Dict, List, Any, Optional, Tuple, cast
|
||||
from PIL import Image
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple, cast
|
||||
|
||||
from litellm import acompletion, completion
|
||||
from litellm.llms.custom_llm import CustomLLM
|
||||
from litellm import completion, acompletion
|
||||
from litellm.types.utils import GenericStreamingChunk, ModelResponse
|
||||
from PIL import Image
|
||||
|
||||
# Try to import MLX dependencies
|
||||
try:
|
||||
import mlx.core as mx
|
||||
from mlx_vlm import load, generate
|
||||
from mlx_vlm import generate, load
|
||||
from mlx_vlm.prompt_utils import apply_chat_template
|
||||
from mlx_vlm.utils import load_config
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
MLX_AVAILABLE = True
|
||||
except ImportError:
|
||||
MLX_AVAILABLE = False
|
||||
@@ -29,20 +31,28 @@ MIN_PIXELS = 100 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
|
||||
def round_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: float, factor: int) -> int:
|
||||
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = IMAGE_FACTOR,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Rescales the image so that the following conditions are met:
|
||||
@@ -70,61 +80,62 @@ def smart_resize(
|
||||
|
||||
class MLXVLMAdapter(CustomLLM):
|
||||
"""MLX VLM Adapter for running vision-language models locally using MLX."""
|
||||
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize the adapter.
|
||||
|
||||
|
||||
Args:
|
||||
**kwargs: Additional arguments
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
|
||||
self.models = {} # Cache for loaded models
|
||||
self.processors = {} # Cache for loaded processors
|
||||
self.configs = {} # Cache for loaded configs
|
||||
self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
|
||||
|
||||
|
||||
def _load_model_and_processor(self, model_name: str):
|
||||
"""Load model and processor if not already cached.
|
||||
|
||||
|
||||
Args:
|
||||
model_name: Name of the model to load
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (model, processor, config)
|
||||
"""
|
||||
if not MLX_AVAILABLE:
|
||||
raise ImportError("MLX VLM dependencies not available. Please install mlx-vlm.")
|
||||
|
||||
|
||||
if model_name not in self.models:
|
||||
# Load model and processor
|
||||
model_obj, processor = load(
|
||||
model_name,
|
||||
processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
|
||||
model_name, processor_kwargs={"min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS}
|
||||
)
|
||||
config = load_config(model_name)
|
||||
|
||||
|
||||
# Cache them
|
||||
self.models[model_name] = model_obj
|
||||
self.processors[model_name] = processor
|
||||
self.configs[model_name] = config
|
||||
|
||||
|
||||
return self.models[model_name], self.processors[model_name], self.configs[model_name]
|
||||
|
||||
def _process_coordinates(self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]) -> str:
|
||||
|
||||
def _process_coordinates(
|
||||
self, text: str, original_size: Tuple[int, int], model_size: Tuple[int, int]
|
||||
) -> str:
|
||||
"""Process coordinates in box tokens based on image resizing using smart_resize approach.
|
||||
|
||||
|
||||
Args:
|
||||
text: Text containing box tokens
|
||||
original_size: Original image size (width, height)
|
||||
model_size: Model processed image size (width, height)
|
||||
|
||||
|
||||
Returns:
|
||||
Text with processed coordinates
|
||||
"""
|
||||
# Find all box tokens
|
||||
box_pattern = r"<\|box_start\|>\((\d+),\s*(\d+)\)<\|box_end\|>"
|
||||
|
||||
|
||||
def process_coords(match):
|
||||
model_x, model_y = int(match.group(1)), int(match.group(2))
|
||||
# Scale coordinates from model space to original image space
|
||||
@@ -132,15 +143,20 @@ class MLXVLMAdapter(CustomLLM):
|
||||
new_x = int(model_x * original_size[0] / model_size[0]) # Width
|
||||
new_y = int(model_y * original_size[1] / model_size[1]) # Height
|
||||
return f"<|box_start|>({new_x},{new_y})<|box_end|>"
|
||||
|
||||
|
||||
return re.sub(box_pattern, process_coords, text)
|
||||
|
||||
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Image.Image], Dict[int, Tuple[int, int]], Dict[int, Tuple[int, int]]]:
|
||||
|
||||
def _convert_messages(self, messages: List[Dict[str, Any]]) -> Tuple[
|
||||
List[Dict[str, Any]],
|
||||
List[Image.Image],
|
||||
Dict[int, Tuple[int, int]],
|
||||
Dict[int, Tuple[int, int]],
|
||||
]:
|
||||
"""Convert OpenAI format messages to MLX VLM format and extract images.
|
||||
|
||||
|
||||
Args:
|
||||
messages: Messages in OpenAI format
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (processed_messages, images, original_sizes, model_sizes)
|
||||
"""
|
||||
@@ -149,13 +165,10 @@ class MLXVLMAdapter(CustomLLM):
|
||||
original_sizes = {} # Track original sizes of images for coordinate mapping
|
||||
model_sizes = {} # Track model processed sizes
|
||||
image_index = 0
|
||||
|
||||
|
||||
for message in messages:
|
||||
processed_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
|
||||
processed_message = {"role": message["role"], "content": []}
|
||||
|
||||
content = message.get("content", [])
|
||||
if isinstance(content, str):
|
||||
# Simple text content
|
||||
@@ -165,164 +178,163 @@ class MLXVLMAdapter(CustomLLM):
|
||||
processed_content = []
|
||||
for item in content:
|
||||
if item.get("type") == "text":
|
||||
processed_content.append({
|
||||
"type": "text",
|
||||
"text": item.get("text", "")
|
||||
})
|
||||
processed_content.append({"type": "text", "text": item.get("text", "")})
|
||||
elif item.get("type") == "image_url":
|
||||
image_url = item.get("image_url", {}).get("url", "")
|
||||
pil_image = None
|
||||
|
||||
|
||||
if image_url.startswith("data:image/"):
|
||||
# Extract base64 data
|
||||
base64_data = image_url.split(',')[1]
|
||||
base64_data = image_url.split(",")[1]
|
||||
# Convert base64 to PIL Image
|
||||
image_data = base64.b64decode(base64_data)
|
||||
pil_image = Image.open(io.BytesIO(image_data))
|
||||
else:
|
||||
# Handle file path or URL
|
||||
pil_image = Image.open(image_url)
|
||||
|
||||
|
||||
# Store original image size for coordinate mapping
|
||||
original_size = pil_image.size
|
||||
original_sizes[image_index] = original_size
|
||||
|
||||
|
||||
# Use smart_resize to determine model size
|
||||
# Note: smart_resize expects (height, width) but PIL gives (width, height)
|
||||
height, width = original_size[1], original_size[0]
|
||||
new_height, new_width = smart_resize(height, width)
|
||||
# Store model size in (width, height) format for consistent coordinate processing
|
||||
model_sizes[image_index] = (new_width, new_height)
|
||||
|
||||
|
||||
# Resize the image using the calculated dimensions from smart_resize
|
||||
resized_image = pil_image.resize((new_width, new_height))
|
||||
images.append(resized_image)
|
||||
|
||||
|
||||
# Add image placeholder to content
|
||||
processed_content.append({
|
||||
"type": "image"
|
||||
})
|
||||
|
||||
processed_content.append({"type": "image"})
|
||||
|
||||
image_index += 1
|
||||
|
||||
|
||||
processed_message["content"] = processed_content
|
||||
|
||||
|
||||
processed_messages.append(processed_message)
|
||||
|
||||
|
||||
return processed_messages, images, original_sizes, model_sizes
|
||||
|
||||
|
||||
def _generate(self, **kwargs) -> str:
|
||||
"""Generate response using the local MLX VLM model.
|
||||
|
||||
|
||||
Args:
|
||||
**kwargs: Keyword arguments containing messages and model info
|
||||
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
"""
|
||||
messages = kwargs.get('messages', [])
|
||||
model_name = kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')
|
||||
max_tokens = kwargs.get('max_tokens', 128)
|
||||
|
||||
messages = kwargs.get("messages", [])
|
||||
model_name = kwargs.get("model", "mlx-community/UI-TARS-1.5-7B-4bit")
|
||||
max_tokens = kwargs.get("max_tokens", 128)
|
||||
|
||||
# Warn about ignored kwargs
|
||||
ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
|
||||
ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
|
||||
if ignored_kwargs:
|
||||
warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
|
||||
|
||||
|
||||
# Load model and processor
|
||||
model, processor, config = self._load_model_and_processor(model_name)
|
||||
|
||||
|
||||
# Convert messages and extract images
|
||||
processed_messages, images, original_sizes, model_sizes = self._convert_messages(messages)
|
||||
|
||||
|
||||
# Process user text input with box coordinates after image processing
|
||||
# Swap original_size and model_size arguments for inverse transformation
|
||||
for msg_idx, msg in enumerate(processed_messages):
|
||||
if msg.get("role") == "user" and isinstance(msg.get("content"), str):
|
||||
content = msg.get("content", "")
|
||||
if "<|box_start|>" in content and original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
||||
if (
|
||||
"<|box_start|>" in content
|
||||
and original_sizes
|
||||
and model_sizes
|
||||
and 0 in original_sizes
|
||||
and 0 in model_sizes
|
||||
):
|
||||
orig_size = original_sizes[0]
|
||||
model_size = model_sizes[0]
|
||||
# Swap arguments to perform inverse transformation for user input
|
||||
processed_messages[msg_idx]["content"] = self._process_coordinates(content, model_size, orig_size)
|
||||
|
||||
processed_messages[msg_idx]["content"] = self._process_coordinates(
|
||||
content, model_size, orig_size
|
||||
)
|
||||
|
||||
try:
|
||||
# Format prompt according to model requirements using the processor directly
|
||||
prompt = processor.apply_chat_template(
|
||||
processed_messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
return_tensors='pt'
|
||||
processed_messages, tokenize=False, add_generation_prompt=True, return_tensors="pt"
|
||||
)
|
||||
tokenizer = cast(PreTrainedTokenizer, processor)
|
||||
|
||||
|
||||
# Generate response
|
||||
text_content, usage = generate(
|
||||
model,
|
||||
tokenizer,
|
||||
str(prompt),
|
||||
images, # type: ignore
|
||||
model,
|
||||
tokenizer,
|
||||
str(prompt),
|
||||
images, # type: ignore
|
||||
verbose=False,
|
||||
max_tokens=max_tokens
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error generating response: {str(e)}") from e
|
||||
|
||||
|
||||
# Process coordinates in the response back to original image space
|
||||
if original_sizes and model_sizes and 0 in original_sizes and 0 in model_sizes:
|
||||
# Get original image size and model size (using the first image)
|
||||
orig_size = original_sizes[0]
|
||||
model_size = model_sizes[0]
|
||||
|
||||
|
||||
# Check if output contains box tokens that need processing
|
||||
if "<|box_start|>" in text_content:
|
||||
# Process coordinates from model space back to original image space
|
||||
text_content = self._process_coordinates(text_content, orig_size, model_size)
|
||||
|
||||
|
||||
return text_content
|
||||
|
||||
|
||||
def completion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Synchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
|
||||
result = completion(
|
||||
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
return cast(ModelResponse, result)
|
||||
|
||||
|
||||
async def acompletion(self, *args, **kwargs) -> ModelResponse:
|
||||
"""Asynchronous completion method.
|
||||
|
||||
|
||||
Returns:
|
||||
ModelResponse with generated text
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
self._executor, functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
|
||||
result = await acompletion(
|
||||
model=f"mlx/{kwargs.get('model', 'mlx-community/UI-TARS-1.5-7B-4bit')}",
|
||||
mock_response=generated_text,
|
||||
)
|
||||
return cast(ModelResponse, result)
|
||||
|
||||
|
||||
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
|
||||
"""Synchronous streaming method.
|
||||
|
||||
|
||||
Returns:
|
||||
Iterator of GenericStreamingChunk
|
||||
"""
|
||||
generated_text = self._generate(**kwargs)
|
||||
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
@@ -331,22 +343,21 @@ class MLXVLMAdapter(CustomLLM):
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
|
||||
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
|
||||
"""Asynchronous streaming method.
|
||||
|
||||
|
||||
Returns:
|
||||
AsyncIterator of GenericStreamingChunk
|
||||
"""
|
||||
# Run _generate in thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
generated_text = await loop.run_in_executor(
|
||||
self._executor,
|
||||
functools.partial(self._generate, **kwargs)
|
||||
self._executor, functools.partial(self._generate, **kwargs)
|
||||
)
|
||||
|
||||
|
||||
generic_streaming_chunk: GenericStreamingChunk = {
|
||||
"finish_reason": "stop",
|
||||
"index": 0,
|
||||
@@ -355,5 +366,5 @@ class MLXVLMAdapter(CustomLLM):
|
||||
"tool_use": None,
|
||||
"usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
|
||||
}
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
yield generic_streaming_chunk
|
||||
|
||||
@@ -2,32 +2,40 @@ from typing import Optional
|
||||
|
||||
try:
|
||||
from transformers import AutoConfig
|
||||
|
||||
HF_AVAILABLE = True
|
||||
except ImportError:
|
||||
HF_AVAILABLE = False
|
||||
|
||||
from .generic import GenericHFModel
|
||||
from .internvl import InternVLModel
|
||||
from .opencua import OpenCUAModel
|
||||
from .qwen2_5_vl import Qwen2_5_VLModel
|
||||
from .internvl import InternVLModel
|
||||
|
||||
|
||||
def load_model(model_name: str, device: str = "auto", trust_remote_code: bool = False):
|
||||
"""Factory function to load and return the right model handler instance.
|
||||
|
||||
|
||||
- If the underlying transformers config class matches OpenCUA, return OpenCUAModel
|
||||
- Otherwise, return GenericHFModel
|
||||
"""
|
||||
if not HF_AVAILABLE:
|
||||
raise ImportError(
|
||||
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
||||
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
||||
)
|
||||
cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
|
||||
cls = cfg.__class__.__name__
|
||||
print(f"cls: {cls}")
|
||||
if "OpenCUA" in cls:
|
||||
return OpenCUAModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
||||
return OpenCUAModel(
|
||||
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
||||
)
|
||||
elif "Qwen2_5_VL" in cls:
|
||||
return Qwen2_5_VLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
||||
return Qwen2_5_VLModel(
|
||||
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
||||
)
|
||||
elif "InternVL" in cls:
|
||||
return InternVLModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
||||
return InternVLModel(
|
||||
model_name=model_name, device=device, trust_remote_code=trust_remote_code
|
||||
)
|
||||
return GenericHFModel(model_name=model_name, device=device, trust_remote_code=trust_remote_code)
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# Hugging Face imports are local to avoid hard dependency at module import
|
||||
try:
|
||||
import torch # type: ignore
|
||||
from transformers import AutoModel, AutoProcessor # type: ignore
|
||||
|
||||
HF_AVAILABLE = True
|
||||
except Exception:
|
||||
HF_AVAILABLE = False
|
||||
@@ -14,10 +15,12 @@ class GenericHFModel:
|
||||
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
||||
) -> None:
|
||||
if not HF_AVAILABLE:
|
||||
raise ImportError(
|
||||
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
||||
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
||||
)
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
@@ -64,7 +67,7 @@ class GenericHFModel:
|
||||
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
||||
# Trim prompt tokens from output
|
||||
generated_ids_trimmed = [
|
||||
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||
]
|
||||
# Decode
|
||||
output_text = self.processor.batch_decode(
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
from __future__ import annotations
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# Hugging Face imports are local to avoid hard dependency at module import
|
||||
try:
|
||||
import torch # type: ignore
|
||||
from transformers import AutoModel, AutoTokenizer # type: ignore
|
||||
# Attempt to import InternVL's model dependencies
|
||||
import einops as _ # type: ignore
|
||||
import timm as _ # type: ignore
|
||||
from PIL import Image # type: ignore
|
||||
import torchvision.transforms as T # type: ignore
|
||||
from torchvision.transforms.functional import InterpolationMode # type: ignore
|
||||
import base64 # type: ignore
|
||||
from io import BytesIO # type: ignore
|
||||
|
||||
# Attempt to import InternVL's model dependencies
|
||||
import einops as _ # type: ignore
|
||||
import requests # type: ignore
|
||||
import timm as _ # type: ignore
|
||||
import torch # type: ignore
|
||||
import torchvision.transforms as T # type: ignore
|
||||
from PIL import Image # type: ignore
|
||||
from torchvision.transforms.functional import InterpolationMode # type: ignore
|
||||
from transformers import AutoModel, AutoTokenizer # type: ignore
|
||||
|
||||
HF_AVAILABLE = True
|
||||
except Exception:
|
||||
HF_AVAILABLE = False
|
||||
@@ -25,10 +28,12 @@ class InternVLModel:
|
||||
Provides preprocessing to support multi-turn conversations with multiple images.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
||||
) -> None:
|
||||
if not HF_AVAILABLE:
|
||||
raise ImportError(
|
||||
"InternVL dependencies not found. Install with: pip install \"cua-agent[internvl-hf]\""
|
||||
'InternVL dependencies not found. Install with: pip install "cua-agent[internvl-hf]"'
|
||||
)
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
@@ -60,16 +65,25 @@ class InternVLModel:
|
||||
|
||||
def _build_transform(self, input_size: int) -> T.Compose:
|
||||
MEAN, STD = self.IMAGENET_MEAN, self.IMAGENET_STD
|
||||
transform = T.Compose([
|
||||
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
||||
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD)
|
||||
])
|
||||
transform = T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
||||
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
return transform
|
||||
|
||||
def _find_closest_aspect_ratio(self, aspect_ratio: float, target_ratios: List[tuple], width: int, height: int, image_size: int):
|
||||
best_ratio_diff = float('inf')
|
||||
def _find_closest_aspect_ratio(
|
||||
self,
|
||||
aspect_ratio: float,
|
||||
target_ratios: List[tuple],
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
):
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
@@ -83,17 +97,29 @@ class InternVLModel:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
def _dynamic_preprocess(self, image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448, use_thumbnail: bool = True) -> List[Image.Image]:
|
||||
def _dynamic_preprocess(
|
||||
self,
|
||||
image: Image.Image,
|
||||
min_num: int = 1,
|
||||
max_num: int = 12,
|
||||
image_size: int = 448,
|
||||
use_thumbnail: bool = True,
|
||||
) -> List[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
target_ratios = set(
|
||||
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
||||
i * j <= max_num and i * j >= min_num)
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if i * j <= max_num and i * j >= min_num
|
||||
)
|
||||
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
target_aspect_ratio = self._find_closest_aspect_ratio(
|
||||
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
||||
aspect_ratio, target_ratios, orig_width, orig_height, image_size
|
||||
)
|
||||
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
@@ -106,7 +132,7 @@ class InternVLModel:
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
@@ -122,20 +148,24 @@ class InternVLModel:
|
||||
# data URL base64
|
||||
header, b64data = src.split(",", 1)
|
||||
img_bytes = base64.b64decode(b64data)
|
||||
return Image.open(BytesIO(img_bytes)).convert('RGB')
|
||||
return Image.open(BytesIO(img_bytes)).convert("RGB")
|
||||
if src.startswith("http://") or src.startswith("https://"):
|
||||
resp = requests.get(src, timeout=10)
|
||||
resp.raise_for_status()
|
||||
return Image.open(BytesIO(resp.content)).convert('RGB')
|
||||
return Image.open(BytesIO(resp.content)).convert("RGB")
|
||||
# Assume local file path
|
||||
return Image.open(src).convert('RGB')
|
||||
return Image.open(src).convert("RGB")
|
||||
|
||||
def _images_to_pixel_values(self, images: List[Image.Image], input_size: int = 448, max_num: int = 12):
|
||||
def _images_to_pixel_values(
|
||||
self, images: List[Image.Image], input_size: int = 448, max_num: int = 12
|
||||
):
|
||||
transform = self._build_transform(input_size=input_size)
|
||||
pixel_values_list = []
|
||||
num_patches_list: List[int] = []
|
||||
for img in images:
|
||||
tiles = self._dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
||||
tiles = self._dynamic_preprocess(
|
||||
img, image_size=input_size, use_thumbnail=True, max_num=max_num
|
||||
)
|
||||
pv = [transform(tile) for tile in tiles]
|
||||
pv = torch.stack(pv)
|
||||
num_patches_list.append(pv.shape[0])
|
||||
@@ -191,7 +221,9 @@ class InternVLModel:
|
||||
last_user_text_parts = parts_text or last_user_text_parts
|
||||
elif role == "assistant":
|
||||
# Only keep text content for history
|
||||
parts_text = [item.get("text", "") for item in content_items if item.get("type") == "text"]
|
||||
parts_text = [
|
||||
item.get("text", "") for item in content_items if item.get("type") == "text"
|
||||
]
|
||||
text = "\n".join(parts_text).strip()
|
||||
if text:
|
||||
context_lines.append(f"Assistant: {text}")
|
||||
@@ -200,7 +232,9 @@ class InternVLModel:
|
||||
pixel_values = None
|
||||
num_patches_list: List[int] = []
|
||||
if all_images:
|
||||
pixel_values, num_patches_list = self._images_to_pixel_values(all_images, input_size=448, max_num=12)
|
||||
pixel_values, num_patches_list = self._images_to_pixel_values(
|
||||
all_images, input_size=448, max_num=12
|
||||
)
|
||||
if pixel_values is not None:
|
||||
# Convert dtype/device as in docs
|
||||
pixel_values = pixel_values.to(torch.bfloat16)
|
||||
@@ -246,7 +280,9 @@ class InternVLModel:
|
||||
num_patches_list=num_patches_list,
|
||||
)
|
||||
else:
|
||||
response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
|
||||
response = self.model.chat(
|
||||
self.tokenizer, pixel_values, question, generation_config
|
||||
)
|
||||
except Exception as e:
|
||||
# Fallback: return empty string to avoid crashing the adapter
|
||||
return ""
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
from typing import List, Dict, Any
|
||||
import re
|
||||
import base64
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Any, Dict, List
|
||||
|
||||
try:
|
||||
import blobfile as _ # assert blobfile is installed
|
||||
import torch # type: ignore
|
||||
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor # type: ignore
|
||||
from PIL import Image # type: ignore
|
||||
import blobfile as _ # assert blobfile is installed
|
||||
from transformers import ( # type: ignore
|
||||
AutoImageProcessor,
|
||||
AutoModel,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
OPENCUA_AVAILABLE = True
|
||||
except Exception:
|
||||
OPENCUA_AVAILABLE = False
|
||||
@@ -16,10 +21,12 @@ except Exception:
|
||||
class OpenCUAModel:
|
||||
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
|
||||
|
||||
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
||||
) -> None:
|
||||
if not OPENCUA_AVAILABLE:
|
||||
raise ImportError(
|
||||
"OpenCUA requirements not found. Install with: pip install \"cua-agent[opencua-hf]\""
|
||||
'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
|
||||
)
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
@@ -56,7 +63,11 @@ class OpenCUAModel:
|
||||
return ""
|
||||
|
||||
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
|
||||
assert self.model is not None and self.tokenizer is not None and self.image_processor is not None
|
||||
assert (
|
||||
self.model is not None
|
||||
and self.tokenizer is not None
|
||||
and self.image_processor is not None
|
||||
)
|
||||
|
||||
# Tokenize text side using chat template
|
||||
input_ids = self.tokenizer.apply_chat_template(
|
||||
@@ -74,7 +85,11 @@ class OpenCUAModel:
|
||||
pixel_values = torch.tensor(image_info["pixel_values"]).to(
|
||||
dtype=torch.bfloat16, device=self.model.device
|
||||
)
|
||||
grid_thws = torch.tensor(image_info["image_grid_thw"]) if "image_grid_thw" in image_info else None
|
||||
grid_thws = (
|
||||
torch.tensor(image_info["image_grid_thw"])
|
||||
if "image_grid_thw" in image_info
|
||||
else None
|
||||
)
|
||||
|
||||
gen_kwargs: Dict[str, Any] = {
|
||||
"max_new_tokens": max_new_tokens,
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# Hugging Face imports are local to avoid hard dependency at module import
|
||||
try:
|
||||
import torch # type: ignore
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor # type: ignore
|
||||
|
||||
HF_AVAILABLE = True
|
||||
except Exception:
|
||||
HF_AVAILABLE = False
|
||||
@@ -14,10 +15,12 @@ class Qwen2_5_VLModel:
|
||||
Loads an AutoModelForImageTextToText and AutoProcessor and generates text.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str, device: str = "auto", trust_remote_code: bool = False) -> None:
|
||||
def __init__(
|
||||
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
|
||||
) -> None:
|
||||
if not HF_AVAILABLE:
|
||||
raise ImportError(
|
||||
"HuggingFace transformers dependencies not found. Install with: pip install \"cua-agent[uitars-hf]\""
|
||||
'HuggingFace transformers dependencies not found. Install with: pip install "cua-agent[uitars-hf]"'
|
||||
)
|
||||
self.model_name = model_name
|
||||
self.device = device
|
||||
@@ -64,7 +67,7 @@ class Qwen2_5_VLModel:
|
||||
generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
||||
# Trim prompt tokens from output
|
||||
generated_ids_trimmed = [
|
||||
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||
]
|
||||
# Decode
|
||||
output_text = self.processor.batch_decode(
|
||||
|
||||
@@ -3,76 +3,83 @@ ComputerAgent - Main agent class that selects and runs agent loops
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
|
||||
|
||||
from litellm.responses.utils import Usage
|
||||
|
||||
from .types import (
|
||||
Messages,
|
||||
AgentCapability,
|
||||
ToolError,
|
||||
IllegalArgumentError
|
||||
)
|
||||
from .responses import make_tool_error_item, replace_failed_computer_calls_with_function_calls
|
||||
from .decorators import find_agent_config
|
||||
import inspect
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncGenerator,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
import litellm
|
||||
import litellm.utils
|
||||
import inspect
|
||||
from litellm.responses.utils import Usage
|
||||
|
||||
from .adapters import (
|
||||
HuggingFaceLocalAdapter,
|
||||
HumanAdapter,
|
||||
MLXVLMAdapter,
|
||||
)
|
||||
from .callbacks import (
|
||||
ImageRetentionCallback,
|
||||
LoggingCallback,
|
||||
TrajectorySaverCallback,
|
||||
BudgetManagerCallback,
|
||||
TelemetryCallback,
|
||||
ImageRetentionCallback,
|
||||
LoggingCallback,
|
||||
OperatorNormalizerCallback,
|
||||
PromptInstructionsCallback,
|
||||
TelemetryCallback,
|
||||
TrajectorySaverCallback,
|
||||
)
|
||||
from .computers import (
|
||||
AsyncComputerHandler,
|
||||
is_agent_computer,
|
||||
make_computer_handler
|
||||
from .computers import AsyncComputerHandler, is_agent_computer, make_computer_handler
|
||||
from .decorators import find_agent_config
|
||||
from .responses import (
|
||||
make_tool_error_item,
|
||||
replace_failed_computer_calls_with_function_calls,
|
||||
)
|
||||
from .types import AgentCapability, IllegalArgumentError, Messages, ToolError
|
||||
|
||||
|
||||
def assert_callable_with(f, *args, **kwargs):
|
||||
"""Check if function can be called with given arguments."""
|
||||
try:
|
||||
inspect.signature(f).bind(*args, **kwargs)
|
||||
return True
|
||||
except TypeError as e:
|
||||
sig = inspect.signature(f)
|
||||
raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
|
||||
"""Check if function can be called with given arguments."""
|
||||
try:
|
||||
inspect.signature(f).bind(*args, **kwargs)
|
||||
return True
|
||||
except TypeError as e:
|
||||
sig = inspect.signature(f)
|
||||
raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
|
||||
|
||||
|
||||
def get_json(obj: Any, max_depth: int = 10) -> Any:
|
||||
def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
|
||||
if seen is None:
|
||||
seen = set()
|
||||
|
||||
|
||||
# Use model_dump() if available
|
||||
if hasattr(o, 'model_dump'):
|
||||
if hasattr(o, "model_dump"):
|
||||
return o.model_dump()
|
||||
|
||||
|
||||
# Check depth limit
|
||||
if depth > max_depth:
|
||||
return f"<max_depth_exceeded:{max_depth}>"
|
||||
|
||||
|
||||
# Check for circular references using object id
|
||||
obj_id = id(o)
|
||||
if obj_id in seen:
|
||||
return f"<circular_reference:{type(o).__name__}>"
|
||||
|
||||
|
||||
# Handle Computer objects
|
||||
if hasattr(o, '__class__') and 'computer' in getattr(o, '__class__').__name__.lower():
|
||||
if hasattr(o, "__class__") and "computer" in o.__class__.__name__.lower():
|
||||
return f"<computer:{o.__class__.__name__}>"
|
||||
|
||||
# Handle objects with __dict__
|
||||
if hasattr(o, '__dict__'):
|
||||
if hasattr(o, "__dict__"):
|
||||
seen.add(obj_id)
|
||||
try:
|
||||
result = {}
|
||||
@@ -84,7 +91,7 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
|
||||
return result
|
||||
finally:
|
||||
seen.discard(obj_id)
|
||||
|
||||
|
||||
# Handle common types that might contain nested objects
|
||||
elif isinstance(o, dict):
|
||||
seen.add(obj_id)
|
||||
@@ -96,7 +103,7 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
|
||||
}
|
||||
finally:
|
||||
seen.discard(obj_id)
|
||||
|
||||
|
||||
elif isinstance(o, (list, tuple, set)):
|
||||
seen.add(obj_id)
|
||||
try:
|
||||
@@ -107,32 +114,33 @@ def get_json(obj: Any, max_depth: int = 10) -> Any:
|
||||
]
|
||||
finally:
|
||||
seen.discard(obj_id)
|
||||
|
||||
|
||||
# For basic types that json.dumps can handle
|
||||
elif isinstance(o, (str, int, float, bool)) or o is None:
|
||||
return o
|
||||
|
||||
|
||||
# Fallback to string representation
|
||||
else:
|
||||
return str(o)
|
||||
|
||||
|
||||
def remove_nones(obj: Any) -> Any:
|
||||
if isinstance(obj, dict):
|
||||
return {k: remove_nones(v) for k, v in obj.items() if v is not None}
|
||||
elif isinstance(obj, list):
|
||||
return [remove_nones(item) for item in obj if item is not None]
|
||||
return obj
|
||||
|
||||
|
||||
# Serialize with circular reference and depth protection
|
||||
serialized = custom_serializer(obj)
|
||||
|
||||
|
||||
# Convert to JSON string and back to ensure JSON compatibility
|
||||
json_str = json.dumps(serialized)
|
||||
parsed = json.loads(json_str)
|
||||
|
||||
|
||||
# Final cleanup of any remaining None values
|
||||
return remove_nones(parsed)
|
||||
|
||||
|
||||
def sanitize_message(msg: Any) -> Any:
|
||||
"""Return a copy of the message with image_url omitted for computer_call_output messages."""
|
||||
if msg.get("type") == "computer_call_output":
|
||||
@@ -143,19 +151,24 @@ def sanitize_message(msg: Any) -> Any:
|
||||
return sanitized
|
||||
return msg
|
||||
|
||||
|
||||
def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
|
||||
call_ids = []
|
||||
for message in messages:
|
||||
if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
|
||||
if (
|
||||
message.get("type") == "computer_call_output"
|
||||
or message.get("type") == "function_call_output"
|
||||
):
|
||||
call_ids.append(message.get("call_id"))
|
||||
return call_ids
|
||||
|
||||
|
||||
class ComputerAgent:
|
||||
"""
|
||||
Main agent class that automatically selects the appropriate agent loop
|
||||
based on the model and executes tool calls.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
@@ -172,11 +185,11 @@ class ComputerAgent:
|
||||
max_trajectory_budget: Optional[float | dict] = None,
|
||||
telemetry_enabled: Optional[bool] = True,
|
||||
trust_remote_code: Optional[bool] = False,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize ComputerAgent.
|
||||
|
||||
|
||||
Args:
|
||||
model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
|
||||
tools: List of tools (computer objects, decorated functions, etc.)
|
||||
@@ -193,11 +206,11 @@ class ComputerAgent:
|
||||
telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
|
||||
trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
|
||||
**kwargs: Additional arguments passed to the agent loop
|
||||
"""
|
||||
"""
|
||||
# If the loop is "human/human", we need to prefix a grounding model fallback
|
||||
if model in ["human/human", "human"]:
|
||||
model = "openai/computer-use-preview+human/human"
|
||||
|
||||
|
||||
self.model = model
|
||||
self.tools = tools or []
|
||||
self.custom_loop = custom_loop
|
||||
@@ -236,34 +249,33 @@ class ComputerAgent:
|
||||
# Add image retention callback if only_n_most_recent_images is set
|
||||
if self.only_n_most_recent_images:
|
||||
self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
|
||||
|
||||
|
||||
# Add trajectory saver callback if trajectory_dir is set
|
||||
if self.trajectory_dir:
|
||||
if isinstance(self.trajectory_dir, dict):
|
||||
self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
|
||||
elif isinstance(self.trajectory_dir, (str, Path)):
|
||||
self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
|
||||
|
||||
|
||||
# Add budget manager if max_trajectory_budget is set
|
||||
if max_trajectory_budget:
|
||||
if isinstance(max_trajectory_budget, dict):
|
||||
self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
|
||||
else:
|
||||
self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
|
||||
|
||||
|
||||
# == Enable local model providers w/ LiteLLM ==
|
||||
|
||||
# Register local model providers
|
||||
hf_adapter = HuggingFaceLocalAdapter(
|
||||
device="auto",
|
||||
trust_remote_code=self.trust_remote_code or False
|
||||
device="auto", trust_remote_code=self.trust_remote_code or False
|
||||
)
|
||||
human_adapter = HumanAdapter()
|
||||
mlx_adapter = MLXVLMAdapter()
|
||||
litellm.custom_provider_map = [
|
||||
{"provider": "huggingface-local", "custom_handler": hf_adapter},
|
||||
{"provider": "human", "custom_handler": human_adapter},
|
||||
{"provider": "mlx", "custom_handler": mlx_adapter}
|
||||
{"provider": "mlx", "custom_handler": mlx_adapter},
|
||||
]
|
||||
litellm.suppress_debug_info = True
|
||||
|
||||
@@ -280,16 +292,16 @@ class ComputerAgent:
|
||||
# Instantiate the agent config class
|
||||
self.agent_loop = config_info.agent_class()
|
||||
self.agent_config_info = config_info
|
||||
|
||||
|
||||
self.tool_schemas = []
|
||||
self.computer_handler = None
|
||||
|
||||
|
||||
async def _initialize_computers(self):
|
||||
"""Initialize computer objects"""
|
||||
if not self.tool_schemas:
|
||||
# Process tools and create tool schemas
|
||||
self.tool_schemas = self._process_tools()
|
||||
|
||||
|
||||
# Find computer tool and create interface adapter
|
||||
computer_handler = None
|
||||
for schema in self.tool_schemas:
|
||||
@@ -297,7 +309,7 @@ class ComputerAgent:
|
||||
computer_handler = await make_computer_handler(schema["computer"])
|
||||
break
|
||||
self.computer_handler = computer_handler
|
||||
|
||||
|
||||
def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
|
||||
"""Process input messages and create schemas for the agent loop"""
|
||||
if isinstance(input, str):
|
||||
@@ -307,69 +319,73 @@ class ComputerAgent:
|
||||
def _process_tools(self) -> List[Dict[str, Any]]:
|
||||
"""Process tools and create schemas for the agent loop"""
|
||||
schemas = []
|
||||
|
||||
|
||||
for tool in self.tools:
|
||||
# Check if it's a computer object (has interface attribute)
|
||||
if is_agent_computer(tool):
|
||||
# This is a computer tool - will be handled by agent loop
|
||||
schemas.append({
|
||||
"type": "computer",
|
||||
"computer": tool
|
||||
})
|
||||
schemas.append({"type": "computer", "computer": tool})
|
||||
elif callable(tool):
|
||||
# Use litellm.utils.function_to_dict to extract schema from docstring
|
||||
try:
|
||||
function_schema = litellm.utils.function_to_dict(tool)
|
||||
schemas.append({
|
||||
"type": "function",
|
||||
"function": function_schema
|
||||
})
|
||||
schemas.append({"type": "function", "function": function_schema})
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not process tool {tool}: {e}")
|
||||
else:
|
||||
print(f"Warning: Unknown tool type: {tool}")
|
||||
|
||||
|
||||
return schemas
|
||||
|
||||
|
||||
def _get_tool(self, name: str) -> Optional[Callable]:
|
||||
"""Get a tool by name"""
|
||||
for tool in self.tools:
|
||||
if hasattr(tool, '__name__') and tool.__name__ == name:
|
||||
if hasattr(tool, "__name__") and tool.__name__ == name:
|
||||
return tool
|
||||
elif hasattr(tool, 'func') and tool.func.__name__ == name:
|
||||
elif hasattr(tool, "func") and tool.func.__name__ == name:
|
||||
return tool
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# AGENT RUN LOOP LIFECYCLE HOOKS
|
||||
# ============================================================================
|
||||
|
||||
|
||||
async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
||||
"""Initialize run tracking by calling callbacks."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_run_start'):
|
||||
if hasattr(callback, "on_run_start"):
|
||||
await callback.on_run_start(kwargs, old_items)
|
||||
|
||||
async def _on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def _on_run_end(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Finalize run tracking by calling callbacks."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_run_end'):
|
||||
if hasattr(callback, "on_run_end"):
|
||||
await callback.on_run_end(kwargs, old_items, new_items)
|
||||
|
||||
async def _on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
|
||||
|
||||
async def _on_run_continue(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> bool:
|
||||
"""Check if run should continue by calling callbacks."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_run_continue'):
|
||||
if hasattr(callback, "on_run_continue"):
|
||||
should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
|
||||
if not should_continue:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Prepare messages for the LLM call by applying callbacks."""
|
||||
result = messages
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_llm_start'):
|
||||
if hasattr(callback, "on_llm_start"):
|
||||
result = await callback.on_llm_start(result)
|
||||
return result
|
||||
|
||||
@@ -377,82 +393,91 @@ class ComputerAgent:
|
||||
"""Postprocess messages after the LLM call by applying callbacks."""
|
||||
result = messages
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_llm_end'):
|
||||
if hasattr(callback, "on_llm_end"):
|
||||
result = await callback.on_llm_end(result)
|
||||
return result
|
||||
|
||||
async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
|
||||
"""Called when responses are received."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_responses'):
|
||||
if hasattr(callback, "on_responses"):
|
||||
await callback.on_responses(get_json(kwargs), get_json(responses))
|
||||
|
||||
|
||||
async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a computer call is about to start."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_computer_call_start'):
|
||||
if hasattr(callback, "on_computer_call_start"):
|
||||
await callback.on_computer_call_start(get_json(item))
|
||||
|
||||
async def _on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def _on_computer_call_end(
|
||||
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
||||
) -> None:
|
||||
"""Called when a computer call has completed."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_computer_call_end'):
|
||||
if hasattr(callback, "on_computer_call_end"):
|
||||
await callback.on_computer_call_end(get_json(item), get_json(result))
|
||||
|
||||
|
||||
async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a function call is about to start."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_function_call_start'):
|
||||
if hasattr(callback, "on_function_call_start"):
|
||||
await callback.on_function_call_start(get_json(item))
|
||||
|
||||
async def _on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def _on_function_call_end(
|
||||
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
||||
) -> None:
|
||||
"""Called when a function call has completed."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_function_call_end'):
|
||||
if hasattr(callback, "on_function_call_end"):
|
||||
await callback.on_function_call_end(get_json(item), get_json(result))
|
||||
|
||||
|
||||
async def _on_text(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a text message is encountered."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_text'):
|
||||
if hasattr(callback, "on_text"):
|
||||
await callback.on_text(get_json(item))
|
||||
|
||||
|
||||
async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
||||
"""Called when an LLM API call is about to start."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_api_start'):
|
||||
if hasattr(callback, "on_api_start"):
|
||||
await callback.on_api_start(get_json(kwargs))
|
||||
|
||||
|
||||
async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
||||
"""Called when an LLM API call has completed."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_api_end'):
|
||||
if hasattr(callback, "on_api_end"):
|
||||
await callback.on_api_end(get_json(kwargs), get_json(result))
|
||||
|
||||
async def _on_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Called when usage information is received."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_usage'):
|
||||
if hasattr(callback, "on_usage"):
|
||||
await callback.on_usage(get_json(usage))
|
||||
|
||||
async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
||||
"""Called when a screenshot is taken."""
|
||||
for callback in self.callbacks:
|
||||
if hasattr(callback, 'on_screenshot'):
|
||||
if hasattr(callback, "on_screenshot"):
|
||||
await callback.on_screenshot(screenshot, name)
|
||||
|
||||
# ============================================================================
|
||||
# AGENT OUTPUT PROCESSING
|
||||
# ============================================================================
|
||||
|
||||
async def _handle_item(self, item: Any, computer: Optional[AsyncComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
||||
|
||||
async def _handle_item(
|
||||
self,
|
||||
item: Any,
|
||||
computer: Optional[AsyncComputerHandler] = None,
|
||||
ignore_call_ids: Optional[List[str]] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Handle each item; may cause a computer action + screenshot."""
|
||||
call_id = item.get("call_id")
|
||||
if ignore_call_ids and call_id and call_id in ignore_call_ids:
|
||||
return []
|
||||
|
||||
|
||||
item_type = item.get("type", None)
|
||||
|
||||
|
||||
if item_type == "message":
|
||||
await self._on_text(item)
|
||||
# # Print messages
|
||||
@@ -461,7 +486,7 @@ class ComputerAgent:
|
||||
# if content_item.get("text"):
|
||||
# print(content_item.get("text"))
|
||||
return []
|
||||
|
||||
|
||||
try:
|
||||
if item_type == "computer_call":
|
||||
await self._on_computer_call_start(item)
|
||||
@@ -472,14 +497,16 @@ class ComputerAgent:
|
||||
action = item.get("action")
|
||||
action_type = action.get("type")
|
||||
if action_type is None:
|
||||
print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
|
||||
print(
|
||||
f"Action type cannot be `None`: action={action}, action_type={action_type}"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
# Extract action arguments (all fields except 'type')
|
||||
action_args = {k: v for k, v in action.items() if k != "type"}
|
||||
|
||||
|
||||
# print(f"{action_type}({action_args})")
|
||||
|
||||
|
||||
# Execute the computer action
|
||||
computer_method = getattr(computer, action_type, None)
|
||||
if computer_method:
|
||||
@@ -487,13 +514,13 @@ class ComputerAgent:
|
||||
await computer_method(**action_args)
|
||||
else:
|
||||
raise ToolError(f"Unknown computer action: {action_type}")
|
||||
|
||||
|
||||
# Take screenshot after action
|
||||
if self.screenshot_delay and self.screenshot_delay > 0:
|
||||
await asyncio.sleep(self.screenshot_delay)
|
||||
screenshot_base64 = await computer.screenshot()
|
||||
await self._on_screenshot(screenshot_base64, "screenshot_after")
|
||||
|
||||
|
||||
# Handle safety checks
|
||||
pending_checks = item.get("pending_safety_checks", [])
|
||||
acknowledged_checks = []
|
||||
@@ -505,7 +532,7 @@ class ComputerAgent:
|
||||
# acknowledged_checks.append(check)
|
||||
# else:
|
||||
# raise ValueError(f"Safety check failed: {check_message}")
|
||||
|
||||
|
||||
# Create call output
|
||||
call_output = {
|
||||
"type": "computer_call_output",
|
||||
@@ -516,25 +543,25 @@ class ComputerAgent:
|
||||
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# # Additional URL safety checks for browser environments
|
||||
# if await computer.get_environment() == "browser":
|
||||
# current_url = await computer.get_current_url()
|
||||
# call_output["output"]["current_url"] = current_url
|
||||
# # TODO: implement a callback for URL safety checks
|
||||
# # check_blocklisted_url(current_url)
|
||||
|
||||
|
||||
result = [call_output]
|
||||
await self._on_computer_call_end(item, result)
|
||||
return result
|
||||
|
||||
|
||||
if item_type == "function_call":
|
||||
await self._on_function_call_start(item)
|
||||
# Perform function call
|
||||
function = self._get_tool(item.get("name"))
|
||||
if not function:
|
||||
raise ToolError(f"Function {item.get('name')} not found")
|
||||
|
||||
|
||||
args = json.loads(item.get("arguments"))
|
||||
|
||||
# Validate arguments before execution
|
||||
@@ -545,14 +572,14 @@ class ComputerAgent:
|
||||
result = await function(**args)
|
||||
else:
|
||||
result = await asyncio.to_thread(function, **args)
|
||||
|
||||
|
||||
# Create function call output
|
||||
call_output = {
|
||||
"type": "function_call_output",
|
||||
"call_id": item.get("call_id"),
|
||||
"output": str(result),
|
||||
}
|
||||
|
||||
|
||||
result = [call_output]
|
||||
await self._on_function_call_end(item, result)
|
||||
return result
|
||||
@@ -564,36 +591,35 @@ class ComputerAgent:
|
||||
# ============================================================================
|
||||
# MAIN AGENT LOOP
|
||||
# ============================================================================
|
||||
|
||||
|
||||
async def run(
|
||||
self,
|
||||
messages: Messages,
|
||||
stream: bool = False,
|
||||
**kwargs
|
||||
self, messages: Messages, stream: bool = False, **kwargs
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
Run the agent with the given messages using Computer protocol handler pattern.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
stream: Whether to stream the response
|
||||
**kwargs: Additional arguments
|
||||
|
||||
|
||||
Returns:
|
||||
AsyncGenerator that yields response chunks
|
||||
"""
|
||||
if not self.agent_config_info:
|
||||
raise ValueError("Agent configuration not found")
|
||||
|
||||
|
||||
capabilities = self.get_capabilities()
|
||||
if "step" not in capabilities:
|
||||
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
|
||||
raise ValueError(
|
||||
f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions"
|
||||
)
|
||||
|
||||
await self._initialize_computers()
|
||||
|
||||
|
||||
# Merge kwargs
|
||||
merged_kwargs = {**self.kwargs, **kwargs}
|
||||
|
||||
|
||||
old_items = self._process_input(messages)
|
||||
new_items = []
|
||||
|
||||
@@ -603,7 +629,7 @@ class ComputerAgent:
|
||||
"stream": stream,
|
||||
"model": self.model,
|
||||
"agent_loop": self.agent_config_info.agent_class.__name__,
|
||||
**merged_kwargs
|
||||
**merged_kwargs,
|
||||
}
|
||||
await self._on_run_start(run_kwargs, old_items)
|
||||
|
||||
@@ -620,7 +646,7 @@ class ComputerAgent:
|
||||
combined_messages = old_items + new_items
|
||||
combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
|
||||
preprocessed_messages = await self._on_llm_start(combined_messages)
|
||||
|
||||
|
||||
loop_kwargs = {
|
||||
"messages": preprocessed_messages,
|
||||
"model": self.model,
|
||||
@@ -629,7 +655,7 @@ class ComputerAgent:
|
||||
"computer_handler": self.computer_handler,
|
||||
"max_retries": self.max_retries,
|
||||
"use_prompt_caching": self.use_prompt_caching,
|
||||
**merged_kwargs
|
||||
**merged_kwargs,
|
||||
}
|
||||
|
||||
# Run agent loop iteration
|
||||
@@ -641,13 +667,13 @@ class ComputerAgent:
|
||||
_on_screenshot=self._on_screenshot,
|
||||
)
|
||||
result = get_json(result)
|
||||
|
||||
|
||||
# Lifecycle hook: Postprocess messages after the LLM call
|
||||
# Use cases:
|
||||
# - PII deanonymization (if you want tool calls to see PII)
|
||||
result["output"] = await self._on_llm_end(result.get("output", []))
|
||||
await self._on_responses(loop_kwargs, result)
|
||||
|
||||
|
||||
# Yield agent response
|
||||
yield result
|
||||
|
||||
@@ -659,7 +685,9 @@ class ComputerAgent:
|
||||
|
||||
# Handle computer actions
|
||||
for item in result.get("output"):
|
||||
partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
|
||||
partial_items = await self._handle_item(
|
||||
item, self.computer_handler, ignore_call_ids=output_call_ids
|
||||
)
|
||||
new_items += partial_items
|
||||
|
||||
# Yield partial response
|
||||
@@ -669,54 +697,52 @@ class ComputerAgent:
|
||||
prompt_tokens=0,
|
||||
completion_tokens=0,
|
||||
total_tokens=0,
|
||||
)
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
await self._on_run_end(loop_kwargs, old_items, new_items)
|
||||
|
||||
|
||||
async def predict_click(
|
||||
self,
|
||||
instruction: str,
|
||||
image_b64: Optional[str] = None
|
||||
self, instruction: str, image_b64: Optional[str] = None
|
||||
) -> Optional[Tuple[int, int]]:
|
||||
"""
|
||||
Predict click coordinates based on image and instruction.
|
||||
|
||||
|
||||
Args:
|
||||
instruction: Instruction for where to click
|
||||
image_b64: Base64 encoded image (optional, will take screenshot if not provided)
|
||||
|
||||
|
||||
Returns:
|
||||
None or tuple with (x, y) coordinates
|
||||
"""
|
||||
if not self.agent_config_info:
|
||||
raise ValueError("Agent configuration not found")
|
||||
|
||||
|
||||
capabilities = self.get_capabilities()
|
||||
if "click" not in capabilities:
|
||||
raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
|
||||
if hasattr(self.agent_loop, 'predict_click'):
|
||||
raise ValueError(
|
||||
f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions"
|
||||
)
|
||||
if hasattr(self.agent_loop, "predict_click"):
|
||||
if not image_b64:
|
||||
if not self.computer_handler:
|
||||
raise ValueError("Computer tool or image_b64 is required for predict_click")
|
||||
image_b64 = await self.computer_handler.screenshot()
|
||||
return await self.agent_loop.predict_click(
|
||||
model=self.model,
|
||||
image_b64=image_b64,
|
||||
instruction=instruction
|
||||
model=self.model, image_b64=image_b64, instruction=instruction
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def get_capabilities(self) -> List[AgentCapability]:
|
||||
"""
|
||||
Get list of capabilities supported by the current agent config.
|
||||
|
||||
|
||||
Returns:
|
||||
List of capability strings (e.g., ["step", "click"])
|
||||
"""
|
||||
if not self.agent_config_info:
|
||||
raise ValueError("Agent configuration not found")
|
||||
|
||||
if hasattr(self.agent_loop, 'get_capabilities'):
|
||||
|
||||
if hasattr(self.agent_loop, "get_capabilities"):
|
||||
return self.agent_loop.get_capabilities()
|
||||
return ["step"] # Default capability
|
||||
return ["step"] # Default capability
|
||||
|
||||
@@ -3,17 +3,17 @@ Callback system for ComputerAgent preprocessing and postprocessing hooks.
|
||||
"""
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
from .budget_manager import BudgetManagerCallback
|
||||
from .image_retention import ImageRetentionCallback
|
||||
from .logging import LoggingCallback
|
||||
from .trajectory_saver import TrajectorySaverCallback
|
||||
from .budget_manager import BudgetManagerCallback
|
||||
from .telemetry import TelemetryCallback
|
||||
from .operator_validator import OperatorNormalizerCallback
|
||||
from .prompt_instructions import PromptInstructionsCallback
|
||||
from .telemetry import TelemetryCallback
|
||||
from .trajectory_saver import TrajectorySaverCallback
|
||||
|
||||
__all__ = [
|
||||
"AsyncCallbackHandler",
|
||||
"ImageRetentionCallback",
|
||||
"ImageRetentionCallback",
|
||||
"LoggingCallback",
|
||||
"TrajectorySaverCallback",
|
||||
"BudgetManagerCallback",
|
||||
|
||||
@@ -3,7 +3,7 @@ Base callback handler interface for ComputerAgent preprocessing and postprocessi
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
|
||||
class AsyncCallbackHandler(ABC):
|
||||
@@ -16,42 +16,52 @@ class AsyncCallbackHandler(ABC):
|
||||
"""Called at the start of an agent run loop."""
|
||||
pass
|
||||
|
||||
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
||||
async def on_run_end(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Called at the end of an agent run loop."""
|
||||
pass
|
||||
|
||||
async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
|
||||
|
||||
async def on_run_continue(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> bool:
|
||||
"""Called during agent run loop to determine if execution should continue.
|
||||
|
||||
|
||||
Args:
|
||||
kwargs: Run arguments
|
||||
old_items: Original messages
|
||||
new_items: New messages generated during run
|
||||
|
||||
|
||||
Returns:
|
||||
True to continue execution, False to stop
|
||||
"""
|
||||
return True
|
||||
|
||||
|
||||
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Called before messages are sent to the agent loop.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries to preprocess
|
||||
|
||||
|
||||
Returns:
|
||||
List of preprocessed message dictionaries
|
||||
"""
|
||||
return messages
|
||||
|
||||
|
||||
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Called after the agent loop returns output.
|
||||
|
||||
|
||||
Args:
|
||||
output: List of output message dictionaries to postprocess
|
||||
|
||||
|
||||
Returns:
|
||||
List of postprocessed output dictionaries
|
||||
"""
|
||||
@@ -60,63 +70,67 @@ class AsyncCallbackHandler(ABC):
|
||||
async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when a computer call is about to start.
|
||||
|
||||
|
||||
Args:
|
||||
item: The computer call item dictionary
|
||||
"""
|
||||
pass
|
||||
|
||||
async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def on_computer_call_end(
|
||||
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
||||
) -> None:
|
||||
"""
|
||||
Called when a computer call has completed.
|
||||
|
||||
|
||||
Args:
|
||||
item: The computer call item dictionary
|
||||
result: The result of the computer call
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
async def on_function_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when a function call is about to start.
|
||||
|
||||
|
||||
Args:
|
||||
item: The function call item dictionary
|
||||
"""
|
||||
pass
|
||||
|
||||
async def on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def on_function_call_end(
|
||||
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
||||
) -> None:
|
||||
"""
|
||||
Called when a function call has completed.
|
||||
|
||||
|
||||
Args:
|
||||
item: The function call item dictionary
|
||||
result: The result of the function call
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
async def on_text(self, item: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when a text message is encountered.
|
||||
|
||||
|
||||
Args:
|
||||
item: The message item dictionary
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when an API call is about to start.
|
||||
|
||||
|
||||
Args:
|
||||
kwargs: The kwargs being passed to the API call
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
||||
"""
|
||||
Called when an API call has completed.
|
||||
|
||||
|
||||
Args:
|
||||
kwargs: The kwargs that were passed to the API call
|
||||
result: The result of the API call
|
||||
@@ -126,7 +140,7 @@ class AsyncCallbackHandler(ABC):
|
||||
async def on_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when usage information is received.
|
||||
|
||||
|
||||
Args:
|
||||
usage: The usage information
|
||||
"""
|
||||
@@ -135,7 +149,7 @@ class AsyncCallbackHandler(ABC):
|
||||
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
||||
"""
|
||||
Called when a screenshot is taken.
|
||||
|
||||
|
||||
Args:
|
||||
screenshot: The screenshot image
|
||||
name: The name of the screenshot
|
||||
@@ -145,9 +159,9 @@ class AsyncCallbackHandler(ABC):
|
||||
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Called when responses are received.
|
||||
|
||||
|
||||
Args:
|
||||
kwargs: The kwargs being passed to the agent loop
|
||||
responses: The responses received
|
||||
"""
|
||||
pass
|
||||
pass
|
||||
|
||||
@@ -1,17 +1,23 @@
|
||||
from typing import Dict, List, Any
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
class BudgetExceededError(Exception):
|
||||
"""Exception raised when budget is exceeded."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class BudgetManagerCallback(AsyncCallbackHandler):
|
||||
"""Budget manager callback that tracks usage costs and can stop execution when budget is exceeded."""
|
||||
|
||||
def __init__(self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False):
|
||||
|
||||
def __init__(
|
||||
self, max_budget: float, reset_after_each_run: bool = True, raise_error: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize BudgetManagerCallback.
|
||||
|
||||
|
||||
Args:
|
||||
max_budget: Maximum budget allowed
|
||||
reset_after_each_run: Whether to reset budget after each run
|
||||
@@ -21,24 +27,30 @@ class BudgetManagerCallback(AsyncCallbackHandler):
|
||||
self.reset_after_each_run = reset_after_each_run
|
||||
self.raise_error = raise_error
|
||||
self.total_cost = 0.0
|
||||
|
||||
|
||||
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
||||
"""Reset budget if configured to do so."""
|
||||
if self.reset_after_each_run:
|
||||
self.total_cost = 0.0
|
||||
|
||||
|
||||
async def on_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Track usage costs."""
|
||||
if "response_cost" in usage:
|
||||
self.total_cost += usage["response_cost"]
|
||||
|
||||
async def on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
|
||||
|
||||
async def on_run_continue(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> bool:
|
||||
"""Check if budget allows continuation."""
|
||||
if self.total_cost >= self.max_budget:
|
||||
if self.raise_error:
|
||||
raise BudgetExceededError(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
|
||||
raise BudgetExceededError(
|
||||
f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}"
|
||||
)
|
||||
else:
|
||||
print(f"Budget exceeded: ${self.total_cost} >= ${self.max_budget}")
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
Image retention callback handler that limits the number of recent images in message history.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
@@ -11,40 +12,40 @@ class ImageRetentionCallback(AsyncCallbackHandler):
|
||||
Callback handler that applies image retention policy to limit the number
|
||||
of recent images in message history to prevent context window overflow.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, only_n_most_recent_images: Optional[int] = None):
|
||||
"""
|
||||
Initialize the image retention callback.
|
||||
|
||||
|
||||
Args:
|
||||
only_n_most_recent_images: If set, only keep the N most recent images in message history
|
||||
"""
|
||||
self.only_n_most_recent_images = only_n_most_recent_images
|
||||
|
||||
|
||||
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Apply image retention policy to messages before sending to agent loop.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
|
||||
|
||||
Returns:
|
||||
List of messages with image retention policy applied
|
||||
"""
|
||||
if self.only_n_most_recent_images is None:
|
||||
return messages
|
||||
|
||||
|
||||
return self._apply_image_retention(messages)
|
||||
|
||||
|
||||
def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Apply image retention policy to keep only the N most recent images.
|
||||
|
||||
|
||||
Removes computer_call_output items with image_url and their corresponding computer_call items,
|
||||
keeping only the most recent N image pairs based on only_n_most_recent_images setting.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
|
||||
|
||||
Returns:
|
||||
Filtered list of messages with image retention applied
|
||||
"""
|
||||
@@ -78,7 +79,11 @@ class ImageRetentionCallback(AsyncCallbackHandler):
|
||||
# Remove the immediately preceding computer_call with matching call_id (if present)
|
||||
call_id = messages[idx].get("call_id")
|
||||
prev_idx = idx - 1
|
||||
if prev_idx >= 0 and messages[prev_idx].get("type") == "computer_call" and messages[prev_idx].get("call_id") == call_id:
|
||||
if (
|
||||
prev_idx >= 0
|
||||
and messages[prev_idx].get("type") == "computer_call"
|
||||
and messages[prev_idx].get("call_id") == call_id
|
||||
):
|
||||
to_remove.add(prev_idx)
|
||||
# Check a single reasoning immediately before that computer_call
|
||||
r_idx = prev_idx - 1
|
||||
@@ -87,4 +92,4 @@ class ImageRetentionCallback(AsyncCallbackHandler):
|
||||
|
||||
# Construct filtered list
|
||||
filtered = [m for i, m in enumerate(messages) if i not in to_remove]
|
||||
return filtered
|
||||
return filtered
|
||||
|
||||
@@ -4,17 +4,18 @@ Logging callback for ComputerAgent that provides configurable logging of agent l
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
def sanitize_image_urls(data: Any) -> Any:
|
||||
"""
|
||||
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
||||
|
||||
|
||||
Args:
|
||||
data: Any data structure (dict, list, or primitive type)
|
||||
|
||||
|
||||
Returns:
|
||||
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
||||
"""
|
||||
@@ -28,11 +29,11 @@ def sanitize_image_urls(data: Any) -> Any:
|
||||
# Recursively sanitize the value
|
||||
sanitized[key] = sanitize_image_urls(value)
|
||||
return sanitized
|
||||
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Recursively sanitize each item in the list
|
||||
return [sanitize_image_urls(item) for item in data]
|
||||
|
||||
|
||||
else:
|
||||
# For primitive types (str, int, bool, None, etc.), return as-is
|
||||
return data
|
||||
@@ -41,37 +42,36 @@ def sanitize_image_urls(data: Any) -> Any:
|
||||
class LoggingCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
Callback handler that logs agent lifecycle events with configurable verbosity.
|
||||
|
||||
|
||||
Logging levels:
|
||||
- DEBUG: All events including API calls, message preprocessing, and detailed outputs
|
||||
- INFO: Major lifecycle events (start/end, messages, outputs)
|
||||
- INFO: Major lifecycle events (start/end, messages, outputs)
|
||||
- WARNING: Only warnings and errors
|
||||
- ERROR: Only errors
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, logger: Optional[logging.Logger] = None, level: int = logging.INFO):
|
||||
"""
|
||||
Initialize the logging callback.
|
||||
|
||||
|
||||
Args:
|
||||
logger: Logger instance to use. If None, creates a logger named 'agent.ComputerAgent'
|
||||
level: Logging level (logging.DEBUG, logging.INFO, etc.)
|
||||
"""
|
||||
self.logger = logger or logging.getLogger('agent.ComputerAgent')
|
||||
self.logger = logger or logging.getLogger("agent.ComputerAgent")
|
||||
self.level = level
|
||||
|
||||
|
||||
# Set up logger if it doesn't have handlers
|
||||
if not self.logger.handlers:
|
||||
handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
handler.setFormatter(formatter)
|
||||
self.logger.addHandler(handler)
|
||||
self.logger.setLevel(level)
|
||||
|
||||
|
||||
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Update total usage statistics."""
|
||||
|
||||
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
||||
for key, value in source.items():
|
||||
if isinstance(value, dict):
|
||||
@@ -82,18 +82,25 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
if key not in target:
|
||||
target[key] = 0
|
||||
target[key] += value
|
||||
|
||||
add_dicts(self.total_usage, usage)
|
||||
|
||||
|
||||
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
||||
"""Called before the run starts."""
|
||||
self.total_usage = {}
|
||||
|
||||
|
||||
async def on_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Called when usage information is received."""
|
||||
self._update_usage(usage)
|
||||
|
||||
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
||||
async def on_run_end(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Called after the run ends."""
|
||||
|
||||
def format_dict(d, indent=0):
|
||||
lines = []
|
||||
prefix = f" - {' ' * indent}"
|
||||
@@ -106,10 +113,10 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
else:
|
||||
lines.append(f"{prefix}{key}: {value}")
|
||||
return lines
|
||||
|
||||
|
||||
formatted_output = "\n".join(format_dict(self.total_usage))
|
||||
self.logger.info(f"Total usage:\n{formatted_output}")
|
||||
|
||||
|
||||
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Called before LLM processing starts."""
|
||||
if self.logger.isEnabledFor(logging.INFO):
|
||||
@@ -118,27 +125,27 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
|
||||
self.logger.debug(f"LLM input messages: {json.dumps(sanitized_messages, indent=2)}")
|
||||
return messages
|
||||
|
||||
|
||||
async def on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Called after LLM processing ends."""
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
sanitized_messages = [sanitize_image_urls(msg) for msg in messages]
|
||||
self.logger.debug(f"LLM output: {json.dumps(sanitized_messages, indent=2)}")
|
||||
return messages
|
||||
|
||||
|
||||
async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a computer call starts."""
|
||||
action = item.get("action", {})
|
||||
action_type = action.get("type", "unknown")
|
||||
action_args = {k: v for k, v in action.items() if k != "type"}
|
||||
|
||||
|
||||
# INFO level logging for the action
|
||||
self.logger.info(f"Computer: {action_type}({action_args})")
|
||||
|
||||
|
||||
# DEBUG level logging for full details
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
self.logger.debug(f"Computer call started: {json.dumps(action, indent=2)}")
|
||||
|
||||
|
||||
async def on_computer_call_end(self, item: Dict[str, Any], result: Any) -> None:
|
||||
"""Called when a computer call ends."""
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
@@ -147,48 +154,52 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
if result:
|
||||
sanitized_result = sanitize_image_urls(result)
|
||||
self.logger.debug(f"Computer call result: {json.dumps(sanitized_result, indent=2)}")
|
||||
|
||||
|
||||
async def on_function_call_start(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a function call starts."""
|
||||
name = item.get("name", "unknown")
|
||||
arguments = item.get("arguments", "{}")
|
||||
|
||||
|
||||
# INFO level logging for the function call
|
||||
self.logger.info(f"Function: {name}({arguments})")
|
||||
|
||||
|
||||
# DEBUG level logging for full details
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
self.logger.debug(f"Function call started: {name}")
|
||||
|
||||
|
||||
async def on_function_call_end(self, item: Dict[str, Any], result: Any) -> None:
|
||||
"""Called when a function call ends."""
|
||||
# INFO level logging for function output (similar to function_call_output)
|
||||
if result:
|
||||
# Handle both list and direct result formats
|
||||
if isinstance(result, list) and len(result) > 0:
|
||||
output = result[0].get("output", str(result)) if isinstance(result[0], dict) else str(result[0])
|
||||
output = (
|
||||
result[0].get("output", str(result))
|
||||
if isinstance(result[0], dict)
|
||||
else str(result[0])
|
||||
)
|
||||
else:
|
||||
output = str(result)
|
||||
|
||||
|
||||
# Truncate long outputs
|
||||
if len(output) > 100:
|
||||
output = output[:100] + "..."
|
||||
|
||||
|
||||
self.logger.info(f"Output: {output}")
|
||||
|
||||
|
||||
# DEBUG level logging for full details
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
name = item.get("name", "unknown")
|
||||
self.logger.debug(f"Function call completed: {name}")
|
||||
if result:
|
||||
self.logger.debug(f"Function call result: {json.dumps(result, indent=2)}")
|
||||
|
||||
|
||||
async def on_text(self, item: Dict[str, Any]) -> None:
|
||||
"""Called when a text message is encountered."""
|
||||
# Get the role to determine if it's Agent or User
|
||||
role = item.get("role", "unknown")
|
||||
content_items = item.get("content", [])
|
||||
|
||||
|
||||
# Process content items to build display text
|
||||
text_parts = []
|
||||
for content_item in content_items:
|
||||
@@ -206,10 +217,10 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
else:
|
||||
# Non-text content, show as [type]
|
||||
text_parts.append(f"[{content_type}]")
|
||||
|
||||
|
||||
# Join all text parts
|
||||
display_text = ''.join(text_parts) if text_parts else "[empty]"
|
||||
|
||||
display_text = "".join(text_parts) if text_parts else "[empty]"
|
||||
|
||||
# Log with appropriate level and format
|
||||
if role == "assistant":
|
||||
self.logger.info(f"Agent: {display_text}")
|
||||
@@ -219,7 +230,7 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
# Fallback for unknown roles, use debug level
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
self.logger.debug(f"Text message ({role}): {display_text}")
|
||||
|
||||
|
||||
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
||||
"""Called when an API call is about to start."""
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
@@ -232,16 +243,18 @@ class LoggingCallback(AsyncCallbackHandler):
|
||||
elif "input" in kwargs:
|
||||
sanitized_input = sanitize_image_urls(kwargs["input"])
|
||||
self.logger.debug(f"API call input: {json.dumps(sanitized_input, indent=2)}")
|
||||
|
||||
|
||||
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
||||
"""Called when an API call has completed."""
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
model = kwargs.get("model", "unknown")
|
||||
self.logger.debug(f"API call completed for model: {model}")
|
||||
self.logger.debug(f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}")
|
||||
self.logger.debug(
|
||||
f"API call result: {json.dumps(sanitize_image_urls(result), indent=2)}"
|
||||
)
|
||||
|
||||
async def on_screenshot(self, item: Union[str, bytes], name: str = "screenshot") -> None:
|
||||
"""Called when a screenshot is taken."""
|
||||
if self.logger.isEnabledFor(logging.DEBUG):
|
||||
image_size = len(item) / 1024
|
||||
self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")
|
||||
self.logger.debug(f"Screenshot captured: {name} {image_size:.2f} KB")
|
||||
|
||||
@@ -9,6 +9,7 @@ Ensures agent output actions conform to expected schemas by fixing common issues
|
||||
This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
|
||||
The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
@@ -48,6 +49,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
|
||||
action["type"] = "type"
|
||||
|
||||
action_type = action.get("type")
|
||||
|
||||
def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
|
||||
"""Keep only the provided keys on action; delete everything else.
|
||||
Always ensures required 'type' is present if listed in keys_to_keep.
|
||||
@@ -55,6 +57,7 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
|
||||
for key in list(action.keys()):
|
||||
if key not in keys_to_keep:
|
||||
del action[key]
|
||||
|
||||
# rename "coordinate" to "x", "y"
|
||||
if "coordinate" in action:
|
||||
action["x"] = action["coordinate"][0]
|
||||
@@ -100,7 +103,6 @@ class OperatorNormalizerCallback(AsyncCallbackHandler):
|
||||
keep = required_keys_by_type.get(action_type or "")
|
||||
if keep:
|
||||
_keep_keys(action, keep)
|
||||
|
||||
|
||||
# # Second pass: if an assistant message is immediately followed by a computer_call,
|
||||
# # replace the assistant message itself with a reasoning message with summary text.
|
||||
|
||||
@@ -2,38 +2,41 @@
|
||||
PII anonymization callback handler using Microsoft Presidio for text and image redaction.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from .base import AsyncCallbackHandler
|
||||
import base64
|
||||
import io
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
try:
|
||||
# TODO: Add Presidio dependencies
|
||||
from PIL import Image
|
||||
|
||||
PRESIDIO_AVAILABLE = True
|
||||
except ImportError:
|
||||
PRESIDIO_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PIIAnonymizationCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
Callback handler that anonymizes PII in text and images using Microsoft Presidio.
|
||||
|
||||
|
||||
This handler:
|
||||
1. Anonymizes PII in messages before sending to the agent loop
|
||||
2. Deanonymizes PII in tool calls and message outputs after the agent loop
|
||||
3. Redacts PII from images in computer_call_output messages
|
||||
"""
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# TODO: Any extra kwargs if needed
|
||||
):
|
||||
"""
|
||||
Initialize the PII anonymization callback.
|
||||
|
||||
|
||||
Args:
|
||||
anonymize_text: Whether to anonymize text content
|
||||
anonymize_images: Whether to redact images
|
||||
@@ -46,16 +49,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
|
||||
"Presidio is not available. Install with: "
|
||||
"pip install cua-agent[pii-anonymization]"
|
||||
)
|
||||
|
||||
|
||||
# TODO: Implement __init__
|
||||
|
||||
|
||||
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Anonymize PII in messages before sending to agent loop.
|
||||
|
||||
|
||||
Args:
|
||||
messages: List of message dictionaries
|
||||
|
||||
|
||||
Returns:
|
||||
List of messages with PII anonymized
|
||||
"""
|
||||
@@ -63,16 +66,16 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
|
||||
for msg in messages:
|
||||
anonymized_msg = await self._anonymize_message(msg)
|
||||
anonymized_messages.append(anonymized_msg)
|
||||
|
||||
|
||||
return anonymized_messages
|
||||
|
||||
|
||||
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Deanonymize PII in tool calls and message outputs after agent loop.
|
||||
|
||||
|
||||
Args:
|
||||
output: List of output dictionaries
|
||||
|
||||
|
||||
Returns:
|
||||
List of output with PII deanonymized for tool calls
|
||||
"""
|
||||
@@ -84,13 +87,13 @@ class PIIAnonymizationCallback(AsyncCallbackHandler):
|
||||
deanonymized_output.append(deanonymized_item)
|
||||
else:
|
||||
deanonymized_output.append(item)
|
||||
|
||||
|
||||
return deanonymized_output
|
||||
|
||||
|
||||
async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# TODO: Implement _anonymize_message
|
||||
return message
|
||||
|
||||
|
||||
async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# TODO: Implement _deanonymize_item
|
||||
return item
|
||||
|
||||
@@ -2,17 +2,17 @@
|
||||
Telemetry callback handler for Computer-Use Agent (cua-agent)
|
||||
"""
|
||||
|
||||
import platform
|
||||
import time
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
from core.telemetry import (
|
||||
record_event,
|
||||
is_telemetry_enabled,
|
||||
record_event,
|
||||
)
|
||||
|
||||
import platform
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
SYSTEM_INFO = {
|
||||
"os": platform.system().lower(),
|
||||
@@ -20,32 +20,29 @@ SYSTEM_INFO = {
|
||||
"python_version": platform.python_version(),
|
||||
}
|
||||
|
||||
|
||||
class TelemetryCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
Telemetry callback handler for Computer-Use Agent (cua-agent)
|
||||
|
||||
|
||||
Tracks agent usage, performance metrics, and optionally trajectory data.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
agent,
|
||||
log_trajectory: bool = False
|
||||
):
|
||||
|
||||
def __init__(self, agent, log_trajectory: bool = False):
|
||||
"""
|
||||
Initialize telemetry callback.
|
||||
|
||||
|
||||
Args:
|
||||
agent: The ComputerAgent instance
|
||||
log_trajectory: Whether to log full trajectory items (opt-in)
|
||||
"""
|
||||
self.agent = agent
|
||||
self.log_trajectory = log_trajectory
|
||||
|
||||
|
||||
# Generate session/run IDs
|
||||
self.session_id = str(uuid.uuid4())
|
||||
self.run_id = None
|
||||
|
||||
|
||||
# Track timing and metrics
|
||||
self.run_start_time = None
|
||||
self.step_count = 0
|
||||
@@ -54,126 +51,133 @@ class TelemetryCallback(AsyncCallbackHandler):
|
||||
"prompt_tokens": 0,
|
||||
"completion_tokens": 0,
|
||||
"total_tokens": 0,
|
||||
"response_cost": 0.0
|
||||
"response_cost": 0.0,
|
||||
}
|
||||
|
||||
|
||||
# Record agent initialization
|
||||
if is_telemetry_enabled():
|
||||
self._record_agent_initialization()
|
||||
|
||||
|
||||
def _record_agent_initialization(self) -> None:
|
||||
"""Record agent type/model and session initialization."""
|
||||
agent_info = {
|
||||
"session_id": self.session_id,
|
||||
"agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
|
||||
"model": getattr(self.agent, 'model', 'unknown'),
|
||||
**SYSTEM_INFO
|
||||
"agent_type": (
|
||||
self.agent.agent_loop.__name__ if hasattr(self.agent, "agent_loop") else "unknown"
|
||||
),
|
||||
"model": getattr(self.agent, "model", "unknown"),
|
||||
**SYSTEM_INFO,
|
||||
}
|
||||
|
||||
|
||||
record_event("agent_session_start", agent_info)
|
||||
|
||||
|
||||
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
||||
"""Called at the start of an agent run loop."""
|
||||
if not is_telemetry_enabled():
|
||||
return
|
||||
|
||||
|
||||
self.run_id = str(uuid.uuid4())
|
||||
self.run_start_time = time.time()
|
||||
self.step_count = 0
|
||||
|
||||
|
||||
# Calculate input context size
|
||||
input_context_size = self._calculate_context_size(old_items)
|
||||
|
||||
|
||||
run_data = {
|
||||
"session_id": self.session_id,
|
||||
"run_id": self.run_id,
|
||||
"start_time": self.run_start_time,
|
||||
"input_context_size": input_context_size,
|
||||
"num_existing_messages": len(old_items)
|
||||
"num_existing_messages": len(old_items),
|
||||
}
|
||||
|
||||
|
||||
# Log trajectory if opted in
|
||||
if self.log_trajectory:
|
||||
trajectory = self._extract_trajectory(old_items)
|
||||
if trajectory:
|
||||
run_data["uploaded_trajectory"] = trajectory
|
||||
|
||||
|
||||
record_event("agent_run_start", run_data)
|
||||
|
||||
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
||||
|
||||
async def on_run_end(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Called at the end of an agent run loop."""
|
||||
if not is_telemetry_enabled() or not self.run_start_time:
|
||||
return
|
||||
|
||||
|
||||
run_duration = time.time() - self.run_start_time
|
||||
|
||||
|
||||
run_data = {
|
||||
"session_id": self.session_id,
|
||||
"run_id": self.run_id,
|
||||
"end_time": time.time(),
|
||||
"duration_seconds": run_duration,
|
||||
"num_steps": self.step_count,
|
||||
"total_usage": self.total_usage.copy()
|
||||
"total_usage": self.total_usage.copy(),
|
||||
}
|
||||
|
||||
|
||||
# Log trajectory if opted in
|
||||
if self.log_trajectory:
|
||||
trajectory = self._extract_trajectory(new_items)
|
||||
if trajectory:
|
||||
run_data["uploaded_trajectory"] = trajectory
|
||||
|
||||
|
||||
record_event("agent_run_end", run_data)
|
||||
|
||||
|
||||
async def on_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Called when usage information is received."""
|
||||
if not is_telemetry_enabled():
|
||||
return
|
||||
|
||||
|
||||
# Accumulate usage stats
|
||||
self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
|
||||
self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
|
||||
self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
|
||||
self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
|
||||
self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
|
||||
|
||||
|
||||
# Record individual usage event
|
||||
usage_data = {
|
||||
"session_id": self.session_id,
|
||||
"run_id": self.run_id,
|
||||
"step": self.step_count,
|
||||
**usage
|
||||
**usage,
|
||||
}
|
||||
|
||||
|
||||
record_event("agent_usage", usage_data)
|
||||
|
||||
|
||||
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
|
||||
"""Called when responses are received."""
|
||||
if not is_telemetry_enabled():
|
||||
return
|
||||
|
||||
|
||||
self.step_count += 1
|
||||
step_duration = None
|
||||
|
||||
|
||||
if self.step_start_time:
|
||||
step_duration = time.time() - self.step_start_time
|
||||
|
||||
|
||||
self.step_start_time = time.time()
|
||||
|
||||
|
||||
step_data = {
|
||||
"session_id": self.session_id,
|
||||
"run_id": self.run_id,
|
||||
"step": self.step_count,
|
||||
"timestamp": self.step_start_time
|
||||
"timestamp": self.step_start_time,
|
||||
}
|
||||
|
||||
|
||||
if step_duration is not None:
|
||||
step_data["duration_seconds"] = step_duration
|
||||
|
||||
|
||||
record_event("agent_step", step_data)
|
||||
|
||||
|
||||
def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
|
||||
"""Calculate approximate context size in tokens/characters."""
|
||||
total_size = 0
|
||||
|
||||
|
||||
for item in items:
|
||||
if item.get("type") == "message" and "content" in item:
|
||||
content = item["content"]
|
||||
@@ -185,25 +189,27 @@ class TelemetryCallback(AsyncCallbackHandler):
|
||||
total_size += len(part["text"])
|
||||
elif "content" in item and isinstance(item["content"], str):
|
||||
total_size += len(item["content"])
|
||||
|
||||
|
||||
return total_size
|
||||
|
||||
|
||||
def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Extract trajectory items that should be logged."""
|
||||
trajectory = []
|
||||
|
||||
|
||||
for item in items:
|
||||
# Include user messages, assistant messages, reasoning, computer calls, and computer outputs
|
||||
if (
|
||||
item.get("role") == "user" or # User inputs
|
||||
(item.get("type") == "message" and item.get("role") == "assistant") or # Model outputs
|
||||
item.get("type") == "reasoning" or # Reasoning traces
|
||||
item.get("type") == "computer_call" or # Computer actions
|
||||
item.get("type") == "computer_call_output" # Computer outputs
|
||||
item.get("role") == "user" # User inputs
|
||||
or (
|
||||
item.get("type") == "message" and item.get("role") == "assistant"
|
||||
) # Model outputs
|
||||
or item.get("type") == "reasoning" # Reasoning traces
|
||||
or item.get("type") == "computer_call" # Computer actions
|
||||
or item.get("type") == "computer_call_output" # Computer outputs
|
||||
):
|
||||
# Create a copy of the item with timestamp
|
||||
trajectory_item = item.copy()
|
||||
trajectory_item["logged_at"] = time.time()
|
||||
trajectory.append(trajectory_item)
|
||||
|
||||
return trajectory
|
||||
|
||||
return trajectory
|
||||
|
||||
@@ -2,26 +2,28 @@
|
||||
Trajectory saving callback handler for ComputerAgent.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Union, override
|
||||
from PIL import Image, ImageDraw
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union, override
|
||||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
from .base import AsyncCallbackHandler
|
||||
|
||||
|
||||
def sanitize_image_urls(data: Any) -> Any:
|
||||
"""
|
||||
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
||||
|
||||
|
||||
Args:
|
||||
data: Any data structure (dict, list, or primitive type)
|
||||
|
||||
|
||||
Returns:
|
||||
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
||||
"""
|
||||
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
|
||||
# Recursively sanitize the value
|
||||
sanitized[key] = sanitize_image_urls(value)
|
||||
return sanitized
|
||||
|
||||
|
||||
elif isinstance(data, list):
|
||||
# Recursively sanitize each item in the list
|
||||
return [sanitize_image_urls(item) for item in data]
|
||||
|
||||
|
||||
else:
|
||||
# For primitive types (str, int, bool, None, etc.), return as-is
|
||||
return data
|
||||
|
||||
|
||||
def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
|
||||
def extract_computer_call_outputs(
|
||||
items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Save any base64-encoded screenshots from computer_call_output entries to files and
|
||||
replace their image_url with the saved file path when a call_id is present.
|
||||
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
|
||||
updated.append(msg)
|
||||
return updated
|
||||
|
||||
|
||||
class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"""
|
||||
Callback handler that saves agent trajectories to disk.
|
||||
|
||||
|
||||
Saves each run as a separate trajectory with unique ID, and each turn
|
||||
within the trajectory gets its own folder with screenshots and responses.
|
||||
"""
|
||||
|
||||
def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
|
||||
|
||||
def __init__(
|
||||
self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Initialize trajectory saver.
|
||||
|
||||
|
||||
Args:
|
||||
trajectory_dir: Base directory to save trajectories
|
||||
reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
|
||||
@@ -129,7 +136,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
self.reset_on_run = reset_on_run
|
||||
# Optional directory to store extracted screenshots from metadata/new_items
|
||||
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
||||
|
||||
|
||||
# Ensure trajectory directory exists
|
||||
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -137,7 +144,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"""Get the directory for the current turn."""
|
||||
if not self.trajectory_id:
|
||||
raise ValueError("Trajectory not initialized - call _on_run_start first")
|
||||
|
||||
|
||||
# format: trajectory_id/turn_000
|
||||
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
|
||||
turn_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -166,6 +173,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
|
||||
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
||||
"""Update total usage statistics."""
|
||||
|
||||
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
||||
for key, value in source.items():
|
||||
if isinstance(value, dict):
|
||||
@@ -176,20 +184,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
if key not in target:
|
||||
target[key] = 0
|
||||
target[key] += value
|
||||
|
||||
add_dicts(self.total_usage, usage)
|
||||
|
||||
|
||||
@override
|
||||
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
||||
"""Initialize trajectory tracking for a new run."""
|
||||
model = kwargs.get("model", "unknown")
|
||||
|
||||
|
||||
# Only reset trajectory state if reset_on_run is True or no trajectory exists
|
||||
if self.reset_on_run or not self.trajectory_id:
|
||||
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
||||
if "+" in model:
|
||||
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
||||
# strip non-alphanumeric characters from model_name_short
|
||||
model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
|
||||
model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
|
||||
|
||||
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
||||
now = datetime.now()
|
||||
@@ -198,11 +207,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
self.current_artifact = 0
|
||||
self.model = model
|
||||
self.total_usage = {}
|
||||
|
||||
|
||||
# Create trajectory directory
|
||||
trajectory_path = self.trajectory_dir / self.trajectory_id
|
||||
trajectory_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
||||
kwargs_to_save = kwargs.copy()
|
||||
try:
|
||||
@@ -219,7 +228,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"status": "running",
|
||||
"kwargs": kwargs_to_save,
|
||||
}
|
||||
|
||||
|
||||
with open(trajectory_path / "metadata.json", "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
else:
|
||||
@@ -227,22 +236,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
self.model = model
|
||||
|
||||
@override
|
||||
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
||||
async def on_run_end(
|
||||
self,
|
||||
kwargs: Dict[str, Any],
|
||||
old_items: List[Dict[str, Any]],
|
||||
new_items: List[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
|
||||
if not self.trajectory_id:
|
||||
return
|
||||
|
||||
|
||||
# Update metadata with completion status, total usage, and new items
|
||||
trajectory_path = self.trajectory_dir / self.trajectory_id
|
||||
metadata_path = trajectory_path / "metadata.json"
|
||||
|
||||
|
||||
# Read existing metadata
|
||||
if metadata_path.exists():
|
||||
with open(metadata_path, "r") as f:
|
||||
metadata = json.load(f)
|
||||
else:
|
||||
metadata = {}
|
||||
|
||||
|
||||
# Update metadata with completion info
|
||||
# Optionally extract screenshots from new_items before persisting
|
||||
new_items_to_save = new_items
|
||||
@@ -251,32 +265,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
metadata.update({
|
||||
"status": "completed",
|
||||
"completed_at": str(uuid.uuid1().time),
|
||||
"total_usage": self.total_usage,
|
||||
"new_items": new_items_to_save,
|
||||
"total_turns": self.current_turn
|
||||
})
|
||||
|
||||
metadata.update(
|
||||
{
|
||||
"status": "completed",
|
||||
"completed_at": str(uuid.uuid1().time),
|
||||
"total_usage": self.total_usage,
|
||||
"new_items": new_items_to_save,
|
||||
"total_turns": self.current_turn,
|
||||
}
|
||||
)
|
||||
|
||||
# Save updated metadata
|
||||
with open(metadata_path, "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
@override
|
||||
|
||||
@override
|
||||
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
||||
if not self.trajectory_id:
|
||||
return
|
||||
|
||||
self._save_artifact("api_start", { "kwargs": kwargs })
|
||||
|
||||
|
||||
self._save_artifact("api_start", {"kwargs": kwargs})
|
||||
|
||||
@override
|
||||
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
||||
"""Save API call result."""
|
||||
if not self.trajectory_id:
|
||||
return
|
||||
|
||||
self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
|
||||
|
||||
self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
|
||||
|
||||
@override
|
||||
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
||||
@@ -295,77 +311,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
"""Save responses to the current turn directory and update usage statistics."""
|
||||
if not self.trajectory_id:
|
||||
return
|
||||
|
||||
|
||||
# Save responses
|
||||
turn_dir = self._get_turn_dir()
|
||||
response_data = {
|
||||
"timestamp": str(uuid.uuid1().time),
|
||||
"model": self.model,
|
||||
"kwargs": kwargs,
|
||||
"response": responses
|
||||
"response": responses,
|
||||
}
|
||||
|
||||
|
||||
self._save_artifact("agent_response", response_data)
|
||||
|
||||
|
||||
# Increment turn counter
|
||||
self.current_turn += 1
|
||||
|
||||
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
|
||||
"""
|
||||
Draw a red dot and crosshair at the specified coordinates on the image.
|
||||
|
||||
|
||||
Args:
|
||||
image_bytes: The original image as bytes
|
||||
x: X coordinate for the crosshair
|
||||
y: Y coordinate for the crosshair
|
||||
|
||||
|
||||
Returns:
|
||||
Modified image as bytes with red dot and crosshair
|
||||
"""
|
||||
# Open the image
|
||||
image = Image.open(io.BytesIO(image_bytes))
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
|
||||
# Draw crosshair lines (red, 2px thick)
|
||||
crosshair_size = 20
|
||||
line_width = 2
|
||||
color = "red"
|
||||
|
||||
|
||||
# Horizontal line
|
||||
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
|
||||
# Vertical line
|
||||
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
|
||||
|
||||
|
||||
# Draw center dot (filled circle)
|
||||
dot_radius = 3
|
||||
draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
|
||||
|
||||
draw.ellipse(
|
||||
[(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
|
||||
)
|
||||
|
||||
# Convert back to bytes
|
||||
output = io.BytesIO()
|
||||
image.save(output, format='PNG')
|
||||
image.save(output, format="PNG")
|
||||
return output.getvalue()
|
||||
|
||||
@override
|
||||
async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
||||
async def on_computer_call_end(
|
||||
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
||||
) -> None:
|
||||
"""
|
||||
Called when a computer call has completed.
|
||||
Saves screenshots and computer call output.
|
||||
"""
|
||||
if not self.trajectory_id:
|
||||
return
|
||||
|
||||
self._save_artifact("computer_call_result", { "item": item, "result": result })
|
||||
|
||||
|
||||
self._save_artifact("computer_call_result", {"item": item, "result": result})
|
||||
|
||||
# Check if action has x/y coordinates and there's a screenshot in the result
|
||||
action = item.get("action", {})
|
||||
if "x" in action and "y" in action:
|
||||
# Look for screenshot in the result
|
||||
for result_item in result:
|
||||
if (result_item.get("type") == "computer_call_output" and
|
||||
result_item.get("output", {}).get("type") == "input_image"):
|
||||
|
||||
if (
|
||||
result_item.get("type") == "computer_call_output"
|
||||
and result_item.get("output", {}).get("type") == "input_image"
|
||||
):
|
||||
|
||||
image_url = result_item["output"]["image_url"]
|
||||
|
||||
|
||||
# Extract base64 image data
|
||||
if image_url.startswith("data:image/"):
|
||||
# Format: data:image/png;base64,<base64_data>
|
||||
@@ -373,26 +395,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
||||
else:
|
||||
# Assume it's just base64 data
|
||||
base64_data = image_url
|
||||
|
||||
|
||||
try:
|
||||
# Decode the image
|
||||
image_bytes = base64.b64decode(base64_data)
|
||||
|
||||
|
||||
# Draw crosshair at the action coordinates
|
||||
annotated_image = self._draw_crosshair_on_image(
|
||||
image_bytes,
|
||||
int(action["x"]),
|
||||
int(action["y"])
|
||||
image_bytes, int(action["x"]), int(action["y"])
|
||||
)
|
||||
|
||||
|
||||
# Save as screenshot_action
|
||||
self._save_artifact("screenshot_action", annotated_image)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
# If annotation fails, just log and continue
|
||||
print(f"Failed to annotate screenshot: {e}")
|
||||
|
||||
|
||||
break # Only process the first screenshot found
|
||||
|
||||
# Increment turn counter
|
||||
self.current_turn += 1
|
||||
self.current_turn += 1
|
||||
|
||||
@@ -3,7 +3,7 @@ CLI chat interface for agent - Computer Use Agent
|
||||
|
||||
Usage:
|
||||
python -m agent.cli <model_string>
|
||||
|
||||
|
||||
Examples:
|
||||
python -m agent.cli openai/computer-use-preview
|
||||
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
@@ -11,19 +11,22 @@ Examples:
|
||||
"""
|
||||
|
||||
try:
|
||||
import asyncio
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Any
|
||||
import dotenv
|
||||
import asyncio
|
||||
import base64
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import dotenv
|
||||
|
||||
try:
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
PIL_AVAILABLE = True
|
||||
except Exception:
|
||||
PIL_AVAILABLE = False
|
||||
@@ -31,36 +34,44 @@ try:
|
||||
except ImportError:
|
||||
if __name__ == "__main__":
|
||||
raise ImportError(
|
||||
"CLI dependencies not found. "
|
||||
"Please install with: pip install \"cua-agent[cli]\""
|
||||
"CLI dependencies not found. " 'Please install with: pip install "cua-agent[cli]"'
|
||||
)
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv()
|
||||
|
||||
|
||||
# Color codes for terminal output
|
||||
class Colors:
|
||||
RESET = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
DIM = '\033[2m'
|
||||
|
||||
# Text colors
|
||||
RED = '\033[31m'
|
||||
GREEN = '\033[32m'
|
||||
YELLOW = '\033[33m'
|
||||
BLUE = '\033[34m'
|
||||
MAGENTA = '\033[35m'
|
||||
CYAN = '\033[36m'
|
||||
WHITE = '\033[37m'
|
||||
GRAY = '\033[90m'
|
||||
|
||||
# Background colors
|
||||
BG_RED = '\033[41m'
|
||||
BG_GREEN = '\033[42m'
|
||||
BG_YELLOW = '\033[43m'
|
||||
BG_BLUE = '\033[44m'
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
DIM = "\033[2m"
|
||||
|
||||
def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
|
||||
# Text colors
|
||||
RED = "\033[31m"
|
||||
GREEN = "\033[32m"
|
||||
YELLOW = "\033[33m"
|
||||
BLUE = "\033[34m"
|
||||
MAGENTA = "\033[35m"
|
||||
CYAN = "\033[36m"
|
||||
WHITE = "\033[37m"
|
||||
GRAY = "\033[90m"
|
||||
|
||||
# Background colors
|
||||
BG_RED = "\033[41m"
|
||||
BG_GREEN = "\033[42m"
|
||||
BG_YELLOW = "\033[43m"
|
||||
BG_BLUE = "\033[44m"
|
||||
|
||||
|
||||
def print_colored(
|
||||
text: str,
|
||||
color: str = "",
|
||||
bold: bool = False,
|
||||
dim: bool = False,
|
||||
end: str = "\n",
|
||||
right: str = "",
|
||||
):
|
||||
"""Print colored text to terminal with optional right-aligned text."""
|
||||
prefix = ""
|
||||
if bold:
|
||||
@@ -69,24 +80,25 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
||||
prefix += Colors.DIM
|
||||
if color:
|
||||
prefix += color
|
||||
|
||||
|
||||
if right:
|
||||
# Get terminal width (default to 80 if unable to determine)
|
||||
try:
|
||||
import shutil
|
||||
|
||||
terminal_width = shutil.get_terminal_size().columns
|
||||
except:
|
||||
terminal_width = 80
|
||||
|
||||
# Add right margin
|
||||
terminal_width -= 1
|
||||
|
||||
|
||||
# Calculate padding needed
|
||||
# Account for ANSI escape codes not taking visual space
|
||||
visible_left_len = len(text)
|
||||
visible_right_len = len(right)
|
||||
padding = terminal_width - visible_left_len - visible_right_len
|
||||
|
||||
|
||||
if padding > 0:
|
||||
output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
|
||||
else:
|
||||
@@ -94,7 +106,7 @@ def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = Fa
|
||||
output = f"{prefix}{text} {right}{Colors.RESET}"
|
||||
else:
|
||||
output = f"{prefix}{text}{Colors.RESET}"
|
||||
|
||||
|
||||
print(output, end=end)
|
||||
|
||||
|
||||
@@ -113,29 +125,34 @@ def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
|
||||
args_str = f"('{details['text']}')"
|
||||
elif action_type == "scroll" and "x" in details and "y" in details:
|
||||
args_str = f"({details['x']}, {details['y']})"
|
||||
|
||||
|
||||
if total_cost > 0:
|
||||
print_colored(f"🛠️ {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
|
||||
else:
|
||||
print_colored(f"🛠️ {action_type}{args_str}", dim=True)
|
||||
|
||||
|
||||
def print_welcome(model: str, agent_loop: str, container_name: str):
|
||||
"""Print welcome message."""
|
||||
print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
|
||||
print_colored("Type 'exit' to quit.", dim=True)
|
||||
|
||||
|
||||
async def ainput(prompt: str = ""):
|
||||
return await asyncio.to_thread(input, prompt)
|
||||
|
||||
async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
|
||||
|
||||
async def chat_loop(
|
||||
agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True
|
||||
):
|
||||
"""Main chat loop with the agent."""
|
||||
print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
|
||||
|
||||
|
||||
history = []
|
||||
|
||||
|
||||
if initial_prompt:
|
||||
history.append({"role": "user", "content": initial_prompt})
|
||||
|
||||
|
||||
total_cost = 0
|
||||
|
||||
while True:
|
||||
@@ -143,28 +160,28 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
||||
# Get user input with prompt
|
||||
print_colored("> ", end="")
|
||||
user_input = await ainput()
|
||||
|
||||
if user_input.lower() in ['exit', 'quit', 'q']:
|
||||
|
||||
if user_input.lower() in ["exit", "quit", "q"]:
|
||||
print_colored("\n👋 Goodbye!")
|
||||
break
|
||||
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
|
||||
# Add user message to history
|
||||
history.append({"role": "user", "content": user_input})
|
||||
|
||||
|
||||
# Stream responses from the agent with spinner
|
||||
with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
|
||||
spinner.hide()
|
||||
|
||||
|
||||
async for result in agent.run(history):
|
||||
# Add agent responses to history
|
||||
history.extend(result.get("output", []))
|
||||
|
||||
if show_usage:
|
||||
total_cost += result.get("usage", {}).get("response_cost", 0)
|
||||
|
||||
|
||||
# Process and display the output
|
||||
for item in result.get("output", []):
|
||||
if item.get("type") == "message" and item.get("role") == "assistant":
|
||||
@@ -176,7 +193,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
||||
if text:
|
||||
spinner.hide()
|
||||
print_colored(text)
|
||||
|
||||
|
||||
elif item.get("type") == "computer_call":
|
||||
# Display computer action
|
||||
action = item.get("action", {})
|
||||
@@ -186,7 +203,7 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
||||
print_action(action_type, action, total_cost)
|
||||
spinner.text = f"Performing {action_type}..."
|
||||
spinner.show()
|
||||
|
||||
|
||||
elif item.get("type") == "function_call":
|
||||
# Display function call
|
||||
function_name = item.get("name", "")
|
||||
@@ -194,18 +211,18 @@ async def chat_loop(agent, model: str, container_name: str, initial_prompt: str
|
||||
print_colored(f"🔧 Calling function: {function_name}", dim=True)
|
||||
spinner.text = f"Calling {function_name}..."
|
||||
spinner.show()
|
||||
|
||||
|
||||
elif item.get("type") == "function_call_output":
|
||||
# Display function output (dimmed)
|
||||
output = item.get("output", "")
|
||||
if output and len(output.strip()) > 0:
|
||||
spinner.hide()
|
||||
print_colored(f"📤 {output}", dim=True)
|
||||
|
||||
|
||||
spinner.hide()
|
||||
if show_usage and total_cost > 0:
|
||||
print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
|
||||
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main CLI function."""
|
||||
@@ -218,90 +235,74 @@ Examples:
|
||||
python -m agent.cli anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
|
||||
python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"model",
|
||||
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
|
||||
help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
choices=["cloud", "lume", "winsandbox", "docker"],
|
||||
default="cloud",
|
||||
help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
|
||||
help="Computer provider to use: cloud (default), lume, winsandbox, or docker",
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument(
|
||||
"--images",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of recent images to keep in context (default: 3)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--trajectory",
|
||||
action="store_true",
|
||||
help="Save trajectory for debugging"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--budget",
|
||||
type=float,
|
||||
help="Maximum budget for the session (in dollars)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose logging"
|
||||
help="Number of recent images to keep in context (default: 3)",
|
||||
)
|
||||
|
||||
parser.add_argument("--trajectory", action="store_true", help="Save trajectory for debugging")
|
||||
|
||||
parser.add_argument("--budget", type=float, help="Maximum budget for the session (in dollars)")
|
||||
|
||||
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
||||
|
||||
parser.add_argument(
|
||||
"-p", "--prompt",
|
||||
"-p",
|
||||
"--prompt",
|
||||
type=str,
|
||||
help="Initial prompt to send to the agent. Leave blank for interactive mode."
|
||||
help="Initial prompt to send to the agent. Leave blank for interactive mode.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--prompt-file",
|
||||
type=Path,
|
||||
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
|
||||
help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--predict-click",
|
||||
dest="predict_click",
|
||||
type=str,
|
||||
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
|
||||
help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it.",
|
||||
)
|
||||
|
||||
parser.add_argument("-c", "--cache", action="store_true", help="Tell the API to enable caching")
|
||||
|
||||
parser.add_argument(
|
||||
"-u", "--usage", action="store_true", help="Show total cost of the agent runs"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-c", "--cache",
|
||||
action="store_true",
|
||||
help="Tell the API to enable caching"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-u", "--usage",
|
||||
action="store_true",
|
||||
help="Show total cost of the agent runs"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r", "--max-retries",
|
||||
"-r",
|
||||
"--max-retries",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum number of retries for the LLM API calls"
|
||||
help="Maximum number of retries for the LLM API calls",
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Check for required environment variables
|
||||
container_name = os.getenv("CUA_CONTAINER_NAME")
|
||||
cua_api_key = os.getenv("CUA_API_KEY")
|
||||
|
||||
|
||||
# Prompt for missing environment variables (container name always required)
|
||||
if not container_name:
|
||||
if args.provider == "cloud":
|
||||
@@ -321,13 +322,13 @@ Examples:
|
||||
if not cua_api_key:
|
||||
print_colored("❌ API key is required for cloud provider.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Check for provider-specific API keys based on model
|
||||
provider_api_keys = {
|
||||
"openai/": "OPENAI_API_KEY",
|
||||
"anthropic/": "ANTHROPIC_API_KEY",
|
||||
}
|
||||
|
||||
|
||||
# Find matching provider and check for API key
|
||||
for prefix, env_var in provider_api_keys.items():
|
||||
if prefix in args.model:
|
||||
@@ -340,7 +341,7 @@ Examples:
|
||||
# Set the environment variable for the session
|
||||
os.environ[env_var] = api_key
|
||||
break
|
||||
|
||||
|
||||
# Import here to avoid import errors if dependencies are missing
|
||||
try:
|
||||
from agent import ComputerAgent
|
||||
@@ -349,7 +350,7 @@ Examples:
|
||||
print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
|
||||
print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Resolve provider -> os_type, provider_type, api key requirement
|
||||
provider_map = {
|
||||
"cloud": ("linux", "cloud", True),
|
||||
@@ -365,42 +366,46 @@ Examples:
|
||||
"name": container_name,
|
||||
}
|
||||
if needs_api_key:
|
||||
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
||||
computer_kwargs["api_key"] = cua_api_key # type: ignore
|
||||
|
||||
# Create computer instance
|
||||
async with Computer(**computer_kwargs) as computer: # type: ignore
|
||||
|
||||
async with Computer(**computer_kwargs) as computer: # type: ignore
|
||||
|
||||
# Create agent
|
||||
agent_kwargs = {
|
||||
"model": args.model,
|
||||
"tools": [computer],
|
||||
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
|
||||
"trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
|
||||
"verbosity": 20 if args.verbose else 30, # DEBUG vs WARNING
|
||||
"max_retries": args.max_retries
|
||||
"max_retries": args.max_retries,
|
||||
}
|
||||
|
||||
if args.images > 0:
|
||||
agent_kwargs["only_n_most_recent_images"] = args.images
|
||||
|
||||
|
||||
if args.trajectory:
|
||||
agent_kwargs["trajectory_dir"] = "trajectories"
|
||||
|
||||
|
||||
if args.budget:
|
||||
agent_kwargs["max_trajectory_budget"] = {
|
||||
"max_budget": args.budget,
|
||||
"raise_error": True,
|
||||
"reset_after_each_run": False
|
||||
"reset_after_each_run": False,
|
||||
}
|
||||
|
||||
if args.cache:
|
||||
agent_kwargs["use_prompt_caching"] = True
|
||||
|
||||
|
||||
agent = ComputerAgent(**agent_kwargs)
|
||||
|
||||
|
||||
# If predict-click mode is requested, run once and exit
|
||||
if args.predict_click:
|
||||
if not PIL_AVAILABLE:
|
||||
print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
|
||||
print_colored(
|
||||
"❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow",
|
||||
Colors.RED,
|
||||
bold=True,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
instruction = args.predict_click
|
||||
@@ -435,6 +440,7 @@ Examples:
|
||||
|
||||
try:
|
||||
from io import BytesIO
|
||||
|
||||
with Image.open(BytesIO(img_bytes)) as img:
|
||||
img = img.convert("RGB")
|
||||
draw = ImageDraw.Draw(img)
|
||||
@@ -457,9 +463,9 @@ Examples:
|
||||
if system == "windows":
|
||||
os.startfile(str(out_path)) # type: ignore[attr-defined]
|
||||
elif system == "darwin":
|
||||
os.system(f"open \"{out_path}\"")
|
||||
os.system(f'open "{out_path}"')
|
||||
else:
|
||||
os.system(f"xdg-open \"{out_path}\"")
|
||||
os.system(f'xdg-open "{out_path}"')
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
@@ -482,9 +488,8 @@ Examples:
|
||||
await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except (KeyboardInterrupt, EOFError) as _:
|
||||
print_colored("\n\n👋 Goodbye!")
|
||||
print_colored("\n\n👋 Goodbye!")
|
||||
|
||||
@@ -6,27 +6,32 @@ computer interface types, supporting both the ComputerHandler protocol and the
|
||||
Computer library interface.
|
||||
"""
|
||||
|
||||
from computer import Computer as cuaComputer
|
||||
|
||||
from .base import AsyncComputerHandler
|
||||
from .cua import cuaComputerHandler
|
||||
from .custom import CustomComputerHandler
|
||||
from computer import Computer as cuaComputer
|
||||
|
||||
|
||||
def is_agent_computer(computer):
|
||||
"""Check if the given computer is a ComputerHandler or CUA Computer."""
|
||||
return isinstance(computer, AsyncComputerHandler) or \
|
||||
isinstance(computer, cuaComputer) or \
|
||||
(isinstance(computer, dict)) #and "screenshot" in computer)
|
||||
return (
|
||||
isinstance(computer, AsyncComputerHandler)
|
||||
or isinstance(computer, cuaComputer)
|
||||
or (isinstance(computer, dict))
|
||||
) # and "screenshot" in computer)
|
||||
|
||||
|
||||
async def make_computer_handler(computer):
|
||||
"""
|
||||
Create a computer handler from a computer interface.
|
||||
|
||||
|
||||
Args:
|
||||
computer: Either a ComputerHandler instance, Computer instance, or dict of functions
|
||||
|
||||
|
||||
Returns:
|
||||
ComputerHandler: A computer handler instance
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: If the computer type is not supported
|
||||
"""
|
||||
@@ -38,4 +43,4 @@ async def make_computer_handler(computer):
|
||||
return computer_handler
|
||||
if isinstance(computer, dict):
|
||||
return CustomComputerHandler(computer)
|
||||
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
||||
raise ValueError(f"Unsupported computer type: {type(computer)}")
|
||||
|
||||
@@ -2,69 +2,78 @@
|
||||
Base computer interface protocol for agent interactions.
|
||||
"""
|
||||
|
||||
from typing import Protocol, Literal, List, Dict, Any, Union, Optional, runtime_checkable
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
Union,
|
||||
runtime_checkable,
|
||||
)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class AsyncComputerHandler(Protocol):
|
||||
"""Protocol defining the interface for computer interactions."""
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
...
|
||||
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
...
|
||||
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
...
|
||||
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
...
|
||||
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
...
|
||||
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
...
|
||||
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
...
|
||||
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
...
|
||||
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
...
|
||||
|
||||
|
||||
async def keypress(self, keys: Union[List[str], str]) -> None:
|
||||
"""Press key combination."""
|
||||
...
|
||||
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
"""Drag along specified path."""
|
||||
...
|
||||
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get current URL (for browser environments)."""
|
||||
...
|
||||
|
||||
# ==== Anthropic Action Space ====
|
||||
|
||||
# ==== Anthropic Action Space ====
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
...
|
||||
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
...
|
||||
|
||||
@@ -3,24 +3,27 @@ Computer handler implementation for OpenAI computer-use-preview protocol.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Dict, List, Any, Literal, Union, Optional
|
||||
from .base import AsyncComputerHandler
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
|
||||
from computer import Computer
|
||||
|
||||
from .base import AsyncComputerHandler
|
||||
|
||||
|
||||
class cuaComputerHandler(AsyncComputerHandler):
|
||||
"""Computer handler that implements the Computer protocol using the computer interface."""
|
||||
|
||||
|
||||
def __init__(self, cua_computer: Computer):
|
||||
"""Initialize with a computer interface (from tool schema)."""
|
||||
self.cua_computer = cua_computer
|
||||
self.interface = None
|
||||
|
||||
async def _initialize(self):
|
||||
if hasattr(self.cua_computer, '_initialized') and not self.cua_computer._initialized:
|
||||
if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
|
||||
await self.cua_computer.run()
|
||||
self.interface = self.cua_computer.interface
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
@@ -32,13 +35,13 @@ class cuaComputerHandler(AsyncComputerHandler):
|
||||
assert self.interface is not None
|
||||
screen_size = await self.interface.get_screen_size()
|
||||
return screen_size["width"], screen_size["height"]
|
||||
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
assert self.interface is not None
|
||||
screenshot_bytes = await self.interface.screenshot()
|
||||
return base64.b64encode(screenshot_bytes).decode('utf-8')
|
||||
|
||||
return base64.b64encode(screenshot_bytes).decode("utf-8")
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
assert self.interface is not None
|
||||
@@ -49,34 +52,35 @@ class cuaComputerHandler(AsyncComputerHandler):
|
||||
else:
|
||||
# Default to left click for unknown buttons
|
||||
await self.interface.left_click(x, y)
|
||||
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
assert self.interface is not None
|
||||
await self.interface.double_click(x, y)
|
||||
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
assert self.interface is not None
|
||||
await self.interface.move_cursor(x, y)
|
||||
await self.interface.scroll(scroll_x, scroll_y)
|
||||
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
assert self.interface is not None
|
||||
await self.interface.type_text(text)
|
||||
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
assert self.interface is not None
|
||||
import asyncio
|
||||
|
||||
await asyncio.sleep(ms / 1000.0)
|
||||
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
assert self.interface is not None
|
||||
await self.interface.move_cursor(x, y)
|
||||
|
||||
|
||||
async def keypress(self, keys: Union[List[str], str]) -> None:
|
||||
"""Press key combination."""
|
||||
assert self.interface is not None
|
||||
@@ -87,38 +91,38 @@ class cuaComputerHandler(AsyncComputerHandler):
|
||||
else:
|
||||
# Handle key combinations
|
||||
await self.interface.hotkey(*keys)
|
||||
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
"""Drag along specified path."""
|
||||
assert self.interface is not None
|
||||
if not path:
|
||||
return
|
||||
|
||||
|
||||
# Start drag from first point
|
||||
start = path[0]
|
||||
await self.interface.mouse_down(start["x"], start["y"])
|
||||
|
||||
|
||||
# Move through path
|
||||
for point in path[1:]:
|
||||
await self.interface.move_cursor(point["x"], point["y"])
|
||||
|
||||
|
||||
# End drag at last point
|
||||
end = path[-1]
|
||||
await self.interface.mouse_up(end["x"], end["y"])
|
||||
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get current URL (for browser environments)."""
|
||||
# This would need to be implemented based on the specific browser interface
|
||||
# For now, return empty string
|
||||
return ""
|
||||
|
||||
# ==== Anthropic Computer Action Space ====
|
||||
# ==== Anthropic Computer Action Space ====
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
assert self.interface is not None
|
||||
await self.interface.mouse_down(x, y, button="left")
|
||||
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
assert self.interface is not None
|
||||
await self.interface.mouse_up(x, y, button="left")
|
||||
await self.interface.mouse_up(x, y, button="left")
|
||||
|
||||
@@ -3,47 +3,49 @@ Custom computer handler implementation that accepts a dictionary of functions.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Dict, List, Any, Literal, Union, Optional, Callable
|
||||
from PIL import Image
|
||||
import io
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from .base import AsyncComputerHandler
|
||||
|
||||
|
||||
class CustomComputerHandler(AsyncComputerHandler):
|
||||
"""Computer handler that implements the Computer protocol using a dictionary of custom functions."""
|
||||
|
||||
|
||||
def __init__(self, functions: Dict[str, Callable]):
|
||||
"""
|
||||
Initialize with a dictionary of functions.
|
||||
|
||||
|
||||
Args:
|
||||
functions: Dictionary where keys are method names and values are callable functions.
|
||||
Only 'screenshot' is required, all others are optional.
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: If required 'screenshot' function is not provided.
|
||||
"""
|
||||
if 'screenshot' not in functions:
|
||||
if "screenshot" not in functions:
|
||||
raise ValueError("'screenshot' function is required in functions dictionary")
|
||||
|
||||
|
||||
self.functions = functions
|
||||
self._last_screenshot_size: Optional[tuple[int, int]] = None
|
||||
|
||||
|
||||
async def _call_function(self, func, *args, **kwargs):
|
||||
"""
|
||||
Call a function, handling both async and sync functions.
|
||||
|
||||
|
||||
Args:
|
||||
func: The function to call
|
||||
*args: Positional arguments to pass to the function
|
||||
**kwargs: Keyword arguments to pass to the function
|
||||
|
||||
|
||||
Returns:
|
||||
The result of the function call
|
||||
"""
|
||||
import asyncio
|
||||
import inspect
|
||||
|
||||
|
||||
if callable(func):
|
||||
if inspect.iscoroutinefunction(func):
|
||||
return await func(*args, **kwargs)
|
||||
@@ -51,14 +53,14 @@ class CustomComputerHandler(AsyncComputerHandler):
|
||||
return func(*args, **kwargs)
|
||||
else:
|
||||
return func
|
||||
|
||||
|
||||
async def _get_value(self, attribute: str):
|
||||
"""
|
||||
Get value for an attribute, checking both 'get_{attribute}' and '{attribute}' keys.
|
||||
|
||||
|
||||
Args:
|
||||
attribute: The attribute name to look for
|
||||
|
||||
|
||||
Returns:
|
||||
The value from the functions dict, called if callable, returned directly if not
|
||||
"""
|
||||
@@ -66,20 +68,20 @@ class CustomComputerHandler(AsyncComputerHandler):
|
||||
get_key = f"get_{attribute}"
|
||||
if get_key in self.functions:
|
||||
return await self._call_function(self.functions[get_key])
|
||||
|
||||
# Check for '{attribute}'
|
||||
|
||||
# Check for '{attribute}'
|
||||
if attribute in self.functions:
|
||||
return await self._call_function(self.functions[attribute])
|
||||
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _to_b64_str(self, img: Union[bytes, Image.Image, str]) -> str:
|
||||
"""
|
||||
Convert image to base64 string.
|
||||
|
||||
|
||||
Args:
|
||||
img: Image as bytes, PIL Image, or base64 string
|
||||
|
||||
|
||||
Returns:
|
||||
str: Base64 encoded image string
|
||||
"""
|
||||
@@ -88,43 +90,43 @@ class CustomComputerHandler(AsyncComputerHandler):
|
||||
return img
|
||||
elif isinstance(img, bytes):
|
||||
# Raw bytes
|
||||
return base64.b64encode(img).decode('utf-8')
|
||||
return base64.b64encode(img).decode("utf-8")
|
||||
elif isinstance(img, Image.Image):
|
||||
# PIL Image
|
||||
buffer = io.BytesIO()
|
||||
img.save(buffer, format='PNG')
|
||||
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
img.save(buffer, format="PNG")
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
else:
|
||||
raise ValueError(f"Unsupported image type: {type(img)}")
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
# ==== Computer-Use-Preview Action Space ====
|
||||
|
||||
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
|
||||
"""Get the current environment type."""
|
||||
result = await self._get_value('environment')
|
||||
result = await self._get_value("environment")
|
||||
if result is None:
|
||||
return "linux"
|
||||
assert result in ["windows", "mac", "linux", "browser"]
|
||||
return result # type: ignore
|
||||
return result # type: ignore
|
||||
|
||||
async def get_dimensions(self) -> tuple[int, int]:
|
||||
"""Get screen dimensions as (width, height)."""
|
||||
result = await self._get_value('dimensions')
|
||||
result = await self._get_value("dimensions")
|
||||
if result is not None:
|
||||
return result # type: ignore
|
||||
|
||||
return result # type: ignore
|
||||
|
||||
# Fallback: use last screenshot size if available
|
||||
if not self._last_screenshot_size:
|
||||
await self.screenshot()
|
||||
assert self._last_screenshot_size is not None, "Failed to get screenshot size"
|
||||
|
||||
|
||||
return self._last_screenshot_size
|
||||
|
||||
|
||||
async def screenshot(self) -> str:
|
||||
"""Take a screenshot and return as base64 string."""
|
||||
result = await self._call_function(self.functions['screenshot'])
|
||||
b64_str = self._to_b64_str(result) # type: ignore
|
||||
|
||||
result = await self._call_function(self.functions["screenshot"])
|
||||
b64_str = self._to_b64_str(result) # type: ignore
|
||||
|
||||
# Try to extract dimensions for fallback use
|
||||
try:
|
||||
if isinstance(result, Image.Image):
|
||||
@@ -136,74 +138,75 @@ class CustomComputerHandler(AsyncComputerHandler):
|
||||
except Exception:
|
||||
# If we can't get dimensions, that's okay
|
||||
pass
|
||||
|
||||
|
||||
return b64_str
|
||||
|
||||
|
||||
async def click(self, x: int, y: int, button: str = "left") -> None:
|
||||
"""Click at coordinates with specified button."""
|
||||
if 'click' in self.functions:
|
||||
await self._call_function(self.functions['click'], x, y, button)
|
||||
if "click" in self.functions:
|
||||
await self._call_function(self.functions["click"], x, y, button)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def double_click(self, x: int, y: int) -> None:
|
||||
"""Double click at coordinates."""
|
||||
if 'double_click' in self.functions:
|
||||
await self._call_function(self.functions['double_click'], x, y)
|
||||
if "double_click" in self.functions:
|
||||
await self._call_function(self.functions["double_click"], x, y)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
|
||||
"""Scroll at coordinates with specified scroll amounts."""
|
||||
if 'scroll' in self.functions:
|
||||
await self._call_function(self.functions['scroll'], x, y, scroll_x, scroll_y)
|
||||
if "scroll" in self.functions:
|
||||
await self._call_function(self.functions["scroll"], x, y, scroll_x, scroll_y)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def type(self, text: str) -> None:
|
||||
"""Type text."""
|
||||
if 'type' in self.functions:
|
||||
await self._call_function(self.functions['type'], text)
|
||||
if "type" in self.functions:
|
||||
await self._call_function(self.functions["type"], text)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def wait(self, ms: int = 1000) -> None:
|
||||
"""Wait for specified milliseconds."""
|
||||
if 'wait' in self.functions:
|
||||
await self._call_function(self.functions['wait'], ms)
|
||||
if "wait" in self.functions:
|
||||
await self._call_function(self.functions["wait"], ms)
|
||||
else:
|
||||
# Default implementation
|
||||
import asyncio
|
||||
|
||||
await asyncio.sleep(ms / 1000.0)
|
||||
|
||||
|
||||
async def move(self, x: int, y: int) -> None:
|
||||
"""Move cursor to coordinates."""
|
||||
if 'move' in self.functions:
|
||||
await self._call_function(self.functions['move'], x, y)
|
||||
if "move" in self.functions:
|
||||
await self._call_function(self.functions["move"], x, y)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def keypress(self, keys: Union[List[str], str]) -> None:
|
||||
"""Press key combination."""
|
||||
if 'keypress' in self.functions:
|
||||
await self._call_function(self.functions['keypress'], keys)
|
||||
if "keypress" in self.functions:
|
||||
await self._call_function(self.functions["keypress"], keys)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def drag(self, path: List[Dict[str, int]]) -> None:
|
||||
"""Drag along specified path."""
|
||||
if 'drag' in self.functions:
|
||||
await self._call_function(self.functions['drag'], path)
|
||||
if "drag" in self.functions:
|
||||
await self._call_function(self.functions["drag"], path)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def get_current_url(self) -> str:
|
||||
"""Get current URL (for browser environments)."""
|
||||
if 'get_current_url' in self.functions:
|
||||
return await self._get_value('current_url') # type: ignore
|
||||
if "get_current_url" in self.functions:
|
||||
return await self._get_value("current_url") # type: ignore
|
||||
return "" # Default fallback
|
||||
|
||||
|
||||
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse down at coordinates."""
|
||||
if 'left_mouse_down' in self.functions:
|
||||
await self._call_function(self.functions['left_mouse_down'], x, y)
|
||||
if "left_mouse_down" in self.functions:
|
||||
await self._call_function(self.functions["left_mouse_down"], x, y)
|
||||
# No-op if not implemented
|
||||
|
||||
|
||||
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
|
||||
"""Left mouse up at coordinates."""
|
||||
if 'left_mouse_up' in self.functions:
|
||||
await self._call_function(self.functions['left_mouse_up'], x, y)
|
||||
if "left_mouse_up" in self.functions:
|
||||
await self._call_function(self.functions["left_mouse_up"], x, y)
|
||||
# No-op if not implemented
|
||||
|
||||
@@ -3,47 +3,56 @@ Decorators for agent - agent_loop decorator
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
from .types import AgentConfigInfo
|
||||
|
||||
# Global registry
|
||||
_agent_configs: List[AgentConfigInfo] = []
|
||||
|
||||
|
||||
def register_agent(models: str, priority: int = 0):
|
||||
"""
|
||||
Decorator to register an AsyncAgentConfig class.
|
||||
|
||||
|
||||
Args:
|
||||
models: Regex pattern to match supported models
|
||||
priority: Priority for agent selection (higher = more priority)
|
||||
"""
|
||||
|
||||
def decorator(agent_class: type):
|
||||
# Validate that the class implements AsyncAgentConfig protocol
|
||||
if not hasattr(agent_class, 'predict_step'):
|
||||
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_step method")
|
||||
if not hasattr(agent_class, 'predict_click'):
|
||||
raise ValueError(f"Agent class {agent_class.__name__} must implement predict_click method")
|
||||
if not hasattr(agent_class, 'get_capabilities'):
|
||||
raise ValueError(f"Agent class {agent_class.__name__} must implement get_capabilities method")
|
||||
|
||||
if not hasattr(agent_class, "predict_step"):
|
||||
raise ValueError(
|
||||
f"Agent class {agent_class.__name__} must implement predict_step method"
|
||||
)
|
||||
if not hasattr(agent_class, "predict_click"):
|
||||
raise ValueError(
|
||||
f"Agent class {agent_class.__name__} must implement predict_click method"
|
||||
)
|
||||
if not hasattr(agent_class, "get_capabilities"):
|
||||
raise ValueError(
|
||||
f"Agent class {agent_class.__name__} must implement get_capabilities method"
|
||||
)
|
||||
|
||||
# Register the agent config
|
||||
config_info = AgentConfigInfo(
|
||||
agent_class=agent_class,
|
||||
models_regex=models,
|
||||
priority=priority
|
||||
agent_class=agent_class, models_regex=models, priority=priority
|
||||
)
|
||||
_agent_configs.append(config_info)
|
||||
|
||||
|
||||
# Sort by priority (highest first)
|
||||
_agent_configs.sort(key=lambda x: x.priority, reverse=True)
|
||||
|
||||
|
||||
return agent_class
|
||||
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def get_agent_configs() -> List[AgentConfigInfo]:
|
||||
"""Get all registered agent configs"""
|
||||
return _agent_configs.copy()
|
||||
|
||||
|
||||
def find_agent_config(model: str) -> Optional[AgentConfigInfo]:
|
||||
"""Find the best matching agent config for a model"""
|
||||
for config_info in _agent_configs:
|
||||
|
||||
@@ -12,7 +12,7 @@ Components:
|
||||
Usage:
|
||||
# Run the server and UI
|
||||
python -m agent.human_tool
|
||||
|
||||
|
||||
# Or run components separately
|
||||
python -m agent.human_tool.server # API server only
|
||||
python -m agent.human_tool.ui # UI only
|
||||
@@ -21,9 +21,4 @@ Usage:
|
||||
from .server import CompletionQueue, completion_queue
|
||||
from .ui import HumanCompletionUI, create_ui
|
||||
|
||||
__all__ = [
|
||||
"CompletionQueue",
|
||||
"completion_queue",
|
||||
"HumanCompletionUI",
|
||||
"create_ui"
|
||||
]
|
||||
__all__ = ["CompletionQueue", "completion_queue", "HumanCompletionUI", "create_ui"]
|
||||
|
||||
@@ -8,6 +8,7 @@ with a Gradio UI for human interaction.
|
||||
|
||||
import gradio as gr
|
||||
from fastapi import FastAPI
|
||||
|
||||
from .server import app as fastapi_app
|
||||
from .ui import create_ui
|
||||
|
||||
@@ -18,6 +19,7 @@ gradio_demo = create_ui()
|
||||
CUSTOM_PATH = "/gradio"
|
||||
app = gr.mount_gradio_app(fastapi_app, gradio_demo, path=CUSTOM_PATH)
|
||||
|
||||
|
||||
# Add a redirect from root to Gradio UI
|
||||
@fastapi_app.get("/")
|
||||
async def redirect_to_ui():
|
||||
@@ -25,14 +27,16 @@ async def redirect_to_ui():
|
||||
return {
|
||||
"message": "Human Completion Server is running",
|
||||
"ui_url": "/gradio",
|
||||
"api_docs": "/docs"
|
||||
"api_docs": "/docs",
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
print("🚀 Starting Human-in-the-Loop Completion Server...")
|
||||
print("📊 API Server: http://localhost:8002")
|
||||
print("🎨 Gradio UI: http://localhost:8002/gradio")
|
||||
print("📚 API Docs: http://localhost:8002/docs")
|
||||
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8002)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
from dataclasses import dataclass, asdict
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
@@ -49,7 +49,7 @@ class CompletionQueue:
|
||||
self._queue: Dict[str, CompletionCall] = {}
|
||||
self._pending_order: List[str] = []
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
|
||||
async def add_completion(self, messages: List[Dict[str, Any]], model: str) -> str:
|
||||
"""Add a completion call to the queue."""
|
||||
async with self._lock:
|
||||
@@ -59,42 +59,47 @@ class CompletionQueue:
|
||||
messages=messages,
|
||||
model=model,
|
||||
status=CompletionStatus.PENDING,
|
||||
created_at=datetime.now()
|
||||
created_at=datetime.now(),
|
||||
)
|
||||
self._queue[call_id] = completion_call
|
||||
self._pending_order.append(call_id)
|
||||
return call_id
|
||||
|
||||
|
||||
async def get_pending_calls(self) -> List[Dict[str, Any]]:
|
||||
"""Get all pending completion calls."""
|
||||
async with self._lock:
|
||||
pending_calls = []
|
||||
for call_id in self._pending_order:
|
||||
if call_id in self._queue and self._queue[call_id].status == CompletionStatus.PENDING:
|
||||
if (
|
||||
call_id in self._queue
|
||||
and self._queue[call_id].status == CompletionStatus.PENDING
|
||||
):
|
||||
call = self._queue[call_id]
|
||||
pending_calls.append({
|
||||
"id": call.id,
|
||||
"model": call.model,
|
||||
"created_at": call.created_at.isoformat(),
|
||||
"messages": call.messages
|
||||
})
|
||||
pending_calls.append(
|
||||
{
|
||||
"id": call.id,
|
||||
"model": call.model,
|
||||
"created_at": call.created_at.isoformat(),
|
||||
"messages": call.messages,
|
||||
}
|
||||
)
|
||||
return pending_calls
|
||||
|
||||
|
||||
async def get_call_status(self, call_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get the status of a specific completion call."""
|
||||
async with self._lock:
|
||||
if call_id not in self._queue:
|
||||
return None
|
||||
|
||||
|
||||
call = self._queue[call_id]
|
||||
result = {
|
||||
"id": call.id,
|
||||
"status": call.status.value,
|
||||
"created_at": call.created_at.isoformat(),
|
||||
"model": call.model,
|
||||
"messages": call.messages
|
||||
"messages": call.messages,
|
||||
}
|
||||
|
||||
|
||||
if call.completed_at:
|
||||
result["completed_at"] = call.completed_at.isoformat()
|
||||
if call.response:
|
||||
@@ -103,69 +108,74 @@ class CompletionQueue:
|
||||
result["tool_calls"] = call.tool_calls
|
||||
if call.error:
|
||||
result["error"] = call.error
|
||||
|
||||
|
||||
return result
|
||||
|
||||
async def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
|
||||
|
||||
async def complete_call(
|
||||
self,
|
||||
call_id: str,
|
||||
response: Optional[str] = None,
|
||||
tool_calls: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> bool:
|
||||
"""Mark a completion call as completed with a response or tool calls."""
|
||||
async with self._lock:
|
||||
if call_id not in self._queue:
|
||||
return False
|
||||
|
||||
|
||||
call = self._queue[call_id]
|
||||
if call.status != CompletionStatus.PENDING:
|
||||
return False
|
||||
|
||||
|
||||
call.status = CompletionStatus.COMPLETED
|
||||
call.completed_at = datetime.now()
|
||||
call.response = response
|
||||
call.tool_calls = tool_calls
|
||||
|
||||
|
||||
# Remove from pending order
|
||||
if call_id in self._pending_order:
|
||||
self._pending_order.remove(call_id)
|
||||
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def fail_call(self, call_id: str, error: str) -> bool:
|
||||
"""Mark a completion call as failed with an error."""
|
||||
async with self._lock:
|
||||
if call_id not in self._queue:
|
||||
return False
|
||||
|
||||
|
||||
call = self._queue[call_id]
|
||||
if call.status != CompletionStatus.PENDING:
|
||||
return False
|
||||
|
||||
|
||||
call.status = CompletionStatus.FAILED
|
||||
call.completed_at = datetime.now()
|
||||
call.error = error
|
||||
|
||||
|
||||
# Remove from pending order
|
||||
if call_id in self._pending_order:
|
||||
self._pending_order.remove(call_id)
|
||||
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def wait_for_completion(self, call_id: str, timeout: float = 300.0) -> Optional[str]:
|
||||
"""Wait for a completion call to be completed and return the response."""
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
|
||||
while True:
|
||||
status = await self.get_call_status(call_id)
|
||||
if not status:
|
||||
return None
|
||||
|
||||
|
||||
if status["status"] == CompletionStatus.COMPLETED.value:
|
||||
return status.get("response")
|
||||
elif status["status"] == CompletionStatus.FAILED.value:
|
||||
raise Exception(f"Completion failed: {status.get('error', 'Unknown error')}")
|
||||
|
||||
|
||||
# Check timeout
|
||||
if asyncio.get_event_loop().time() - start_time > timeout:
|
||||
await self.fail_call(call_id, "Timeout waiting for human response")
|
||||
raise TimeoutError("Timeout waiting for human response")
|
||||
|
||||
|
||||
# Wait a bit before checking again
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
@@ -204,9 +214,7 @@ async def get_status(call_id: str):
|
||||
async def complete_call(call_id: str, response: CompletionResponse):
|
||||
"""Complete a call with a human response."""
|
||||
success = await completion_queue.complete_call(
|
||||
call_id,
|
||||
response=response.response,
|
||||
tool_calls=response.tool_calls
|
||||
call_id, response=response.response, tool_calls=response.tool_calls
|
||||
)
|
||||
if success:
|
||||
return {"status": "success", "message": "Call completed"}
|
||||
@@ -219,7 +227,9 @@ async def fail_call(call_id: str, error: Dict[str, str]):
|
||||
"""Mark a call as failed."""
|
||||
success = await completion_queue.fail_call(call_id, error.get("error", "Unknown error"))
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Completion call not found or already completed")
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Completion call not found or already completed"
|
||||
)
|
||||
return {"status": "failed"}
|
||||
|
||||
|
||||
@@ -231,4 +241,5 @@ async def root():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8002)
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
import gradio as gr
|
||||
import json
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from .server import completion_queue
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import gradio as gr
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from .server import completion_queue
|
||||
|
||||
|
||||
class HumanCompletionUI:
|
||||
def __init__(self, server_url: str = "http://localhost:8002"):
|
||||
self.server_url = server_url
|
||||
@@ -20,7 +23,7 @@ class HumanCompletionUI:
|
||||
self.current_button: str = "left"
|
||||
self.current_scroll_x: int = 0
|
||||
self.current_scroll_y: int = -120
|
||||
|
||||
|
||||
def format_messages_for_chatbot(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Format messages for display in gr.Chatbot with type='messages'."""
|
||||
formatted = []
|
||||
@@ -28,7 +31,7 @@ class HumanCompletionUI:
|
||||
role = msg.get("role", "user")
|
||||
content = msg.get("content", "")
|
||||
tool_calls = msg.get("tool_calls", [])
|
||||
|
||||
|
||||
# Handle different content formats
|
||||
if isinstance(content, list):
|
||||
# Multi-modal content - can include text and images
|
||||
@@ -55,7 +58,7 @@ class HumanCompletionUI:
|
||||
else:
|
||||
# For URL images, create gr.Image with URL
|
||||
formatted_content.append(gr.Image(value=image_url))
|
||||
|
||||
|
||||
# Determine final content format
|
||||
if len(formatted_content) == 1:
|
||||
content = formatted_content[0]
|
||||
@@ -63,28 +66,28 @@ class HumanCompletionUI:
|
||||
content = formatted_content
|
||||
else:
|
||||
content = "[Empty content]"
|
||||
|
||||
|
||||
# Ensure role is valid for Gradio Chatbot
|
||||
if role not in ["user", "assistant"]:
|
||||
role = "assistant" if role == "system" else "user"
|
||||
|
||||
|
||||
# Invert roles for better display in human UI context
|
||||
# (what the AI says becomes "user", what human should respond becomes "assistant")
|
||||
if role == "user":
|
||||
role = "assistant"
|
||||
else:
|
||||
role = "user"
|
||||
|
||||
|
||||
# Add the main message if it has content
|
||||
if content and str(content).strip():
|
||||
formatted.append({"role": role, "content": content})
|
||||
|
||||
|
||||
# Handle tool calls - create separate messages for each tool call
|
||||
if tool_calls:
|
||||
for tool_call in tool_calls:
|
||||
function_name = tool_call.get("function", {}).get("name", "unknown")
|
||||
arguments_str = tool_call.get("function", {}).get("arguments", "{}")
|
||||
|
||||
|
||||
try:
|
||||
# Parse arguments to format them nicely
|
||||
arguments = json.loads(arguments_str)
|
||||
@@ -92,18 +95,20 @@ class HumanCompletionUI:
|
||||
except json.JSONDecodeError:
|
||||
# If parsing fails, use the raw string
|
||||
formatted_args = arguments_str
|
||||
|
||||
|
||||
# Create a formatted message for the tool call
|
||||
tool_call_content = f"```json\n{formatted_args}\n```"
|
||||
|
||||
formatted.append({
|
||||
"role": role,
|
||||
"content": tool_call_content,
|
||||
"metadata": {"title": f"🛠️ Used {function_name}"}
|
||||
})
|
||||
|
||||
|
||||
formatted.append(
|
||||
{
|
||||
"role": role,
|
||||
"content": tool_call_content,
|
||||
"metadata": {"title": f"🛠️ Used {function_name}"},
|
||||
}
|
||||
)
|
||||
|
||||
return formatted
|
||||
|
||||
|
||||
def get_pending_calls(self) -> List[Dict[str, Any]]:
|
||||
"""Get pending calls from the server."""
|
||||
try:
|
||||
@@ -113,38 +118,39 @@ class HumanCompletionUI:
|
||||
except Exception as e:
|
||||
print(f"Error fetching pending calls: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def complete_call_with_response(self, call_id: str, response: str) -> bool:
|
||||
"""Complete a call with a text response."""
|
||||
try:
|
||||
response_data = {"response": response}
|
||||
response_obj = requests.post(
|
||||
f"{self.server_url}/complete/{call_id}",
|
||||
json=response_data,
|
||||
timeout=10
|
||||
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
||||
)
|
||||
response_obj.raise_for_status()
|
||||
return True
|
||||
except requests.RequestException as e:
|
||||
print(f"Error completing call: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def complete_call_with_tool_calls(self, call_id: str, tool_calls: List[Dict[str, Any]]) -> bool:
|
||||
"""Complete a call with tool calls."""
|
||||
try:
|
||||
response_data = {"tool_calls": tool_calls}
|
||||
response_obj = requests.post(
|
||||
f"{self.server_url}/complete/{call_id}",
|
||||
json=response_data,
|
||||
timeout=10
|
||||
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
||||
)
|
||||
response_obj.raise_for_status()
|
||||
return True
|
||||
except requests.RequestException as e:
|
||||
print(f"Error completing call: {e}")
|
||||
return False
|
||||
|
||||
def complete_call(self, call_id: str, response: Optional[str] = None, tool_calls: Optional[List[Dict[str, Any]]] = None) -> bool:
|
||||
|
||||
def complete_call(
|
||||
self,
|
||||
call_id: str,
|
||||
response: Optional[str] = None,
|
||||
tool_calls: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> bool:
|
||||
"""Complete a call with either a response or tool calls."""
|
||||
try:
|
||||
response_data = {}
|
||||
@@ -152,25 +158,23 @@ class HumanCompletionUI:
|
||||
response_data["response"] = response
|
||||
if tool_calls:
|
||||
response_data["tool_calls"] = tool_calls
|
||||
|
||||
|
||||
response_obj = requests.post(
|
||||
f"{self.server_url}/complete/{call_id}",
|
||||
json=response_data,
|
||||
timeout=10
|
||||
f"{self.server_url}/complete/{call_id}", json=response_data, timeout=10
|
||||
)
|
||||
response_obj.raise_for_status()
|
||||
return True
|
||||
except requests.RequestException as e:
|
||||
print(f"Error completing call: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_last_image_from_messages(self, messages: List[Dict[str, Any]]) -> Optional[Any]:
|
||||
"""Extract the last image from the messages for display above conversation."""
|
||||
last_image = None
|
||||
|
||||
|
||||
for msg in reversed(messages): # Start from the last message
|
||||
content = msg.get("content", "")
|
||||
|
||||
|
||||
if isinstance(content, list):
|
||||
for item in reversed(content): # Get the last image in the message
|
||||
if item.get("type") == "image_url":
|
||||
@@ -189,13 +193,13 @@ class HumanCompletionUI:
|
||||
else:
|
||||
# For URL images, return the URL
|
||||
return image_url
|
||||
|
||||
|
||||
return last_image
|
||||
|
||||
|
||||
def refresh_pending_calls(self):
|
||||
"""Refresh the list of pending calls."""
|
||||
pending_calls = self.get_pending_calls()
|
||||
|
||||
|
||||
if not pending_calls:
|
||||
return (
|
||||
gr.update(choices=["latest"], value="latest"), # dropdown
|
||||
@@ -205,27 +209,27 @@ class HumanCompletionUI:
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
|
||||
# Sort pending calls by created_at to get oldest first
|
||||
sorted_calls = sorted(pending_calls, key=lambda x: x.get("created_at", ""))
|
||||
|
||||
|
||||
# Create choices for dropdown
|
||||
choices = [("latest", "latest")] # Add "latest" option first
|
||||
|
||||
|
||||
for call in sorted_calls:
|
||||
call_id = call["id"]
|
||||
model = call.get("model", "unknown")
|
||||
created_at = call.get("created_at", "")
|
||||
# Format timestamp
|
||||
try:
|
||||
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
||||
dt = datetime.fromisoformat(created_at.replace("Z", "+00:00"))
|
||||
time_str = dt.strftime("%H:%M:%S")
|
||||
except:
|
||||
time_str = created_at
|
||||
|
||||
|
||||
choice_label = f"{call_id[:8]}... ({model}) - {time_str}"
|
||||
choices.append((choice_label, call_id))
|
||||
|
||||
|
||||
# Default to "latest" which shows the oldest pending conversation
|
||||
selected_call_id = "latest"
|
||||
if selected_call_id == "latest" and sorted_calls:
|
||||
@@ -239,7 +243,7 @@ class HumanCompletionUI:
|
||||
conversation = []
|
||||
self.current_call_id = None
|
||||
self.last_image = None
|
||||
|
||||
|
||||
return (
|
||||
gr.update(choices=choices, value="latest"),
|
||||
gr.update(value=self.last_image),
|
||||
@@ -248,7 +252,7 @@ class HumanCompletionUI:
|
||||
gr.update(visible=True), # click_actions_group visible when there is a call
|
||||
gr.update(visible=True), # actions_group visible when there is a call
|
||||
)
|
||||
|
||||
|
||||
def on_call_selected(self, selected_choice):
|
||||
"""Handle when a call is selected from the dropdown."""
|
||||
if not selected_choice:
|
||||
@@ -259,7 +263,7 @@ class HumanCompletionUI:
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
|
||||
pending_calls = self.get_pending_calls()
|
||||
if not pending_calls:
|
||||
return (
|
||||
@@ -269,7 +273,7 @@ class HumanCompletionUI:
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
|
||||
# Handle "latest" option
|
||||
if selected_choice == "latest":
|
||||
# Sort calls by created_at to get oldest first
|
||||
@@ -284,17 +288,17 @@ class HumanCompletionUI:
|
||||
if call_id_short in selected_choice:
|
||||
call_id = call["id"]
|
||||
break
|
||||
|
||||
|
||||
if not call_id:
|
||||
return (
|
||||
gr.update(value=None), # no image
|
||||
gr.update(value=[]), # empty chatbot
|
||||
gr.update(interactive=False)
|
||||
gr.update(interactive=False),
|
||||
)
|
||||
|
||||
|
||||
# Find the selected call
|
||||
selected_call = next((c for c in pending_calls if c["id"] == call_id), None)
|
||||
|
||||
|
||||
if not selected_call:
|
||||
return (
|
||||
gr.update(value=None), # no image
|
||||
@@ -303,12 +307,12 @@ class HumanCompletionUI:
|
||||
gr.update(visible=False), # click_actions_group hidden
|
||||
gr.update(visible=False), # actions_group hidden
|
||||
)
|
||||
|
||||
|
||||
conversation = self.format_messages_for_chatbot(selected_call.get("messages", []))
|
||||
self.current_call_id = call_id
|
||||
# Get the last image from messages
|
||||
self.last_image = self.get_last_image_from_messages(selected_call.get("messages", []))
|
||||
|
||||
|
||||
return (
|
||||
gr.update(value=self.last_image),
|
||||
gr.update(value=conversation),
|
||||
@@ -316,110 +320,111 @@ class HumanCompletionUI:
|
||||
gr.update(visible=True), # click_actions_group visible
|
||||
gr.update(visible=True), # actions_group visible
|
||||
)
|
||||
|
||||
|
||||
def submit_response(self, response_text: str):
|
||||
"""Submit a text response to the current call."""
|
||||
if not self.current_call_id:
|
||||
return (
|
||||
gr.update(value=response_text), # keep response text
|
||||
gr.update(value="❌ No call selected") # status
|
||||
gr.update(value="❌ No call selected"), # status
|
||||
)
|
||||
|
||||
|
||||
if not response_text.strip():
|
||||
return (
|
||||
gr.update(value=response_text), # keep response text
|
||||
gr.update(value="❌ Response cannot be empty") # status
|
||||
gr.update(value="❌ Response cannot be empty"), # status
|
||||
)
|
||||
|
||||
|
||||
success = self.complete_call_with_response(self.current_call_id, response_text)
|
||||
|
||||
|
||||
if success:
|
||||
status_msg = "✅ Response submitted successfully!"
|
||||
return (
|
||||
gr.update(value=""), # clear response text
|
||||
gr.update(value=status_msg) # status
|
||||
gr.update(value=status_msg), # status
|
||||
)
|
||||
else:
|
||||
return (
|
||||
gr.update(value=response_text), # keep response text
|
||||
gr.update(value="❌ Failed to submit response") # status
|
||||
gr.update(value="❌ Failed to submit response"), # status
|
||||
)
|
||||
|
||||
|
||||
def submit_action(self, action_type: str, **kwargs) -> str:
|
||||
"""Submit a computer action as a tool call."""
|
||||
if not self.current_call_id:
|
||||
return "❌ No call selected"
|
||||
|
||||
|
||||
import uuid
|
||||
|
||||
|
||||
# Create tool call structure
|
||||
action_data = {"type": action_type, **kwargs}
|
||||
tool_call = {
|
||||
"id": f"call_{uuid.uuid4().hex[:24]}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "computer",
|
||||
"arguments": json.dumps(action_data)
|
||||
}
|
||||
"function": {"name": "computer", "arguments": json.dumps(action_data)},
|
||||
}
|
||||
|
||||
|
||||
success = self.complete_call_with_tool_calls(self.current_call_id, [tool_call])
|
||||
|
||||
|
||||
if success:
|
||||
return f"✅ {action_type.capitalize()} action submitted as tool call"
|
||||
else:
|
||||
return f"❌ Failed to submit {action_type} action"
|
||||
|
||||
def submit_click_action(self, x: int, y: int, action_type: str = "click", button: str = "left") -> str:
|
||||
|
||||
def submit_click_action(
|
||||
self, x: int, y: int, action_type: str = "click", button: str = "left"
|
||||
) -> str:
|
||||
"""Submit a coordinate-based action."""
|
||||
if action_type == "click":
|
||||
return self.submit_action(action_type, x=x, y=y, button=button)
|
||||
else:
|
||||
return self.submit_action(action_type, x=x, y=y)
|
||||
|
||||
|
||||
def submit_type_action(self, text: str) -> str:
|
||||
"""Submit a type action."""
|
||||
return self.submit_action("type", text=text)
|
||||
|
||||
|
||||
def submit_hotkey_action(self, keys: str) -> str:
|
||||
"""Submit a hotkey action."""
|
||||
return self.submit_action("keypress", keys=keys)
|
||||
|
||||
|
||||
def submit_wait_action(self) -> str:
|
||||
"""Submit a wait action with no kwargs."""
|
||||
return self.submit_action("wait")
|
||||
|
||||
def submit_description_click(self, description: str, action_type: str = "click", button: str = "left") -> str:
|
||||
|
||||
def submit_description_click(
|
||||
self, description: str, action_type: str = "click", button: str = "left"
|
||||
) -> str:
|
||||
"""Submit a description-based action."""
|
||||
if action_type == "click":
|
||||
return self.submit_action(action_type, element_description=description, button=button)
|
||||
else:
|
||||
return self.submit_action(action_type, element_description=description)
|
||||
|
||||
|
||||
def wait_for_pending_calls(self, max_seconds: float = 10.0, check_interval: float = 0.2):
|
||||
"""Wait for pending calls to appear or until max_seconds elapsed.
|
||||
|
||||
|
||||
This method loops and checks for pending calls at regular intervals,
|
||||
returning as soon as a pending call is found or the maximum wait time is reached.
|
||||
|
||||
|
||||
Args:
|
||||
max_seconds: Maximum number of seconds to wait
|
||||
check_interval: How often to check for pending calls (in seconds)
|
||||
"""
|
||||
import time
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
while time.time() - start_time < max_seconds:
|
||||
# Check if there are any pending calls
|
||||
pending_calls = self.get_pending_calls()
|
||||
if pending_calls:
|
||||
# Found pending calls, return immediately
|
||||
return self.refresh_pending_calls()
|
||||
|
||||
|
||||
# Wait before checking again
|
||||
time.sleep(check_interval)
|
||||
|
||||
|
||||
# Max wait time reached, return current state
|
||||
return self.refresh_pending_calls()
|
||||
|
||||
@@ -427,79 +432,73 @@ class HumanCompletionUI:
|
||||
def create_ui():
|
||||
"""Create the Gradio interface."""
|
||||
ui_handler = HumanCompletionUI()
|
||||
|
||||
|
||||
with gr.Blocks(title="Human-in-the-Loop Agent Tool", fill_width=True) as demo:
|
||||
gr.Markdown("# 🤖 Human-in-the-Loop Agent Tool")
|
||||
gr.Markdown("Review AI conversation requests and provide human responses.")
|
||||
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column(scale=2):
|
||||
with gr.Group():
|
||||
screenshot_image = gr.Image(
|
||||
label="Interactive Screenshot",
|
||||
interactive=False,
|
||||
height=600
|
||||
label="Interactive Screenshot", interactive=False, height=600
|
||||
)
|
||||
|
||||
|
||||
# Action type selection for image clicks (wrapped for visibility control)
|
||||
with gr.Group(visible=False) as click_actions_group:
|
||||
with gr.Row():
|
||||
action_type_radio = gr.Dropdown(
|
||||
label="Interactive Action",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down", "scroll"],
|
||||
choices=[
|
||||
"click",
|
||||
"double_click",
|
||||
"move",
|
||||
"left_mouse_up",
|
||||
"left_mouse_down",
|
||||
"scroll",
|
||||
],
|
||||
value="click",
|
||||
scale=2
|
||||
scale=2,
|
||||
)
|
||||
action_button_radio = gr.Dropdown(
|
||||
label="Button",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left",
|
||||
visible=True,
|
||||
scale=1
|
||||
scale=1,
|
||||
)
|
||||
scroll_x_input = gr.Number(
|
||||
label="scroll_x",
|
||||
value=0,
|
||||
visible=False,
|
||||
scale=1
|
||||
label="scroll_x", value=0, visible=False, scale=1
|
||||
)
|
||||
scroll_y_input = gr.Number(
|
||||
label="scroll_y",
|
||||
value=-120,
|
||||
visible=False,
|
||||
scale=1
|
||||
label="scroll_y", value=-120, visible=False, scale=1
|
||||
)
|
||||
|
||||
|
||||
conversation_chatbot = gr.Chatbot(
|
||||
label="Conversation",
|
||||
type="messages",
|
||||
height=500,
|
||||
show_copy_button=True
|
||||
label="Conversation", type="messages", height=500, show_copy_button=True
|
||||
)
|
||||
|
||||
|
||||
with gr.Column(scale=1):
|
||||
with gr.Group():
|
||||
call_dropdown = gr.Dropdown(
|
||||
label="Select a pending conversation request",
|
||||
choices=["latest"],
|
||||
interactive=True,
|
||||
value="latest"
|
||||
value="latest",
|
||||
)
|
||||
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
|
||||
status_display = gr.Textbox(
|
||||
label="Status",
|
||||
interactive=False,
|
||||
value="Ready to receive requests..."
|
||||
label="Status", interactive=False, value="Ready to receive requests..."
|
||||
)
|
||||
|
||||
with gr.Group():
|
||||
response_text = gr.Textbox(
|
||||
label="Message",
|
||||
lines=3,
|
||||
placeholder="Enter your message here..."
|
||||
label="Message", lines=3, placeholder="Enter your message here..."
|
||||
)
|
||||
submit_btn = gr.Button("📤 Submit Message", variant="primary", interactive=False)
|
||||
|
||||
submit_btn = gr.Button(
|
||||
"📤 Submit Message", variant="primary", interactive=False
|
||||
)
|
||||
|
||||
# Action Accordions (wrapped for visibility control)
|
||||
with gr.Group(visible=False) as actions_group:
|
||||
with gr.Tabs():
|
||||
@@ -507,58 +506,73 @@ def create_ui():
|
||||
with gr.Group():
|
||||
description_text = gr.Textbox(
|
||||
label="Element Description",
|
||||
placeholder="e.g., 'Privacy and security option in left sidebar'"
|
||||
placeholder="e.g., 'Privacy and security option in left sidebar'",
|
||||
)
|
||||
with gr.Row():
|
||||
description_action_type = gr.Dropdown(
|
||||
label="Action",
|
||||
choices=["click", "double_click", "move", "left_mouse_up", "left_mouse_down"],
|
||||
value="click"
|
||||
choices=[
|
||||
"click",
|
||||
"double_click",
|
||||
"move",
|
||||
"left_mouse_up",
|
||||
"left_mouse_down",
|
||||
],
|
||||
value="click",
|
||||
)
|
||||
description_button = gr.Dropdown(
|
||||
label="Button",
|
||||
choices=["left", "right", "wheel", "back", "forward"],
|
||||
value="left"
|
||||
value="left",
|
||||
)
|
||||
description_submit_btn = gr.Button("Submit Click Action")
|
||||
|
||||
|
||||
with gr.Tab("📝 Type Action"):
|
||||
with gr.Group():
|
||||
type_text = gr.Textbox(
|
||||
label="Text to Type",
|
||||
placeholder="Enter text to type..."
|
||||
label="Text to Type", placeholder="Enter text to type..."
|
||||
)
|
||||
type_submit_btn = gr.Button("Submit Type")
|
||||
|
||||
|
||||
with gr.Tab("⌨️ Keypress Action"):
|
||||
with gr.Group():
|
||||
keypress_text = gr.Textbox(
|
||||
label="Keys",
|
||||
placeholder="e.g., ctrl+c, alt+tab"
|
||||
label="Keys", placeholder="e.g., ctrl+c, alt+tab"
|
||||
)
|
||||
keypress_submit_btn = gr.Button("Submit Keypress")
|
||||
|
||||
|
||||
with gr.Tab("🧰 Misc Actions"):
|
||||
with gr.Group():
|
||||
misc_action_dropdown = gr.Dropdown(
|
||||
label="Action",
|
||||
choices=["wait"],
|
||||
value="wait"
|
||||
label="Action", choices=["wait"], value="wait"
|
||||
)
|
||||
misc_submit_btn = gr.Button("Submit Action")
|
||||
|
||||
|
||||
# Event handlers
|
||||
refresh_btn.click(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
call_dropdown.change(
|
||||
fn=ui_handler.on_call_selected,
|
||||
inputs=[call_dropdown],
|
||||
outputs=[screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def handle_image_click(evt: gr.SelectData):
|
||||
if evt.index is not None:
|
||||
x, y = evt.index
|
||||
@@ -568,31 +582,44 @@ def create_ui():
|
||||
sx_i = int(ui_handler.current_scroll_x or 0)
|
||||
sy_i = int(ui_handler.current_scroll_y or 0)
|
||||
# Submit a scroll action with x,y position and scroll deltas
|
||||
result = ui_handler.submit_action("scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i)
|
||||
result = ui_handler.submit_action(
|
||||
"scroll", x=x, y=y, scroll_x=sx_i, scroll_y=sy_i
|
||||
)
|
||||
else:
|
||||
result = ui_handler.submit_click_action(x, y, action_type, button)
|
||||
ui_handler.wait_for_pending_calls()
|
||||
return result
|
||||
return "No coordinates selected"
|
||||
|
||||
screenshot_image.select(
|
||||
fn=handle_image_click,
|
||||
outputs=[status_display]
|
||||
).then(
|
||||
screenshot_image.select(fn=handle_image_click, outputs=[status_display]).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
# Response submission
|
||||
submit_btn.click(
|
||||
fn=ui_handler.submit_response,
|
||||
inputs=[response_text],
|
||||
outputs=[response_text, status_display]
|
||||
outputs=[response_text, status_display],
|
||||
).then(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# Toggle visibility of controls based on action type
|
||||
def toggle_action_controls(action_type):
|
||||
# Button visible only for click
|
||||
@@ -603,59 +630,63 @@ def create_ui():
|
||||
# Update state
|
||||
ui_handler.current_action_type = action_type or "click"
|
||||
return button_vis, scroll_x_vis, scroll_y_vis
|
||||
|
||||
|
||||
action_type_radio.change(
|
||||
fn=toggle_action_controls,
|
||||
inputs=[action_type_radio],
|
||||
outputs=[action_button_radio, scroll_x_input, scroll_y_input]
|
||||
outputs=[action_button_radio, scroll_x_input, scroll_y_input],
|
||||
)
|
||||
|
||||
# Keep other control values in ui_handler state
|
||||
def on_button_change(val):
|
||||
ui_handler.current_button = (val or "left")
|
||||
action_button_radio.change(
|
||||
fn=on_button_change,
|
||||
inputs=[action_button_radio]
|
||||
)
|
||||
ui_handler.current_button = val or "left"
|
||||
|
||||
action_button_radio.change(fn=on_button_change, inputs=[action_button_radio])
|
||||
|
||||
def on_scroll_x_change(val):
|
||||
try:
|
||||
ui_handler.current_scroll_x = int(val) if val is not None else 0
|
||||
except Exception:
|
||||
ui_handler.current_scroll_x = 0
|
||||
scroll_x_input.change(
|
||||
fn=on_scroll_x_change,
|
||||
inputs=[scroll_x_input]
|
||||
)
|
||||
|
||||
scroll_x_input.change(fn=on_scroll_x_change, inputs=[scroll_x_input])
|
||||
|
||||
def on_scroll_y_change(val):
|
||||
try:
|
||||
ui_handler.current_scroll_y = int(val) if val is not None else 0
|
||||
except Exception:
|
||||
ui_handler.current_scroll_y = 0
|
||||
scroll_y_input.change(
|
||||
fn=on_scroll_y_change,
|
||||
inputs=[scroll_y_input]
|
||||
)
|
||||
|
||||
|
||||
scroll_y_input.change(fn=on_scroll_y_change, inputs=[scroll_y_input])
|
||||
|
||||
type_submit_btn.click(
|
||||
fn=ui_handler.submit_type_action,
|
||||
inputs=[type_text],
|
||||
outputs=[status_display]
|
||||
fn=ui_handler.submit_type_action, inputs=[type_text], outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
keypress_submit_btn.click(
|
||||
fn=ui_handler.submit_hotkey_action,
|
||||
inputs=[keypress_text],
|
||||
outputs=[status_display]
|
||||
fn=ui_handler.submit_hotkey_action, inputs=[keypress_text], outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def handle_description_submit(description, action_type, button):
|
||||
if description:
|
||||
result = ui_handler.submit_description_click(description, action_type, button)
|
||||
@@ -666,12 +697,19 @@ def create_ui():
|
||||
description_submit_btn.click(
|
||||
fn=handle_description_submit,
|
||||
inputs=[description_text, description_action_type, description_button],
|
||||
outputs=[status_display]
|
||||
outputs=[status_display],
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# Misc action handler
|
||||
def handle_misc_submit(selected_action):
|
||||
if selected_action == "wait":
|
||||
@@ -681,20 +719,32 @@ def create_ui():
|
||||
return f"Unsupported misc action: {selected_action}"
|
||||
|
||||
misc_submit_btn.click(
|
||||
fn=handle_misc_submit,
|
||||
inputs=[misc_action_dropdown],
|
||||
outputs=[status_display]
|
||||
fn=handle_misc_submit, inputs=[misc_action_dropdown], outputs=[status_display]
|
||||
).then(
|
||||
fn=ui_handler.wait_for_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
# Load initial data
|
||||
demo.load(
|
||||
fn=ui_handler.refresh_pending_calls,
|
||||
outputs=[call_dropdown, screenshot_image, conversation_chatbot, submit_btn, click_actions_group, actions_group]
|
||||
outputs=[
|
||||
call_dropdown,
|
||||
screenshot_image,
|
||||
conversation_chatbot,
|
||||
submit_btn,
|
||||
click_actions_group,
|
||||
actions_group,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
return demo
|
||||
|
||||
|
||||
|
||||
@@ -8,21 +8,22 @@ Exports:
|
||||
- run_full_dataset(dataset, ...)
|
||||
- MCPComputerAgent
|
||||
"""
|
||||
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from agent.computers import is_agent_computer
|
||||
from datasets import load_dataset, Dataset
|
||||
from hud.datasets import Task, run_dataset
|
||||
from datasets import Dataset, load_dataset
|
||||
from hud import trace
|
||||
from hud.datasets import Task, run_dataset
|
||||
|
||||
from .agent import MCPComputerAgent
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Single-task runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_single_task(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
@@ -47,24 +48,20 @@ async def run_single_task(
|
||||
|
||||
# Load dataset and pick a sample
|
||||
if isinstance(dataset, str):
|
||||
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
||||
dataset = load_dataset(dataset, split="train") # type: ignore[arg-type]
|
||||
elif isinstance(dataset, list):
|
||||
dataset = dataset
|
||||
else:
|
||||
dataset = dataset["train"]
|
||||
|
||||
|
||||
sample_task = dataset[task_id] # type: ignore[index]
|
||||
task_prompt = sample_task.get("prompt", f"Task {sample_task.get('id', 0)}") # type: ignore[attr-defined]
|
||||
|
||||
# Filter any existing Computer tools
|
||||
# The eval framework will add its own Computer tool per task
|
||||
if tools:
|
||||
tools = [
|
||||
tool
|
||||
for tool in tools
|
||||
if not is_agent_computer(tool)
|
||||
]
|
||||
|
||||
tools = [tool for tool in tools if not is_agent_computer(tool)]
|
||||
|
||||
with trace(name=task_prompt):
|
||||
task = Task(**sample_task) # type: ignore[arg-type]
|
||||
|
||||
@@ -87,13 +84,14 @@ async def run_single_task(
|
||||
)
|
||||
print(f"Running: {task_prompt}")
|
||||
result = await agent.run(task, max_steps=10)
|
||||
print(f"✅ Reward: {getattr(result, 'reward')}")
|
||||
print(f"✅ Reward: {result.reward}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Full-dataset runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_full_dataset(
|
||||
dataset: str | Dataset | list[dict[str, Any]],
|
||||
*,
|
||||
@@ -121,9 +119,9 @@ async def run_full_dataset(
|
||||
|
||||
# Run with our MCP-based agent class.
|
||||
if isinstance(dataset, str):
|
||||
dataset_name = dataset.split('/')[-1]
|
||||
dataset_name = dataset.split("/")[-1]
|
||||
job_name = job_name or f"Evaluation {dataset_name}"
|
||||
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
||||
dataset = load_dataset(dataset, split=split) # type: ignore[arg-type]
|
||||
else:
|
||||
dataset_name = "custom"
|
||||
job_name = job_name or f"Evaluation {time.strftime('%H:%M %Y-%m-%d')}"
|
||||
@@ -131,12 +129,8 @@ async def run_full_dataset(
|
||||
# Filter any existing Computer tools
|
||||
# The eval framework will add its own Computer tool per task
|
||||
if tools:
|
||||
tools = [
|
||||
tool
|
||||
for tool in tools
|
||||
if not is_agent_computer(tool)
|
||||
]
|
||||
|
||||
tools = [tool for tool in tools if not is_agent_computer(tool)]
|
||||
|
||||
# Execute evaluation
|
||||
return await run_dataset(
|
||||
name=job_name,
|
||||
@@ -170,4 +164,4 @@ __all__ = [
|
||||
"run_single_task",
|
||||
"run_full_dataset",
|
||||
"MCPComputerAgent",
|
||||
]
|
||||
]
|
||||
|
||||
@@ -9,26 +9,26 @@ Key differences from the OpenAI OperatorAgent variant:
|
||||
- Planning is executed via `ComputerAgent.run(messages)`.
|
||||
- The first yielded result per step is returned as the agent response.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar, Optional
|
||||
|
||||
import hud
|
||||
import mcp.types as types
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
from agent.callbacks import PromptInstructionsCallback
|
||||
from agent.callbacks.trajectory_saver import TrajectorySaverCallback
|
||||
from agent.computers import is_agent_computer
|
||||
from agent.responses import make_failed_tool_call_items
|
||||
from hud.agents import MCPAgent
|
||||
from hud.tools.computer.settings import computer_settings
|
||||
from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
|
||||
|
||||
from agent.responses import make_failed_tool_call_items
|
||||
from agent.computers import is_agent_computer
|
||||
from PIL import Image
|
||||
import mcp.types as types
|
||||
import hud
|
||||
import uuid
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class MCPComputerAgent(MCPAgent):
|
||||
@@ -114,8 +114,10 @@ class MCPComputerAgent(MCPAgent):
|
||||
self.last_screenshot_b64 = None
|
||||
|
||||
buffer = io.BytesIO()
|
||||
Image.new('RGB', (self.metadata["display_width"], self.metadata["display_height"])).save(buffer, format='PNG')
|
||||
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
||||
Image.new("RGB", (self.metadata["display_width"], self.metadata["display_height"])).save(
|
||||
buffer, format="PNG"
|
||||
)
|
||||
self.last_screenshot_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
# Ensure a computer shim is present so width/height/environment are known
|
||||
computer_shim = {
|
||||
@@ -128,12 +130,8 @@ class MCPComputerAgent(MCPAgent):
|
||||
}
|
||||
agent_tools: list[Any] = [computer_shim]
|
||||
if tools:
|
||||
agent_tools.extend([
|
||||
tool
|
||||
for tool in tools
|
||||
if not is_agent_computer(tool)
|
||||
])
|
||||
|
||||
agent_tools.extend([tool for tool in tools if not is_agent_computer(tool)])
|
||||
|
||||
agent_kwargs = {
|
||||
"model": self.model,
|
||||
"trajectory_dir": trajectory_dir,
|
||||
@@ -150,9 +148,7 @@ class MCPComputerAgent(MCPAgent):
|
||||
"telemetry_enabled": telemetry_enabled,
|
||||
}
|
||||
|
||||
self.computer_agent = BaseComputerAgent(
|
||||
**agent_kwargs
|
||||
)
|
||||
self.computer_agent = BaseComputerAgent(**agent_kwargs)
|
||||
|
||||
async def get_system_messages(self) -> list[Any]:
|
||||
"""Create initial messages.
|
||||
@@ -161,9 +157,7 @@ class MCPComputerAgent(MCPAgent):
|
||||
"""
|
||||
return []
|
||||
|
||||
async def format_blocks(
|
||||
self, blocks: list[types.ContentBlock]
|
||||
) -> list[dict[str, Any]]:
|
||||
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Format blocks for OpenAI input format.
|
||||
|
||||
@@ -200,42 +194,49 @@ class MCPComputerAgent(MCPAgent):
|
||||
|
||||
# Call the ComputerAgent LLM API
|
||||
async for result in self.computer_agent.run(messages): # type: ignore[arg-type]
|
||||
items = result['output']
|
||||
items = result["output"]
|
||||
if not items or tool_calls:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
if item['type'] in ['reasoning', 'message', 'computer_call', 'function_call', 'function_call_output']:
|
||||
if item["type"] in [
|
||||
"reasoning",
|
||||
"message",
|
||||
"computer_call",
|
||||
"function_call",
|
||||
"function_call_output",
|
||||
]:
|
||||
agent_result.append(item)
|
||||
|
||||
|
||||
# Add messages to output text
|
||||
if item['type'] == 'reasoning':
|
||||
if item["type"] == "reasoning":
|
||||
output_text.extend(
|
||||
f"Reasoning: {summary['text']}"
|
||||
for summary in item['summary']
|
||||
f"Reasoning: {summary['text']}" for summary in item["summary"]
|
||||
)
|
||||
elif item['type'] == 'message':
|
||||
if isinstance(item['content'], list):
|
||||
elif item["type"] == "message":
|
||||
if isinstance(item["content"], list):
|
||||
output_text.extend(
|
||||
item['text']
|
||||
for item in item['content']
|
||||
if item['type'] == 'output_text'
|
||||
item["text"]
|
||||
for item in item["content"]
|
||||
if item["type"] == "output_text"
|
||||
)
|
||||
elif isinstance(item['content'], str):
|
||||
output_text.append(item['content'])
|
||||
|
||||
elif isinstance(item["content"], str):
|
||||
output_text.append(item["content"])
|
||||
|
||||
# If we get a tool call, we're not done
|
||||
if item['type'] == 'computer_call':
|
||||
if item["type"] == "computer_call":
|
||||
id = item["call_id"]
|
||||
tool_calls.append(MCPToolCall(
|
||||
name="openai_computer",
|
||||
arguments=item["action"],
|
||||
id=id,
|
||||
))
|
||||
tool_calls.append(
|
||||
MCPToolCall(
|
||||
name="openai_computer",
|
||||
arguments=item["action"],
|
||||
id=id,
|
||||
)
|
||||
)
|
||||
is_done = False
|
||||
self.tool_call_inputs[id] = agent_result
|
||||
break
|
||||
|
||||
|
||||
# if we have tool calls, we should exit the loop
|
||||
if tool_calls:
|
||||
break
|
||||
@@ -247,7 +248,7 @@ class MCPComputerAgent(MCPAgent):
|
||||
tool_calls=tool_calls,
|
||||
done=is_done,
|
||||
)
|
||||
|
||||
|
||||
def _log_image(self, image_b64: str):
|
||||
callbacks = self.computer_agent.callbacks
|
||||
for callback in callbacks:
|
||||
@@ -257,9 +258,7 @@ class MCPComputerAgent(MCPAgent):
|
||||
callback._save_artifact("screenshot_after", image_bytes)
|
||||
|
||||
async def format_tool_results(
|
||||
self,
|
||||
tool_calls: list[MCPToolCall],
|
||||
tool_results: list[MCPToolResult]
|
||||
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Extract latest screenshot from tool results in dict form.
|
||||
|
||||
@@ -274,45 +273,60 @@ class MCPComputerAgent(MCPAgent):
|
||||
previous_output = self.previous_output.copy() or []
|
||||
|
||||
# First we need to remove any pending computer_calls from the end of previous_output
|
||||
while previous_output and previous_output[-1]['type'] == 'computer_call':
|
||||
while previous_output and previous_output[-1]["type"] == "computer_call":
|
||||
previous_output.pop()
|
||||
messages.extend(previous_output)
|
||||
|
||||
# If the call is a 'response', don't add the result
|
||||
if call.name == 'response':
|
||||
if call.name == "response":
|
||||
continue
|
||||
# Otherwise, if we have a result, we should add it to the messages
|
||||
content = [
|
||||
{ "type": "input_text", "text": content.text } if isinstance(content, types.TextContent)
|
||||
else { "type": "input_image", "image_url": f"data:image/png;base64,{content.data}" } if isinstance(content, types.ImageContent)
|
||||
else { "type": "input_text", "text": "" }
|
||||
(
|
||||
{"type": "input_text", "text": content.text}
|
||||
if isinstance(content, types.TextContent)
|
||||
else (
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{content.data}",
|
||||
}
|
||||
if isinstance(content, types.ImageContent)
|
||||
else {"type": "input_text", "text": ""}
|
||||
)
|
||||
)
|
||||
for content in result.content
|
||||
]
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": content,
|
||||
})
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
|
||||
# Add the assistant's computer call
|
||||
messages.extend(self.tool_call_inputs[call.id])
|
||||
|
||||
|
||||
if result.isError:
|
||||
error_text = "".join([
|
||||
content.text
|
||||
for content in result.content
|
||||
if isinstance(content, types.TextContent)
|
||||
])
|
||||
error_text = "".join(
|
||||
[
|
||||
content.text
|
||||
for content in result.content
|
||||
if isinstance(content, types.TextContent)
|
||||
]
|
||||
)
|
||||
|
||||
# Replace computer call with failed tool call
|
||||
messages.pop()
|
||||
messages.extend(make_failed_tool_call_items(
|
||||
tool_name=call.name,
|
||||
tool_kwargs=call.arguments or {},
|
||||
error_message=error_text,
|
||||
call_id=call.id,
|
||||
))
|
||||
messages.extend(
|
||||
make_failed_tool_call_items(
|
||||
tool_name=call.name,
|
||||
tool_kwargs=call.arguments or {},
|
||||
error_message=error_text,
|
||||
call_id=call.id,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# Get the latest screenshot
|
||||
screenshots = [
|
||||
@@ -325,23 +339,27 @@ class MCPComputerAgent(MCPAgent):
|
||||
if screenshots:
|
||||
self._log_image(screenshots[0])
|
||||
self.last_screenshot_b64 = screenshots[0]
|
||||
messages.append({
|
||||
"type": "computer_call_output",
|
||||
"call_id": call.id,
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshots[0]}"
|
||||
},
|
||||
})
|
||||
messages.append(
|
||||
{
|
||||
"type": "computer_call_output",
|
||||
"call_id": call.id,
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshots[0]}",
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Otherwise, replace computer call with failed tool call
|
||||
messages.pop()
|
||||
messages.extend(make_failed_tool_call_items(
|
||||
tool_name=call.name,
|
||||
tool_kwargs=call.arguments or {},
|
||||
error_message="No screenshots returned.",
|
||||
call_id=call.id,
|
||||
))
|
||||
messages.extend(
|
||||
make_failed_tool_call_items(
|
||||
tool_name=call.name,
|
||||
tool_kwargs=call.arguments or {},
|
||||
error_message="No screenshots returned.",
|
||||
call_id=call.id,
|
||||
)
|
||||
)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
@@ -7,30 +7,33 @@ OpenAI-like response blocks. We intentionally only support a single-step call
|
||||
by consuming the first yielded result from `ComputerAgent.run()`.
|
||||
"""
|
||||
|
||||
import traceback
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.agent import ComputerAgent as BaseComputerAgent
|
||||
from agent.callbacks import PromptInstructionsCallback
|
||||
from hud.tools.computer.settings import computer_settings
|
||||
from PIL import Image
|
||||
from hud.agents import OperatorAgent
|
||||
from hud.tools.computer.settings import computer_settings
|
||||
|
||||
# OpenAI Responses typed models (required)
|
||||
from openai.types.responses import (
|
||||
Response,
|
||||
ResponseComputerToolCall,
|
||||
ResponseInputParam,
|
||||
ResponseOutputItem,
|
||||
ResponseComputerToolCall,
|
||||
ResponseOutputMessage,
|
||||
ResponseOutputText,
|
||||
ResponseReasoningItem,
|
||||
ResponseUsage,
|
||||
)
|
||||
from PIL import Image
|
||||
|
||||
def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> List[ResponseOutputItem]:
|
||||
|
||||
def _map_agent_output_to_openai_blocks(
|
||||
output_items: List[Dict[str, Any]],
|
||||
) -> List[ResponseOutputItem]:
|
||||
"""Map our agent output items to OpenAI ResponseOutputItem typed models.
|
||||
|
||||
Only a subset is supported: computer_call, assistant message (text), and reasoning.
|
||||
@@ -40,14 +43,16 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
||||
for item in output_items or []:
|
||||
t = item.get("type")
|
||||
if t == "computer_call":
|
||||
comp = ResponseComputerToolCall.model_validate({
|
||||
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
||||
"type": "computer_call",
|
||||
"call_id": item["call_id"],
|
||||
"action": item["action"],
|
||||
"pending_safety_checks": item.get("pending_safety_checks", []),
|
||||
"status": "completed",
|
||||
})
|
||||
comp = ResponseComputerToolCall.model_validate(
|
||||
{
|
||||
"id": item.get("id") or f"cu_{uuid.uuid4().hex}",
|
||||
"type": "computer_call",
|
||||
"call_id": item["call_id"],
|
||||
"action": item["action"],
|
||||
"pending_safety_checks": item.get("pending_safety_checks", []),
|
||||
"status": "completed",
|
||||
}
|
||||
)
|
||||
blocks.append(comp)
|
||||
# we will exit early here as the responses api only supports a single step
|
||||
break
|
||||
@@ -55,31 +60,38 @@ def _map_agent_output_to_openai_blocks(output_items: List[Dict[str, Any]]) -> Li
|
||||
content_blocks: List[ResponseOutputText] = []
|
||||
for c in item.get("content", []) or []:
|
||||
content_blocks.append(
|
||||
ResponseOutputText.model_validate({
|
||||
"type": "output_text",
|
||||
"text": c["text"],
|
||||
"annotations": [],
|
||||
})
|
||||
ResponseOutputText.model_validate(
|
||||
{
|
||||
"type": "output_text",
|
||||
"text": c["text"],
|
||||
"annotations": [],
|
||||
}
|
||||
)
|
||||
)
|
||||
if content_blocks:
|
||||
msg = ResponseOutputMessage.model_validate({
|
||||
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "completed",
|
||||
"content": [ct.model_dump() for ct in content_blocks],
|
||||
})
|
||||
msg = ResponseOutputMessage.model_validate(
|
||||
{
|
||||
"id": item.get("id") or f"msg_{uuid.uuid4()}",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "completed",
|
||||
"content": [ct.model_dump() for ct in content_blocks],
|
||||
}
|
||||
)
|
||||
blocks.append(msg)
|
||||
elif t == "reasoning":
|
||||
reasoning = ResponseReasoningItem.model_validate({
|
||||
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
||||
"type": "reasoning",
|
||||
"summary": item["summary"],
|
||||
})
|
||||
reasoning = ResponseReasoningItem.model_validate(
|
||||
{
|
||||
"id": item.get("id") or f"rsn_{uuid.uuid4()}",
|
||||
"type": "reasoning",
|
||||
"summary": item["summary"],
|
||||
}
|
||||
)
|
||||
blocks.append(reasoning)
|
||||
# Unhandled types are ignored
|
||||
return blocks
|
||||
|
||||
|
||||
def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
||||
out: List[Dict[str, Any]] = []
|
||||
for it in list(items):
|
||||
@@ -92,6 +104,7 @@ def _to_plain_dict_list(items: Any) -> List[Dict[str, Any]]:
|
||||
out.append(dict(it)) # may raise if not mapping
|
||||
return out
|
||||
|
||||
|
||||
class FakeAsyncOpenAI:
|
||||
"""Minimal fake OpenAI client with only `responses.create` implemented.
|
||||
|
||||
@@ -132,10 +145,12 @@ class FakeAsyncOpenAI:
|
||||
# Pre-pend instructions message
|
||||
effective_input = full_input
|
||||
if instructions:
|
||||
effective_input = [{
|
||||
"role": "user",
|
||||
"content": instructions,
|
||||
}] + full_input
|
||||
effective_input = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": instructions,
|
||||
}
|
||||
] + full_input
|
||||
|
||||
# Run a single iteration of the ComputerAgent
|
||||
agent_result: Optional[Dict[str, Any]] = None
|
||||
@@ -152,32 +167,43 @@ class FakeAsyncOpenAI:
|
||||
blocks_to_cache = full_input + output
|
||||
for b in blocks_to_cache:
|
||||
bid = getattr(b, "id", None) or f"tmp-{hash(repr(b))}"
|
||||
self.blocks_cache[bid] = b # type: ignore[assignment]
|
||||
self.blocks_cache[bid] = b # type: ignore[assignment]
|
||||
block_ids.append(bid)
|
||||
response_id = agent_result.get("id") or f"fake-{int(time.time()*1000)}"
|
||||
self.context_cache[response_id] = block_ids
|
||||
|
||||
try:
|
||||
return Response.model_validate({
|
||||
"id": response_id,
|
||||
"created_at": time.time(),
|
||||
"object": "response",
|
||||
"model": model,
|
||||
"output": output,
|
||||
"parallel_tool_calls": False,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"previous_response_id": previous_response_id,
|
||||
"usage": ResponseUsage.model_validate({
|
||||
"input_tokens": usage.get("input_tokens", 0),
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
"input_tokens_details": usage.get("input_tokens_details", { "cached_tokens": 0 }),
|
||||
"output_tokens_details": usage.get("output_tokens_details", { "reasoning_tokens": 0 }),
|
||||
}),
|
||||
})
|
||||
return Response.model_validate(
|
||||
{
|
||||
"id": response_id,
|
||||
"created_at": time.time(),
|
||||
"object": "response",
|
||||
"model": model,
|
||||
"output": output,
|
||||
"parallel_tool_calls": False,
|
||||
"tool_choice": "auto",
|
||||
"tools": [],
|
||||
"previous_response_id": previous_response_id,
|
||||
"usage": ResponseUsage.model_validate(
|
||||
{
|
||||
"input_tokens": usage.get("input_tokens", 0),
|
||||
"output_tokens": usage.get("output_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
"input_tokens_details": usage.get(
|
||||
"input_tokens_details", {"cached_tokens": 0}
|
||||
),
|
||||
"output_tokens_details": usage.get(
|
||||
"output_tokens_details", {"reasoning_tokens": 0}
|
||||
),
|
||||
}
|
||||
),
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ", e)
|
||||
print(
|
||||
f"Error while validating agent response (attempt {attempt + 1}/{max_retries}): ",
|
||||
e,
|
||||
)
|
||||
if attempt == max_retries - 1:
|
||||
print(traceback.format_exc())
|
||||
raise e
|
||||
@@ -221,9 +247,15 @@ class ProxyOperatorAgent(OperatorAgent):
|
||||
allowed_tools = allowed_tools or ["openai_computer"]
|
||||
|
||||
computer_shim = {
|
||||
'screenshot': lambda: Image.new('RGB', (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)),
|
||||
'environment': 'linux',
|
||||
'dimensions': (computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT)
|
||||
"screenshot": lambda: Image.new(
|
||||
"RGB",
|
||||
(computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT),
|
||||
),
|
||||
"environment": "linux",
|
||||
"dimensions": (
|
||||
computer_settings.OPENAI_COMPUTER_WIDTH,
|
||||
computer_settings.OPENAI_COMPUTER_HEIGHT,
|
||||
),
|
||||
}
|
||||
# Build tools ensuring the computer_shim is included
|
||||
agent_tools: list[Any] = [computer_shim]
|
||||
@@ -258,6 +290,7 @@ class ProxyOperatorAgent(OperatorAgent):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"FakeAsyncOpenAI",
|
||||
"ProxyOperatorAgent",
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user