mirror of
https://github.com/trycua/computer.git
synced 2026-01-04 12:30:08 -06:00
Merge branch 'main' into feat/fara-browser-use
This commit is contained in:
29
.github/workflows/docker-publish-cua-linux.yml
vendored
Normal file
29
.github/workflows/docker-publish-cua-linux.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Build and Publish CUA Linux Container
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- "docker-cua-linux-v*.*.*"
|
||||
paths:
|
||||
- "libs/qemu-docker/linux/**"
|
||||
- ".github/workflows/docker-publish-cua-linux.yml"
|
||||
- ".github/workflows/docker-reusable-publish.yml"
|
||||
pull_request:
|
||||
paths:
|
||||
- "libs/qemu-docker/linux/**"
|
||||
- ".github/workflows/docker-publish-cua-linux.yml"
|
||||
- ".github/workflows/docker-reusable-publish.yml"
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
uses: ./.github/workflows/docker-reusable-publish.yml
|
||||
with:
|
||||
image_name: cua-linux
|
||||
context_dir: libs/qemu-docker/linux
|
||||
dockerfile_path: Dockerfile
|
||||
tag_prefix: docker-cua-linux-v
|
||||
docker_hub_org: trycua
|
||||
secrets:
|
||||
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
|
||||
29
.github/workflows/docker-publish-cua-windows.yml
vendored
Normal file
29
.github/workflows/docker-publish-cua-windows.yml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Build and Publish CUA Windows Container
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- "docker-cua-windows-v*.*.*"
|
||||
paths:
|
||||
- "libs/qemu-docker/windows/**"
|
||||
- ".github/workflows/docker-publish-cua-windows.yml"
|
||||
- ".github/workflows/docker-reusable-publish.yml"
|
||||
pull_request:
|
||||
paths:
|
||||
- "libs/qemu-docker/windows/**"
|
||||
- ".github/workflows/docker-publish-cua-windows.yml"
|
||||
- ".github/workflows/docker-reusable-publish.yml"
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
uses: ./.github/workflows/docker-reusable-publish.yml
|
||||
with:
|
||||
image_name: cua-windows
|
||||
context_dir: libs/qemu-docker/windows
|
||||
dockerfile_path: Dockerfile
|
||||
tag_prefix: docker-cua-windows-v
|
||||
docker_hub_org: trycua
|
||||
secrets:
|
||||
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
|
||||
188
.github/workflows/docker-reusable-publish.yml
vendored
188
.github/workflows/docker-reusable-publish.yml
vendored
@@ -39,20 +39,19 @@ jobs:
|
||||
- linux/amd64
|
||||
- linux/arm64
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Prepare platform tag
|
||||
id: platform
|
||||
run: |
|
||||
# Convert platform (e.g., linux/amd64) to a valid tag suffix (e.g., linux-amd64)
|
||||
PLATFORM_TAG=$(echo "${{ matrix.platform }}" | sed 's/\//-/g')
|
||||
echo "tag=${PLATFORM_TAG}" >> $GITHUB_OUTPUT
|
||||
TAG=$(echo "${{ matrix.platform }}" | sed 's/\//-/g')
|
||||
echo "tag=${TAG}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ inputs.docker_hub_org }}
|
||||
@@ -67,7 +66,22 @@ jobs:
|
||||
tags: |
|
||||
type=raw,value=${{ github.sha }}
|
||||
|
||||
- name: Extract metadata (main branch)
|
||||
- name: Build & push digest (PR)
|
||||
if: github.event_name == 'pull_request'
|
||||
id: build-pr
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./${{ inputs.context_dir }}
|
||||
file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
|
||||
push: true
|
||||
platforms: ${{ matrix.platform }}
|
||||
outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
|
||||
labels: ${{ steps.meta-pr.outputs.labels }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
|
||||
cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
|
||||
|
||||
- name: Extract metadata (main)
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
id: meta-main
|
||||
uses: docker/metadata-action@v5
|
||||
@@ -76,7 +90,22 @@ jobs:
|
||||
tags: |
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Extract metadata (semantic version tag)
|
||||
- name: Build & push digest (main)
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
id: build-main
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./${{ inputs.context_dir }}
|
||||
file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
|
||||
push: true
|
||||
platforms: ${{ matrix.platform }}
|
||||
outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
|
||||
labels: ${{ steps.meta-main.outputs.labels }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
|
||||
cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
|
||||
|
||||
- name: Extract metadata (semver)
|
||||
if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
|
||||
id: meta-semver
|
||||
uses: docker/metadata-action@v5
|
||||
@@ -88,68 +117,111 @@ jobs:
|
||||
type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Build and push Docker image (PR)
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./${{ inputs.context_dir }}
|
||||
file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
|
||||
push: true
|
||||
tags: ${{ steps.meta-pr.outputs.tags }}
|
||||
labels: ${{ steps.meta-pr.outputs.labels }}
|
||||
platforms: ${{ matrix.platform }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
|
||||
cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
|
||||
|
||||
- name: Build and push Docker image (main branch)
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./${{ inputs.context_dir }}
|
||||
file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
|
||||
push: true
|
||||
tags: ${{ steps.meta-main.outputs.tags }}
|
||||
labels: ${{ steps.meta-main.outputs.labels }}
|
||||
platforms: ${{ matrix.platform }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
|
||||
cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
|
||||
|
||||
- name: Build and push Docker image (semantic version tag)
|
||||
- name: Build & push digest (semver)
|
||||
if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
|
||||
id: build-semver
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./${{ inputs.context_dir }}
|
||||
file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
|
||||
push: true
|
||||
tags: ${{ steps.meta-semver.outputs.tags }}
|
||||
labels: ${{ steps.meta-semver.outputs.labels }}
|
||||
platforms: ${{ matrix.platform }}
|
||||
outputs: type=registry,name=${{ inputs.docker_hub_org }}/${{ inputs.image_name }},push-by-digest=true
|
||||
labels: ${{ steps.meta-semver.outputs.labels }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
|
||||
type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
|
||||
cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
|
||||
|
||||
- name: Image digest
|
||||
if: github.event_name == 'pull_request' || github.ref == 'refs/heads/main' || startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
|
||||
- name: Export digest
|
||||
id: export-digest
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "pull_request" ]; then
|
||||
echo "Image pushed with digest ${{ steps.meta-pr.outputs.digest }}"
|
||||
elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then
|
||||
echo "Image pushed with digest ${{ steps.meta-semver.outputs.digest }}"
|
||||
else
|
||||
echo "Image pushed with digest ${{ steps.meta-main.outputs.digest }}"
|
||||
fi
|
||||
mkdir -p /tmp/digests
|
||||
digest="${{ steps.build-pr.outputs.digest || steps.build-main.outputs.digest || steps.build-semver.outputs.digest }}"
|
||||
echo "$digest" > "/tmp/digests/${{ steps.platform.outputs.tag }}.txt"
|
||||
|
||||
- name: print image tags
|
||||
- name: Upload digest artifact (unique per platform)
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: digests-${{ steps.platform.outputs.tag }}
|
||||
path: /tmp/digests/*.txt
|
||||
retention-days: 1
|
||||
|
||||
publish-manifest-list:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build-and-push
|
||||
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ inputs.docker_hub_org }}
|
||||
password: ${{ secrets.DOCKER_HUB_TOKEN }}
|
||||
|
||||
- name: Extract final metadata (PR)
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
|
||||
tags: |
|
||||
type=ref,event=pr
|
||||
type=sha
|
||||
|
||||
- name: Extract final metadata (main)
|
||||
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
|
||||
tags: |
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Extract final metadata (semver)
|
||||
if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
|
||||
tags: |
|
||||
type=semver,pattern={{version}},prefix=${{ inputs.tag_prefix }}
|
||||
type=semver,pattern={{major}}.{{minor}},prefix=${{ inputs.tag_prefix }}
|
||||
type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }}
|
||||
type=raw,value=latest
|
||||
|
||||
- name: Download all digest artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: digests-*
|
||||
path: /tmp/digests
|
||||
merge-multiple: true
|
||||
|
||||
- name: Create & push multi-arch manifest
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "pull_request" ]; then
|
||||
echo "Image tags: ${{ steps.meta-pr.outputs.tags }}"
|
||||
elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then
|
||||
echo "Image tags: ${{ steps.meta-semver.outputs.tags }}"
|
||||
else
|
||||
echo "Image tags: ${{ steps.meta-main.outputs.tags }}"
|
||||
fi
|
||||
IMAGE="${{ inputs.docker_hub_org }}/${{ inputs.image_name }}"
|
||||
|
||||
DIGEST_ARGS=""
|
||||
for f in $(find /tmp/digests -type f -name "*.txt"); do
|
||||
d=$(cat "$f")
|
||||
DIGEST_ARGS="$DIGEST_ARGS ${IMAGE}@${d}"
|
||||
done
|
||||
|
||||
echo "Using digests:"
|
||||
echo "$DIGEST_ARGS"
|
||||
|
||||
# Create manifest for each tag produced by metadata-action
|
||||
echo "${DOCKER_METADATA_OUTPUT_JSON}" | jq -r '.tags[]' | while read FULL_TAG; do
|
||||
echo "Creating manifest: $FULL_TAG"
|
||||
docker buildx imagetools create --tag "$FULL_TAG" $DIGEST_ARGS
|
||||
done
|
||||
|
||||
- name: Inspect pushed manifests
|
||||
run: |
|
||||
IMAGE="${{ inputs.docker_hub_org }}/${{ inputs.image_name }}"
|
||||
echo "Inspecting manifests:"
|
||||
|
||||
echo "${DOCKER_METADATA_OUTPUT_JSON}" | jq -r '.tags[]' | while read FULL_TAG; do
|
||||
echo ""
|
||||
echo "Inspecting: $FULL_TAG"
|
||||
docker buildx imagetools inspect "$FULL_TAG"
|
||||
done
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
**/image/setup.iso
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
70
README.md
70
README.md
@@ -1,14 +1,22 @@
|
||||
<div align="center">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" alt="Cua logo" height="150" srcset="img/logo_white.png">
|
||||
<source media="(prefers-color-scheme: light)" alt="Cua logo" height="150" srcset="img/logo_black.png">
|
||||
<img alt="Cua logo" height="150" src="img/logo_black.png">
|
||||
</picture>
|
||||
<a href="https://cua.ai" target="_blank" rel="noopener noreferrer">
|
||||
<picture>
|
||||
<source media="(prefers-color-scheme: dark)" alt="Cua logo" width="150" srcset="img/logo_white.png">
|
||||
<source media="(prefers-color-scheme: light)" alt="Cua logo" width="150" srcset="img/logo_black.png">
|
||||
<img alt="Cua logo" width="500" src="img/logo_black.png">
|
||||
</picture>
|
||||
</a>
|
||||
|
||||
[](#)
|
||||
[](https://discord.com/invite/mVnXXpdE85)
|
||||
<br>
|
||||
<p align="center">Build and deploy AI agents that can reason, plan and act on any Computers</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://cua.ai" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/cua.ai-0ea5e9" alt="cua.ai"></a>
|
||||
<a href="https://discord.com/invite/cua-ai" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Discord-Join%20Server-10b981?logo=discord&logoColor=white" alt="Discord"></a>
|
||||
<a href="https://x.com/trycua" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/twitter/follow/trycua?style=social" alt="Twitter"></a>
|
||||
<a href="https://cua.ai/docs" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/Docs-0ea5e9.svg" alt="Documentation"></a>
|
||||
<br>
|
||||
<a href="https://trendshift.io/repositories/13685" target="_blank"><img src="https://trendshift.io/api/badge/repositories/13685" alt="trycua%2Fcua | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -211,21 +219,22 @@ These are the valid model configurations for `ComputerAgent(model="...")`:
|
||||
|
||||
The following table shows which capabilities are supported by each model:
|
||||
|
||||
| Model | Computer-Use | Grounding | Tools | VLM |
|
||||
| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: |
|
||||
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ |
|
||||
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ |
|
||||
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | 🖥️ | 🎯 | 🛠️ | 👁️ |
|
||||
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | |
|
||||
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | |
|
||||
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | |
|
||||
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | |
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | |
|
||||
| Model | Computer-Use | Grounding | Tools | VLM | Cloud |
|
||||
| -------------------------------------------------------------------------------------------------------------------------------- | :----------: | :-------: | :---: | :-: | :---: |
|
||||
| [Claude Sonnet/Haiku](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ | ☁️ |
|
||||
| [Claude Opus](https://docs.claude.com/en/docs/agents-and-tools/tool-use/computer-use-tool#how-to-implement-computer-use) | 🖥️ | 🎯 | 🛠️ | 👁️ | ☁️ |
|
||||
| [OpenAI CU Preview](https://platform.openai.com/docs/models/computer-use-preview) | 🖥️ | 🎯 | | 👁️ | |
|
||||
| [Qwen3 VL](https://huggingface.co/collections/Qwen/qwen3-vl) | 🖥️ | 🎯 | 🛠️ | 👁️ | ☁️ |
|
||||
| [GLM-V](https://huggingface.co/THUDM/glm-4v-9b) | 🖥️ | 🎯 | 🛠️ | 👁️ | |
|
||||
| [Gemini CU Preview](https://ai.google.dev/gemini-api/docs/computer-use) | 🖥️ | 🎯 | | 👁️ | |
|
||||
| [InternVL](https://huggingface.co/OpenGVLab/InternVL3_5-1B) | 🖥️ | 🎯 | 🛠️ | 👁️ | |
|
||||
| [UI-TARS](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) | 🖥️ | 🎯 | 🛠️ | 👁️ | |
|
||||
| [UI-TARS-2](https://cua.ai/dashboard/vlm-router) | 🖥️ | 🎯 | 🛠️ | 👁️ | ☁️ |
|
||||
| [OpenCUA](https://huggingface.co/xlangai/OpenCUA-7B) | | 🎯 | | | |
|
||||
| [GTA](https://huggingface.co/HelloKKMe/GTA1-7B) | | 🎯 | | | |
|
||||
| [Holo](https://huggingface.co/Hcompany/Holo1.5-3B) | | 🎯 | | | |
|
||||
| [Moondream](https://huggingface.co/moondream/moondream3-preview) | | 🎯 | | | |
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser) | | 🎯 | | | |
|
||||
|
||||
**Legend:**
|
||||
|
||||
@@ -233,6 +242,7 @@ The following table shows which capabilities are supported by each model:
|
||||
- 🎯 **Grounding**: UI element detection and click coordinate prediction
|
||||
- 🛠️ **Tools**: Support for function calling beyond screen interaction
|
||||
- 👁️ **VLM**: Vision-language understanding
|
||||
- ☁️ **Cloud**: Supported on Cua VLM
|
||||
|
||||
**Composition Examples:**
|
||||
|
||||
@@ -373,6 +383,20 @@ Learn more in the [SOM documentation](./libs/python/som/README.md).
|
||||
|
||||
## 2025
|
||||
|
||||
### December 2025
|
||||
|
||||
- **Cloud VLM Platform**: Support for Claude Opus, Qwen3 VL 235B, and UI-TARS-2 on Cua VLM cloud infrastructure
|
||||
- **QEMU Container Support**: Native Linux and Windows container execution via QEMU virtualization
|
||||
|
||||
### November 2025
|
||||
|
||||
- **Generic VLM Provider**: Expanded support for custom VLM providers and model configurations
|
||||
- **NeurIPS 2025**: Coverage of computer-use agent research papers and developments ([Blog Post](https://cua.ai/blog/neurips-2025-cua-papers))
|
||||
|
||||
### October 2025
|
||||
|
||||
- **Agent SDK Improvements**: Enhanced model support and configuration options
|
||||
|
||||
### September 2025
|
||||
|
||||
- **Hack the North Competition**: First benchmark-driven hackathon track with guaranteed YC interview prize. Winner achieved 68.3% on OSWorld-Tiny ([Blog Post](https://www.cua.ai/blog/hack-the-north))
|
||||
|
||||
@@ -255,8 +255,8 @@ If there's a feature you need, let us know in [Discord](https://discord.gg/cua-a
|
||||
|
||||
## Need Help?
|
||||
|
||||
- **Documentation**: [https://cua.ai/docs/libraries/cua-cli/commands](https://cua.ai/docs/libraries/cua-cli/commands)
|
||||
- **Installation Guide**: [https://cua.ai/docs/libraries/cua-cli/installation](https://cua.ai/docs/libraries/cua-cli/installation)
|
||||
- **Documentation**: [https://cua.ai/docs/cli-playbook/commands](https://cua.ai/docs/cli-playbook/commands)
|
||||
- **Installation Guide**: [https://cua.ai/docs/cli-playbook](https://cua.ai/docs/cli-playbook)
|
||||
- **Discord Community**: [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
|
||||
|
||||
---
|
||||
|
||||
@@ -4,11 +4,7 @@ description: Supported computer-using agent loops and models
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
An agent can be thought of as a loop - it generates actions, executes them, and repeats until done:
|
||||
|
||||
@@ -3,14 +3,7 @@ title: Customize ComputerAgent
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb"
|
||||
target="_blank"
|
||||
>
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
|
||||
|
||||
@@ -4,11 +4,7 @@ description: Use ComputerAgent with HUD for benchmarking and evaluation
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.
|
||||
|
||||
17
docs/content/docs/agent-sdk/mcp-server/index.mdx
Normal file
17
docs/content/docs/agent-sdk/mcp-server/index.mdx
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
title: MCP Server
|
||||
description: Run Cua agents through Claude Desktop and other MCP clients
|
||||
---
|
||||
|
||||
The MCP Server exposes Cua agents as tools for [Model Context Protocol](https://modelcontextprotocol.io/) clients like Claude Desktop. This lets you ask Claude to perform computer tasks directly from the chat interface.
|
||||
|
||||
```bash
|
||||
pip install cua-mcp-server
|
||||
```
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Claude Desktop integration** - Use Cua agents directly in Claude's chat
|
||||
- **Multi-client support** - Concurrent sessions with automatic resource management
|
||||
- **Progress reporting** - Real-time updates during task execution
|
||||
- **VM safety** - Runs in sandboxed VMs by default
|
||||
@@ -14,6 +14,7 @@
|
||||
"usage-tracking",
|
||||
"telemetry",
|
||||
"benchmarks",
|
||||
"integrations"
|
||||
"integrations",
|
||||
"mcp-server"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ async for _ in agent.run("Take a screenshot, analyze the UI, and click on the mo
|
||||
pass
|
||||
```
|
||||
|
||||
### GTA1 + Claude 3.5 Sonnet
|
||||
### GTA1 + Claude 4.5 Sonnet
|
||||
|
||||
Combine state-of-the-art grounding with powerful reasoning:
|
||||
|
||||
|
||||
@@ -25,16 +25,31 @@ model="cua/anthropic/claude-haiku-4.5" # Claude Haiku 4.5 (faster)
|
||||
|
||||
### Anthropic Claude (Computer Use API - BYOK)
|
||||
|
||||
Direct access to Anthropic's Claude models using your own Anthropic API key (BYOK - Bring Your Own Key).
|
||||
Access Anthropic's computer use models directly or through Azure AI Foundry using your own API key (BYOK).
|
||||
|
||||
#### Via Anthropic API
|
||||
|
||||
```python
|
||||
model="anthropic/claude-3-7-sonnet-20250219"
|
||||
model="anthropic/claude-opus-4-20250514"
|
||||
model="anthropic/claude-sonnet-4-20250514"
|
||||
model="anthropic/claude-haiku-4-5-20251001" # Claude Haiku 4.5 (fastest, cost-effective)
|
||||
model="anthropic/claude-sonnet-4-5-20250929" # Claude Sonnet 4.5 (recommended)
|
||||
model="anthropic/claude-opus-4-5-20251101" # Claude Opus 4.5 (most advanced)
|
||||
```
|
||||
|
||||
**Setup:** Set `ANTHROPIC_API_KEY` environment variable with your Anthropic API key.
|
||||
|
||||
#### Via Azure AI Foundry
|
||||
|
||||
```python
|
||||
model="anthropic/claude-haiku-4-5"
|
||||
model="anthropic/claude-sonnet-4-5"
|
||||
model="anthropic/claude-opus-4-5"
|
||||
```
|
||||
|
||||
**Setup:**
|
||||
|
||||
- Set `ANTHROPIC_API_KEY` environment variable with your Azure AI Foundry key
|
||||
- Set `ANTHROPIC_API_BASE` to your Azure endpoint (e.g., `https://<your-resource>.services.ai.azure.com/anthropic`)
|
||||
|
||||
### OpenAI Computer Use Preview (BYOK)
|
||||
|
||||
Direct access to OpenAI's computer use models using your own OpenAI API key (BYOK).
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: Commands
|
||||
title: Command Reference
|
||||
description: Complete reference for all CUA CLI commands
|
||||
---
|
||||
|
||||
68
docs/content/docs/cli-playbook/index.mdx
Normal file
68
docs/content/docs/cli-playbook/index.mdx
Normal file
@@ -0,0 +1,68 @@
|
||||
---
|
||||
title: Getting Started
|
||||
description: Install and set up the CUA CLI
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
The Cua CLI is a command-line tool for managing your Cua cloud sandboxes. Create, start, stop, and connect to sandboxes directly from your terminal.
|
||||
|
||||
## Installation
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">
|
||||
```bash
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
```powershell
|
||||
powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
This installs [Bun](https://bun.sh) and the CUA CLI. Verify with:
|
||||
|
||||
```bash
|
||||
cua --help
|
||||
```
|
||||
|
||||
## Authentication
|
||||
|
||||
Login to your CUA account:
|
||||
|
||||
```bash
|
||||
# Browser-based login
|
||||
cua auth login
|
||||
|
||||
# Or with API key
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
Generate a `.env` file for your project:
|
||||
|
||||
```bash
|
||||
cua auth env
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Create a sandbox
|
||||
cua create --os linux --size small --region north-america
|
||||
|
||||
# List sandboxes
|
||||
cua list
|
||||
|
||||
# Open VNC in browser
|
||||
cua vnc my-sandbox
|
||||
|
||||
# Stop a sandbox
|
||||
cua stop my-sandbox
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Command Reference](/cli-playbook/commands) - Full list of available commands
|
||||
5
docs/content/docs/cli-playbook/meta.json
Normal file
5
docs/content/docs/cli-playbook/meta.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"title": "Cloud CLI",
|
||||
"description": "Command-line interface for CUA Cloud",
|
||||
"pages": ["index", "commands"]
|
||||
}
|
||||
@@ -5,7 +5,7 @@ description: Computer commands and interface methods
|
||||
|
||||
This page describes the set of supported **commands** you can use to control a Cua Computer directly via the Python SDK.
|
||||
|
||||
These commands map to the same actions available in the [Computer Server API Commands Reference](../libraries/computer-server/Commands), and provide low-level, async access to system operations from your agent or automation code.
|
||||
These commands map to the same actions available in the [Computer Server API Commands Reference](/computer-sdk/computer-server/Commands), and provide low-level, async access to system operations from your agent or automation code.
|
||||
|
||||
## Shell Actions
|
||||
|
||||
|
||||
15
docs/content/docs/computer-sdk/computer-server/index.mdx
Normal file
15
docs/content/docs/computer-sdk/computer-server/index.mdx
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
title: Computer Server
|
||||
description: HTTP/WebSocket server for remote computer control
|
||||
---
|
||||
|
||||
The Computer Server is an HTTP and WebSocket server that runs inside each Cua sandbox (VM or container). It exposes APIs for remote computer control - allowing the Computer SDK and agents to execute actions like clicking, typing, taking screenshots, and running commands on the sandboxed environment.
|
||||
|
||||
When you use `Computer(provider_type="cloud")` or any other provider, the Computer SDK communicates with this server running inside the sandbox to execute your automation commands.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **REST API** - Execute commands, take screenshots, manage files
|
||||
- **WebSocket API** - Real-time streaming for continuous interaction
|
||||
- **Cross-platform** - Runs on Linux, macOS, and Windows sandboxes
|
||||
- **Secure** - Isolated inside the sandbox environment
|
||||
4
docs/content/docs/computer-sdk/computer-server/meta.json
Normal file
4
docs/content/docs/computer-sdk/computer-server/meta.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"title": "Computer Server",
|
||||
"pages": ["index", "Commands", "REST-API", "WebSocket-API"]
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: Computer UI (Deprecated)
|
||||
title: Computer UI
|
||||
---
|
||||
|
||||
<Callout type="warn" title="Deprecated">
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
"tracing-api",
|
||||
"sandboxed-python",
|
||||
"custom-computer-handlers",
|
||||
"computer-ui"
|
||||
"computer-ui",
|
||||
"computer-server"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -4,14 +4,7 @@ slug: sandboxed-python
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py"
|
||||
target="_blank"
|
||||
>
|
||||
Python example
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
---
|
||||
title: Computer Tracing API
|
||||
title: Tracing
|
||||
description: Record computer interactions for debugging, training, and analysis
|
||||
---
|
||||
|
||||
# Computer Tracing API
|
||||
# Tracing
|
||||
|
||||
The Computer tracing API provides a powerful way to record computer interactions for debugging, training, analysis, and compliance purposes. Inspired by Playwright's tracing functionality, it offers flexible recording options and standardized output formats.
|
||||
|
||||
|
||||
@@ -75,11 +75,12 @@ pip install -r requirements.txt
|
||||
Create a `.env` file with your API keys:
|
||||
|
||||
```text
|
||||
ANTHROPIC_API_KEY=your-anthropic-api-key
|
||||
ANTHROPIC_API_KEY=your-anthropic-api-key # optional, BYOK. By default, this cookbook uses the CUA VLM Router
|
||||
CUA_API_KEY=sk_cua-api01...
|
||||
CUA_CONTAINER_NAME=m-linux-...
|
||||
```
|
||||
|
||||
Finally, setup your sandbox. Refer to the [quickstart guide](https://cua.ai/docs/get-started/quickstart) on how to setup the computer environment.
|
||||
</Step>
|
||||
|
||||
<Step>
|
||||
|
||||
@@ -19,8 +19,6 @@ import { Code, Terminal } from 'lucide-react';
|
||||
</Card>
|
||||
</div> */}
|
||||
|
||||
---
|
||||
|
||||
## Set Up Your Computer Environment
|
||||
|
||||
Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
|
||||
@@ -43,7 +41,7 @@ You can run your Cua computer in the cloud (recommended for easiest setup), loca
|
||||
**Option 1: Via Website**
|
||||
|
||||
1. Navigate to **Dashboard > Sandboxes > Create Sandbox**
|
||||
2. Create a **Small** sandbox, choosing **Linux**, **Windows**, or **macOS**
|
||||
2. Create a sandbox, choosing **Linux**, **Windows**, or **macOS**
|
||||
3. Note your sandbox name
|
||||
|
||||
**Option 2: Via CLI**
|
||||
@@ -149,6 +147,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
|
||||
```python
|
||||
import os
|
||||
from computer import Computer
|
||||
import asyncio
|
||||
|
||||
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
|
||||
|
||||
@@ -157,42 +156,117 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name" # from CLI or website
|
||||
)
|
||||
await computer.run() # Connect to the sandbox
|
||||
|
||||
async def main():
|
||||
await computer.run() # Connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Linux on Docker">
|
||||
```python
|
||||
from computer import Computer
|
||||
import asyncio
|
||||
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
provider_type="docker",
|
||||
image="trycua/cua-xfce:latest" # or "trycua/cua-ubuntu:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
|
||||
async def main():
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="macOS Sandbox">
|
||||
```python
|
||||
from computer import Computer
|
||||
import asyncio
|
||||
|
||||
computer = Computer(
|
||||
os_type="macos",
|
||||
provider_type="lume",
|
||||
name="macos-sequoia-cua:latest"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
|
||||
async def main():
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Windows Sandbox">
|
||||
```python
|
||||
from computer import Computer
|
||||
import asyncio
|
||||
|
||||
computer = Computer(
|
||||
os_type="windows",
|
||||
provider_type="windows_sandbox"
|
||||
)
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
|
||||
async def main():
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="Your host desktop">
|
||||
@@ -205,26 +279,32 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
|
||||
Then, use the `Computer` object to connect:
|
||||
```python
|
||||
from computer import Computer
|
||||
import asyncio
|
||||
|
||||
computer = Computer(use_host_computer_server=True)
|
||||
await computer.run() # Connect to the host desktop
|
||||
|
||||
async def main():
|
||||
await computer.run() # Connect to the host desktop
|
||||
# Alternative: If your computer server is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the host desktop
|
||||
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop everything, use stop() instead:
|
||||
# await computer.stop() # Fully stop and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
Once connected, you can perform interactions:
|
||||
```python
|
||||
try:
|
||||
# Take a screenshot of the computer's current display
|
||||
screenshot = await computer.interface.screenshot()
|
||||
# Simulate a left-click at coordinates (100, 100)
|
||||
await computer.interface.left_click(100, 100)
|
||||
# Type "Hello!" into the active application
|
||||
await computer.interface.type_text("Hello!")
|
||||
finally:
|
||||
await computer.close()
|
||||
```
|
||||
|
||||
</Tab>
|
||||
<Tab value="TypeScript">
|
||||
<Callout type="warn" title="TypeScript SDK Deprecated">
|
||||
@@ -318,7 +398,7 @@ Connect to your Cua computer and perform basic interactions, such as taking scre
|
||||
// Type "Hello!" into the active application
|
||||
await computer.interface.typeText("Hello!");
|
||||
} finally {
|
||||
await computer.close();
|
||||
await computer.disconnect();
|
||||
}
|
||||
```
|
||||
|
||||
@@ -351,22 +431,42 @@ Choose how you want to access vision-language models for your agent:
|
||||
**Use the agent with CUA models:**
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer import Computer
|
||||
from agent import ComputerAgent
|
||||
|
||||
os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5", # CUA-routed model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
computer = Computer(
|
||||
os_type="linux", # or "windows" or "macos"
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name" # from CLI or website
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
async def main():
|
||||
await computer.run() # Connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
try:
|
||||
agent = ComputerAgent(
|
||||
model="cua/anthropic/claude-sonnet-4.5", # CUA-routed model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Available CUA models:**
|
||||
@@ -375,6 +475,13 @@ Choose how you want to access vision-language models for your agent:
|
||||
- `cua/anthropic/claude-haiku-4.5` - Claude Haiku 4.5 (faster, cost-effective)
|
||||
- `cua/qwen/qwen3-vl-235b` - Qwen3 VL 235B (large-scale vision-language tasks)
|
||||
|
||||
**Available composed models**
|
||||
- `huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929` - GTA1 grounding + Claude Sonnet 4.5 planning
|
||||
- `huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5` - GTA1 grounding + GPT-5 planning
|
||||
- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o` - UI-TARS grounding + GPT-4o planning
|
||||
- `moondream3+openai/gpt-4o` - Moondream3 grounding + GPT-4o planning
|
||||
|
||||
|
||||
**Benefits:**
|
||||
- Single API key for multiple providers
|
||||
- Cost tracking and optimization
|
||||
@@ -388,6 +495,8 @@ Choose how you want to access vision-language models for your agent:
|
||||
**Use the agent with your provider:**
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from computer import Computer
|
||||
from agent import ComputerAgent
|
||||
|
||||
# Set your provider API key
|
||||
@@ -395,18 +504,36 @@ Choose how you want to access vision-language models for your agent:
|
||||
# OR
|
||||
os.environ["OPENAI_API_KEY"] = "sk-..." # For OpenAI
|
||||
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-sonnet-4-5-20250929", # Direct provider model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
computer = Computer(
|
||||
os_type="linux", # or "windows" or "macos"
|
||||
provider_type="cloud",
|
||||
name="your-sandbox-name" # from CLI or website
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
async def main():
|
||||
await computer.run() # Launch & connect to the sandbox
|
||||
# Alternative: If your VM is not running, use start() instead:
|
||||
# await computer.start() # Start and connect to the sandbox
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
try:
|
||||
agent = ComputerAgent(
|
||||
model="anthropic/claude-sonnet-4-5-20250929", # Direct provider model
|
||||
tools=[computer],
|
||||
max_trajectory_budget=5.0
|
||||
)
|
||||
|
||||
messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
|
||||
|
||||
async for result in agent.run(messages):
|
||||
for item in result["output"]:
|
||||
if item["type"] == "message":
|
||||
print(item["content"][0]["text"])
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
# Alternative: If you want to fully stop the VM, use stop() instead:
|
||||
# await computer.stop() # Fully stop VM and disconnect
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Supported providers:**
|
||||
|
||||
@@ -4,55 +4,46 @@ title: Introduction
|
||||
|
||||
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
|
||||
|
||||
<div className="rounded-lg border bg-card text-card-foreground shadow-sm px-4 py-2 mb-6">
|
||||
Cua is an open-source framework for building **Computer-Use Agents** - AI systems that see,
|
||||
understand, and interact with desktop applications through vision and action, just like humans do.
|
||||
<div className="not-prose -mt-2 mb-6">
|
||||
<p className="text-fd-primary font-semibold text-sm mb-1">Welcome</p>
|
||||
<h1 className="text-3xl font-bold tracking-tight md:text-4xl">Welcome to Cua</h1>
|
||||
</div>
|
||||
|
||||
## Why Cua?
|
||||
**Cua** is an open-source framework for building, deploying and evaluating Computer-Use Agents - AI systems that autonomously interact with computer interfaces by understanding visual elements and executing actions. Cua provides SDKs for easy integration with 100+ vision-language models (VLMs), supporting everything from simple task automation to complex multi-step workflows across Windows, Linux, and macOS environments.
|
||||
|
||||
Cua gives you everything you need to automate any desktop application without brittle selectors or APIs.
|
||||
|
||||
Some highlights include:
|
||||
|
||||
- **Model flexibility** - Connect to 100+ LLM providers through liteLLM's standard interface. Use models from Anthropic, OpenAI, Google, and more - or run them locally with Ollama, Hugging Face, or MLX.
|
||||
- **Composed agents** - Mix and match grounding models with planning models for optimal performance. Use specialized models like GTA, OpenCUA, or OmniParser for UI element detection paired with powerful reasoning models like Claude or GPT-4.
|
||||
- **Cross-platform sandboxes** - Run agents safely in isolated environments. Choose from Docker containers, macOS VMs with Lume, Windows Sandbox, or deploy to Cua Cloud with production-ready infrastructure.
|
||||
- **Computer SDK** - Control any application with a PyAutoGUI-like API. Click, type, scroll, take screenshots, manage windows, read/write files - everything you need for desktop automation.
|
||||
- **Agent SDK** - Build autonomous agents with trajectory tracing, prompt caching, cost tracking, and budget controls. Test agents on industry-standard benchmarks like OSWorld-Verified with one line of code.
|
||||
- **Human-in-the-loop** - Pause agent execution and await user input or approval before continuing. Use the `human/human` model string to let humans control the agent directly.
|
||||
- **Production essentials** - Ship reliable agents with built-in PII anonymization, cost tracking, trajectory logging, and integration with observability platforms like Laminar and HUD.
|
||||
|
||||
## What can you build?
|
||||
|
||||
- RPA automation that works with any application - even legacy software without APIs.
|
||||
- Form-filling agents that handle complex multi-step web workflows.
|
||||
- Testing automation that adapts to UI changes without brittle selectors.
|
||||
- Data extraction from desktop applications and document processing.
|
||||
- Cross-application workflows that combine multiple tools and services.
|
||||
- Research agents that browse, read, and synthesize information from the web.
|
||||
|
||||
Explore real-world examples in our [blog posts](https://cua.ai/blog).
|
||||
|
||||
## Get started
|
||||
|
||||
Follow the [Quickstart guide](/docs/get-started/quickstart) for step-by-step setup with Python or TypeScript.
|
||||
|
||||
If you're new to computer-use agents, check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
|
||||
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8">
|
||||
<Card icon={<Rocket />} href="/get-started/quickstart" title="Quickstart">
|
||||
Get up and running in 3 steps with Python or TypeScript.
|
||||
</Card>
|
||||
<Card icon={<Zap />} href="/agent-sdk/agent-loops" title="Agent Loops">
|
||||
Learn how agents work and how to build your own.
|
||||
</Card>
|
||||
<Card icon={<BookOpen />} href="/computer-sdk/computers" title="Computer SDK">
|
||||
Control desktop applications with the Computer SDK.
|
||||
</Card>
|
||||
<Card icon={<Monitor />} href="/example-usecases/form-filling" title="Example Use Cases">
|
||||
See Cua in action with real-world examples.
|
||||
</Card>
|
||||
<div className="not-prose relative rounded-xl overflow-hidden my-8 w-full">
|
||||
<img src="/docs/img/hero.png" alt="Cua" className="w-full h-auto rounded-xl" />
|
||||
</div>
|
||||
|
||||
We can't wait to see what you build with Cua ✨
|
||||
## What is a Computer-Use Agent?
|
||||
|
||||
Computer-Use Agents (CUAs) are AI systems that can autonomously interact with computer interfaces through visual understanding and action execution. They work by capturing screenshots, feeding them to a vision-language model (VLM), and letting the model determine the next action to take - such as clicking, typing, or scrolling - in a continuous loop until the task is complete.
|
||||
|
||||
## What is a Computer-Use Sandbox?
|
||||
|
||||
Computer-Use Sandboxes are isolated, controlled environments where AI agents can safely interact with computer interfaces. They provide a secure execution space for agents to perform actions such as clicking, typing, and running code, test automation workflows, and learn from interactions without affecting production systems.
|
||||
|
||||
## Key Features
|
||||
|
||||
With the **Computer SDK**, you can:
|
||||
- Automate **Windows, Linux, and macOS** sandboxes with a consistent, pyautogui-like API
|
||||
- Create & manage sandboxes locally or using **Cua Cloud**
|
||||
|
||||
With the **Agent SDK**, you can:
|
||||
- Run computer-use models with a consistent schema
|
||||
- Benchmark on **OSWorld-Verified**, **SheetBench-V2**, and **ScreenSpot**
|
||||
- Combine UI grounding models with any LLM using **composed agents**
|
||||
- Use **100+ models** via API or local inference (Claude, GPT-4, Gemini, Ollama, MLX)
|
||||
|
||||
## Get Started
|
||||
|
||||
Follow the [Quickstart guide](/get-started/quickstart) for step-by-step setup with Python or TypeScript.
|
||||
|
||||
Check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
|
||||
|
||||
<div className="grid grid-cols-2 md:grid-cols-4 gap-2 mt-4 text-sm">
|
||||
<Card icon={<Rocket className="w-4 h-4" />} href="/get-started/quickstart" title="Quickstart" />
|
||||
<Card icon={<Zap className="w-4 h-4" />} href="/agent-sdk/agent-loops" title="Agent Loops" />
|
||||
<Card icon={<BookOpen className="w-4 h-4" />} href="/computer-sdk/computers" title="Computer SDK" />
|
||||
<Card icon={<Monitor className="w-4 h-4" />} href="/example-usecases/form-filling" title="Examples" />
|
||||
</div>
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
---
|
||||
title: Agent
|
||||
description: Reference for the current version of the Agent library.
|
||||
pypi: cua-agent
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/agent
|
||||
---
|
||||
|
||||
The Agent library provides the ComputerAgent class and tools for building AI agents that automate workflows on Cua Computers.
|
||||
|
||||
## Agent Loops
|
||||
|
||||
See the [Agent Loops](../agent-sdk/agent-loops) documentation for how agents process information and take actions.
|
||||
|
||||
## Chat History
|
||||
|
||||
See the [Chat History](../agent-sdk/chat-history) documentation for managing conversational context and turn-by-turn interactions.
|
||||
|
||||
## Callbacks
|
||||
|
||||
See the [Callbacks](../agent-sdk/callbacks) documentation for extending and customizing agent behavior with custom hooks.
|
||||
@@ -1,24 +0,0 @@
|
||||
---
|
||||
title: Computer Server
|
||||
descrption: Reference for the current version of the Computer Server library.
|
||||
pypi: cua-computer-server
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/computer-server
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a
|
||||
href="https://github.com/trycua/cua/blob/main/notebooks/computer_server_nb.ipynb"
|
||||
target="_blank"
|
||||
>
|
||||
Jupyter Notebook
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
The Computer Server API reference documentation is currently under development.
|
||||
|
||||
## Overview
|
||||
|
||||
The Computer Server provides WebSocket and REST API endpoints for remote computer control and automation.
|
||||
@@ -1,23 +0,0 @@
|
||||
---
|
||||
title: Computer
|
||||
description: Reference for the current version of the Computer library.
|
||||
pypi: cua-computer
|
||||
npm: '@trycua/computer'
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/computer
|
||||
- https://github.com/trycua/cua/tree/main/libs/typescript/computer
|
||||
---
|
||||
|
||||
The Computer library provides a Computer class for controlling and automating containers running the Computer Server.
|
||||
|
||||
## Connecting to Computers
|
||||
|
||||
See the [Cua Computers](../computer-sdk/computers) documentation for how to connect to different computer types (cloud, local, or host desktop).
|
||||
|
||||
## Computer Commands
|
||||
|
||||
See the [Commands](../computer-sdk/commands) documentation for all supported commands and interface methods (Shell, Mouse, Keyboard, File System, etc.).
|
||||
|
||||
## Sandboxed Python Functions
|
||||
|
||||
See the [Sandboxed Python](../computer-sdk/sandboxed-python) documentation for running Python functions securely in isolated environments on a remote Cua Computer.
|
||||
@@ -1,13 +0,0 @@
|
||||
---
|
||||
title: Core
|
||||
description: Reference for the current version of the Core library.
|
||||
pypi: cua-core
|
||||
npm: '@trycua/core'
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/core
|
||||
- https://github.com/trycua/cua/tree/main/libs/typescript/core
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
The Core library provides foundational utilities and shared functionality across the CUA ecosystem.
|
||||
@@ -1,58 +0,0 @@
|
||||
---
|
||||
title: Cua CLI
|
||||
description: Command-line interface for managing Cua cloud sandboxes and authentication
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
|
||||
The Cua CLI is a command-line tool that provides an intuitive interface for managing your Cua cloud sandboxes and authentication. It offers a streamlined workflow for creating, managing, and connecting to cloud sandboxes.
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Authentication Management**: Secure login with browser-based OAuth flow
|
||||
- **Sandbox Lifecycle**: Create, start, stop, restart, and delete cloud sandboxes
|
||||
- **Quick Access**: Direct links to VNC and playground interfaces
|
||||
- **Cross-Platform**: Works on macOS, Linux, and Windows
|
||||
- **Environment Integration**: Automatic `.env` file generation
|
||||
|
||||
## Quick Example
|
||||
|
||||
```bash
|
||||
# Install the CLI (installs Bun + CUA CLI)
|
||||
curl -LsSf https://cua.ai/cli/install.sh | sh
|
||||
|
||||
# Login to your CUA account
|
||||
cua auth login
|
||||
|
||||
# Create a new Linux sandbox
|
||||
cua sb create --os linux --size small --region north-america
|
||||
|
||||
# List your sandboxes
|
||||
cua sb list
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### Development Workflow
|
||||
|
||||
- Quickly spin up cloud sandboxes for testing
|
||||
- Manage multiple sandboxes across different regions
|
||||
- Integrate with CI/CD pipelines
|
||||
|
||||
### Team Collaboration
|
||||
|
||||
- Share sandbox configurations and access
|
||||
- Standardize development environments
|
||||
- Quick onboarding for new team members
|
||||
|
||||
### Automation
|
||||
|
||||
- Script sandbox provisioning and management
|
||||
- Integrate with deployment workflows
|
||||
- Automate environment setup
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Install the CLI](/libraries/cua-cli/installation)
|
||||
- [Learn about available commands](/libraries/cua-cli/commands)
|
||||
- [Get started with the quickstart guide](/get-started/quickstart#cli-quickstart)
|
||||
@@ -1,130 +0,0 @@
|
||||
---
|
||||
title: Installation
|
||||
description: Install the CUA CLI on your system
|
||||
---
|
||||
|
||||
import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
|
||||
import { Callout } from 'fumadocs-ui/components/callout';
|
||||
|
||||
## Quick Install
|
||||
|
||||
The fastest way to install the CUA CLI is using our installation scripts:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">```bash curl -LsSf https://cua.ai/cli/install.sh | sh ```</Tab>
|
||||
<Tab value="Windows">
|
||||
```powershell powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
These scripts will automatically:
|
||||
|
||||
1. Install [Bun](https://bun.sh) (a fast JavaScript runtime)
|
||||
2. Install the CUA CLI via `bun add -g @trycua/cli`
|
||||
|
||||
<Callout type="info">
|
||||
The installation scripts will automatically detect your system and install the appropriate binary
|
||||
to your PATH.
|
||||
</Callout>
|
||||
|
||||
## Alternative: Install with Bun
|
||||
|
||||
You can also install the CLI directly using Bun:
|
||||
|
||||
```bash
|
||||
# Install Bun if you don't have it
|
||||
curl -fsSL https://bun.sh/install | bash
|
||||
|
||||
# Install CUA CLI
|
||||
bun add -g @trycua/cli
|
||||
```
|
||||
|
||||
<Callout type="info">
|
||||
Using Bun provides faster installation and better performance compared to npm. If you don't have
|
||||
Bun installed, the first command will install it for you.
|
||||
</Callout>
|
||||
|
||||
## Verify Installation
|
||||
|
||||
After installation, verify the CLI is working:
|
||||
|
||||
```bash
|
||||
cua --help
|
||||
```
|
||||
|
||||
You should see the CLI help output with available commands.
|
||||
|
||||
## First Time Setup
|
||||
|
||||
After installation, you'll need to authenticate with your CUA account:
|
||||
|
||||
```bash
|
||||
# Login with browser-based OAuth flow
|
||||
cua auth login
|
||||
|
||||
# Or provide your API key directly
|
||||
cua auth login --api-key sk-your-api-key-here
|
||||
```
|
||||
|
||||
## Updating
|
||||
|
||||
To update to the latest version:
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Re-run the installation script: ```bash # macOS/Linux curl -LsSf https://cua.ai/cli/install.sh |
|
||||
sh # Windows powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm update -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Uninstalling
|
||||
|
||||
<Tabs items={['Script Install', 'npm Install']}>
|
||||
<Tab value="Script Install">
|
||||
Remove the binary from your PATH: ```bash # macOS/Linux rm $(which cua) # Windows # Remove from
|
||||
your PATH or delete the executable ```
|
||||
</Tab>
|
||||
<Tab value="npm Install">```bash npm uninstall -g @trycua/cli ```</Tab>
|
||||
</Tabs>
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Command Not Found
|
||||
|
||||
If you get a "command not found" error after installation:
|
||||
|
||||
1. **Check your PATH**: Make sure the installation directory is in your PATH
|
||||
2. **Restart your terminal**: Close and reopen your terminal/command prompt
|
||||
3. **Manual PATH setup**: Add the installation directory to your PATH manually
|
||||
|
||||
### Permission Issues
|
||||
|
||||
If you encounter permission issues during installation:
|
||||
|
||||
<Tabs items={['macOS / Linux', 'Windows']}>
|
||||
<Tab value="macOS / Linux">
|
||||
Try running with sudo (not recommended for the curl method): ```bash # If using npm sudo npm
|
||||
install -g @trycua/cli ```
|
||||
</Tab>
|
||||
<Tab value="Windows">
|
||||
Run PowerShell as Administrator: ```powershell # Right-click PowerShell and "Run as
|
||||
Administrator" powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
|
||||
```
|
||||
</Tab>
|
||||
</Tabs>
|
||||
|
||||
### Network Issues
|
||||
|
||||
If the installation script fails due to network issues:
|
||||
|
||||
1. **Check your internet connection**
|
||||
2. **Try the npm installation method instead**
|
||||
3. **Check if your firewall is blocking the download**
|
||||
|
||||
## Next Steps
|
||||
|
||||
- [Learn about CLI commands](/libraries/cua-cli/commands)
|
||||
- [Follow the quickstart guide](/get-started/quickstart#cli-quickstart)
|
||||
@@ -1,5 +0,0 @@
|
||||
{
|
||||
"title": "CLI",
|
||||
"description": "Command-line interface for CUA",
|
||||
"pages": ["index", "installation", "commands"]
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
---
|
||||
title: MCP Server
|
||||
description: Reference for the current version of the MCP Server library.
|
||||
pypi: cua-mcp-server
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/mcp-server
|
||||
---
|
||||
|
||||
**cua-mcp-server** is a MCP server for the Computer-Use Agent (CUA), allowing you to run CUA through Claude Desktop or other MCP clients.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-Client Support**: Concurrent sessions with automatic resource management
|
||||
- **Progress Reporting**: Real-time progress updates during task execution
|
||||
- **Error Handling**: Robust error recovery with screenshot capture
|
||||
- **Concurrent Execution**: Run multiple tasks in parallel for improved performance
|
||||
- **Session Management**: Automatic cleanup and resource pooling
|
||||
- **LiteLLM Integration**: Support for multiple model providers
|
||||
- **VM Safety**: Default VM execution with optional host system control
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. **Install**: `pip install cua-mcp-server`
|
||||
2. **Configure**: Add to your MCP client configuration
|
||||
3. **Use**: Ask Claude to perform computer tasks
|
||||
|
||||
See the [Installation](/docs/libraries/mcp-server/installation) guide for detailed setup instructions.
|
||||
@@ -1,78 +0,0 @@
|
||||
---
|
||||
title: Configuration
|
||||
---
|
||||
|
||||
### Detection Parameters
|
||||
|
||||
#### Box Threshold (0.3)
|
||||
|
||||
Controls the confidence threshold for accepting detections:
|
||||
|
||||
<img
|
||||
src="/docs/img/som_box_threshold.png"
|
||||
alt="Illustration of confidence thresholds in object detection, with a high-confidence detection accepted and a low-confidence detection rejected."
|
||||
width="500px"
|
||||
/>
|
||||
- Higher values (0.3) yield more precise but fewer detections - Lower values (0.01) catch more
|
||||
potential icons but increase false positives - Default is 0.3 for optimal precision/recall balance
|
||||
|
||||
#### IOU Threshold (0.1)
|
||||
|
||||
Controls how overlapping detections are merged:
|
||||
|
||||
<img
|
||||
src="/docs/img/som_iou_threshold.png"
|
||||
alt="Diagram showing Intersection over Union (IOU) with low overlap between two boxes kept separate and high overlap leading to merging."
|
||||
width="500px"
|
||||
/>
|
||||
- Lower values (0.1) more aggressively remove overlapping boxes - Higher values (0.5) allow more
|
||||
overlapping detections - Default is 0.1 to handle densely packed UI elements
|
||||
|
||||
### OCR Configuration
|
||||
|
||||
- **Engine**: EasyOCR
|
||||
- Primary choice for all platforms
|
||||
- Fast initialization and processing
|
||||
- Built-in English language support
|
||||
- GPU acceleration when available
|
||||
|
||||
- **Settings**:
|
||||
- Timeout: 5 seconds
|
||||
- Confidence threshold: 0.5
|
||||
- Paragraph mode: Disabled
|
||||
- Language: English only
|
||||
|
||||
## Performance
|
||||
|
||||
### Hardware Acceleration
|
||||
|
||||
#### MPS (Metal Performance Shaders)
|
||||
|
||||
- Multi-scale detection (640px, 1280px, 1920px)
|
||||
- Test-time augmentation enabled
|
||||
- Half-precision (FP16)
|
||||
- Average detection time: ~0.4s
|
||||
- Best for production use when available
|
||||
|
||||
#### CPU
|
||||
|
||||
- Single-scale detection (1280px)
|
||||
- Full-precision (FP32)
|
||||
- Average detection time: ~1.3s
|
||||
- Reliable fallback option
|
||||
|
||||
### Example Output Structure
|
||||
|
||||
```
|
||||
examples/output/
|
||||
├── {timestamp}_no_ocr/
|
||||
│ ├── annotated_images/
|
||||
│ │ └── screenshot_analyzed.png
|
||||
│ ├── screen_details.txt
|
||||
│ └── summary.json
|
||||
└── {timestamp}_ocr/
|
||||
├── annotated_images/
|
||||
│ └── screenshot_analyzed.png
|
||||
├── screen_details.txt
|
||||
└── summary.json
|
||||
```
|
||||
@@ -1,66 +0,0 @@
|
||||
---
|
||||
title: Set-of-Mark
|
||||
description: Reference for the current version of the Set-of-Mark library.
|
||||
pypi: cua-som
|
||||
github:
|
||||
- https://github.com/trycua/cua/tree/main/libs/python/som
|
||||
---
|
||||
|
||||
<Callout>
|
||||
A corresponding{' '}
|
||||
<a href="https://github.com/trycua/cua/blob/main/examples/som_examples.py" target="_blank">
|
||||
Python example
|
||||
</a>{' '}
|
||||
is available for this documentation.
|
||||
</Callout>
|
||||
|
||||
## Overview
|
||||
|
||||
The SOM library provides visual element detection and interaction capabilities. It is based on the [Set-of-Mark](https://arxiv.org/abs/2310.11441) research paper and the [OmniParser](https://github.com/microsoft/OmniParser) model.
|
||||
|
||||
## API Documentation
|
||||
|
||||
### OmniParser Class
|
||||
|
||||
```python
|
||||
class OmniParser:
|
||||
def __init__(self, device: str = "auto"):
|
||||
"""Initialize the parser with automatic device detection"""
|
||||
|
||||
def parse(
|
||||
self,
|
||||
image: PIL.Image,
|
||||
box_threshold: float = 0.3,
|
||||
iou_threshold: float = 0.1,
|
||||
use_ocr: bool = True,
|
||||
ocr_engine: str = "easyocr"
|
||||
) -> ParseResult:
|
||||
"""Parse UI elements from an image"""
|
||||
```
|
||||
|
||||
### ParseResult Object
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class ParseResult:
|
||||
elements: List[UIElement] # Detected elements
|
||||
visualized_image: PIL.Image # Annotated image
|
||||
processing_time: float # Time in seconds
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to JSON-serializable dictionary"""
|
||||
|
||||
def filter_by_type(self, elem_type: str) -> List[UIElement]:
|
||||
"""Filter elements by type ('icon' or 'text')"""
|
||||
```
|
||||
|
||||
### UIElement
|
||||
|
||||
```python
|
||||
class UIElement(BaseModel):
|
||||
id: Optional[int] = Field(None) # Element ID (1-indexed)
|
||||
type: Literal["icon", "text"] # Element type
|
||||
bbox: BoundingBox # Bounding box coordinates { x1, y1, x2, y2 }
|
||||
interactivity: bool = Field(default=False) # Whether the element is interactive
|
||||
confidence: float = Field(default=1.0) # Detection confidence
|
||||
```
|
||||
5
docs/content/docs/macos-vm-cli-playbook/meta.json
Normal file
5
docs/content/docs/macos-vm-cli-playbook/meta.json
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"title": "macOS VM CLI",
|
||||
"description": "CLI tools for macOS virtualization",
|
||||
"pages": ["lume", "lumier"]
|
||||
}
|
||||
@@ -10,9 +10,11 @@
|
||||
"...example-usecases",
|
||||
"---[BookCopy]Computer Playbook---",
|
||||
"...computer-sdk",
|
||||
"---[BookCopy]Agent Playbook---",
|
||||
"---[Bot]Agent Playbook---",
|
||||
"...agent-sdk",
|
||||
"---[CodeXml]API Reference---",
|
||||
"...libraries"
|
||||
"---[Terminal]Cloud CLI Playbook---",
|
||||
"...cli-playbook",
|
||||
"---[Terminal]macOS VM CLI Playbook---",
|
||||
"...macos-vm-cli-playbook"
|
||||
]
|
||||
}
|
||||
|
||||
BIN
docs/public/img/bg-dark.jpg
Normal file
BIN
docs/public/img/bg-dark.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 277 KiB |
BIN
docs/public/img/bg-light.jpg
Normal file
BIN
docs/public/img/bg-light.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 418 KiB |
BIN
docs/public/img/hero.png
Normal file
BIN
docs/public/img/hero.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.2 MiB |
@@ -200,7 +200,7 @@ export default async function Page(props: { params: Promise<{ slug?: string[] }>
|
||||
<div className="flex flex-row w-full items-start">
|
||||
<div className="flex-1">
|
||||
<div className="flex flex-row w-full">
|
||||
<DocsTitle>{page.data.title}</DocsTitle>
|
||||
{slug.length > 0 && <DocsTitle>{page.data.title}</DocsTitle>}
|
||||
|
||||
<div className="ml-auto flex items-center gap-2">
|
||||
{apiSection && versionItems.length > 1 && (
|
||||
|
||||
@@ -2,6 +2,34 @@
|
||||
@import 'fumadocs-ui/css/neutral.css';
|
||||
@import 'fumadocs-ui/css/preset.css';
|
||||
|
||||
/* Custom Sky + Emerald theme */
|
||||
@theme {
|
||||
--color-fd-primary: hsl(199, 89%, 48%); /* sky-500 */
|
||||
--color-fd-primary-foreground: hsl(0, 0%, 100%);
|
||||
--color-fd-ring: hsl(199, 89%, 48%); /* sky-500 */
|
||||
--color-fd-muted: hsl(160, 84%, 95%); /* emerald-50 */
|
||||
--color-fd-accent: hsl(152, 76%, 92%); /* emerald-100 */
|
||||
--font-sans: var(--font-geist-sans);
|
||||
--font-mono: var(--font-geist-mono);
|
||||
}
|
||||
|
||||
.dark {
|
||||
--color-fd-primary: hsl(199, 89%, 48%); /* sky-500 */
|
||||
--color-fd-primary-foreground: hsl(0, 0%, 100%);
|
||||
--color-fd-ring: hsl(199, 89%, 48%); /* sky-500 */
|
||||
--color-fd-muted: hsl(199, 89%, 14%); /* sky-950 */
|
||||
--color-fd-accent: hsl(199, 89%, 20%); /* sky dark */
|
||||
}
|
||||
|
||||
.dark body {
|
||||
background-image: linear-gradient(
|
||||
rgba(14, 165, 233, 0.1),
|
||||
transparent 20rem,
|
||||
transparent
|
||||
);
|
||||
background-repeat: no-repeat;
|
||||
}
|
||||
|
||||
/* Fix TOC overflow on production builds */
|
||||
#nd-toc {
|
||||
overflow-y: auto;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import './global.css';
|
||||
import { RootProvider } from 'fumadocs-ui/provider';
|
||||
import { Inter } from 'next/font/google';
|
||||
import { Geist, Geist_Mono } from 'next/font/google';
|
||||
import type { ReactNode } from 'react';
|
||||
import { PHProvider, PostHogPageView } from '@/providers/posthog-provider';
|
||||
import { AnalyticsTracker } from '@/components/analytics-tracker';
|
||||
@@ -8,13 +8,23 @@ import { CookieConsent } from '@/components/cookie-consent';
|
||||
import { Footer } from '@/components/footer';
|
||||
import { Suspense } from 'react';
|
||||
|
||||
const inter = Inter({
|
||||
const geist = Geist({
|
||||
subsets: ['latin'],
|
||||
variable: '--font-geist-sans',
|
||||
});
|
||||
|
||||
const geistMono = Geist_Mono({
|
||||
subsets: ['latin'],
|
||||
variable: '--font-geist-mono',
|
||||
});
|
||||
|
||||
export default function Layout({ children }: { children: ReactNode }) {
|
||||
return (
|
||||
<html lang="en" className={inter.className} suppressHydrationWarning>
|
||||
<html
|
||||
lang="en"
|
||||
className={`${geist.variable} ${geistMono.variable} font-sans`}
|
||||
suppressHydrationWarning
|
||||
>
|
||||
<head>
|
||||
<link rel="icon" href="/docs/favicon.ico" sizes="any" />
|
||||
</head>
|
||||
|
||||
@@ -141,20 +141,20 @@ export function Footer() {
|
||||
</p>
|
||||
<div className="flex gap-4">
|
||||
<a
|
||||
href="https://cua.ai/privacy"
|
||||
href="https://cua.ai/privacy-policy"
|
||||
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
|
||||
>
|
||||
Privacy Policy
|
||||
</a>
|
||||
<a
|
||||
href="https://cua.ai/terms"
|
||||
href="https://cua.ai/cookie-policy"
|
||||
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
|
||||
>
|
||||
Terms of Service
|
||||
Cookie Policy
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -55,11 +55,11 @@ To get set up with Lume for development, read [these instructions](Development.m
|
||||
|
||||
## Docs
|
||||
|
||||
- [Installation](https://cua.ai/docs/libraries/lume/installation)
|
||||
- [Prebuilt Images](https://cua.ai/docs/libraries/lume/prebuilt-images)
|
||||
- [CLI Reference](https://cua.ai/docs/libraries/lume/cli-reference)
|
||||
- [HTTP API](https://cua.ai/docs/libraries/lume/http-api)
|
||||
- [FAQ](https://cua.ai/docs/libraries/lume/faq)
|
||||
- [Installation](https://cua.ai/docs/macos-vm-cli-playbook/lume/installation)
|
||||
- [Prebuilt Images](https://cua.ai/docs/macos-vm-cli-playbook/lume/prebuilt-images)
|
||||
- [CLI Reference](https://cua.ai/docs/macos-vm-cli-playbook/lume/cli-reference)
|
||||
- [HTTP API](https://cua.ai/docs/macos-vm-cli-playbook/lume/http-api)
|
||||
- [FAQ](https://cua.ai/docs/macos-vm-cli-playbook/lume/faq)
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@@ -133,44 +133,93 @@ detect_platform() {
|
||||
create_temp_dir() {
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
echo "Using temporary directory: $TEMP_DIR"
|
||||
|
||||
|
||||
# Make sure we clean up on exit
|
||||
trap 'rm -rf "$TEMP_DIR"' EXIT
|
||||
}
|
||||
|
||||
# Get the latest lume release tag
|
||||
get_latest_lume_tag() {
|
||||
echo "Finding latest Lume release..." >&2
|
||||
|
||||
local page=1
|
||||
local per_page=100
|
||||
local max_pages=10 # Safety limit (1000 tags max)
|
||||
local LUME_TAG=""
|
||||
|
||||
while [ $page -le $max_pages ]; do
|
||||
echo "Checking page $page..." >&2
|
||||
|
||||
local response=$(curl -s "https://api.github.com/repos/$GITHUB_REPO/tags?per_page=$per_page&page=$page")
|
||||
|
||||
if [ -z "$response" ] || [ "$(echo "$response" | grep -c '"name":')" -eq 0 ]; then
|
||||
if [ $page -eq 1 ]; then
|
||||
echo "${RED}Error: Failed to fetch tags from GitHub API.${NORMAL}" >&2
|
||||
exit 1
|
||||
else
|
||||
echo "${RED}Error: No lume tags found after checking $((page - 1)) pages.${NORMAL}" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
LUME_TAG=$(echo "$response" \
|
||||
| grep '"name": "lume-' \
|
||||
| head -n 1 \
|
||||
| cut -d '"' -f 4)
|
||||
|
||||
if [ -n "$LUME_TAG" ]; then
|
||||
echo "Found latest Lume release: ${BOLD}$LUME_TAG${NORMAL}" >&2
|
||||
echo "$LUME_TAG"
|
||||
return 0
|
||||
fi
|
||||
|
||||
page=$((page + 1))
|
||||
done
|
||||
|
||||
echo "${RED}Error: Could not find any lume tags after checking $max_pages pages.${NORMAL}" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Download the latest release
|
||||
download_release() {
|
||||
echo "Downloading latest Lume release..."
|
||||
|
||||
# Use the direct download link with the non-versioned symlink
|
||||
DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume.tar.gz"
|
||||
LUME_TAG=$(get_latest_lume_tag)
|
||||
|
||||
if [ -z "$LUME_TAG" ]; then
|
||||
echo "${RED}Error: Could not determine latest Lume release tag.${NORMAL}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading Lume release $LUME_TAG..."
|
||||
|
||||
# Use the direct download link with the lume release tag
|
||||
DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/download/$LUME_TAG/lume.tar.gz"
|
||||
echo "Downloading from: $DOWNLOAD_URL"
|
||||
|
||||
|
||||
# Download the tarball
|
||||
if command -v curl &> /dev/null; then
|
||||
curl -L --progress-bar "$DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz"
|
||||
|
||||
|
||||
# Verify the download was successful
|
||||
if [ ! -s "$TEMP_DIR/lume.tar.gz" ]; then
|
||||
echo "${RED}Error: Failed to download Lume.${NORMAL}"
|
||||
echo "The download URL may be incorrect or the file may not exist."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# Verify the file is a valid archive
|
||||
if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then
|
||||
echo "${RED}Error: The downloaded file is not a valid tar.gz archive.${NORMAL}"
|
||||
echo "Let's try the alternative URL..."
|
||||
|
||||
# Try alternative URL
|
||||
ALT_DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/latest/download/lume-$PLATFORM.tar.gz"
|
||||
|
||||
# Try alternative URL with platform-specific name
|
||||
ALT_DOWNLOAD_URL="https://github.com/$GITHUB_REPO/releases/download/$LUME_TAG/lume-darwin.tar.gz"
|
||||
echo "Downloading from alternative URL: $ALT_DOWNLOAD_URL"
|
||||
curl -L --progress-bar "$ALT_DOWNLOAD_URL" -o "$TEMP_DIR/lume.tar.gz"
|
||||
|
||||
|
||||
# Check again
|
||||
if ! tar -tzf "$TEMP_DIR/lume.tar.gz" > /dev/null 2>&1; then
|
||||
echo "${RED}Error: Could not download a valid Lume archive.${NORMAL}"
|
||||
echo "Please try installing Lume manually from: https://github.com/$GITHUB_REPO/releases/latest"
|
||||
echo "Please try installing Lume manually from: https://github.com/$GITHUB_REPO/releases/tag/$LUME_TAG"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -58,14 +58,14 @@ docker run -it --rm \
|
||||
|
||||
After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).
|
||||
|
||||
> **Note:** With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See [the documentation](https://cua.ai/docs/libraries/lumier/docker) for how to save your VM state.
|
||||
> **Note:** With the basic setup above, your VM will be reset when you stop the container (ephemeral mode). This means any changes you make inside the macOS VM will be lost. See [the documentation](https://cua.ai/docs/macos-vm-cli-playbook/lumier/docker) for how to save your VM state.
|
||||
|
||||
## Docs
|
||||
|
||||
- [Installation](https://cua.ai/docs/libraries/lumier/installation)
|
||||
- [Docker](https://cua.ai/docs/libraries/lumier/docker)
|
||||
- [Docker Compose](https://cua.ai/docs/libraries/lumier/docker-compose)
|
||||
- [Building Lumier](https://cua.ai/docs/libraries/lumier/building-lumier)
|
||||
- [Installation](https://cua.ai/docs/macos-vm-cli-playbook/lumier/installation)
|
||||
- [Docker](https://cua.ai/docs/macos-vm-cli-playbook/lumier/docker)
|
||||
- [Docker Compose](https://cua.ai/docs/macos-vm-cli-playbook/lumier/docker-compose)
|
||||
- [Building Lumier](https://cua.ai/docs/macos-vm-cli-playbook/lumier/building-lumier)
|
||||
|
||||
## Credits
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.5.1
|
||||
current_version = 0.5.2
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = agent-v{new_version}
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
|
||||
|
||||
[project]
|
||||
name = "cua-agent"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
description = "CUA (Computer Use) Agent for AI-driven computer interaction"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
|
||||
26
libs/python/bench-ui/README.md
Normal file
26
libs/python/bench-ui/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# CUA Bench UI
|
||||
|
||||
Lightweight webUI window controller for CUA bench environments using pywebview
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from bench_ui import launch_window, get_element_rect, execute_javascript
|
||||
|
||||
# Launch a window with inline HTML content
|
||||
pid = launch_window(html="<html><body><h1>Hello</h1></body></html>")
|
||||
|
||||
# Get element rect in screen space
|
||||
rect = get_element_rect(pid, "h1", space="screen")
|
||||
print(rect)
|
||||
|
||||
# Execute arbitrary JavaScript
|
||||
text = execute_javascript(pid, "document.querySelector('h1')?.textContent")
|
||||
print(text)
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install cua-bench-ui
|
||||
```
|
||||
3
libs/python/bench-ui/bench_ui/__init__.py
Normal file
3
libs/python/bench-ui/bench_ui/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .api import execute_javascript, get_element_rect, launch_window
|
||||
|
||||
__all__ = ["launch_window", "get_element_rect", "execute_javascript"]
|
||||
181
libs/python/bench-ui/bench_ui/api.py
Normal file
181
libs/python/bench-ui/bench_ui/api.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from urllib import request
|
||||
from urllib.error import HTTPError, URLError
|
||||
|
||||
import psutil
|
||||
|
||||
# Map child PID -> listening port
|
||||
_pid_to_port: Dict[int, int] = {}
|
||||
|
||||
|
||||
def _post_json(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = request.Request(
|
||||
url, data=data, headers={"Content-Type": "application/json"}, method="POST"
|
||||
)
|
||||
try:
|
||||
with request.urlopen(req, timeout=5) as resp:
|
||||
text = resp.read().decode("utf-8")
|
||||
return json.loads(text)
|
||||
except HTTPError as e:
|
||||
try:
|
||||
body = (e.read() or b"").decode("utf-8", errors="ignore")
|
||||
return json.loads(body)
|
||||
except Exception:
|
||||
return {"error": "http_error", "status": getattr(e, "code", None)}
|
||||
except URLError as e:
|
||||
return {"error": "url_error", "reason": str(e.reason)}
|
||||
|
||||
|
||||
def _detect_port_for_pid(pid: int) -> int:
|
||||
"""Detect a listening local TCP port for the given PID using psutil.
|
||||
|
||||
Fails fast if psutil is unavailable or if no suitable port is found.
|
||||
"""
|
||||
if psutil is None:
|
||||
raise RuntimeError("psutil is required for PID->port detection. Please install psutil.")
|
||||
|
||||
# Scan system-wide connections and filter by PID
|
||||
for c in psutil.net_connections(kind="tcp"):
|
||||
if getattr(c, "pid", None) != pid:
|
||||
continue
|
||||
laddr = getattr(c, "laddr", None)
|
||||
status = str(getattr(c, "status", ""))
|
||||
if not laddr or not isinstance(laddr, tuple) or len(laddr) < 2:
|
||||
continue
|
||||
lip, lport = laddr[0], int(laddr[1])
|
||||
if status.upper() != "LISTEN":
|
||||
continue
|
||||
if lip in ("127.0.0.1", "::1", "0.0.0.0", "::"):
|
||||
return lport
|
||||
|
||||
raise RuntimeError(f"Could not detect listening port for pid {pid}")
|
||||
|
||||
|
||||
def launch_window(
|
||||
url: Optional[str] = None,
|
||||
*,
|
||||
html: Optional[str] = None,
|
||||
folder: Optional[str] = None,
|
||||
title: str = "Window",
|
||||
x: Optional[int] = None,
|
||||
y: Optional[int] = None,
|
||||
width: int = 600,
|
||||
height: int = 400,
|
||||
icon: Optional[str] = None,
|
||||
use_inner_size: bool = False,
|
||||
title_bar_style: str = "default",
|
||||
) -> int:
|
||||
"""Create a pywebview window in a child process and return its PID.
|
||||
|
||||
Preferred input is a URL via the positional `url` parameter.
|
||||
To load inline HTML instead, pass `html=...`.
|
||||
To serve a static folder, pass `folder=...` (path to directory).
|
||||
|
||||
Spawns `python -m bench_ui.child` with a JSON config passed via a temp file.
|
||||
The child prints a single JSON line: {"pid": <pid>, "port": <port>}.
|
||||
We cache pid->port for subsequent control calls like get_element_rect.
|
||||
"""
|
||||
if not url and not html and not folder:
|
||||
raise ValueError("launch_window requires either a url, html, or folder")
|
||||
|
||||
config = {
|
||||
"url": url,
|
||||
"html": html,
|
||||
"folder": folder,
|
||||
"title": title,
|
||||
"x": x,
|
||||
"y": y,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"icon": icon,
|
||||
"use_inner_size": use_inner_size,
|
||||
"title_bar_style": title_bar_style,
|
||||
}
|
||||
|
||||
with tempfile.NamedTemporaryFile("w", delete=False, suffix=".json") as f:
|
||||
json.dump(config, f)
|
||||
cfg_path = f.name
|
||||
|
||||
try:
|
||||
# Launch child process
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-m", "bench_ui.child", cfg_path],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
assert proc.stdout is not None
|
||||
# Read first line with startup info
|
||||
line = proc.stdout.readline().strip()
|
||||
info = json.loads(line)
|
||||
pid = int(info["pid"]) if "pid" in info else proc.pid
|
||||
port = int(info["port"]) # required
|
||||
_pid_to_port[pid] = port
|
||||
return pid
|
||||
finally:
|
||||
try:
|
||||
os.unlink(cfg_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def get_element_rect(pid: int, selector: str, *, space: str = "window"):
|
||||
"""Ask the child process to compute element client rect via injected JS.
|
||||
|
||||
Returns a dict like {"x": float, "y": float, "width": float, "height": float} or None if not found.
|
||||
"""
|
||||
if pid not in _pid_to_port:
|
||||
_pid_to_port[pid] = _detect_port_for_pid(pid)
|
||||
port = _pid_to_port[pid]
|
||||
url = f"http://127.0.0.1:{port}/rect"
|
||||
last: Dict[str, Any] = {}
|
||||
for _ in range(30): # ~3s total
|
||||
resp = _post_json(url, {"selector": selector, "space": space})
|
||||
last = resp or {}
|
||||
rect = last.get("rect") if isinstance(last, dict) else None
|
||||
err = last.get("error") if isinstance(last, dict) else None
|
||||
if rect is not None:
|
||||
return rect
|
||||
if err in ("window_not_ready", "invalid_json"):
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
# If other transient errors, brief retry
|
||||
if err:
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
time.sleep(0.1)
|
||||
raise RuntimeError(f"Failed to get element rect: {last}")
|
||||
|
||||
|
||||
def execute_javascript(pid: int, javascript: str):
|
||||
"""Execute arbitrary JavaScript in the window and return its result.
|
||||
|
||||
Retries briefly while the window is still becoming ready.
|
||||
"""
|
||||
if pid not in _pid_to_port:
|
||||
_pid_to_port[pid] = _detect_port_for_pid(pid)
|
||||
port = _pid_to_port[pid]
|
||||
url = f"http://127.0.0.1:{port}/eval"
|
||||
last: Dict[str, Any] = {}
|
||||
for _ in range(30): # ~3s total
|
||||
resp = _post_json(url, {"javascript": javascript})
|
||||
last = resp or {}
|
||||
if isinstance(last, dict):
|
||||
if "result" in last:
|
||||
return last["result"]
|
||||
if last.get("error") in ("window_not_ready", "invalid_json"):
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
if last.get("error"):
|
||||
time.sleep(0.1)
|
||||
continue
|
||||
time.sleep(0.1)
|
||||
raise RuntimeError(f"Failed to execute JavaScript: {last}")
|
||||
221
libs/python/bench-ui/bench_ui/child.py
Normal file
221
libs/python/bench-ui/bench_ui/child.py
Normal file
@@ -0,0 +1,221 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import webview
|
||||
from aiohttp import web
|
||||
|
||||
|
||||
def _get_free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def _start_http_server(
|
||||
window: webview.Window,
|
||||
port: int,
|
||||
ready_event: threading.Event,
|
||||
html_content: str | None = None,
|
||||
folder_path: str | None = None,
|
||||
):
|
||||
async def rect_handler(request: web.Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "invalid_json"}, status=400)
|
||||
selector = data.get("selector")
|
||||
space = data.get("space", "window")
|
||||
if not isinstance(selector, str):
|
||||
return web.json_response({"error": "selector_required"}, status=400)
|
||||
|
||||
# Ensure window content is loaded
|
||||
if not ready_event.is_set():
|
||||
# give it a short chance to finish loading
|
||||
ready_event.wait(timeout=2.0)
|
||||
if not ready_event.is_set():
|
||||
return web.json_response({"error": "window_not_ready"}, status=409)
|
||||
|
||||
# Safely embed selector into JS
|
||||
selector_js = json.dumps(selector)
|
||||
if space == "screen":
|
||||
# Compute approximate screen coordinates using window metrics
|
||||
js = (
|
||||
"(function(){"
|
||||
f"const s = {selector_js};"
|
||||
"const el = document.querySelector(s);"
|
||||
"if(!el){return null;}"
|
||||
"const r = el.getBoundingClientRect();"
|
||||
"const sx = (window.screenX ?? window.screenLeft ?? 0);"
|
||||
"const syRaw = (window.screenY ?? window.screenTop ?? 0);"
|
||||
"const frameH = (window.outerHeight - window.innerHeight) || 0;"
|
||||
"const sy = syRaw + frameH;"
|
||||
"return {x:sx + r.left, y:sy + r.top, width:r.width, height:r.height};"
|
||||
"})()"
|
||||
)
|
||||
else:
|
||||
js = (
|
||||
"(function(){"
|
||||
f"const s = {selector_js};"
|
||||
"const el = document.querySelector(s);"
|
||||
"if(!el){return null;}"
|
||||
"const r = el.getBoundingClientRect();"
|
||||
"return {x:r.left,y:r.top,width:r.width,height:r.height};"
|
||||
"})()"
|
||||
)
|
||||
try:
|
||||
# Evaluate JS on the target window; this call is thread-safe in pywebview
|
||||
result = window.evaluate_js(js)
|
||||
except Exception as e:
|
||||
return web.json_response({"error": str(e)}, status=500)
|
||||
return web.json_response({"rect": result})
|
||||
|
||||
async def eval_handler(request: web.Request):
|
||||
try:
|
||||
data = await request.json()
|
||||
except Exception:
|
||||
return web.json_response({"error": "invalid_json"}, status=400)
|
||||
code = data.get("javascript") or data.get("code")
|
||||
if not isinstance(code, str):
|
||||
return web.json_response({"error": "javascript_required"}, status=400)
|
||||
|
||||
if not ready_event.is_set():
|
||||
ready_event.wait(timeout=2.0)
|
||||
if not ready_event.is_set():
|
||||
return web.json_response({"error": "window_not_ready"}, status=409)
|
||||
|
||||
try:
|
||||
result = window.evaluate_js(code)
|
||||
except Exception as e:
|
||||
return web.json_response({"error": str(e)}, status=500)
|
||||
return web.json_response({"result": result})
|
||||
|
||||
async def index_handler(request: web.Request):
|
||||
if html_content is None:
|
||||
return web.json_response({"status": "ok", "message": "bench-ui control server"})
|
||||
return web.Response(text=html_content, content_type="text/html")
|
||||
|
||||
app = web.Application()
|
||||
|
||||
# If serving a folder, add static file routes
|
||||
if folder_path:
|
||||
app.router.add_static("/", folder_path, show_index=True)
|
||||
else:
|
||||
app.router.add_get("/", index_handler)
|
||||
|
||||
app.router.add_post("/rect", rect_handler)
|
||||
app.router.add_post("/eval", eval_handler)
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
|
||||
def run_loop():
|
||||
asyncio.set_event_loop(loop)
|
||||
runner = web.AppRunner(app)
|
||||
loop.run_until_complete(runner.setup())
|
||||
site = web.TCPSite(runner, "127.0.0.1", port)
|
||||
loop.run_until_complete(site.start())
|
||||
loop.run_forever()
|
||||
|
||||
t = threading.Thread(target=run_loop, daemon=True)
|
||||
t.start()
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python -m bench_ui.child <config.json>", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
cfg_path = Path(sys.argv[1])
|
||||
cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
|
||||
|
||||
html: Optional[str] = cfg.get("html") or ""
|
||||
url: Optional[str] = cfg.get("url")
|
||||
folder: Optional[str] = cfg.get("folder")
|
||||
title: str = cfg.get("title", "Window")
|
||||
x: Optional[int] = cfg.get("x")
|
||||
y: Optional[int] = cfg.get("y")
|
||||
width: int = int(cfg.get("width", 600))
|
||||
height: int = int(cfg.get("height", 400))
|
||||
icon: Optional[str] = cfg.get("icon")
|
||||
use_inner_size: bool = bool(cfg.get("use_inner_size", False))
|
||||
title_bar_style: str = cfg.get("title_bar_style", "default")
|
||||
|
||||
# Choose port early so we can point the window to it when serving inline HTML or folder
|
||||
port = _get_free_port()
|
||||
|
||||
# Create window
|
||||
if url:
|
||||
window = webview.create_window(
|
||||
title,
|
||||
url=url,
|
||||
width=width,
|
||||
height=height,
|
||||
x=x,
|
||||
y=y,
|
||||
confirm_close=False,
|
||||
text_select=True,
|
||||
background_color="#FFFFFF",
|
||||
)
|
||||
html_for_server = None
|
||||
folder_for_server = None
|
||||
elif folder:
|
||||
# Serve static folder at control server root and point window to index.html
|
||||
resolved_url = f"http://127.0.0.1:{port}/index.html"
|
||||
window = webview.create_window(
|
||||
title,
|
||||
url=resolved_url,
|
||||
width=width,
|
||||
height=height,
|
||||
x=x,
|
||||
y=y,
|
||||
confirm_close=False,
|
||||
text_select=True,
|
||||
background_color="#FFFFFF",
|
||||
)
|
||||
html_for_server = None
|
||||
folder_for_server = folder
|
||||
else:
|
||||
# Serve inline HTML at control server root and point window to it
|
||||
resolved_url = f"http://127.0.0.1:{port}/"
|
||||
window = webview.create_window(
|
||||
title,
|
||||
url=resolved_url,
|
||||
width=width,
|
||||
height=height,
|
||||
x=x,
|
||||
y=y,
|
||||
confirm_close=False,
|
||||
text_select=True,
|
||||
background_color="#FFFFFF",
|
||||
)
|
||||
html_for_server = html
|
||||
folder_for_server = None
|
||||
|
||||
# Track when the page is loaded so JS execution succeeds
|
||||
window_ready = threading.Event()
|
||||
|
||||
def _on_loaded():
|
||||
window_ready.set()
|
||||
|
||||
window.events.loaded += _on_loaded # type: ignore[attr-defined]
|
||||
|
||||
# Start HTTP server for control (and optionally serve inline HTML or static folder)
|
||||
_start_http_server(
|
||||
window, port, window_ready, html_content=html_for_server, folder_path=folder_for_server
|
||||
)
|
||||
|
||||
# Print startup info for parent to read
|
||||
print(json.dumps({"pid": os.getpid(), "port": port}), flush=True)
|
||||
|
||||
# Start GUI (blocking)
|
||||
webview.start(debug=os.environ.get("CUA_BENCH_UI_DEBUG", "false").lower() in ("true", "1"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40
libs/python/bench-ui/examples/folder_example.py
Normal file
40
libs/python/bench-ui/examples/folder_example.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
import time
|
||||
from bench_ui import launch_window, get_element_rect, execute_javascript
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
def main():
|
||||
os.environ["CUA_BENCH_UI_DEBUG"] = "1"
|
||||
|
||||
# Get the path to the gui folder
|
||||
gui_folder = Path(__file__).parent / "gui"
|
||||
|
||||
# Launch a window serving the static folder
|
||||
pid = launch_window(
|
||||
folder=str(gui_folder),
|
||||
title="Static Folder Example",
|
||||
width=800,
|
||||
height=600,
|
||||
)
|
||||
print(f"Launched window with PID: {pid}")
|
||||
print(f"Serving folder: {gui_folder}")
|
||||
|
||||
# Give the window a moment to render
|
||||
time.sleep(1.5)
|
||||
|
||||
# Query the client rect of the button element
|
||||
rect = get_element_rect(pid, "#testButton", space="window")
|
||||
print("Button rect (window space):", rect)
|
||||
|
||||
# Check if button has been clicked
|
||||
clicked = execute_javascript(pid, "document.getElementById('testButton').disabled")
|
||||
print("Button clicked:", clicked)
|
||||
|
||||
# Get the page title
|
||||
title = execute_javascript(pid, "document.title")
|
||||
print("Page title:", title)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
42
libs/python/bench-ui/examples/gui/index.html
Normal file
42
libs/python/bench-ui/examples/gui/index.html
Normal file
@@ -0,0 +1,42 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Static Folder Example</title>
|
||||
<link rel="stylesheet" href="styles.css">
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>Static Folder Example</h1>
|
||||
<p>This page is served from a static folder using bench-ui!</p>
|
||||
|
||||
<div class="image-container">
|
||||
<img src="logo.svg" alt="Example SVG Logo" class="logo">
|
||||
</div>
|
||||
|
||||
<div class="info">
|
||||
<p>This example demonstrates:</p>
|
||||
<ul>
|
||||
<li>Serving a static folder with bench-ui</li>
|
||||
<li>Loading external CSS files (styles.css)</li>
|
||||
<li>Loading SVG images (logo.svg)</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<button id="testButton" class="btn">Click Me!</button>
|
||||
<p id="status"></p>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
document.getElementById('testButton').addEventListener('click', function () {
|
||||
document.getElementById('status').textContent = 'Button clicked! ✓';
|
||||
this.disabled = true;
|
||||
this.textContent = 'Clicked!';
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
24
libs/python/bench-ui/examples/gui/logo.svg
Normal file
24
libs/python/bench-ui/examples/gui/logo.svg
Normal file
@@ -0,0 +1,24 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 200 200">
|
||||
<defs>
|
||||
<linearGradient id="grad1" x1="0%" y1="0%" x2="100%" y2="100%">
|
||||
<stop offset="0%" style="stop-color:#667eea;stop-opacity:1" />
|
||||
<stop offset="100%" style="stop-color:#764ba2;stop-opacity:1" />
|
||||
</linearGradient>
|
||||
</defs>
|
||||
|
||||
<!-- Background circle -->
|
||||
<circle cx="100" cy="100" r="95" fill="url(#grad1)" />
|
||||
|
||||
<!-- Window icon -->
|
||||
<rect x="50" y="50" width="100" height="100" rx="8" fill="white" opacity="0.9" />
|
||||
|
||||
<!-- Window panes -->
|
||||
<line x1="100" y1="50" x2="100" y2="150" stroke="url(#grad1)" stroke-width="4" />
|
||||
<line x1="50" y1="100" x2="150" y2="100" stroke="url(#grad1)" stroke-width="4" />
|
||||
|
||||
<!-- Decorative dots -->
|
||||
<circle cx="75" cy="75" r="8" fill="url(#grad1)" />
|
||||
<circle cx="125" cy="75" r="8" fill="url(#grad1)" />
|
||||
<circle cx="75" cy="125" r="8" fill="url(#grad1)" />
|
||||
<circle cx="125" cy="125" r="8" fill="url(#grad1)" />
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 963 B |
92
libs/python/bench-ui/examples/gui/styles.css
Normal file
92
libs/python/bench-ui/examples/gui/styles.css
Normal file
@@ -0,0 +1,92 @@
|
||||
body {
|
||||
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.container {
|
||||
background: white;
|
||||
border-radius: 12px;
|
||||
padding: 40px;
|
||||
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
|
||||
max-width: 600px;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
margin-top: 0;
|
||||
font-size: 2em;
|
||||
}
|
||||
|
||||
p {
|
||||
color: #666;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.image-container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
margin: 30px 0;
|
||||
}
|
||||
|
||||
.logo {
|
||||
width: 150px;
|
||||
height: 150px;
|
||||
}
|
||||
|
||||
.info {
|
||||
background: #f8f9fa;
|
||||
border-left: 4px solid #667eea;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.info ul {
|
||||
margin: 10px 0;
|
||||
padding-left: 20px;
|
||||
}
|
||||
|
||||
.info li {
|
||||
color: #555;
|
||||
margin: 8px 0;
|
||||
}
|
||||
|
||||
.btn {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 12px 30px;
|
||||
font-size: 16px;
|
||||
border-radius: 6px;
|
||||
cursor: pointer;
|
||||
transition: transform 0.2s, box-shadow 0.2s;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.btn:hover:not(:disabled) {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
|
||||
}
|
||||
|
||||
.btn:active:not(:disabled) {
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
.btn:disabled {
|
||||
opacity: 0.6;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
#status {
|
||||
margin-top: 15px;
|
||||
font-weight: 600;
|
||||
color: #28a745;
|
||||
font-size: 18px;
|
||||
}
|
||||
BIN
libs/python/bench-ui/examples/output_overlay.png
Normal file
BIN
libs/python/bench-ui/examples/output_overlay.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 743 KiB |
80
libs/python/bench-ui/examples/simple_example.py
Normal file
80
libs/python/bench-ui/examples/simple_example.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from bench_ui import execute_javascript, get_element_rect, launch_window
|
||||
|
||||
HTML = """
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Bench UI Example</title>
|
||||
<style>
|
||||
body { font-family: system-ui, sans-serif; margin: 24px; }
|
||||
#target { width: 220px; height: 120px; background: #4f46e5; color: white; display: flex; align-items: center; justify-content: center; border-radius: 8px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Bench UI Example</h1>
|
||||
<div id="target">Hello from pywebview</div>
|
||||
|
||||
|
||||
<h1>Click the button</h1>
|
||||
<button id="submit" class="btn" data-instruction="the button">Submit</button>
|
||||
<script>
|
||||
window.__submitted = false;
|
||||
document.getElementById('submit').addEventListener('click', function() {
|
||||
window.__submitted = true;
|
||||
this.textContent = 'Submitted!';
|
||||
this.disabled = true;
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def main():
|
||||
os.environ["CUA_BENCH_UI_DEBUG"] = "1"
|
||||
|
||||
# Launch a window with inline HTML content
|
||||
pid = launch_window(
|
||||
html=HTML,
|
||||
title="Bench UI Example",
|
||||
width=800,
|
||||
height=600,
|
||||
)
|
||||
print(f"Launched window with PID: {pid}")
|
||||
|
||||
# Give the window a brief moment to render
|
||||
time.sleep(1.0)
|
||||
|
||||
# Query the client rect of an element via CSS selector in SCREEN space
|
||||
rect = get_element_rect(pid, "#target", space="screen")
|
||||
print("Element rect (screen space):", rect)
|
||||
|
||||
# Take a screenshot and overlay the bbox
|
||||
try:
|
||||
from PIL import ImageDraw, ImageGrab
|
||||
|
||||
img = ImageGrab.grab() # full screen
|
||||
draw = ImageDraw.Draw(img)
|
||||
x, y, w, h = rect["x"], rect["y"], rect["width"], rect["height"]
|
||||
box = (x, y, x + w, y + h)
|
||||
draw.rectangle(box, outline=(255, 0, 0), width=3)
|
||||
out_path = Path(__file__).parent / "output_overlay.png"
|
||||
img.save(out_path)
|
||||
print(f"Saved overlay screenshot to: {out_path}")
|
||||
except Exception as e:
|
||||
print(f"Failed to capture/annotate screenshot: {e}")
|
||||
|
||||
# Execute arbitrary JavaScript
|
||||
text = execute_javascript(pid, "window.__submitted")
|
||||
print("text:", text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
25
libs/python/bench-ui/pyproject.toml
Normal file
25
libs/python/bench-ui/pyproject.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
|
||||
[project]
|
||||
name = "cua-bench-ui"
|
||||
version = "0.7.0"
|
||||
description = "Lightweight webUI window controller for CUA bench using pywebview"
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
{ name = "TryCua", email = "gh@trycua.com" }
|
||||
]
|
||||
dependencies = [
|
||||
"pywebview>=5.3",
|
||||
"aiohttp>=3.9.0",
|
||||
"psutil>=5.9",
|
||||
]
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[tool.pdm]
|
||||
distribution = true
|
||||
|
||||
[tool.pdm.build]
|
||||
includes = ["bench_ui/"]
|
||||
source-includes = ["README.md"]
|
||||
50
libs/python/bench-ui/tests/test_port_detection.py
Normal file
50
libs/python/bench-ui/tests/test_port_detection.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import time
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
from bench_ui import execute_javascript, launch_window
|
||||
from bench_ui.api import _pid_to_port
|
||||
|
||||
HTML = """
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Bench UI Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="t">hello-world</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def test_execute_js_after_clearing_port_mapping():
|
||||
# Skip if pywebview backend is unavailable on this machine
|
||||
pywebview = pytest.importorskip("webview")
|
||||
|
||||
pid = launch_window(html=HTML, title="Bench UI Test", width=400, height=300)
|
||||
try:
|
||||
# Give a brief moment for window to render and server to start
|
||||
time.sleep(1.0)
|
||||
|
||||
# Sanity: mapping should exist initially
|
||||
assert pid in _pid_to_port
|
||||
|
||||
# Clear the cached mapping to simulate a fresh process lookup
|
||||
del _pid_to_port[pid]
|
||||
|
||||
# Now execute JS; this should succeed by detecting the port via psutil
|
||||
result = execute_javascript(pid, "document.querySelector('#t')?.textContent")
|
||||
assert result == "hello-world"
|
||||
finally:
|
||||
# Best-effort cleanup of the child process
|
||||
try:
|
||||
p = psutil.Process(pid)
|
||||
p.terminate()
|
||||
try:
|
||||
p.wait(timeout=3)
|
||||
except psutil.TimeoutExpired:
|
||||
p.kill()
|
||||
except Exception:
|
||||
pass
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.1.30
|
||||
current_version = 0.1.31
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = computer-server-v{new_version}
|
||||
|
||||
@@ -40,7 +40,7 @@ Refer to this notebook for a step-by-step guide on how to use the Computer-Use S
|
||||
|
||||
## Docs
|
||||
|
||||
- [Commands](https://cua.ai/docs/libraries/computer-server/Commands)
|
||||
- [REST-API](https://cua.ai/docs/libraries/computer-server/REST-API)
|
||||
- [WebSocket-API](https://cua.ai/docs/libraries/computer-server/WebSocket-API)
|
||||
- [Index](https://cua.ai/docs/libraries/computer-server)
|
||||
- [Commands](https://cua.ai/docs/computer-sdk/computer-server/Commands)
|
||||
- [REST-API](https://cua.ai/docs/computer-sdk/computer-server/REST-API)
|
||||
- [WebSocket-API](https://cua.ai/docs/computer-sdk/computer-server/WebSocket-API)
|
||||
- [Index](https://cua.ai/docs/computer-sdk/computer-server)
|
||||
|
||||
@@ -24,8 +24,8 @@ from fastapi import (
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
from .handlers.factory import HandlerFactory
|
||||
from .browser import get_browser_manager
|
||||
from .handlers.factory import HandlerFactory
|
||||
|
||||
# Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
|
||||
AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
|
||||
@@ -805,7 +805,7 @@ async def playwright_exec_endpoint(
|
||||
try:
|
||||
browser_manager = get_browser_manager()
|
||||
result = await browser_manager.execute_command(command, params)
|
||||
|
||||
|
||||
if result.get("success"):
|
||||
return JSONResponse(content=result)
|
||||
else:
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
|
||||
|
||||
[project]
|
||||
name = "cua-computer-server"
|
||||
version = "0.1.30"
|
||||
version = "0.1.31"
|
||||
|
||||
description = "Server component for the Computer-Use Interface (CUI) framework powering Cua"
|
||||
authors = [
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[bumpversion]
|
||||
current_version = 0.4.17
|
||||
current_version = 0.4.18
|
||||
commit = True
|
||||
tag = True
|
||||
tag_name = computer-v{new_version}
|
||||
|
||||
@@ -7,7 +7,28 @@ import platform
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union, cast
|
||||
from functools import wraps
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Awaitable,
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
TypeVar,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
|
||||
try:
|
||||
from typing import ParamSpec
|
||||
except Exception: # pragma: no cover
|
||||
from typing_extensions import ParamSpec # type: ignore
|
||||
|
||||
P = ParamSpec("P")
|
||||
R = TypeVar("R")
|
||||
|
||||
from core.telemetry import is_telemetry_enabled, record_event
|
||||
from PIL import Image
|
||||
@@ -66,8 +87,9 @@ class Computer:
|
||||
verbosity: Union[int, LogLevel] = logging.INFO,
|
||||
telemetry_enabled: bool = True,
|
||||
provider_type: Union[str, VMProviderType] = VMProviderType.LUME,
|
||||
port: Optional[int] = 7777,
|
||||
provider_port: Optional[int] = 7777,
|
||||
noVNC_port: Optional[int] = 8006,
|
||||
api_port: Optional[int] = None,
|
||||
host: str = os.environ.get("PYLUME_HOST", "localhost"),
|
||||
storage: Optional[str] = None,
|
||||
ephemeral: bool = False,
|
||||
@@ -118,14 +140,19 @@ class Computer:
|
||||
|
||||
# Store original parameters
|
||||
self.image = image
|
||||
self.port = port
|
||||
self.provider_port = provider_port
|
||||
self.noVNC_port = noVNC_port
|
||||
self.api_port = api_port
|
||||
self.host = host
|
||||
self.os_type = os_type
|
||||
self.provider_type = provider_type
|
||||
self.ephemeral = ephemeral
|
||||
self.api_key = api_key if self.provider_type == VMProviderType.CLOUD else None
|
||||
|
||||
# Set default API port if not specified
|
||||
if self.api_port is None:
|
||||
self.api_port = 8443 if self.api_key else 8000
|
||||
|
||||
self.api_key = api_key
|
||||
self.experiments = experiments or []
|
||||
|
||||
if "app-use" in self.experiments:
|
||||
@@ -273,7 +300,7 @@ class Computer:
|
||||
interface = cast(
|
||||
BaseComputerInterface,
|
||||
InterfaceFactory.create_interface_for_os(
|
||||
os=self.os_type, ip_address=ip_address # type: ignore[arg-type]
|
||||
os=self.os_type, ip_address=ip_address, api_port=self.api_port # type: ignore[arg-type]
|
||||
),
|
||||
)
|
||||
self._interface = interface
|
||||
@@ -300,7 +327,7 @@ class Computer:
|
||||
storage = "ephemeral" if self.ephemeral else self.storage
|
||||
verbose = self.verbosity >= LogLevel.DEBUG
|
||||
ephemeral = self.ephemeral
|
||||
port = self.port if self.port is not None else 7777
|
||||
port = self.provider_port if self.provider_port is not None else 7777
|
||||
host = self.host if self.host else "localhost"
|
||||
image = self.image
|
||||
shared_path = self.shared_path
|
||||
@@ -365,6 +392,7 @@ class Computer:
|
||||
verbose=verbose,
|
||||
ephemeral=ephemeral,
|
||||
noVNC_port=noVNC_port,
|
||||
api_port=self.api_port,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported provider type: {self.provider_type}")
|
||||
@@ -513,13 +541,14 @@ class Computer:
|
||||
ip_address=ip_address,
|
||||
api_key=self.api_key,
|
||||
vm_name=self.config.name,
|
||||
api_port=self.api_port,
|
||||
),
|
||||
)
|
||||
else:
|
||||
interface = cast(
|
||||
BaseComputerInterface,
|
||||
InterfaceFactory.create_interface_for_os(
|
||||
os=self.os_type, ip_address=ip_address
|
||||
os=self.os_type, ip_address=ip_address, api_port=self.api_port
|
||||
),
|
||||
)
|
||||
|
||||
@@ -533,15 +562,13 @@ class Computer:
|
||||
# Use a single timeout for the entire connection process
|
||||
# The VM should already be ready at this point, so we're just establishing the connection
|
||||
await self._interface.wait_for_ready(timeout=30)
|
||||
self.logger.info("WebSocket interface connected successfully")
|
||||
self.logger.info("Sandbox interface connected successfully")
|
||||
except TimeoutError as e:
|
||||
self.logger.error(f"Failed to connect to WebSocket interface at {ip_address}")
|
||||
port = getattr(self._interface, "_api_port", 8000) # Default to 8000 if not set
|
||||
self.logger.error(f"Failed to connect to sandbox interface at {ip_address}:{port}")
|
||||
raise TimeoutError(
|
||||
f"Could not connect to WebSocket interface at {ip_address}:8000/ws: {str(e)}"
|
||||
f"Could not connect to sandbox interface at {ip_address}:{port}: {str(e)}"
|
||||
)
|
||||
# self.logger.warning(
|
||||
# f"Could not connect to WebSocket interface at {ip_address}:8000/ws: {str(e)}, expect missing functionality"
|
||||
# )
|
||||
|
||||
# Create an event to keep the VM running in background if needed
|
||||
if not self.use_host_computer_server:
|
||||
@@ -688,6 +715,7 @@ class Computer:
|
||||
ip_address=ip_address,
|
||||
api_key=self.api_key,
|
||||
vm_name=self.config.name,
|
||||
api_port=self.api_port,
|
||||
),
|
||||
)
|
||||
else:
|
||||
@@ -696,6 +724,7 @@ class Computer:
|
||||
InterfaceFactory.create_interface_for_os(
|
||||
os=self.os_type,
|
||||
ip_address=ip_address,
|
||||
api_port=self.api_port,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -1013,7 +1042,7 @@ class Computer:
|
||||
else:
|
||||
# POSIX (macOS/Linux)
|
||||
venv_path = f"$HOME/.venvs/{venv_name}"
|
||||
create_cmd = f'mkdir -p "$HOME/.venvs" && python3 -m venv "{venv_path}"'
|
||||
create_cmd = f'mkdir -p "$HOME/.venvs" && python -m venv "{venv_path}"'
|
||||
# Check if venv exists, if not create it
|
||||
check_cmd = f'test -d "{venv_path}" || ({create_cmd})'
|
||||
_ = await self.interface.run_command(check_cmd)
|
||||
@@ -1024,7 +1053,25 @@ class Computer:
|
||||
if requirements_str
|
||||
else "echo No requirements to install"
|
||||
)
|
||||
return await self.interface.run_command(install_cmd)
|
||||
return await self.interface.run_command(install_cmd)
|
||||
|
||||
async def pip_install(self, requirements: list[str]):
|
||||
"""Install packages using the system Python/pip (no venv).
|
||||
|
||||
Args:
|
||||
requirements: List of package requirements to install globally/user site.
|
||||
|
||||
Returns:
|
||||
Tuple of (stdout, stderr) from the installation command
|
||||
"""
|
||||
requirements = requirements or []
|
||||
if not requirements:
|
||||
return await self.interface.run_command("echo No requirements to install")
|
||||
|
||||
# Use python -m pip for cross-platform consistency
|
||||
reqs = " ".join(requirements)
|
||||
install_cmd = f"python -m pip install {reqs}"
|
||||
return await self.interface.run_command(install_cmd)
|
||||
|
||||
async def venv_cmd(self, venv_name: str, command: str):
|
||||
"""Execute a shell command in a virtual environment.
|
||||
@@ -1074,19 +1121,11 @@ class Computer:
|
||||
The result of the function execution, or raises any exception that occurred
|
||||
"""
|
||||
import base64
|
||||
import inspect
|
||||
import json
|
||||
import textwrap
|
||||
|
||||
try:
|
||||
# Get function source code using inspect.getsource
|
||||
source = inspect.getsource(python_func)
|
||||
# Remove common leading whitespace (dedent)
|
||||
func_source = textwrap.dedent(source).strip()
|
||||
|
||||
# Remove decorators
|
||||
while func_source.lstrip().startswith("@"):
|
||||
func_source = func_source.split("\n", 1)[1].strip()
|
||||
func_source = helpers.generate_source_code(python_func)
|
||||
|
||||
# Get function name for execution
|
||||
func_name = python_func.__name__
|
||||
@@ -1101,19 +1140,23 @@ class Computer:
|
||||
raise Exception(f"Failed to reconstruct function source: {e}")
|
||||
|
||||
# Create Python code that will define and execute the function
|
||||
args_b64 = base64.b64encode(args_json.encode("utf-8")).decode("ascii")
|
||||
kwargs_b64 = base64.b64encode(kwargs_json.encode("utf-8")).decode("ascii")
|
||||
|
||||
python_code = f'''
|
||||
import json
|
||||
import traceback
|
||||
import base64
|
||||
|
||||
try:
|
||||
# Define the function from source
|
||||
{textwrap.indent(func_source, " ")}
|
||||
|
||||
# Deserialize args and kwargs from JSON
|
||||
args_json = """{args_json}"""
|
||||
kwargs_json = """{kwargs_json}"""
|
||||
args = json.loads(args_json)
|
||||
kwargs = json.loads(kwargs_json)
|
||||
# Deserialize args and kwargs from base64 JSON
|
||||
_args_b64 = """{args_b64}"""
|
||||
_kwargs_b64 = """{kwargs_b64}"""
|
||||
args = json.loads(base64.b64decode(_args_b64).decode('utf-8'))
|
||||
kwargs = json.loads(base64.b64decode(_kwargs_b64).decode('utf-8'))
|
||||
|
||||
# Execute the function
|
||||
result = {func_name}(*args, **kwargs)
|
||||
@@ -1177,10 +1220,21 @@ print(f"<<<VENV_EXEC_START>>>{{output_json}}<<<VENV_EXEC_END>>>")
|
||||
if output_payload["success"]:
|
||||
return output_payload["result"]
|
||||
else:
|
||||
import builtins
|
||||
|
||||
# Recreate and raise the original exception
|
||||
error_info = output_payload["error"]
|
||||
error_class = eval(error_info["type"])
|
||||
raise error_class(error_info["message"])
|
||||
error_info = output_payload.get("error", {}) or {}
|
||||
err_type = error_info.get("type") or "Exception"
|
||||
err_msg = error_info.get("message") or ""
|
||||
err_tb = error_info.get("traceback") or ""
|
||||
|
||||
exc_cls = getattr(builtins, err_type, None)
|
||||
if isinstance(exc_cls, type) and issubclass(exc_cls, BaseException):
|
||||
# Built-in exception: rethrow with remote traceback appended
|
||||
raise exc_cls(f"{err_msg}\n\nRemote traceback:\n{err_tb}")
|
||||
else:
|
||||
# Non built-in: raise a safe local error carrying full remote context
|
||||
raise RuntimeError(f"{err_type}: {err_msg}\n\nRemote traceback:\n{err_tb}")
|
||||
else:
|
||||
raise Exception("Invalid output format: markers found but no content between them")
|
||||
else:
|
||||
@@ -1188,3 +1242,345 @@ print(f"<<<VENV_EXEC_START>>>{{output_json}}<<<VENV_EXEC_END>>>")
|
||||
raise Exception(
|
||||
f"No output payload found. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
async def venv_exec_background(
|
||||
self, venv_name: str, python_func, *args, requirements: Optional[List[str]] = None, **kwargs
|
||||
) -> int:
|
||||
"""Run the Python function in the venv in the background and return the PID.
|
||||
|
||||
Uses a short launcher Python that spawns a detached child and exits immediately.
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import textwrap
|
||||
import time as _time
|
||||
|
||||
try:
|
||||
func_source = helpers.generate_source_code(python_func)
|
||||
func_name = python_func.__name__
|
||||
args_json = json.dumps(args, default=str)
|
||||
kwargs_json = json.dumps(kwargs, default=str)
|
||||
except OSError as e:
|
||||
raise Exception(f"Cannot retrieve source code for function {python_func.__name__}: {e}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to reconstruct function source: {e}")
|
||||
|
||||
reqs_list = requirements or []
|
||||
reqs_json = json.dumps(reqs_list)
|
||||
|
||||
# Create Python code that will define and execute the function
|
||||
args_b64 = base64.b64encode(args_json.encode("utf-8")).decode("ascii")
|
||||
kwargs_b64 = base64.b64encode(kwargs_json.encode("utf-8")).decode("ascii")
|
||||
|
||||
payload_code = (
|
||||
f'''
|
||||
import json
|
||||
import traceback
|
||||
import base64
|
||||
|
||||
try:
|
||||
# Define the function from source
|
||||
{textwrap.indent(func_source, " ")}
|
||||
|
||||
# Deserialize args and kwargs from base64 JSON
|
||||
_args_b64 = """{args_b64}"""
|
||||
_kwargs_b64 = """{kwargs_b64}"""
|
||||
args = json.loads(base64.b64decode(_args_b64).decode('utf-8'))
|
||||
kwargs = json.loads(base64.b64decode(_kwargs_b64).decode('utf-8'))
|
||||
|
||||
# Ensure requirements inside the active venv
|
||||
for pkg in json.loads('''
|
||||
+ repr(reqs_json)
|
||||
+ """):
|
||||
if pkg:
|
||||
import subprocess, sys
|
||||
subprocess.run([sys.executable, '-m', 'pip', 'install', pkg], check=False)
|
||||
_ = {func_name}(*args, **kwargs)
|
||||
except Exception:
|
||||
import sys
|
||||
sys.stderr.write(traceback.format_exc())
|
||||
"""
|
||||
)
|
||||
payload_b64 = base64.b64encode(payload_code.encode("utf-8")).decode("ascii")
|
||||
|
||||
if self.os_type == "windows":
|
||||
# Launcher spawns detached child and prints its PID
|
||||
launcher_code = f"""
|
||||
import base64, subprocess, os, sys
|
||||
DETACHED_PROCESS = 0x00000008
|
||||
CREATE_NEW_PROCESS_GROUP = 0x00000200
|
||||
creationflags = DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP
|
||||
code = base64.b64decode("{payload_b64}").decode("utf-8")
|
||||
p = subprocess.Popen(["python", "-c", code], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, creationflags=creationflags)
|
||||
print(p.pid)
|
||||
"""
|
||||
launcher_b64 = base64.b64encode(launcher_code.encode("utf-8")).decode("ascii")
|
||||
venv_path = f"%USERPROFILE%\\.venvs\\{venv_name}"
|
||||
cmd = (
|
||||
'cmd /c "'
|
||||
f'call "{venv_path}\\Scripts\\activate.bat" && '
|
||||
f"python -c \"import base64; exec(base64.b64decode('{launcher_b64}').decode('utf-8'))\""
|
||||
'"'
|
||||
)
|
||||
result = await self.interface.run_command(cmd)
|
||||
pid_str = (result.stdout or "").strip().splitlines()[-1].strip()
|
||||
return int(pid_str)
|
||||
else:
|
||||
log = f"/tmp/cua_bg_{int(_time.time())}.log"
|
||||
launcher_code = f"""
|
||||
import base64, subprocess, os, sys
|
||||
code = base64.b64decode("{payload_b64}").decode("utf-8")
|
||||
with open("{log}", "ab", buffering=0) as f:
|
||||
p = subprocess.Popen(["python", "-c", code], stdout=f, stderr=subprocess.STDOUT, preexec_fn=getattr(os, "setsid", None))
|
||||
print(p.pid)
|
||||
"""
|
||||
launcher_b64 = base64.b64encode(launcher_code.encode("utf-8")).decode("ascii")
|
||||
venv_path = f"$HOME/.venvs/{venv_name}"
|
||||
shell = (
|
||||
f'. "{venv_path}/bin/activate" && '
|
||||
f"python -c \"import base64; exec(base64.b64decode('{launcher_b64}').decode('utf-8'))\""
|
||||
)
|
||||
result = await self.interface.run_command(shell)
|
||||
pid_str = (result.stdout or "").strip().splitlines()[-1].strip()
|
||||
return int(pid_str)
|
||||
|
||||
async def python_exec(self, python_func, *args, **kwargs):
|
||||
"""Execute a Python function using the system Python (no venv).
|
||||
|
||||
Uses source extraction and base64 transport, mirroring venv_exec but
|
||||
without virtual environment activation.
|
||||
|
||||
Returns the function result or raises a reconstructed exception with
|
||||
remote traceback context appended.
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import textwrap
|
||||
|
||||
try:
|
||||
func_source = helpers.generate_source_code(python_func)
|
||||
func_name = python_func.__name__
|
||||
args_json = json.dumps(args, default=str)
|
||||
kwargs_json = json.dumps(kwargs, default=str)
|
||||
except OSError as e:
|
||||
raise Exception(f"Cannot retrieve source code for function {python_func.__name__}: {e}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to reconstruct function source: {e}")
|
||||
|
||||
# Create Python code that will define and execute the function
|
||||
args_b64 = base64.b64encode(args_json.encode("utf-8")).decode("ascii")
|
||||
kwargs_b64 = base64.b64encode(kwargs_json.encode("utf-8")).decode("ascii")
|
||||
|
||||
python_code = f'''
|
||||
import json
|
||||
import traceback
|
||||
import base64
|
||||
|
||||
try:
|
||||
# Define the function from source
|
||||
{textwrap.indent(func_source, " ")}
|
||||
|
||||
# Deserialize args and kwargs from base64 JSON
|
||||
_args_b64 = """{args_b64}"""
|
||||
_kwargs_b64 = """{kwargs_b64}"""
|
||||
args = json.loads(base64.b64decode(_args_b64).decode('utf-8'))
|
||||
kwargs = json.loads(base64.b64decode(_kwargs_b64).decode('utf-8'))
|
||||
|
||||
# Execute the function
|
||||
result = {func_name}(*args, **kwargs)
|
||||
|
||||
# Create success output payload
|
||||
output_payload = {{
|
||||
"success": True,
|
||||
"result": result,
|
||||
"error": None
|
||||
}}
|
||||
|
||||
except Exception as e:
|
||||
# Create error output payload
|
||||
output_payload = {{
|
||||
"success": False,
|
||||
"result": None,
|
||||
"error": {{
|
||||
"type": type(e).__name__,
|
||||
"message": str(e),
|
||||
"traceback": traceback.format_exc()
|
||||
}}
|
||||
}}
|
||||
|
||||
# Serialize the output payload as JSON
|
||||
import json
|
||||
output_json = json.dumps(output_payload, default=str)
|
||||
|
||||
# Print the JSON output with markers
|
||||
print(f"<<<VENV_EXEC_START>>>{{output_json}}<<<VENV_EXEC_END>>>")
|
||||
'''
|
||||
|
||||
encoded_code = base64.b64encode(python_code.encode("utf-8")).decode("ascii")
|
||||
python_command = (
|
||||
f"python -c \"import base64; exec(base64.b64decode('{encoded_code}').decode('utf-8'))\""
|
||||
)
|
||||
result = await self.interface.run_command(python_command)
|
||||
|
||||
start_marker = "<<<VENV_EXEC_START>>>"
|
||||
end_marker = "<<<VENV_EXEC_END>>>"
|
||||
|
||||
print(result.stdout[: result.stdout.find(start_marker)])
|
||||
|
||||
if start_marker in result.stdout and end_marker in result.stdout:
|
||||
start_idx = result.stdout.find(start_marker) + len(start_marker)
|
||||
end_idx = result.stdout.find(end_marker)
|
||||
if start_idx < end_idx:
|
||||
output_json = result.stdout[start_idx:end_idx]
|
||||
try:
|
||||
output_payload = json.loads(output_json)
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to decode output payload: {e}")
|
||||
|
||||
if output_payload["success"]:
|
||||
return output_payload["result"]
|
||||
else:
|
||||
import builtins
|
||||
|
||||
error_info = output_payload.get("error", {}) or {}
|
||||
err_type = error_info.get("type") or "Exception"
|
||||
err_msg = error_info.get("message") or ""
|
||||
err_tb = error_info.get("traceback") or ""
|
||||
exc_cls = getattr(builtins, err_type, None)
|
||||
if isinstance(exc_cls, type) and issubclass(exc_cls, BaseException):
|
||||
raise exc_cls(f"{err_msg}\n\nRemote traceback:\n{err_tb}")
|
||||
else:
|
||||
raise RuntimeError(f"{err_type}: {err_msg}\n\nRemote traceback:\n{err_tb}")
|
||||
else:
|
||||
raise Exception("Invalid output format: markers found but no content between them")
|
||||
else:
|
||||
raise Exception(
|
||||
f"No output payload found. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
async def python_exec_background(
|
||||
self, python_func, *args, requirements: Optional[List[str]] = None, **kwargs
|
||||
) -> int:
|
||||
"""Run a Python function with the system interpreter in the background and return PID.
|
||||
|
||||
Uses a short launcher Python that spawns a detached child and exits immediately.
|
||||
"""
|
||||
import base64
|
||||
import json
|
||||
import textwrap
|
||||
import time as _time
|
||||
|
||||
try:
|
||||
func_source = helpers.generate_source_code(python_func)
|
||||
func_name = python_func.__name__
|
||||
args_json = json.dumps(args, default=str)
|
||||
kwargs_json = json.dumps(kwargs, default=str)
|
||||
except OSError as e:
|
||||
raise Exception(f"Cannot retrieve source code for function {python_func.__name__}: {e}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to reconstruct function source: {e}")
|
||||
|
||||
# Create Python code that will define and execute the function
|
||||
args_b64 = base64.b64encode(args_json.encode("utf-8")).decode("ascii")
|
||||
kwargs_b64 = base64.b64encode(kwargs_json.encode("utf-8")).decode("ascii")
|
||||
|
||||
payload_code = f'''
|
||||
import json
|
||||
import traceback
|
||||
import base64
|
||||
|
||||
try:
|
||||
# Define the function from source
|
||||
{textwrap.indent(func_source, " ")}
|
||||
|
||||
# Deserialize args and kwargs from base64 JSON
|
||||
_args_b64 = """{args_b64}"""
|
||||
_kwargs_b64 = """{kwargs_b64}"""
|
||||
args = json.loads(base64.b64decode(_args_b64).decode('utf-8'))
|
||||
kwargs = json.loads(base64.b64decode(_kwargs_b64).decode('utf-8'))
|
||||
|
||||
_ = {func_name}(*args, **kwargs)
|
||||
except Exception:
|
||||
import sys
|
||||
sys.stderr.write(traceback.format_exc())
|
||||
'''
|
||||
payload_b64 = base64.b64encode(payload_code.encode("utf-8")).decode("ascii")
|
||||
|
||||
if self.os_type == "windows":
|
||||
launcher_code = f"""
|
||||
import base64, subprocess, os, sys
|
||||
DETACHED_PROCESS = 0x00000008
|
||||
CREATE_NEW_PROCESS_GROUP = 0x00000200
|
||||
creationflags = DETACHED_PROCESS | CREATE_NEW_PROCESS_GROUP
|
||||
code = base64.b64decode("{payload_b64}").decode("utf-8")
|
||||
p = subprocess.Popen(["python", "-c", code], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, creationflags=creationflags)
|
||||
print(p.pid)
|
||||
"""
|
||||
launcher_b64 = base64.b64encode(launcher_code.encode("utf-8")).decode("ascii")
|
||||
cmd = f"python -c \"import base64; exec(base64.b64decode('{launcher_b64}').decode('utf-8'))\""
|
||||
result = await self.interface.run_command(cmd)
|
||||
pid_str = (result.stdout or "").strip().splitlines()[-1].strip()
|
||||
return int(pid_str)
|
||||
else:
|
||||
log = f"/tmp/cua_bg_{int(_time.time())}.log"
|
||||
launcher_code = f"""
|
||||
import base64, subprocess, os, sys
|
||||
code = base64.b64decode("{payload_b64}").decode("utf-8")
|
||||
with open("{log}", "ab", buffering=0) as f:
|
||||
p = subprocess.Popen(["python", "-c", code], stdout=f, stderr=subprocess.STDOUT, preexec_fn=getattr(os, "setsid", None))
|
||||
print(p.pid)
|
||||
"""
|
||||
launcher_b64 = base64.b64encode(launcher_code.encode("utf-8")).decode("ascii")
|
||||
cmd = f"python -c \"import base64; exec(base64.b64decode('{launcher_b64}').decode('utf-8'))\""
|
||||
result = await self.interface.run_command(cmd)
|
||||
pid_str = (result.stdout or "").strip().splitlines()[-1].strip()
|
||||
return int(pid_str)
|
||||
|
||||
def python_command(
|
||||
self,
|
||||
requirements: Optional[List[str]] = None,
|
||||
*,
|
||||
venv_name: str = "default",
|
||||
use_system_python: bool = False,
|
||||
background: bool = False,
|
||||
) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]]:
|
||||
"""Decorator to execute a Python function remotely in this Computer's venv.
|
||||
|
||||
This mirrors `computer.helpers.sandboxed()` but binds to this instance and
|
||||
optionally ensures required packages are installed before execution.
|
||||
|
||||
Args:
|
||||
requirements: Packages to install in the virtual environment.
|
||||
venv_name: Name of the virtual environment to use.
|
||||
use_system_python: If True, use the system Python/pip instead of a venv.
|
||||
background: If True, run the function detached and return the child PID immediately.
|
||||
|
||||
Returns:
|
||||
A decorator that turns a local function into an async callable which
|
||||
runs remotely and returns the function's result.
|
||||
"""
|
||||
|
||||
reqs = list(requirements or [])
|
||||
|
||||
def decorator(func: Callable[P, R]) -> Callable[P, Awaitable[R]]:
|
||||
@wraps(func)
|
||||
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
if use_system_python:
|
||||
# For background, avoid blocking installs; install inside child process
|
||||
if background:
|
||||
return await self.python_exec_background(func, *args, requirements=reqs, **kwargs) # type: ignore[return-value]
|
||||
# Foreground: install first, then execute
|
||||
if reqs:
|
||||
await self.pip_install(reqs)
|
||||
return await self.python_exec(func, *args, **kwargs)
|
||||
else:
|
||||
# For background, avoid blocking installs; install inside child process under venv
|
||||
if background:
|
||||
return await self.venv_exec_background(venv_name, func, *args, requirements=reqs, **kwargs) # type: ignore[return-value]
|
||||
# Foreground: ensure venv and install, then execute
|
||||
await self.venv_install(venv_name, reqs)
|
||||
return await self.venv_exec(venv_name, func, *args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
@@ -2,18 +2,46 @@
|
||||
Helper functions and decorators for the Computer module.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import asyncio
|
||||
import builtins
|
||||
import importlib.util
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, Optional, TypeVar, cast
|
||||
from inspect import getsource
|
||||
from textwrap import dedent
|
||||
from types import FunctionType, ModuleType
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Set, TypedDict, TypeVar
|
||||
|
||||
try:
|
||||
# Python 3.12+ has ParamSpec in typing
|
||||
from typing import ParamSpec
|
||||
except ImportError: # pragma: no cover
|
||||
# Fallback for environments without ParamSpec in typing
|
||||
from typing_extensions import ParamSpec # type: ignore
|
||||
|
||||
P = ParamSpec("P")
|
||||
R = TypeVar("R")
|
||||
|
||||
|
||||
class DependencyInfo(TypedDict):
|
||||
import_statements: List[str]
|
||||
definitions: List[tuple[str, Any]]
|
||||
|
||||
|
||||
# Global reference to the default computer instance
|
||||
_default_computer = None
|
||||
|
||||
# Global cache for function dependency analysis
|
||||
_function_dependency_map: Dict[FunctionType, DependencyInfo] = {}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def set_default_computer(computer):
|
||||
def set_default_computer(computer: Any) -> None:
|
||||
"""
|
||||
Set the default computer instance to be used by the remote decorator.
|
||||
|
||||
@@ -24,19 +52,26 @@ def set_default_computer(computer):
|
||||
_default_computer = computer
|
||||
|
||||
|
||||
def sandboxed(venv_name: str = "default", computer: str = "default", max_retries: int = 3):
|
||||
def sandboxed(
|
||||
venv_name: str = "default",
|
||||
computer: str = "default",
|
||||
max_retries: int = 3,
|
||||
) -> Callable[[Callable[P, R]], Callable[P, Awaitable[R]]]:
|
||||
"""
|
||||
Decorator that wraps a function to be executed remotely via computer.venv_exec
|
||||
|
||||
The function is automatically analyzed for dependencies (imports, helper functions,
|
||||
constants, etc.) and reconstructed with all necessary code in the remote sandbox.
|
||||
|
||||
Args:
|
||||
venv_name: Name of the virtual environment to execute in
|
||||
computer: The computer instance to use, or "default" to use the globally set default
|
||||
max_retries: Maximum number of retries for the remote execution
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
def decorator(func: Callable[P, R]) -> Callable[P, Awaitable[R]]:
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
# Determine which computer instance to use
|
||||
comp = computer if computer != "default" else _default_computer
|
||||
|
||||
@@ -54,6 +89,402 @@ def sandboxed(venv_name: str = "default", computer: str = "default", max_retries
|
||||
if i == max_retries - 1:
|
||||
raise e
|
||||
|
||||
# Should be unreachable because we either returned or raised
|
||||
raise RuntimeError("sandboxed wrapper reached unreachable code path")
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def _extract_import_statement(name: str, module: ModuleType) -> str:
|
||||
"""Extract the original import statement for a module."""
|
||||
module_name = module.__name__
|
||||
|
||||
if name == module_name.split(".")[0]:
|
||||
return f"import {module_name}"
|
||||
else:
|
||||
return f"import {module_name} as {name}"
|
||||
|
||||
|
||||
def _is_third_party_module(module_name: str) -> bool:
|
||||
"""Check if a module is a third-party module."""
|
||||
stdlib_modules = set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
|
||||
|
||||
if module_name in stdlib_modules:
|
||||
return False
|
||||
|
||||
try:
|
||||
spec = importlib.util.find_spec(module_name)
|
||||
if spec is None:
|
||||
return False
|
||||
|
||||
if spec.origin and ("site-packages" in spec.origin or "dist-packages" in spec.origin):
|
||||
return True
|
||||
|
||||
return False
|
||||
except (ImportError, ModuleNotFoundError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
def _is_project_import(module_name: str) -> bool:
|
||||
"""Check if a module is a project-level import."""
|
||||
if module_name.startswith("__relative_import_level_"):
|
||||
return True
|
||||
|
||||
if module_name in sys.modules:
|
||||
module = sys.modules[module_name]
|
||||
if hasattr(module, "__file__") and module.__file__:
|
||||
if "site-packages" not in module.__file__ and "dist-packages" not in module.__file__:
|
||||
cwd = os.getcwd()
|
||||
if module.__file__.startswith(cwd):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _categorize_module(module_name: str) -> str:
|
||||
"""Categorize a module as stdlib, third-party, or project."""
|
||||
if module_name.startswith("__relative_import_level_"):
|
||||
return "project"
|
||||
elif module_name in (
|
||||
set(sys.stdlib_module_names) if hasattr(sys, "stdlib_module_names") else set()
|
||||
):
|
||||
return "stdlib"
|
||||
elif _is_third_party_module(module_name):
|
||||
return "third_party"
|
||||
elif _is_project_import(module_name):
|
||||
return "project"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
|
||||
class _DependencyVisitor(ast.NodeVisitor):
|
||||
"""AST visitor to extract imports and name references from a function."""
|
||||
|
||||
def __init__(self, function_name: str) -> None:
|
||||
self.function_name = function_name
|
||||
self.internal_imports: Set[str] = set()
|
||||
self.internal_import_statements: List[str] = []
|
||||
self.name_references: Set[str] = set()
|
||||
self.local_names: Set[str] = set()
|
||||
self.inside_function = False
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
if node.name == self.function_name and not self.inside_function:
|
||||
self.inside_function = True
|
||||
|
||||
for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
|
||||
self.local_names.add(arg.arg)
|
||||
if node.args.vararg:
|
||||
self.local_names.add(node.args.vararg.arg)
|
||||
if node.args.kwarg:
|
||||
self.local_names.add(node.args.kwarg.arg)
|
||||
|
||||
for child in node.body:
|
||||
self.visit(child)
|
||||
|
||||
self.inside_function = False
|
||||
else:
|
||||
if self.inside_function:
|
||||
self.local_names.add(node.name)
|
||||
for child in node.body:
|
||||
self.visit(child)
|
||||
|
||||
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||
self.visit_FunctionDef(node) # type: ignore
|
||||
|
||||
def visit_Import(self, node: ast.Import) -> None:
|
||||
if self.inside_function:
|
||||
for alias in node.names:
|
||||
module_name = alias.name.split(".")[0]
|
||||
self.internal_imports.add(module_name)
|
||||
imported_as = alias.asname if alias.asname else alias.name.split(".")[0]
|
||||
self.local_names.add(imported_as)
|
||||
self.internal_import_statements.append(ast.unparse(node))
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||
if self.inside_function:
|
||||
if node.level == 0 and node.module:
|
||||
module_name = node.module.split(".")[0]
|
||||
self.internal_imports.add(module_name)
|
||||
elif node.level > 0:
|
||||
self.internal_imports.add(f"__relative_import_level_{node.level}__")
|
||||
|
||||
for alias in node.names:
|
||||
imported_as = alias.asname if alias.asname else alias.name
|
||||
self.local_names.add(imported_as)
|
||||
self.internal_import_statements.append(ast.unparse(node))
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Name(self, node: ast.Name) -> None:
|
||||
if self.inside_function:
|
||||
if isinstance(node.ctx, ast.Load):
|
||||
self.name_references.add(node.id)
|
||||
elif isinstance(node.ctx, ast.Store):
|
||||
self.local_names.add(node.id)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||
if self.inside_function:
|
||||
self.local_names.add(node.name)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_For(self, node: ast.For) -> None:
|
||||
if self.inside_function and isinstance(node.target, ast.Name):
|
||||
self.local_names.add(node.target.id)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_comprehension(self, node: ast.comprehension) -> None:
|
||||
if self.inside_function and isinstance(node.target, ast.Name):
|
||||
self.local_names.add(node.target.id)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ExceptHandler(self, node: ast.ExceptHandler) -> None:
|
||||
if self.inside_function and node.name:
|
||||
self.local_names.add(node.name)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_With(self, node: ast.With) -> None:
|
||||
if self.inside_function:
|
||||
for item in node.items:
|
||||
if item.optional_vars and isinstance(item.optional_vars, ast.Name):
|
||||
self.local_names.add(item.optional_vars.id)
|
||||
self.generic_visit(node)
|
||||
|
||||
|
||||
def _traverse_and_collect_dependencies(func: FunctionType) -> DependencyInfo:
|
||||
"""
|
||||
Traverse a function and collect its dependencies.
|
||||
|
||||
Returns a dict with:
|
||||
- import_statements: List of import statements needed
|
||||
- definitions: List of (name, obj) tuples for helper functions/classes/constants
|
||||
"""
|
||||
source = dedent(getsource(func))
|
||||
tree = ast.parse(source)
|
||||
|
||||
visitor = _DependencyVisitor(func.__name__)
|
||||
visitor.visit(tree)
|
||||
|
||||
builtin_names = set(dir(builtins))
|
||||
external_refs = (visitor.name_references - visitor.local_names) - builtin_names
|
||||
|
||||
import_statements = []
|
||||
definitions = []
|
||||
visited = set()
|
||||
|
||||
# Include all internal import statements
|
||||
import_statements.extend(visitor.internal_import_statements)
|
||||
|
||||
# Analyze external references recursively
|
||||
def analyze_object(obj: Any, name: str, depth: int = 0) -> None:
|
||||
if depth > 20:
|
||||
return
|
||||
|
||||
obj_id = id(obj)
|
||||
if obj_id in visited:
|
||||
return
|
||||
visited.add(obj_id)
|
||||
|
||||
# Handle modules
|
||||
if inspect.ismodule(obj):
|
||||
import_stmt = _extract_import_statement(name, obj)
|
||||
import_statements.append(import_stmt)
|
||||
return
|
||||
|
||||
# Handle functions and classes
|
||||
if (
|
||||
inspect.isfunction(obj)
|
||||
or inspect.isclass(obj)
|
||||
or inspect.isbuiltin(obj)
|
||||
or inspect.ismethod(obj)
|
||||
):
|
||||
obj_module = getattr(obj, "__module__", None)
|
||||
if obj_module:
|
||||
base_module = obj_module.split(".")[0]
|
||||
module_category = _categorize_module(base_module)
|
||||
|
||||
# If from stdlib/third-party, just add import
|
||||
if module_category in ("stdlib", "third_party"):
|
||||
obj_name = getattr(obj, "__name__", name)
|
||||
|
||||
# Check if object is accessible by 'name' (in globals or closures)
|
||||
is_accessible = False
|
||||
if name in func.__globals__ and func.__globals__[name] is obj:
|
||||
is_accessible = True
|
||||
elif func.__closure__ and hasattr(func, "__code__"):
|
||||
freevars = func.__code__.co_freevars
|
||||
for i, var_name in enumerate(freevars):
|
||||
if var_name == name and i < len(func.__closure__):
|
||||
try:
|
||||
if func.__closure__[i].cell_contents is obj:
|
||||
is_accessible = True
|
||||
break
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
if is_accessible and name == obj_name:
|
||||
# Direct import: from requests import get, from math import sqrt
|
||||
import_statements.append(f"from {base_module} import {name}")
|
||||
else:
|
||||
# Module import: import requests
|
||||
import_statements.append(f"import {base_module}")
|
||||
return
|
||||
|
||||
try:
|
||||
obj_tree = ast.parse(dedent(getsource(obj)))
|
||||
obj_visitor = _DependencyVisitor(obj.__name__)
|
||||
obj_visitor.visit(obj_tree)
|
||||
|
||||
obj_external_refs = obj_visitor.name_references - obj_visitor.local_names
|
||||
obj_external_refs = obj_external_refs - builtin_names
|
||||
|
||||
# Add internal imports from this object
|
||||
import_statements.extend(obj_visitor.internal_import_statements)
|
||||
|
||||
# Recursively analyze its dependencies
|
||||
obj_globals = getattr(obj, "__globals__", None)
|
||||
obj_closure = getattr(obj, "__closure__", None)
|
||||
obj_code = getattr(obj, "__code__", None)
|
||||
if obj_globals:
|
||||
for ref_name in obj_external_refs:
|
||||
ref_obj = None
|
||||
|
||||
# Check globals first
|
||||
if ref_name in obj_globals:
|
||||
ref_obj = obj_globals[ref_name]
|
||||
# Check closure variables using co_freevars
|
||||
elif obj_closure and obj_code:
|
||||
freevars = obj_code.co_freevars
|
||||
for i, var_name in enumerate(freevars):
|
||||
if var_name == ref_name and i < len(obj_closure):
|
||||
try:
|
||||
ref_obj = obj_closure[i].cell_contents
|
||||
break
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
|
||||
if ref_obj is not None:
|
||||
analyze_object(ref_obj, ref_name, depth + 1)
|
||||
|
||||
# Add this object to definitions
|
||||
if not inspect.ismodule(obj):
|
||||
ref_module = getattr(obj, "__module__", None)
|
||||
if ref_module:
|
||||
ref_base_module = ref_module.split(".")[0]
|
||||
ref_category = _categorize_module(ref_base_module)
|
||||
if ref_category not in ("stdlib", "third_party"):
|
||||
definitions.append((name, obj))
|
||||
else:
|
||||
definitions.append((name, obj))
|
||||
|
||||
except (OSError, TypeError):
|
||||
pass
|
||||
return
|
||||
|
||||
if isinstance(obj, (int, float, str, bool, list, dict, tuple, set, frozenset, type(None))):
|
||||
definitions.append((name, obj))
|
||||
|
||||
# Analyze all external references
|
||||
for name in external_refs:
|
||||
obj = None
|
||||
|
||||
# First check globals
|
||||
if name in func.__globals__:
|
||||
obj = func.__globals__[name]
|
||||
# Then check closure variables (sibling functions in enclosing scope)
|
||||
elif func.__closure__ and func.__code__.co_freevars:
|
||||
# Match closure variable names with cell contents
|
||||
freevars = func.__code__.co_freevars
|
||||
for i, var_name in enumerate(freevars):
|
||||
if var_name == name and i < len(func.__closure__):
|
||||
try:
|
||||
obj = func.__closure__[i].cell_contents
|
||||
break
|
||||
except (ValueError, AttributeError):
|
||||
# Cell is empty or doesn't have contents
|
||||
pass
|
||||
|
||||
if obj is not None:
|
||||
analyze_object(obj, name)
|
||||
|
||||
# Remove duplicate import statements
|
||||
unique_imports = []
|
||||
seen = set()
|
||||
for stmt in import_statements:
|
||||
if stmt not in seen:
|
||||
seen.add(stmt)
|
||||
unique_imports.append(stmt)
|
||||
|
||||
# Remove duplicate definitions
|
||||
unique_definitions = []
|
||||
seen_names = set()
|
||||
for name, obj in definitions:
|
||||
if name not in seen_names:
|
||||
seen_names.add(name)
|
||||
unique_definitions.append((name, obj))
|
||||
|
||||
return {
|
||||
"import_statements": unique_imports,
|
||||
"definitions": unique_definitions,
|
||||
}
|
||||
|
||||
|
||||
def generate_source_code(func: FunctionType) -> str:
|
||||
"""
|
||||
Generate complete source code for a function with all dependencies.
|
||||
|
||||
Args:
|
||||
func: The function to generate source code for
|
||||
|
||||
Returns:
|
||||
Complete Python source code as a string
|
||||
"""
|
||||
|
||||
if func in _function_dependency_map:
|
||||
info = _function_dependency_map[func]
|
||||
else:
|
||||
info = _traverse_and_collect_dependencies(func)
|
||||
_function_dependency_map[func] = info
|
||||
|
||||
# Build source code
|
||||
parts = []
|
||||
|
||||
# 1. Add imports
|
||||
if info["import_statements"]:
|
||||
parts.append("\n".join(info["import_statements"]))
|
||||
|
||||
# 2. Add definitions
|
||||
for name, obj in info["definitions"]:
|
||||
try:
|
||||
if inspect.isfunction(obj):
|
||||
source = dedent(getsource(obj))
|
||||
tree = ast.parse(source)
|
||||
if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
tree.body[0].decorator_list = []
|
||||
source = ast.unparse(tree)
|
||||
parts.append(source)
|
||||
elif inspect.isclass(obj):
|
||||
source = dedent(getsource(obj))
|
||||
tree = ast.parse(source)
|
||||
if tree.body and isinstance(tree.body[0], ast.ClassDef):
|
||||
tree.body[0].decorator_list = []
|
||||
source = ast.unparse(tree)
|
||||
parts.append(source)
|
||||
else:
|
||||
parts.append(f"{name} = {repr(obj)}")
|
||||
except (OSError, TypeError):
|
||||
pass
|
||||
|
||||
# 3. Add main function (without decorators)
|
||||
func_source = dedent(getsource(func))
|
||||
tree = ast.parse(func_source)
|
||||
if tree.body and isinstance(tree.body[0], (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
tree.body[0].decorator_list = []
|
||||
func_source = ast.unparse(tree)
|
||||
parts.append(func_source)
|
||||
|
||||
return "\n\n".join(parts)
|
||||
|
||||
@@ -12,6 +12,7 @@ class InterfaceFactory:
|
||||
def create_interface_for_os(
|
||||
os: Literal["macos", "linux", "windows"],
|
||||
ip_address: str,
|
||||
api_port: Optional[int] = None,
|
||||
api_key: Optional[str] = None,
|
||||
vm_name: Optional[str] = None,
|
||||
) -> BaseComputerInterface:
|
||||
@@ -20,6 +21,7 @@ class InterfaceFactory:
|
||||
Args:
|
||||
os: Operating system type ('macos', 'linux', or 'windows')
|
||||
ip_address: IP address of the computer to control
|
||||
api_port: Optional API port of the computer to control
|
||||
api_key: Optional API key for cloud authentication
|
||||
vm_name: Optional VM name for cloud authentication
|
||||
|
||||
@@ -35,10 +37,16 @@ class InterfaceFactory:
|
||||
from .windows import WindowsComputerInterface
|
||||
|
||||
if os == "macos":
|
||||
return MacOSComputerInterface(ip_address, api_key=api_key, vm_name=vm_name)
|
||||
return MacOSComputerInterface(
|
||||
ip_address, api_key=api_key, vm_name=vm_name, api_port=api_port
|
||||
)
|
||||
elif os == "linux":
|
||||
return LinuxComputerInterface(ip_address, api_key=api_key, vm_name=vm_name)
|
||||
return LinuxComputerInterface(
|
||||
ip_address, api_key=api_key, vm_name=vm_name, api_port=api_port
|
||||
)
|
||||
elif os == "windows":
|
||||
return WindowsComputerInterface(ip_address, api_key=api_key, vm_name=vm_name)
|
||||
return WindowsComputerInterface(
|
||||
ip_address, api_key=api_key, vm_name=vm_name, api_port=api_port
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported OS type: {os}")
|
||||
|
||||
@@ -30,6 +30,7 @@ class GenericComputerInterface(BaseComputerInterface):
|
||||
api_key: Optional[str] = None,
|
||||
vm_name: Optional[str] = None,
|
||||
logger_name: str = "computer.interface.generic",
|
||||
api_port: Optional[int] = None,
|
||||
):
|
||||
super().__init__(ip_address, username, password, api_key, vm_name)
|
||||
self._ws = None
|
||||
@@ -47,6 +48,9 @@ class GenericComputerInterface(BaseComputerInterface):
|
||||
# Set logger name for the interface
|
||||
self.logger = Logger(logger_name, LogLevel.NORMAL)
|
||||
|
||||
# Store custom ports
|
||||
self._api_port = api_port
|
||||
|
||||
# Optional default delay time between commands (in seconds)
|
||||
self.delay = 0.0
|
||||
|
||||
@@ -70,7 +74,12 @@ class GenericComputerInterface(BaseComputerInterface):
|
||||
WebSocket URI for the Computer API Server
|
||||
"""
|
||||
protocol = "wss" if self.api_key else "ws"
|
||||
port = "8443" if self.api_key else "8000"
|
||||
# Use custom API port if provided, otherwise use defaults based on API key
|
||||
port = (
|
||||
str(self._api_port)
|
||||
if self._api_port is not None
|
||||
else ("8443" if self.api_key else "8000")
|
||||
)
|
||||
return f"{protocol}://{self.ip_address}:{port}/ws"
|
||||
|
||||
@property
|
||||
@@ -81,7 +90,12 @@ class GenericComputerInterface(BaseComputerInterface):
|
||||
REST URI for the Computer API Server
|
||||
"""
|
||||
protocol = "https" if self.api_key else "http"
|
||||
port = "8443" if self.api_key else "8000"
|
||||
# Use custom API port if provided, otherwise use defaults based on API key
|
||||
port = (
|
||||
str(self._api_port)
|
||||
if self._api_port is not None
|
||||
else ("8443" if self.api_key else "8000")
|
||||
)
|
||||
return f"{protocol}://{self.ip_address}:{port}/cmd"
|
||||
|
||||
# Mouse actions
|
||||
|
||||
@@ -13,7 +13,8 @@ class LinuxComputerInterface(GenericComputerInterface):
|
||||
password: str = "lume",
|
||||
api_key: Optional[str] = None,
|
||||
vm_name: Optional[str] = None,
|
||||
api_port: Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.linux"
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.linux", api_port
|
||||
)
|
||||
|
||||
@@ -13,9 +13,10 @@ class MacOSComputerInterface(GenericComputerInterface):
|
||||
password: str = "lume",
|
||||
api_key: Optional[str] = None,
|
||||
vm_name: Optional[str] = None,
|
||||
api_port: Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.macos"
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.macos", api_port
|
||||
)
|
||||
|
||||
async def diorama_cmd(self, action: str, arguments: Optional[dict] = None) -> dict:
|
||||
|
||||
@@ -13,7 +13,8 @@ class WindowsComputerInterface(GenericComputerInterface):
|
||||
password: str = "lume",
|
||||
api_key: Optional[str] = None,
|
||||
vm_name: Optional[str] = None,
|
||||
api_port: Optional[int] = None,
|
||||
):
|
||||
super().__init__(
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.windows"
|
||||
ip_address, username, password, api_key, vm_name, "computer.interface.windows", api_port
|
||||
)
|
||||
|
||||
@@ -37,7 +37,6 @@ class DockerProvider(BaseVMProvider):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
port: Optional[int] = 8000,
|
||||
host: str = "localhost",
|
||||
storage: Optional[str] = None,
|
||||
shared_path: Optional[str] = None,
|
||||
@@ -45,11 +44,11 @@ class DockerProvider(BaseVMProvider):
|
||||
verbose: bool = False,
|
||||
ephemeral: bool = False,
|
||||
vnc_port: Optional[int] = 6901,
|
||||
api_port: Optional[int] = None,
|
||||
):
|
||||
"""Initialize the Docker VM Provider.
|
||||
|
||||
Args:
|
||||
port: Currently unused (VM provider port)
|
||||
host: Hostname for the API server (default: localhost)
|
||||
storage: Path for persistent VM storage
|
||||
shared_path: Path for shared folder between host and container
|
||||
@@ -60,9 +59,10 @@ class DockerProvider(BaseVMProvider):
|
||||
verbose: Enable verbose logging
|
||||
ephemeral: Use ephemeral (temporary) storage
|
||||
vnc_port: Port for VNC interface (default: 6901)
|
||||
api_port: Port for API server (default: 8000)
|
||||
"""
|
||||
self.host = host
|
||||
self.api_port = 8000
|
||||
self.api_port = api_port if api_port is not None else 8000
|
||||
self.vnc_port = vnc_port
|
||||
self.ephemeral = ephemeral
|
||||
|
||||
@@ -296,6 +296,7 @@ class DockerProvider(BaseVMProvider):
|
||||
if vnc_port:
|
||||
cmd.extend(["-p", f"{vnc_port}:6901"]) # VNC port
|
||||
if api_port:
|
||||
# Map the API port to container port 8000 (computer-server default)
|
||||
cmd.extend(["-p", f"{api_port}:8000"]) # computer-server API port
|
||||
|
||||
# Add volume mounts if storage is specified
|
||||
|
||||
@@ -14,7 +14,7 @@ class VMProviderFactory:
|
||||
@staticmethod
|
||||
def create_provider(
|
||||
provider_type: Union[str, VMProviderType],
|
||||
port: int = 7777,
|
||||
provider_port: int = 7777,
|
||||
host: str = "localhost",
|
||||
bin_path: Optional[str] = None,
|
||||
storage: Optional[str] = None,
|
||||
@@ -23,13 +23,14 @@ class VMProviderFactory:
|
||||
verbose: bool = False,
|
||||
ephemeral: bool = False,
|
||||
noVNC_port: Optional[int] = None,
|
||||
api_port: Optional[int] = None,
|
||||
**kwargs,
|
||||
) -> BaseVMProvider:
|
||||
"""Create a VM provider of the specified type.
|
||||
|
||||
Args:
|
||||
provider_type: Type of VM provider to create
|
||||
port: Port for the API server
|
||||
provider_port: Port for the provider's API server
|
||||
host: Hostname for the API server
|
||||
bin_path: Path to provider binary if needed
|
||||
storage: Path for persistent VM storage
|
||||
@@ -37,7 +38,8 @@ class VMProviderFactory:
|
||||
image: VM image to use (for Lumier provider)
|
||||
verbose: Enable verbose logging
|
||||
ephemeral: Use ephemeral (temporary) storage
|
||||
noVNC_port: Specific port for noVNC interface (for Lumier provider)
|
||||
noVNC_port: Specific port for noVNC interface (for Lumier and Docker provider)
|
||||
api_port: Specific port for Computer API server (for Docker provider)
|
||||
|
||||
Returns:
|
||||
An instance of the requested VM provider
|
||||
@@ -63,7 +65,11 @@ class VMProviderFactory:
|
||||
"Please install it with 'pip install cua-computer[lume]'"
|
||||
)
|
||||
return LumeProvider(
|
||||
port=port, host=host, storage=storage, verbose=verbose, ephemeral=ephemeral
|
||||
provider_port=provider_port,
|
||||
host=host,
|
||||
storage=storage,
|
||||
verbose=verbose,
|
||||
ephemeral=ephemeral,
|
||||
)
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import LumeProvider: {e}")
|
||||
@@ -81,7 +87,7 @@ class VMProviderFactory:
|
||||
"Please install Docker for Apple Silicon and Lume CLI before using this provider."
|
||||
)
|
||||
return LumierProvider(
|
||||
port=port,
|
||||
provider_port=provider_port,
|
||||
host=host,
|
||||
storage=storage,
|
||||
shared_path=shared_path,
|
||||
@@ -121,7 +127,6 @@ class VMProviderFactory:
|
||||
"Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'"
|
||||
)
|
||||
return WinSandboxProvider(
|
||||
port=port,
|
||||
host=host,
|
||||
storage=storage,
|
||||
verbose=verbose,
|
||||
@@ -144,7 +149,6 @@ class VMProviderFactory:
|
||||
"Please install Docker and ensure it is running."
|
||||
)
|
||||
return DockerProvider(
|
||||
port=port,
|
||||
host=host,
|
||||
storage=storage,
|
||||
shared_path=shared_path,
|
||||
@@ -152,6 +156,7 @@ class VMProviderFactory:
|
||||
verbose=verbose,
|
||||
ephemeral=ephemeral,
|
||||
vnc_port=noVNC_port,
|
||||
api_port=api_port,
|
||||
)
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import DockerProvider: {e}")
|
||||
|
||||
@@ -38,7 +38,7 @@ class LumeProvider(BaseVMProvider):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
port: int = 7777,
|
||||
provider_port: int = 7777,
|
||||
host: str = "localhost",
|
||||
storage: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
@@ -47,7 +47,7 @@ class LumeProvider(BaseVMProvider):
|
||||
"""Initialize the Lume provider.
|
||||
|
||||
Args:
|
||||
port: Port for the Lume API server (default: 7777)
|
||||
provider_port: Port for the Lume API server (default: 7777)
|
||||
host: Host to use for API connections (default: localhost)
|
||||
storage: Path to store VM data
|
||||
verbose: Enable verbose logging
|
||||
@@ -59,7 +59,7 @@ class LumeProvider(BaseVMProvider):
|
||||
)
|
||||
|
||||
self.host = host
|
||||
self.port = port # Default port for Lume API
|
||||
self.port = provider_port # Default port for Lume API
|
||||
self.storage = storage
|
||||
self.verbose = verbose
|
||||
self.ephemeral = ephemeral # If True, VMs will be deleted after stopping
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user