From 7fb5e75711685464e121f1b9361820227d3a7257 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 24 Apr 2025 19:24:12 -0400 Subject: [PATCH 01/38] consistency with other loops --- libs/agent/agent/providers/uitars/loop.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 99132365..0d3bc9f7 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop): if self.client is None: raise RuntimeError("Failed to initialize client") - # Convert messages to UI-TARS format + # Get messages in standard format from the message manager + self.message_manager.messages = messages.copy() prepared_messages = self.message_manager.get_messages() + + # Convert messages to UI-TARS format uitars_messages = self.to_uitars_format(prepared_messages) # Log request From 505a9a5f453d1b738005ba3a41082e7a8e6ea0bc Mon Sep 17 00:00:00 2001 From: Finn Date: Sat, 26 Apr 2025 20:58:21 -0400 Subject: [PATCH 02/38] docs: fix wait action --- notebooks/blog/build-your-own-operator-on-macos-1.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/notebooks/blog/build-your-own-operator-on-macos-1.ipynb b/notebooks/blog/build-your-own-operator-on-macos-1.ipynb index 22db332d..70c0e6ea 100644 --- a/notebooks/blog/build-your-own-operator-on-macos-1.ipynb +++ b/notebooks/blog/build-your-own-operator-on-macos-1.ipynb @@ -145,9 +145,8 @@ " await computer.interface.press_key(key)\n", " \n", " elif action_type == \"wait\":\n", - " wait_time = action.time\n", - " print(f\"Waiting for {wait_time} seconds\")\n", - " await asyncio.sleep(wait_time)\n", + " print(f\"Waiting for 2 seconds\")\n", + " await asyncio.sleep(2)\n", " \n", " elif action_type == \"screenshot\":\n", " print(\"Taking screenshot\")\n", From 967a732bbad03159c916d6cbed9560e79d89e264 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Sun, 27 Apr 2025 22:43:34 -0700 Subject: [PATCH 03/38] Add Lumier --- .gitignore | 8 +- .vscode/lumier.code-workspace | 30 ++++ libs/lumier/.dockerignore | 24 ++++ libs/lumier/Dockerfile | 74 ++++++++++ libs/lumier/README.md | 175 +++++++++++++++++++++++ libs/lumier/install.sh | 176 +++++++++++++++++++++++ libs/lumier/lumier | 200 +++++++++++++++++++++++++++ libs/lumier/mount/server.py | 10 ++ libs/lumier/mount/setup.sh | 9 ++ libs/lumier/src/bin/entry.sh | 98 +++++++++++++ libs/lumier/src/bin/server.sh | 99 +++++++++++++ libs/lumier/src/bin/tunnel-script.sh | 44 ++++++ libs/lumier/src/bin/tunnel.sh | 96 +++++++++++++ libs/lumier/src/config/constants.sh | 25 ++++ libs/lumier/src/hooks/on-logon.sh | 8 ++ libs/lumier/src/lib/utils.sh | 106 ++++++++++++++ libs/lumier/src/lib/vm.sh | 175 +++++++++++++++++++++++ 17 files changed, 1355 insertions(+), 2 deletions(-) create mode 100644 .vscode/lumier.code-workspace create mode 100644 libs/lumier/.dockerignore create mode 100644 libs/lumier/Dockerfile create mode 100644 libs/lumier/README.md create mode 100755 libs/lumier/install.sh create mode 100755 libs/lumier/lumier create mode 100644 libs/lumier/mount/server.py create mode 100755 libs/lumier/mount/setup.sh create mode 100755 libs/lumier/src/bin/entry.sh create mode 100755 libs/lumier/src/bin/server.sh create mode 100755 libs/lumier/src/bin/tunnel-script.sh create mode 100755 libs/lumier/src/bin/tunnel.sh create mode 100644 libs/lumier/src/config/constants.sh create mode 100755 libs/lumier/src/hooks/on-logon.sh create mode 100755 libs/lumier/src/lib/utils.sh create mode 100755 libs/lumier/src/lib/vm.sh diff --git a/.gitignore b/.gitignore index ce8445bf..8265a5a1 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,8 @@ dist/ downloads/ eggs/ .eggs/ -lib/ +lib/* +!libs/lumier/src/lib/ lib64/ parts/ sdist/ @@ -242,4 +243,7 @@ trajectories/ .storage/ # Gradio settings -.gradio_settings.json \ No newline at end of file +.gradio_settings.json + +# Lumier Storage +storage/ \ No newline at end of file diff --git a/.vscode/lumier.code-workspace b/.vscode/lumier.code-workspace new file mode 100644 index 00000000..26e12846 --- /dev/null +++ b/.vscode/lumier.code-workspace @@ -0,0 +1,30 @@ +{ + "folders": [ + { + "name": "lumier", + "path": "../libs/lumier" + }, + { + "name": "lume", + "path": "../libs/lume" + } + ], + "settings": { + "files.exclude": { + "**/.git": true, + "**/.svn": true, + "**/.hg": true, + "**/CVS": true, + "**/.DS_Store": true + } + }, + "tasks": { + "version": "2.0.0", + "tasks": [ + ] + }, + "launch": { + "configurations": [ + ] + } +} \ No newline at end of file diff --git a/libs/lumier/.dockerignore b/libs/lumier/.dockerignore new file mode 100644 index 00000000..3e0f9c98 --- /dev/null +++ b/libs/lumier/.dockerignore @@ -0,0 +1,24 @@ +# Ignore macOS system files and trash +.DS_Store +.Trashes +**/.Trashes +**/.* + +# Ignore Python cache +__pycache__/ +*.pyc +*.pyo + +# Ignore virtual environments +.venv/ +venv/ + +# Ignore editor/project files +.vscode/ +.idea/ +*.swp + +# Ignore test artifacts +test-results/ + +# Ignore anything else you don't want in the Docker build context diff --git a/libs/lumier/Dockerfile b/libs/lumier/Dockerfile new file mode 100644 index 00000000..710eb80b --- /dev/null +++ b/libs/lumier/Dockerfile @@ -0,0 +1,74 @@ +# Base image using Debian for arm64 architecture (optimized for Apple Silicon) +FROM debian:bullseye-slim AS lumier-base + +# Set environment variables for Lume API server configuration +ENV LUME_API_HOST="host.docker.internal" +ENV LUME_API_PORT="8080" + +# Default VM configuration (can be overridden at runtime) +ENV VERSION="ghcr.io/trycua/macos-sequoia-vanilla:latest" +ENV RAM_SIZE="8192" +ENV CPU_CORES="4" +ENV DISK_SIZE="100" +ENV DISPLAY="1024x768" +ENV VM_NAME="lumier" +ENV HOST_DATA_PATH="" +ENV LUMIER_DEBUG="0" + +# Install necessary tools and noVNC dependencies +RUN apt-get update && \ + apt-get install -y \ + netcat-traditional \ + curl \ + sshpass \ + wget \ + unzip \ + git \ + python3 \ + python3-pip \ + python3-numpy \ + procps && \ + rm -rf /var/lib/apt/lists/* + +# Add a dummy environment variable to invalidate cache +ENV CACHEBUST=1 + +# Download and install noVNC without caching +RUN wget https://github.com/trycua/noVNC/archive/refs/heads/master.zip -O master1.zip && \ + unzip master1.zip && \ + mv noVNC-master /opt/noVNC && \ + rm master1.zip + +# Set environment variables for noVNC +ENV NOVNC_PATH="/opt/noVNC" + +# Create directory structure +RUN mkdir -p /run/bin /run/lib /run/config /run/hooks + +# Copy scripts to the container +COPY src/bin/tunnel.sh /run/bin/ +COPY src/bin/tunnel-script.sh /usr/local/bin/lume +COPY src/bin/tunnel-script.sh /usr/local/bin/sshpass +COPY src/config/constants.sh /run/config/ +COPY src/bin/entry.sh /run/bin/entry.sh + +# Copy library files if they exist +COPY src/lib/ /run/lib/ +COPY src/hooks/ /run/hooks/ + +# Make scripts executable +RUN chmod +x /usr/local/bin/lume \ + /usr/local/bin/sshpass \ + /run/bin/* \ + /run/hooks/* 2>/dev/null || true + +# Expose ports for noVNC and Lume API +EXPOSE 8080 +EXPOSE 8006 + +# VOLUME setup +VOLUME [ "/storage" ] +VOLUME [ "/data" ] + +# Default entrypoint +ENTRYPOINT ["/run/bin/entry.sh"] \ No newline at end of file diff --git a/libs/lumier/README.md b/libs/lumier/README.md new file mode 100644 index 00000000..65803e39 --- /dev/null +++ b/libs/lumier/README.md @@ -0,0 +1,175 @@ +
+

+
+ + + + Shows my svg + +
+ + [![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#) + [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#) + [![Homebrew](https://img.shields.io/badge/Homebrew-FBB040?logo=homebrew&logoColor=fff)](#install) + [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85) +

+
+ +**Lumier** provides a Docker-based interface for the `lume` CLI, allowing you to easily run macOS virtual machines inside a container with VNC access. It creates a secure tunnel to execute lume commands on your host machine while providing a containerized environment for your applications. + +## Requirements + +Before using Lumier, make sure you have: + +1. Install [lume](https://github.com/trycua/cua/blob/main/libs/lume/README.md) on your host machine +2. Docker installed on your host machine +3. `socat` installed for the tunnel (install with Homebrew: `brew install socat`) + +## Installation + +You can use Lumier directly from its directory or install it to your system: + +```bash +# Option 1: Install to your user's bin directory (recommended) +./install.sh + +# Option 2: Install to a custom directory +./install.sh --install-dir=/usr/local/bin # May require sudo + +# Option 3: View installation options +./install.sh --help +``` + +After installation, you can run `lumier` from anywhere in your terminal. + +If you get a "command not found" error, make sure the installation directory is in your PATH. The installer will warn you if it isn't and provide instructions to add it. + +## Usage + +There are two ways to use Lumier: with the provided script or directly with Docker. + +### Option 1: Using the Lumier Script + +Lumier provides a simple CLI interface to manage VMs in Docker with full Docker compatibility: + +```bash +# Show help and available commands +lumier help + +# Start the tunnel to connect to lume +lumier start + +# Check if the tunnel is running +lumier status + +# Stop the tunnel +lumier stop + +# Build the Docker image (optional, happens automatically on first run) +lumier build + +# Run a VM with default settings +lumier run -it --rm + +# Run a VM with custom settings using Docker's -e flag +lumier run -it --rm \ + --name lumier-vm \ + -p 8006:8006 \ + -v $(pwd)/storage:/storage \ + -v $(pwd)/shared:/data \ + -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \ + -e CPU_CORES=4 \ + -e RAM_SIZE=8192 + +# Note: +# The lumier script now automatically detects the real host paths for ./storage and ./shared +# and passes them to the container as HOST_STORAGE_PATH and HOST_DATA_PATH. +# You do NOT need to specify these environment variables manually. +# The VM name is always set from the container name. +``` + +### Option 2: Using Docker Directly + +You can also use Docker commands directly without the lumier utility: + +```bash +# 1. Start the tunnel manually +cd libs/lumier +socat TCP-LISTEN:8080,reuseaddr,fork EXEC:"$PWD/src/bin/tunnel.sh" & +TUNNEL_PID=$! + +# 2. Build the Docker image +docker build -t lumier:latest . + +# 3. Run the container +docker run -it --rm \ + --name lumier-vm \ + -p 8006:8006 \ + -v $(pwd)/storage:/storage \ + -v $(pwd)/shared:/data \ + -e VM_NAME=lumier-vm \ + -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \ + -e CPU_CORES=4 \ + -e RAM_SIZE=8192 \ + -e HOST_STORAGE_PATH=$(pwd)/storage \ + -e HOST_DATA_PATH=$(pwd)/shared \ + lumier:latest + +# 4. Stop the tunnel when you're done +kill $TUNNEL_PID + +# Alternatively, find and kill the tunnel process +# First, find the process +lsof -i TCP:8080 +# Then kill it by PID +kill +``` + +Note that when using Docker directly, you're responsible for: +- Starting and managing the tunnel +- Building the Docker image +- Providing the correct environment variables + +## Available Environment Variables + +These variables can be set using Docker's `-e` flag: + +- `VM_NAME`: Set the VM name (default: lumier) +- `VERSION`: Set the VM image (default: ghcr.io/trycua/macos-sequoia-vanilla:latest) +- `CPU_CORES`: Set the number of CPU cores (default: 4) +- `RAM_SIZE`: Set the memory size in MB (default: 8192) +- `DISPLAY`: Set the display resolution (default: 1024x768) +- `HOST_DATA_PATH`: Path on the host to share with the VM +- `LUMIER_DEBUG`: Enable debug mode (set to 1) + +## Project Structure + +The project is organized as follows: + +``` +lumier/ +├── Dockerfile # Main Docker image definition +├── README.md # This file +├── lumier # Main CLI script +├── install.sh # Installation script +├── src/ # Source code +│ ├── bin/ # Executable scripts +│ │ ├── entry.sh # Docker entrypoint +│ │ ├── server.sh # Tunnel server manager +│ │ └── tunnel.sh # Tunnel request handler +│ ├── config/ # Configuration +│ │ └── constants.sh # Shared constants +│ ├── hooks/ # Lifecycle hooks +│ │ └── on-logon.sh # Run after VM boots +│ └── lib/ # Shared library code +│ ├── utils.sh # Utility functions +│ └── vm.sh # VM management functions +└── mount/ # Default shared directory +``` + +## VNC Access + +When a VM is running, you can access it via VNC through: +http://localhost:8006/vnc.html + +The password is displayed in the console output when the VM starts. \ No newline at end of file diff --git a/libs/lumier/install.sh b/libs/lumier/install.sh new file mode 100755 index 00000000..bd9e3b6b --- /dev/null +++ b/libs/lumier/install.sh @@ -0,0 +1,176 @@ +#!/bin/bash +set -e + +# Lumier Installer +# This script installs Lumier to your system + +# Define colors for output +BOLD=$(tput bold) +NORMAL=$(tput sgr0) +RED=$(tput setaf 1) +GREEN=$(tput setaf 2) +BLUE=$(tput setaf 4) +YELLOW=$(tput setaf 3) + +# Default installation directory (user-specific, doesn't require sudo) +DEFAULT_INSTALL_DIR="$HOME/.local/bin" +INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" + +# Script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Parse command line arguments +while [ "$#" -gt 0 ]; do + case "$1" in + --install-dir=*) + INSTALL_DIR="${1#*=}" + ;; + --help) + echo "${BOLD}${BLUE}Lumier Installer${NORMAL}" + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --install-dir=DIR Install to the specified directory (default: $DEFAULT_INSTALL_DIR)" + echo " --help Display this help message" + echo "" + echo "Examples:" + echo " $0 # Install to $DEFAULT_INSTALL_DIR" + echo " $0 --install-dir=/usr/local/bin # Install to system directory (may require root privileges)" + echo " INSTALL_DIR=/opt/lumier $0 # Install to /opt/lumier (legacy env var support)" + exit 0 + ;; + *) + echo "${RED}Unknown option: $1${NORMAL}" + echo "Use --help for usage information" + exit 1 + ;; + esac + shift +done + +echo "${BOLD}${BLUE}Lumier Installer${NORMAL}" +echo "This script will install Lumier to your system." + +# Check if we're running with appropriate permissions +check_permissions() { + # System directories that typically require root privileges + SYSTEM_DIRS=("/usr/local/bin" "/usr/bin" "/bin" "/opt") + + NEEDS_ROOT=false + for DIR in "${SYSTEM_DIRS[@]}"; do + if [[ "$INSTALL_DIR" == "$DIR"* ]] && [ ! -w "$INSTALL_DIR" ]; then + NEEDS_ROOT=true + break + fi + done + + if [ "$NEEDS_ROOT" = true ]; then + echo "${YELLOW}Warning: Installing to $INSTALL_DIR may require root privileges.${NORMAL}" + echo "Consider these alternatives:" + echo " • Install to a user-writable location: $0 --install-dir=$HOME/.local/bin" + echo " • Create the directory with correct permissions first:" + echo " sudo mkdir -p $INSTALL_DIR && sudo chown $(whoami) $INSTALL_DIR" + echo "" + + # Check if we already have write permission (might have been set up previously) + if [ ! -w "$INSTALL_DIR" ] && [ ! -w "$(dirname "$INSTALL_DIR")" ]; then + echo "${RED}Error: You don't have write permission to $INSTALL_DIR${NORMAL}" + echo "Please choose a different installation directory or ensure you have the proper permissions." + exit 1 + fi + fi +} + +# Detect OS and architecture +detect_platform() { + OS=$(uname -s | tr '[:upper:]' '[:lower:]') + ARCH=$(uname -m) + + if [ "$OS" != "darwin" ]; then + echo "${RED}Error: Currently only macOS is supported.${NORMAL}" + exit 1 + fi + + if [ "$ARCH" != "arm64" ]; then + echo "${RED}Error: Lumier only supports macOS on Apple Silicon (ARM64).${NORMAL}" + exit 1 + fi + + PLATFORM="darwin-arm64" + echo "Detected platform: ${BOLD}$PLATFORM${NORMAL}" +} + +# Check dependencies +check_dependencies() { + echo "Checking dependencies..." + + # Check if lume is installed + if ! command -v lume &> /dev/null; then + echo "${RED}Error: Lume is required but not installed.${NORMAL}" + echo "Please install Lume first: https://github.com/trycua/cua/blob/main/libs/lume/README.md" + exit 1 + fi + + # Check if socat is installed + if ! command -v socat &> /dev/null; then + echo "${YELLOW}Warning: socat is required but not installed.${NORMAL}" + echo "Installing socat with Homebrew..." + + # Check if Homebrew is installed + if ! command -v brew &> /dev/null; then + echo "${RED}Error: Homebrew is required to install socat.${NORMAL}" + echo "Please install Homebrew first: https://brew.sh/" + echo "Or install socat manually, then run this script again." + exit 1 + fi + + # Install socat + brew install socat + fi + + # Check if Docker is installed + if ! command -v docker &> /dev/null; then + echo "${YELLOW}Warning: Docker is required but not installed.${NORMAL}" + echo "Please install Docker: https://docs.docker.com/get-docker/" + echo "Continuing with installation, but Lumier will not work without Docker." + fi + + echo "${GREEN}All dependencies are satisfied.${NORMAL}" +} + +# Copy the lumier script directly +copy_lumier() { + echo "Copying lumier script to $INSTALL_DIR..." + cp "$SCRIPT_DIR/lumier" "$INSTALL_DIR/lumier" + chmod +x "$INSTALL_DIR/lumier" +} + +# Main installation flow +main() { + check_permissions + detect_platform + check_dependencies + + echo "Installing Lumier to $INSTALL_DIR..." + + # Create install directory if it doesn't exist + mkdir -p "$INSTALL_DIR" + + # Copy the lumier script + copy_lumier + + echo "${GREEN}Installation complete!${NORMAL}" + echo "Lumier has been installed to ${BOLD}$INSTALL_DIR/lumier${NORMAL}" + + # Check if the installation directory is in PATH + if [[ ":$PATH:" != *":$INSTALL_DIR:"* ]]; then + echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}" + echo "To add it, run one of these commands based on your shell:" + echo " For bash: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile" + echo " For zsh: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zshrc" + echo " For fish: echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish" + fi +} + +# Run the installation +main \ No newline at end of file diff --git a/libs/lumier/lumier b/libs/lumier/lumier new file mode 100755 index 00000000..1c3912cb --- /dev/null +++ b/libs/lumier/lumier @@ -0,0 +1,200 @@ +#!/usr/bin/env bash + +# Exit on errors, undefined variables, and propagate errors in pipes +set -eo pipefail + +# Always use the current working directory as the build context +SCRIPT_DIR="$(pwd)" +PORT=8080 +DEBUG=${LUMIER_DEBUG:-0} + +usage() { + cat </dev/null | grep LISTEN > /dev/null; then + return 0 # Tunnel is active + else + return 1 # Tunnel is not active + fi +} + +# Start the tunnel if needed +ensure_tunnel() { + if ! is_tunnel_active; then + echo "Tunnel is not active. Starting tunnel..." + "$SCRIPT_DIR/src/bin/server.sh" start + sleep 2 # Wait for the tunnel to start + + if ! is_tunnel_active; then + echo "Failed to start tunnel. Make sure 'lume' is installed on your host." + exit 1 + fi + else + echo "Tunnel is already active." + fi +} + +# Build the Docker image with cache busting +build_image() { + local image_name="${LUMIER_IMAGE:-lumier:latest}" + echo "Building Lumier Docker image: $image_name" + echo "SCRIPT_DIR=$SCRIPT_DIR" + echo "Checking for Dockerfile at: $SCRIPT_DIR/Dockerfile" + ls -l "$SCRIPT_DIR/Dockerfile" || echo "Dockerfile not found at $SCRIPT_DIR/Dockerfile" + + # Pass any additional arguments to docker build with cache busting + docker build --build-arg CACHEBUST=$(date +%s) -t "$image_name" "$SCRIPT_DIR" "$@" + + echo "Lumier image built successfully: $image_name" +} + +# Run the Docker container +run_container() { + local image_name="${LUMIER_IMAGE:-lumier:latest}" + + # Ensure the Docker image exists + if ! docker image inspect "$image_name" &>/dev/null; then + echo "Docker image '$image_name' not found. Building it..." + build_image + fi + + # Ensure the tunnel is running + ensure_tunnel + + # Automatically resolve and pass host paths for storage and data + STORAGE_PATH="${HOST_STORAGE_PATH:-$(realpath ./storage)}" + DATA_PATH="${HOST_DATA_PATH:-$(realpath ./shared)}" + + # Only add -e if not already present in args + DOCKER_ARGS=( ) + add_env_var() { + local var="$1"; local val="$2"; local flag="-e $var=" + for arg in "$@"; do + [[ "$arg" == *"$flag"* ]] && return 0 + done + DOCKER_ARGS+=( -e "$var=$val" ) + } + add_env_var HOST_STORAGE_PATH "$STORAGE_PATH" + add_env_var HOST_DATA_PATH "$DATA_PATH" + + # Detect --name argument and set VM_NAME if not already present + local container_name="" + local prev_arg="" + for arg in "$@"; do + if [[ "$prev_arg" == "--name" ]]; then + container_name="$arg" + break + elif [[ "$arg" == --name=* ]]; then + container_name="${arg#--name=}" + break + fi + prev_arg="$arg" + done + # Only add -e VM_NAME if not already present and container_name is set + local vm_name_set=false + for arg in "$@"; do + if [[ "$arg" == "-e" ]] && [[ "$2" == VM_NAME=* ]]; then + vm_name_set=true + break + elif [[ "$arg" == "-eVM_NAME="* ]]; then + vm_name_set=true + break + elif [[ "$arg" == "-e"* ]] && [[ "$arg" == *"VM_NAME="* ]]; then + vm_name_set=true + break + fi + done + if [[ -n "$container_name" && "$vm_name_set" != true ]]; then + DOCKER_ARGS+=( -e "VM_NAME=$container_name" ) + fi + + echo "Running Lumier container with image: $image_name" + if [[ "$*" == *"-p 8006:8006"* || "$*" == *"-p"*"8006:8006"* ]]; then + docker run "${DOCKER_ARGS[@]}" "$@" "$image_name" + else + docker run "${DOCKER_ARGS[@]}" -p 8006:8006 "$@" "$image_name" + fi +} + +# Main command handling +case "${1:-help}" in + run) + shift + run_container "$@" + ;; + tunnel) + # Handle tunnel subcommands + case "${2:-}" in + start) + "$SCRIPT_DIR/src/bin/server.sh" start + ;; + stop) + "$SCRIPT_DIR/src/bin/server.sh" stop + ;; + status) + "$SCRIPT_DIR/src/bin/server.sh" status + ;; + *) + echo "Unknown tunnel subcommand: $2" + usage + exit 1 + ;; + esac + ;; + + build) + shift + build_image "$@" + ;; + help) + usage + ;; + *) + echo "Unknown command: $1" + usage + exit 1 + ;; +esac \ No newline at end of file diff --git a/libs/lumier/mount/server.py b/libs/lumier/mount/server.py new file mode 100644 index 00000000..464c26ad --- /dev/null +++ b/libs/lumier/mount/server.py @@ -0,0 +1,10 @@ +from flask import Flask + +app = Flask(__name__) + +@app.route('/') +def hello_world(): + return 'Hello, World, from VM!' + +if __name__ == '__main__': + app.run(debug=True, host="0.0.0.0", port=5001) \ No newline at end of file diff --git a/libs/lumier/mount/setup.sh b/libs/lumier/mount/setup.sh new file mode 100755 index 00000000..8897896e --- /dev/null +++ b/libs/lumier/mount/setup.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +echo "Creating helloworld.txt on the Desktop..." +if [ ! -f ~/Desktop/helloworld.txt ]; then + echo "Hello, World!" > ~/Desktop/helloworld.txt + echo "helloworld.txt created successfully." +else + echo "helloworld.txt already exists." +fi \ No newline at end of file diff --git a/libs/lumier/src/bin/entry.sh b/libs/lumier/src/bin/entry.sh new file mode 100755 index 00000000..66a375ad --- /dev/null +++ b/libs/lumier/src/bin/entry.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash + +# Exit on errors, undefined variables, and propagate errors in pipes +set -euo pipefail + +# Source configuration files +CONFIG_DIR="/run/config" +LIB_DIR="/run/lib" + +# Source constants if available +if [ -f "${CONFIG_DIR}/constants.sh" ]; then + source "${CONFIG_DIR}/constants.sh" +fi + +# Import utilities +for lib in "${LIB_DIR}"/*.sh; do + if [ -f "$lib" ]; then + source "$lib" + fi +done + +# Set VM_NAME to env or fallback to container name (from --name) +if [ -z "${VM_NAME:-}" ]; then + VM_NAME="$(cat /etc/hostname)" + export VM_NAME +fi + +# Set HOST_STORAGE_PATH to /storage/$VM_NAME if not set +if [ -z "${HOST_STORAGE_PATH:-}" ]; then + HOST_STORAGE_PATH="/storage/$VM_NAME" + export HOST_STORAGE_PATH +fi + +# Optionally check for mountpoints +if mountpoint -q /storage; then + echo "/storage is mounted" +fi +if mountpoint -q /data; then + echo "/data is mounted" +fi + +# Log startup info +echo "Lumier VM is starting..." + +# Cleanup function to ensure VM and noVNC proxy shutdown on container stop +cleanup() { + set +e # Don't exit on error in cleanup + echo "[cleanup] Caught signal, shutting down..." + echo "[cleanup] Stopping VM..." + stop_vm + # Now gently stop noVNC proxy if running + # if [ -n "${NOVNC_PID:-}" ] && kill -0 "$NOVNC_PID" 2>/dev/null; then + # echo "[cleanup] Stopping noVNC proxy (PID $NOVNC_PID)..." + # kill -TERM "$NOVNC_PID" + # # Wait up to 5s for noVNC to exit + # for i in {1..5}; do + # if ! kill -0 "$NOVNC_PID" 2>/dev/null; then + # echo "[cleanup] noVNC proxy stopped." + # break + # fi + # sleep 1 + # done + # # Escalate if still running + # if kill -0 "$NOVNC_PID" 2>/dev/null; then + # echo "[cleanup] noVNC proxy did not exit, killing..." + # kill -KILL "$NOVNC_PID" 2>/dev/null + # fi + # fi + echo "[cleanup] Done. Exiting." + exit 0 +} +trap cleanup SIGTERM SIGINT + +# Start the VM +start_vm + +# Start noVNC for VNC access +NOVNC_PID="" +if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then + echo "Starting noVNC proxy with optimized color settings..." + ${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 & + NOVNC_PID=$! + disown $NOVNC_PID + echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true&logging=debug" +fi + +# Run any post-startup hooks +if [ -d "/run/hooks" ]; then + for hook in /run/hooks/*; do + if [ -x "$hook" ]; then + echo "Running hook: $(basename "$hook")" + "$hook" + fi + done +fi + +echo "Lumier is running. Press Ctrl+C to stop." +tail -f /dev/null \ No newline at end of file diff --git a/libs/lumier/src/bin/server.sh b/libs/lumier/src/bin/server.sh new file mode 100755 index 00000000..5849d667 --- /dev/null +++ b/libs/lumier/src/bin/server.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash + +# Exit on errors, undefined variables, and propagate errors in pipes +set -euo pipefail + +# Source constants if available +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -f "${SCRIPT_DIR}/../config/constants.sh" ]; then + source "${SCRIPT_DIR}/../config/constants.sh" +fi + +# Use the tunnel port from constants if available, otherwise default to 8080 +PORT="${TUNNEL_PORT:-8080}" +TUNNEL_SCRIPT="${SCRIPT_DIR}/tunnel.sh" + +# Function to check if the tunnel is active +is_tunnel_active() { + if lsof -i TCP:$PORT 2>/dev/null | grep LISTEN > /dev/null; then + return 0 # Tunnel is active + else + return 1 # Tunnel is not active + fi +} + +# Function to start the tunnel +start_tunnel() { + echo "Starting tunnel on port $PORT..." + if is_tunnel_active; then + echo "Tunnel is already running on port $PORT." + return 0 + fi + + # Start socat in the background + socat TCP-LISTEN:$PORT,reuseaddr,fork EXEC:"$TUNNEL_SCRIPT" & + SOCAT_PID=$! + + # Check if the tunnel started successfully + sleep 1 + if ! is_tunnel_active; then + echo "Failed to start tunnel on port $PORT." + return 1 + fi + + echo "Tunnel started successfully on port $PORT (PID: $SOCAT_PID)." + return 0 +} + +# Function to stop the tunnel +stop_tunnel() { + echo "Stopping tunnel on port $PORT..." + if ! is_tunnel_active; then + echo "No tunnel running on port $PORT." + return 0 + fi + + # Find and kill the socat process + local pid=$(lsof -i TCP:$PORT | grep LISTEN | awk '{print $2}') + if [ -n "$pid" ]; then + kill $pid + echo "Tunnel stopped (PID: $pid)." + return 0 + else + echo "Failed to find process using port $PORT." + return 1 + fi +} + +# Function to check tunnel status +status_tunnel() { + if is_tunnel_active; then + local pid=$(lsof -i TCP:$PORT | grep LISTEN | awk '{print $2}') + echo "Tunnel is active on port $PORT (PID: $pid)." + return 0 + else + echo "No tunnel running on port $PORT." + return 1 + fi +} + +# Parse command line arguments +case "${1:-}" in + start) + start_tunnel + ;; + stop) + stop_tunnel + ;; + restart) + stop_tunnel + start_tunnel + ;; + status) + status_tunnel + ;; + *) + echo "Usage: $0 {start|stop|restart|status}" + exit 1 + ;; +esac \ No newline at end of file diff --git a/libs/lumier/src/bin/tunnel-script.sh b/libs/lumier/src/bin/tunnel-script.sh new file mode 100755 index 00000000..529839ea --- /dev/null +++ b/libs/lumier/src/bin/tunnel-script.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# Source constants if running in container context +if [ -f "/run/config/constants.sh" ]; then + source "/run/config/constants.sh" +fi + +# Define server address with fallback +SERVER="${TUNNEL_HOST:-host.docker.internal}:${TUNNEL_PORT:-8080}" + +# Extract the base name of the command and arguments +command=$(basename "$0") +subcommand="$1" +shift +args="$@" + +command="$command $subcommand $args" + +# Concatenate command and any stdin data +full_data="$command" +if [ ! -t 0 ]; then + stdin_data=$(cat) + if [ -n "$stdin_data" ]; then + # Format full_data to include stdin data + full_data="$full_data << 'EOF' + $stdin_data +EOF" + fi +fi + +# Trim leading/trailing whitespace and newlines +full_data=$(echo -e "$full_data" | sed 's/^[ \t\n]*//;s/[ \t\n]*$//') + +# Log command if debug is enabled +if [ "${LUMIER_DEBUG:-0}" -eq 1 ]; then + echo "Executing lume command: $full_data" >&2 + echo "Sending to: $SERVER" >&2 +fi + +# Use curl with -N to disable output buffering and -s for silent mode +curl -N -s -X POST \ + -H "Content-Type: application/octet-stream" \ + --data-binary @- \ + "http://$SERVER" <<< "$full_data" \ No newline at end of file diff --git a/libs/lumier/src/bin/tunnel.sh b/libs/lumier/src/bin/tunnel.sh new file mode 100755 index 00000000..6de14282 --- /dev/null +++ b/libs/lumier/src/bin/tunnel.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash + +# Exit on errors, undefined variables, and propagate errors in pipes +set -euo pipefail + +# Source constants if available +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -f "${SCRIPT_DIR}/../config/constants.sh" ]; then + source "${SCRIPT_DIR}/../config/constants.sh" +fi + +# Handle errors and cleanup +cleanup() { + local exit_code=$? + # Clean up any temporary files if they exist + [ -n "${temp_file:-}" ] && [ -f "$temp_file" ] && rm "$temp_file" + [ -n "${fifo:-}" ] && [ -p "$fifo" ] && rm "$fifo" + exit $exit_code +} +trap cleanup EXIT INT TERM + +log_debug() { + if [ "${LUMIER_DEBUG:-0}" -eq 1 ]; then + echo "[DEBUG] $*" >&2 + fi +} + +send_error_response() { + local status_code=$1 + local message=$2 + echo "HTTP/1.1 $status_code" + echo "Content-Type: text/plain" + echo "" + echo "$message" + exit 1 +} + +# Read the HTTP request line +read -r request_line +log_debug "Request: $request_line" + +# Read headers and look for Content-Length +content_length=0 +while IFS= read -r header; do + [[ $header == $'\r' ]] && break # End of headers + log_debug "Header: $header" + if [[ "$header" =~ ^Content-Length:\ ([0-9]+) ]]; then + content_length="${BASH_REMATCH[1]}" + fi +done + +# Read the body using the content length +command="" +if [ "$content_length" -gt 0 ]; then + command=$(dd bs=1 count="$content_length" 2>/dev/null) + log_debug "Received command: $command" +fi + +# Determine the executable and arguments based on the command +if [[ "$command" == lume* ]]; then + executable="$(which lume || echo "/usr/local/bin/lume")" + command_args="${command#lume}" # Remove 'lume' from the command +elif [[ "$command" == sshpass* ]]; then + executable="$(which sshpass || echo "/usr/local/bin/sshpass")" + command_args="${command#sshpass}" +else + send_error_response "400 Bad Request" "Unsupported command: $command" +fi + +# Check if executable exists +if [ ! -x "$executable" ]; then + send_error_response "500 Internal Server Error" "Executable not found or not executable: $executable" +fi + +# Create a temporary file to store the command +temp_file=$(mktemp) +echo "$executable $command_args" > "$temp_file" +chmod +x "$temp_file" + +# Create a FIFO (named pipe) for capturing output +fifo=$(mktemp -u) +mkfifo "$fifo" + +# Execute the command and pipe its output through awk to ensure line-buffering +{ + log_debug "Executing: $executable $command_args" + "$temp_file" 2>&1 | awk '{ print; fflush() }' > "$fifo" +} & + +# Stream the output from the FIFO as an HTTP response +{ + echo -e "HTTP/1.1 200 OK\r" + echo -e "Content-Type: text/plain\r" + echo -e "\r" + cat "$fifo" +} \ No newline at end of file diff --git a/libs/lumier/src/config/constants.sh b/libs/lumier/src/config/constants.sh new file mode 100644 index 00000000..766c4373 --- /dev/null +++ b/libs/lumier/src/config/constants.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Port configuration +TUNNEL_PORT=8080 +VNC_PORT=8006 + +# Host configuration +TUNNEL_HOST="host.docker.internal" + +# Default VM configuration +DEFAULT_RAM_SIZE="8192" +DEFAULT_CPU_CORES="4" +DEFAULT_DISK_SIZE="100" +DEFAULT_VM_NAME="lumier" +DEFAULT_VM_VERSION="ghcr.io/trycua/macos-sequoia-vanilla:latest" + +# Paths +NOVNC_PATH="/opt/noVNC" +LIFECYCLE_HOOKS_DIR="/run/hooks" + +# VM connection details +HOST_USER="lume" +HOST_PASSWORD="lume" +SSH_RETRY_ATTEMPTS=20 +SSH_RETRY_INTERVAL=5 \ No newline at end of file diff --git a/libs/lumier/src/hooks/on-logon.sh b/libs/lumier/src/hooks/on-logon.sh new file mode 100755 index 00000000..faa817c0 --- /dev/null +++ b/libs/lumier/src/hooks/on-logon.sh @@ -0,0 +1,8 @@ +setup_script="$DATA_FOLDER_PATH/setup.sh" + +if [ -f "$setup_script" ]; then + chmod +x "$setup_script" + source "$setup_script" +else + echo "Setup script not found at: $setup_script" +fi \ No newline at end of file diff --git a/libs/lumier/src/lib/utils.sh b/libs/lumier/src/lib/utils.sh new file mode 100755 index 00000000..7d599669 --- /dev/null +++ b/libs/lumier/src/lib/utils.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +# Function to wait for SSH to become available +wait_for_ssh() { + local host_ip=$1 + local user=$2 + local password=$3 + local retry_interval=${4:-5} # Default retry interval is 5 seconds + local max_retries=${5:-20} # Default maximum retries is 20 (0 for infinite) + + echo "Waiting for SSH to become available on $host_ip..." + + local retry_count=0 + while true; do + # Try to connect via SSH + sshpass -p "$password" ssh -o StrictHostKeyChecking=no "$user@$host_ip" "exit" + + # Check the exit status of the SSH command + if [ $? -eq 0 ]; then + echo "SSH is ready on $host_ip!" + return 0 + fi + + # Increment retry count + ((retry_count++)) + + # Exit if maximum retries are reached + if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then + echo "Maximum retries reached. SSH is not available." + return 1 + fi + + echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)" + sleep $retry_interval + done +} + +# Function to execute a script on a remote server using sshpass +execute_remote_script() { + local host="$1" + local user="$2" + local password="$3" + local script_path="$4" + local vnc_password="$5" + local data_folder="$6" + + # Check if all required arguments are provided + if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then + echo "Usage: execute_remote_script [data_folder]" + return 1 + fi + + echo "VNC password exported to VM: $vnc_password" + + data_folder_path="$VM_SHARED_FILES_PATH/$data_folder" + echo "Data folder path in VM: $data_folder_path" + + # Read the script content and prepend the shebang + script_content="#!/usr/bin/env bash\n" + if [ -n "$data_folder" ]; then + script_content+="export VNC_PASSWORD='$vnc_password'\n" + script_content+="export DATA_FOLDER_PATH='$data_folder_path'\n" + fi + script_content+="$(<"$script_path")" + + # Use a here-document to send the script content + sshpass -p "$password" ssh -o StrictHostKeyChecking=no "$user@$host" "bash -s" </dev/null 2>&1 || true + fi + + # Check if VM exists and its status using JSON format + VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1) + + # Check if VM not found error + if [[ $VM_INFO == *"Virtual machine not found"* ]]; then + IMAGE_NAME="${VERSION##*/}" + lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_NAME" + else + # Parse the JSON status - check if it contains "status" : "running" + if [[ $VM_INFO == *'"status" : "running"'* ]]; then + lume_stop "$VM_NAME" "$STORAGE_NAME" + fi + fi + + # Set VM parameters + lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_NAME" + + # Fetch VM configuration + CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json) + + # Setup data directory args if necessary + SHARED_DIR_ARGS="" + if [ -d "/data" ]; then + if [ -n "$HOST_DATA_PATH" ]; then + SHARED_DIR_ARGS="--shared-dir=$HOST_DATA_PATH" + else + echo "Warning: /data volume exists but HOST_DATA_PATH is not set. Cannot mount volume." + fi + fi + + # Run VM with VNC and shared directory using curl + lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" & + + # Wait for VM to be running and VNC URL to be available + vm_ip="" + vnc_url="" + max_attempts=30 + attempt=0 + + while [ $attempt -lt $max_attempts ]; do + # Get VM info as JSON + VM_INFO=$(lume get "$VM_NAME" -f json 2>/dev/null) + + # Check if VM has status 'running' + if [[ $VM_INFO == *'"status" : "running"'* ]]; then + # Extract IP address using the existing function from utils.sh + vm_ip=$(extract_json_field "ipAddress" "$VM_INFO") + # Extract VNC URL using the existing function from utils.sh + vnc_url=$(extract_json_field "vncUrl" "$VM_INFO") + + # If we have both IP and VNC URL, break the loop + if [ -n "$vm_ip" ] && [ -n "$vnc_url" ]; then + break + fi + fi + + sleep 2 + attempt=$((attempt + 1)) + done + + if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then + echo "Timed out waiting for VM to start or VNC URL to become available." + lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1 + exit 1 + fi + + + # Parse VNC URL to extract password and port + VNC_PASSWORD=$(echo "$vnc_url" | sed -n 's/.*:\(.*\)@.*/\1/p') + VNC_PORT=$(echo "$vnc_url" | sed -n 's/.*:\([0-9]\+\)$/\1/p') + + # Wait for SSH to become available + wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20 + + # Export VNC variables for entry.sh to use + export VNC_PORT + export VNC_PASSWORD + + # Execute on-logon.sh if present + on_logon_script="/run/lifecycle/on-logon.sh" + if [ -f "$on_logon_script" ]; then + execute_remote_script "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" "$on_logon_script" "$VNC_PASSWORD" "$DATA_FOLDER" + fi + + # The VM is still running because we never killed lume run. + # If you want to stop the VM at some point, you can kill $LUME_PID or use lume_stop. +} + +stop_vm() { + echo "Stopping VM '$VM_NAME'..." + STORAGE_NAME="storage_${VM_NAME}" + # Check if the VM exists and is running (use lume get for speed) + VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>/dev/null) + if [[ -z "$VM_INFO" || $VM_INFO == *"Virtual machine not found"* ]]; then + echo "VM '$VM_NAME' does not exist." + elif [[ $VM_INFO == *'"status" : "running"'* ]]; then + lume_stop "$VM_NAME" "$STORAGE_NAME" + echo "VM '$VM_NAME' was running and is now stopped." + elif [[ $VM_INFO == *'"status" : "stopped"'* ]]; then + echo "VM '$VM_NAME' is already stopped." + else + echo "Unknown VM status for '$VM_NAME'." + fi +} + +is_vm_running() { + lume ls | grep -q "$VM_NAME" +} + +# Stop VM with storage location specified using curl +lume_stop() { + local vm_name="$1" + local storage="$2" + curl --connect-timeout 6000 \ + --max-time 5000 \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"storage":"'$storage'"}' \ + "http://host.docker.internal:3000/lume/vms/${vm_name}/stop" +} + +# Run VM with VNC client started and shared directory using curl +lume_run() { + # Parse args + local shared_dir="" + local storage="ssd" + local vm_name="lume_vm" + local no_display=true + while [[ $# -gt 0 ]]; do + case $1 in + --shared-dir=*) + shared_dir="${1#*=}" + shift + ;; + --storage) + storage="$2" + shift 2 + ;; + --no-display) + no_display=true + shift + ;; + *) + # Assume last arg is VM name if not an option + vm_name="$1" + shift + ;; + esac + done + + # Default to ~/Projects if not provided + if [[ -z "$shared_dir" ]]; then + shared_dir="~/Projects" + fi + + local json_body="{\"noDisplay\": true, \"sharedDirectories\": [{\"hostPath\": \"$shared_dir\", \"readOnly\": false}], \"storage\": \"$storage\", \"recoveryMode\": false}" + local curl_cmd="curl --connect-timeout 6000 \ + --max-time 5000 \ + -X POST \ + -H 'Content-Type: application/json' \ + -d '$json_body' \ + http://host.docker.internal:3000/lume/vms/$vm_name/run" + echo "[lume_run] Running:" + echo "$curl_cmd" + eval "$curl_cmd" +} \ No newline at end of file From 885831f04ea7e8ae740761af46f2413f0e0c44ae Mon Sep 17 00:00:00 2001 From: f-trycua Date: Sun, 27 Apr 2025 22:52:11 -0700 Subject: [PATCH 04/38] Add lume options --- libs/lumier/src/lib/vm.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/libs/lumier/src/lib/vm.sh b/libs/lumier/src/lib/vm.sh index 19dcff07..9d3dda06 100755 --- a/libs/lumier/src/lib/vm.sh +++ b/libs/lumier/src/lib/vm.sh @@ -9,7 +9,7 @@ start_vm() { # Check if VM exists and its status using JSON format VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1) - + # Check if VM not found error if [[ $VM_INFO == *"Virtual machine not found"* ]]; then IMAGE_NAME="${VERSION##*/}" @@ -17,7 +17,8 @@ start_vm() { else # Parse the JSON status - check if it contains "status" : "running" if [[ $VM_INFO == *'"status" : "running"'* ]]; then - lume_stop "$VM_NAME" "$STORAGE_NAME" + # lume_stop "$VM_NAME" "$STORAGE_NAME" + lume stop "$VM_NAME" --storage "$STORAGE_NAME" fi fi @@ -38,7 +39,8 @@ start_vm() { fi # Run VM with VNC and shared directory using curl - lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" & + # lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" & + lume run "$VM_NAME" --storage "$STORAGE_NAME" --no-display # Wait for VM to be running and VNC URL to be available vm_ip="" @@ -69,7 +71,8 @@ start_vm() { if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then echo "Timed out waiting for VM to start or VNC URL to become available." - lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1 + # lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1 + lume stop "$VM_NAME" --storage "$STORAGE_NAME" > /dev/null 2>&1 exit 1 fi @@ -79,7 +82,7 @@ start_vm() { VNC_PORT=$(echo "$vnc_url" | sed -n 's/.*:\([0-9]\+\)$/\1/p') # Wait for SSH to become available - wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20 + wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20 # Export VNC variables for entry.sh to use export VNC_PORT From cf75a7e577a68a571ffb0449c45639c9202dc1f2 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Sun, 27 Apr 2025 23:06:45 -0700 Subject: [PATCH 05/38] Update to zprofile --- libs/lume/scripts/install.sh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh index aa0529c6..f6313538 100755 --- a/libs/lume/scripts/install.sh +++ b/libs/lume/scripts/install.sh @@ -12,6 +12,8 @@ GREEN=$(tput setaf 2) BLUE=$(tput setaf 4) YELLOW=$(tput setaf 3) + + # Default installation directory (user-specific, doesn't require sudo) DEFAULT_INSTALL_DIR="$HOME/.local/bin" INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" @@ -173,11 +175,25 @@ install_binary() { # Check if the installation directory is in PATH if [ -n "${PATH##*$INSTALL_DIR*}" ]; then + SHELL_NAME=$(basename "$SHELL") echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}" - echo "To add it, run one of these commands based on your shell:" - echo " For bash: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile" - echo " For zsh: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zshrc" - echo " For fish: echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish" + case "$SHELL_NAME" in + zsh) + echo "To add it, run:" + echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zprofile" + ;; + bash) + echo "To add it, run:" + echo " echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile" + ;; + fish) + echo "To add it, run:" + echo " echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish" + ;; + *) + echo "Add $INSTALL_DIR to your PATH in your shell profile file." + ;; + esac fi } From a37fa708482eb4c99e9b5fc425e848d85cb85bbc Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 10:15:25 -0400 Subject: [PATCH 06/38] added basic demo video maker --- examples/video_maker_traj.py | 692 +++++++++++++++++++++++++++++++++++ 1 file changed, 692 insertions(+) create mode 100644 examples/video_maker_traj.py diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py new file mode 100644 index 00000000..34a5ad3c --- /dev/null +++ b/examples/video_maker_traj.py @@ -0,0 +1,692 @@ +""" +Video Maker for Trajectory Dataset + +This script processes a trajectory dataset folder, extracts frames, +and creates an animated video with cursor overlays. +""" + +from utils import load_dotenv_files +load_dotenv_files() + +import os +import json +import math +import shutil +import re +from pathlib import Path +import argparse +import numpy as np +from PIL import Image, ImageDraw, ImageFilter +import requests +from io import BytesIO +from tqdm import tqdm + +# Constants +CURSOR_SCALE = 2 # Scale factor for cursor size +FRAMES_PER_CLICK = 8 # Number of frames to show for click animation +FRAMES_PER_MOVE = 10 # Number of frames to interpolate between cursor positions +CURSOR_NORMAL = "https://mac-cursors.netlify.app/png/default@2x.png" +CURSOR_CLICKING = "https://mac-cursors.netlify.app/png/handpointing@2x.png" +CURSOR_TYPING = "https://mac-cursors.netlify.app/png/textcursor@2x.png" +CURSOR_HOTSPOT = (20, 15) +OUTPUT_DIR = "examples/output/video_frames" + +# Vignette effect constants +VIGNETTE_WIDTH = 10 # Width of the vignette border in pixels +VIGNETTE_COLORS = [(128, 0, 255), (0, 0, 255)] # Purple to Blue gradient colors +VIGNETTE_ANIMATION_SPEED = 0.1 # Controls speed of the animation pulse + +def download_image(url): + """Download an image from a URL.""" + response = requests.get(url) + return Image.open(BytesIO(response.content)) + +def load_cursor_images(): + """Load and resize cursor images.""" + cursor_normal = download_image(CURSOR_NORMAL) + cursor_clicking = download_image(CURSOR_CLICKING) + cursor_typing = download_image(CURSOR_TYPING) + + # Resize all cursors based on CURSOR_SCALE + width_normal, height_normal = cursor_normal.size + width_clicking, height_clicking = cursor_clicking.size + width_typing, height_typing = cursor_typing.size + + cursor_normal = cursor_normal.resize((int(width_normal * CURSOR_SCALE), int(height_normal * CURSOR_SCALE))) + cursor_clicking = cursor_clicking.resize((int(width_clicking * CURSOR_SCALE), int(height_clicking * CURSOR_SCALE))) + cursor_typing = cursor_typing.resize((int(width_typing * CURSOR_SCALE), int(height_typing * CURSOR_SCALE))) + + cursors = { + "normal": cursor_normal, + "clicking": cursor_clicking, + "typing": cursor_typing + } + + return cursors + +# Store the last known cursor position and thought across all frames +last_known_cursor_position = None +last_known_thought = None + +def extract_thought_from_api_response(filename): + """Extract thought from API response for the current frame.""" + global last_known_thought + + turn_dir = os.path.dirname(filename) + api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')] + + for api_file in api_response_files: + try: + with open(os.path.join(turn_dir, api_file), 'r') as f: + data = json.load(f) + # Extract content from response + content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '') + + # Extract the Thought section + thought_match = re.search(r"Thought: (.*?)(?:\nAction:|$)", content, re.DOTALL) + if thought_match: + thought = thought_match.group(1).strip() + if thought: + last_known_thought = thought + return thought + except (json.JSONDecodeError, FileNotFoundError, KeyError): + pass + + # Return the last known thought if no new thought is found + return last_known_thought + +def extract_cursor_position_from_filename(filename): + """Extract cursor position from a filename containing click info.""" + global last_known_cursor_position + + # For 'screenshot_NNN_click_TIMESTAMP.png', try to extract coordinates + match = re.search(r'click_(\d+)_(\d+)_\d+\.png$', filename) + if match: + position = (int(match.group(1)), int(match.group(2))) + last_known_cursor_position = position + return position + + # Check if we have position info from API response + turn_dir = os.path.dirname(filename) + api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')] + + for api_file in api_response_files: + try: + with open(os.path.join(turn_dir, api_file), 'r') as f: + data = json.load(f) + # Extract action from response + content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '') + # Look for coordinates in the action + # First try the pattern from the example: click(start_box='(28,15)') + coord_match = re.search(r"click\(start_box='\((\d+),(\d+)\)'\)", content) + if coord_match: + position = (int(coord_match.group(1)), int(coord_match.group(2))) + last_known_cursor_position = position + return position + + # Try alternative pattern: click(start_box='<|box_start|>(x,y)<|box_end|>') + alt_match = re.search(r"click\(start_box='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)", content) + if alt_match: + position = (int(alt_match.group(1)), int(alt_match.group(2))) + last_known_cursor_position = position + return position + except (json.JSONDecodeError, FileNotFoundError, KeyError): + pass + + # No new position found, return the last known position + return last_known_cursor_position + +def extract_action_from_filename(filename): + """Determine the action type from the filename pattern.""" + if 'click' in filename: + return "clicking" + elif 'type' in filename: + return "typing" + else: + return "normal" + +def create_animated_vignette(image, frame_index): + """ + Create an animated purple/blue gradient vignette effect around the border of the image. + The animation pulses the colors and gently varies their intensity over time. + + Args: + image: The base image to apply the vignette to + frame_index: Current frame index for animation timing + + Returns: + Image with vignette effect applied + """ + # Create a copy of the image to work with + result = image.copy() + width, height = result.size + + # Create a blank RGBA image for the vignette overlay + vignette = Image.new('RGBA', (width, height), (0, 0, 0, 0)) + draw = ImageDraw.Draw(vignette) + + # Calculate animation phase based on frame index + phase = math.sin(frame_index * VIGNETTE_ANIMATION_SPEED) * 0.5 + 0.5 # Oscillates between 0 and 1 + + # Interpolate between the vignette colors based on the animation phase + color1 = VIGNETTE_COLORS[0] + color2 = VIGNETTE_COLORS[1] + animated_color = ( + int(color1[0] + (color2[0] - color1[0]) * phase), + int(color1[1] + (color2[1] - color1[1]) * phase), + int(color1[2] + (color2[2] - color1[2]) * phase), + ) + + # Draw gradient borders around each edge + # Top border + for i in range(VIGNETTE_WIDTH): + alpha = int(150 * (1 - i / VIGNETTE_WIDTH)) + border_color = animated_color[:3] + (alpha,) + draw.line([(0, i), (width, i)], fill=border_color, width=1) + draw.line([(0, height-i-1), (width, height-i-1)], fill=border_color, width=1) + draw.line([(i, 0), (i, height)], fill=border_color, width=1) + draw.line([(width-i-1, 0), (width-i-1, height)], fill=border_color, width=1) + + # Apply slight blur to smooth the gradient + vignette = vignette.filter(ImageFilter.GaussianBlur(16)) + + # Composite the vignette over the original image + result = Image.alpha_composite(result.convert('RGBA'), vignette) + + return result.convert('RGB') # Convert back to RGB for consistency + +def scale_cursor_with_animation(cursor, frame, max_frames, cursor_type): + """Create springy scale animation for cursor.""" + if cursor_type == "normal": + return cursor + + # For clicking or typing cursors, create a spring effect + progress = frame / max_frames + + # Spring effect calculation - starts big, gets smaller, then back to normal + if progress < 0.3: + # Start with larger scale, shrink down + scale = 1.3 - progress + elif progress < 0.7: + # Then bounce back up a bit + scale = 0.7 + (progress - 0.3) * 0.8 + else: + # Then settle to normal (1.0) + scale = 1.0 + (1.0 - progress) * 0.3 + + # Apply scale + width, height = cursor.size + new_width = int(width * scale) + new_height = int(height * scale) + return cursor.resize((new_width, new_height)) + +# Store the last thought bubble position +last_thought_bubble_pos = None + +def draw_thought_bubble(image, position, thought_text, frame_index): + """Draw a thought bubble with the AI's thoughts near the cursor position.""" + global last_thought_bubble_pos + + if thought_text is None or position is None: + return image + + # Create a copy of the image to work with + result = image.copy() + + # Set up text parameters + font_size = 16 + try: + # Try to use a nice font if available + from PIL import ImageFont + try: + font = ImageFont.truetype("Arial", font_size) + except IOError: + # Fallback to default font + font = ImageFont.load_default() + except ImportError: + font = None + + # Wrap text to fit in bubble + max_width = 400 # Max width in pixels + wrapped_lines = [] + words = thought_text.split() + current_line = [] + + for word in words: + # Add word to current line + test_line = ' '.join(current_line + [word]) + + # Create a temporary draw object to measure text width if needed + temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1))) + + # Measure the text width + if font: + if hasattr(temp_draw, 'textlength'): + text_width = temp_draw.textlength(test_line, font=font) + else: + # Fall back to rough estimation + text_width = len(test_line) * (font_size * 0.6) + else: + # Rough estimation if no font metrics are available + text_width = len(test_line) * (font_size * 0.6) + + if text_width <= max_width: + current_line.append(word) + else: + # Line is full, start a new line + if current_line: + wrapped_lines.append(' '.join(current_line)) + current_line = [word] + + # Don't forget the last line + if current_line: + wrapped_lines.append(' '.join(current_line)) + + # Limit number of lines for very long thoughts + max_lines = 8 + if len(wrapped_lines) > max_lines: + wrapped_lines = wrapped_lines[:max_lines-1] + ["..."] + + # Calculate text dimensions + line_height = font_size + 4 + text_height = len(wrapped_lines) * line_height + + # Find the widest line + if font: + # Create a draw object to measure text width + temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1))) + if hasattr(temp_draw, 'textlength'): + text_width = max(temp_draw.textlength(line, font=font) for line in wrapped_lines) + else: + # Fall back to rough estimation + text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines) + else: + text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines) + + # Add padding + padding = 20 + bubble_width = text_width + padding * 2 + bubble_height = text_height + padding * 2 + + # Calculate bubble position - move slowly towards cursor position + x, y = position + screen_width, screen_height = image.size + + # Default initial position if this is the first bubble + target_bubble_x = min(x + 30, screen_width - bubble_width - 10) + target_bubble_y = max(y - bubble_height - 30, 10) + + # Ensure target position is fully on screen + if target_bubble_x < 10: + target_bubble_x = 10 + if target_bubble_y + bubble_height > screen_height - 10: + target_bubble_y = screen_height - bubble_height - 10 + + # Calculate new position with slow movement towards target + # Very slow movement factor (0.01 means it moves 1% of the distance per frame) + movement_factor = 0.001 + + if last_thought_bubble_pos is None: + # First frame, set to target position + bubble_x, bubble_y = target_bubble_x, target_bubble_y + else: + # Interpolate slowly towards target position + last_x, last_y = last_thought_bubble_pos + bubble_x = last_x + (target_bubble_x - last_x) * movement_factor + bubble_y = last_y + (target_bubble_y - last_y) * movement_factor + + # Add a subtle animation effect to the bubble + # animation_offset = math.sin(frame_index * 0.1) * 2 + # bubble_y += int(animation_offset) + + # Store position for next frame + last_thought_bubble_pos = (bubble_x, bubble_y) + + # Draw rounded rectangle for bubble + corner_radius = 15 + + # Background with black gaussian blur + background_color = (0, 0, 0, 180) # Black with transparency + outline_color = (50, 50, 50, 255) # Dark gray outline + + # Draw the bubble background - first create an RGBA version + bubble_img = Image.new('RGBA', result.size, (0, 0, 0, 0)) + bubble_draw = ImageDraw.Draw(bubble_img) + + # Draw rounded rectangle + # Check if rounded_rectangle is available (PIL 8.0.0+) + if hasattr(bubble_draw, 'rounded_rectangle'): + bubble_draw.rounded_rectangle( + [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height], + radius=corner_radius, + fill=background_color, + outline=outline_color, + width=2 + ) + else: + # Fall back to regular rectangle if rounded_rectangle not available + bubble_draw.rectangle( + [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height], + fill=background_color, + outline=outline_color + ) + + # Apply gaussian blur to the bubble background + bubble_img = bubble_img.filter(ImageFilter.GaussianBlur(3)) + + # Draw small triangle pointing to cursor + pointer_size = 10 + pointer_x = x + 15 + pointer_y = y - 5 + + # Make sure pointer is under the bubble + if pointer_x > bubble_x + bubble_width: + pointer_x = bubble_x + bubble_width - 20 + elif pointer_x < bubble_x: + pointer_x = bubble_x + 20 + + # Create an overlay for the pointer + pointer_overlay = Image.new('RGBA', result.size, (0, 0, 0, 0)) + pointer_draw = ImageDraw.Draw(pointer_overlay) + + # Draw pointer triangle + # pointer_draw.polygon( + # [ + # (pointer_x, pointer_y), + # (pointer_x - pointer_size, pointer_y - pointer_size), + # (pointer_x + pointer_size, pointer_y - pointer_size) + # ], + # fill=background_color, + # outline=outline_color + # ) + + # Apply gaussian blur to the pointer + pointer_overlay = pointer_overlay.filter(ImageFilter.GaussianBlur(3)) + + # Composite the bubble and pointer onto the original image + result = Image.alpha_composite(result.convert('RGBA'), bubble_img) + result = Image.alpha_composite(result, pointer_overlay) + + # Now draw the text + draw = ImageDraw.Draw(result) + text_x = bubble_x + padding + text_y = bubble_y + padding + + text_color = (255, 255, 255, 255) # White text + for line in wrapped_lines: + draw.text((text_x, text_y), line, font=font, fill=text_color) + text_y += line_height + + return result.convert('RGB') + +def create_cursor_overlay(base_image, position, cursor_images, thought_text=None, cursor_type="normal", animation_frame=0, frame_index=0): + """Create an image with cursor overlaid on the base image and thought bubble if available.""" + # Create a copy of the base image + result = base_image.copy() + + # If position is None, return the image without a cursor + if position is None: + return result + + # Get the appropriate cursor image + cursor = cursor_images[cursor_type] + + # Apply animation scaling if needed + if cursor_type in ["clicking", "typing"]: + cursor = scale_cursor_with_animation(cursor, animation_frame, FRAMES_PER_CLICK, cursor_type) + + # Calculate position to center the cursor hotspot + # Cursor hotspot is at (20,15) of the cursor image + x, y = position + hotspot_x, hotspot_y = CURSOR_HOTSPOT + cursor_x = x - (hotspot_x * CURSOR_SCALE) # X offset for hotspot + cursor_y = y - (hotspot_y * CURSOR_SCALE) # Y offset for hotspot + + # Paste the cursor onto the image + result.paste(cursor, (int(cursor_x), int(cursor_y)), cursor) + + # Add thought bubble if text is available + if thought_text: + result = draw_thought_bubble(result, position, thought_text, frame_index) + + return result + +def get_screenshot_files(trajectory_dir): + """ + Get all screenshot files from a trajectory directory, sorted by sequence number. + + Args: + trajectory_dir: Path to trajectory directory containing turn_XXX folders + + Returns: + List of tuples (path, sequence_number, action_type, position) + """ + screenshot_files = [] + + # List all turn directories in order + turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], + key=lambda x: int(x.split('_')[1])) + + for turn_dir in turn_dirs: + turn_path = os.path.join(trajectory_dir, turn_dir) + if not os.path.isdir(turn_path): + continue + + # Get all screenshot files in this turn + files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')] + + for file in files: + file_path = os.path.join(turn_path, file) + + # Extract sequence number from filename (e.g., screenshot_003_...) + seq_match = re.search(r'screenshot_(\d+)', file) + if seq_match: + seq_number = int(seq_match.group(1)) + + # Determine action type from filename + action_type = extract_action_from_filename(file) + + # Get cursor position if available + position = extract_cursor_position_from_filename(file_path) + + screenshot_files.append((file_path, seq_number, action_type, position)) + + # Sort by sequence number + screenshot_files.sort(key=lambda x: x[1]) + + return screenshot_files + +def process_trajectory(trajectory_dir, output_dir, cursors): + """Process a trajectory directory and create output frames.""" + # Get all screenshot files + screenshot_files = get_screenshot_files(trajectory_dir) + + if not screenshot_files: + print(f"No screenshot files found in {trajectory_dir}") + return + + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Track frame index + frame_index = 0 + + # Process each screenshot + prev_img = None + prev_cursor_pos = None + + for i, (file_path, seq_number, action_type, position) in enumerate(tqdm(screenshot_files, desc="Processing frames")): + # Load the current image + try: + current_img = Image.open(file_path) + except Exception as e: + print(f"Error loading image {file_path}: {e}") + continue + + # Current cursor position + current_cursor_pos = position + + # Check if the current frame has an action (click/typing) + is_action_frame = action_type in ["clicking", "typing"] + + if is_action_frame: + # If we have a previous frame, use it for the first half of animation + if prev_img is not None: + half_frames = FRAMES_PER_CLICK // 2 + # First half of animation uses PREVIOUS image + for j in range(half_frames): + # Get the thought from the API response + current_thought = extract_thought_from_api_response(file_path) + + output_img = create_cursor_overlay( + prev_img, current_cursor_pos, cursors, + thought_text=current_thought, + cursor_type=action_type, + animation_frame=j, + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 + + # Second half uses CURRENT image + for j in range(half_frames, FRAMES_PER_CLICK): + # Get the thought from the API response + current_thought = extract_thought_from_api_response(file_path) + + output_img = create_cursor_overlay( + current_img, current_cursor_pos, cursors, + thought_text=current_thought, + cursor_type=action_type, + animation_frame=j, + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 + else: + # If no previous frame, use current for full animation + for j in range(FRAMES_PER_CLICK): + # Get the thought from the API response + current_thought = extract_thought_from_api_response(file_path) + + output_img = create_cursor_overlay( + current_img, current_cursor_pos, cursors, + thought_text=current_thought, + cursor_type=action_type, + animation_frame=j, + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 + else: + # Regular frame with normal cursor + # Get the thought from the API response + current_thought = extract_thought_from_api_response(file_path) + + output_img = create_cursor_overlay( + current_img, current_cursor_pos, cursors, + thought_text=current_thought, + cursor_type="normal", + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 + + # Add position interpolation frames if we're not at the last frame + if i < len(screenshot_files) - 1: + # Get next position + next_cursor_pos = screenshot_files[i+1][3] + + # Only interpolate if both positions are valid and different + if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos: + for j in range(1, FRAMES_PER_MOVE): + progress = j / FRAMES_PER_MOVE + interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress + interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress + interp_pos = (int(interp_x), int(interp_y)) + + # Create interpolated movement frame + # Get the thought from the API response + current_thought = extract_thought_from_api_response(file_path) + + output_img = create_cursor_overlay( + current_img, interp_pos, cursors, + thought_text=current_thought, + cursor_type="normal", + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 + + # Save current frame as previous for next iteration + prev_img = current_img + prev_cursor_pos = current_cursor_pos + +def main(): + """Main function to process the trajectory and create video frames.""" + parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.') + parser.add_argument('trajectory_dir', type=str, help='Path to the trajectory folder') + parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames') + parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video') + parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file') + parser.add_argument('--skip_ffmpeg', action='store_true', help='Skip running ffmpeg to create video') + args = parser.parse_args() + + trajectory_dir = args.trajectory_dir + output_dir = args.output_dir + fps = args.fps + output_video = args.output_video + skip_ffmpeg = args.skip_ffmpeg + + # Check if trajectory directory exists + if not os.path.exists(trajectory_dir): + print(f"Trajectory directory {trajectory_dir} does not exist") + return + + # Clean output directory if it exists + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + # Create output directory + os.makedirs(output_dir, exist_ok=True) + + # Load cursor images + print("Loading cursor images...") + cursors = load_cursor_images() + + # Process the trajectory + print(f"Processing trajectory from {trajectory_dir}...") + process_trajectory(trajectory_dir, output_dir, cursors) + + print(f"Processing complete. Frames saved to {output_dir}") + + # Run ffmpeg to create the video + if not skip_ffmpeg: + print(f"Running ffmpeg to create video: {output_video}") + ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}" + try: + import subprocess + result = subprocess.run(ffmpeg_cmd, shell=True, check=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True) + print(f"Video created successfully: {output_video}") + except subprocess.CalledProcessError as e: + print(f"Error running ffmpeg: {e}") + print(f"ffmpeg output:\n{e.stdout}\n{e.stderr}") + print("\nYou can create a video manually with this command:") + print(ffmpeg_cmd) + else: + print("Skipping ffmpeg. You can create a video from these frames using ffmpeg with this command:") + print(f"ffmpeg -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}") + +if __name__ == "__main__": + main() From d049fa6ebbdfa21128c29b782fb8f62c77b63c9b Mon Sep 17 00:00:00 2001 From: Finn Date: Mon, 28 Apr 2025 21:30:49 -0400 Subject: [PATCH 07/38] Renames os arg to os_type to avoid module collision --- libs/computer/computer/computer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py index f4d9d9bf..ddb68f9e 100644 --- a/libs/computer/computer/computer.py +++ b/libs/computer/computer/computer.py @@ -29,7 +29,7 @@ class Computer: display: Union[Display, Dict[str, int], str] = "1024x768", memory: str = "8GB", cpu: str = "4", - os: OSType = "macos", + os_type: OSType = "macos", name: str = "", image: str = "macos-sequoia-cua:latest", shared_directories: Optional[List[str]] = None, @@ -68,6 +68,7 @@ class Computer: self.image = image self.port = port self.host = host + self.os_type = os_type # Store telemetry preference self._telemetry_enabled = telemetry_enabled @@ -129,8 +130,8 @@ class Computer: self.shared_paths = [] if shared_directories: for path in shared_directories: - abs_path = os.path.abspath(os.path.expanduser(path)) # type: ignore[attr-defined] - if not os.path.exists(abs_path): # type: ignore[attr-defined] + abs_path = os.path.abspath(os.path.expanduser(path)) + if not os.path.exists(abs_path): raise ValueError(f"Shared directory does not exist: {path}") self.shared_paths.append(abs_path) self._pylume_context = None @@ -188,7 +189,7 @@ class Computer: self._interface = cast( BaseComputerInterface, InterfaceFactory.create_interface_for_os( - os=self.os, ip_address=ip_address # type: ignore[arg-type] + os=self.os_type, ip_address=ip_address # type: ignore[arg-type] ), ) @@ -288,13 +289,13 @@ class Computer: try: # Initialize the interface using the factory with the specified OS - self.logger.info(f"Initializing interface for {self.os} at {ip_address}") + self.logger.info(f"Initializing interface for {self.os_type} at {ip_address}") from .interface.base import BaseComputerInterface self._interface = cast( BaseComputerInterface, InterfaceFactory.create_interface_for_os( - os=self.os, ip_address=ip_address # type: ignore[arg-type] + os=self.os_type, ip_address=ip_address # type: ignore[arg-type] ), ) From d502cbdc991c496669bbd6b8fdf732ebea091958 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 19:12:32 -0400 Subject: [PATCH 08/38] fix endpoint not liking string message content --- .../agent/providers/omni/clients/oaicompat.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/omni/clients/oaicompat.py b/libs/agent/agent/providers/omni/clients/oaicompat.py index 6a95896a..b15515fd 100644 --- a/libs/agent/agent/providers/omni/clients/oaicompat.py +++ b/libs/agent/agent/providers/omni/clients/oaicompat.py @@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient): """ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - final_messages = [{"role": "system", "content": system}] + final_messages = [ + { + "role": "system", + "content": [ + { "type": "text", "text": system } + ] + } + ] # Process messages for item in messages: @@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient): else: message = { "role": item["role"], - "content": [{"type": "text", "text": item["content"]}], + "content": [{ + "type": "text", + "text": item["content"] + }], } final_messages.append(message) else: From 84ed45c0dd621ecd7907b3b6edf4b1db16f4f350 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 24 Apr 2025 19:24:12 -0400 Subject: [PATCH 09/38] consistency with other loops --- libs/agent/agent/providers/uitars/loop.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 99132365..0d3bc9f7 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop): if self.client is None: raise RuntimeError("Failed to initialize client") - # Convert messages to UI-TARS format + # Get messages in standard format from the message manager + self.message_manager.messages = messages.copy() prepared_messages = self.message_manager.get_messages() + + # Convert messages to UI-TARS format uitars_messages = self.to_uitars_format(prepared_messages) # Log request From 3608491419be160503250436fc6d8a1933747b9e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 21:53:58 -0400 Subject: [PATCH 10/38] fix uitars oai provider --- .../agent/agent/providers/uitars/clients/oaicompat.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py index 4567360b..963fb05b 100644 --- a/libs/agent/agent/providers/uitars/clients/oaicompat.py +++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py @@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient): """ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - final_messages = [{"role": "system", "content": system}] - + final_messages = [ + { + "role": "system", + "content": [ + { "type": "text", "text": system } + ] + } + ] + # Process messages for item in messages: if isinstance(item, dict): From b4af3f67d5643be1c66ea12272f30db97c4fcd52 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 11:23:03 -0700 Subject: [PATCH 11/38] decreased scroll sensitivity for openai's cua --- libs/agent/agent/providers/openai/tools/computer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py index ae4fdce8..5ec9460a 100644 --- a/libs/agent/agent/providers/openai/tools/computer.py +++ b/libs/agent/agent/providers/openai/tools/computer.py @@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): y = kwargs.get("y") if x is None or y is None: raise ToolError("x and y coordinates are required for scroll action") - scroll_x = kwargs.get("scroll_x", 0) // 20 - scroll_y = kwargs.get("scroll_y", 0) // 20 + scroll_x = kwargs.get("scroll_x", 0) // 50 + scroll_y = kwargs.get("scroll_y", 0) // 50 return await self.handle_scroll(x, y, scroll_x, scroll_y) elif type == "screenshot": return await self.screenshot() From ea31cc63408498d7c3bfe606eb4a15d65cf8defa Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 11:34:18 -0700 Subject: [PATCH 12/38] added mappings for modifier keys --- libs/computer/computer/interface/models.py | 26 ++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py index b586a9f7..c09a092c 100644 --- a/libs/computer/computer/interface/models.py +++ b/libs/computer/computer/interface/models.py @@ -7,6 +7,9 @@ NavigationKey = Literal['pagedown', 'pageup', 'home', 'end', 'left', 'right', 'u # Special key literals SpecialKey = Literal['enter', 'esc', 'tab', 'space', 'backspace', 'del'] +# Modifier key literals +ModifierKey = Literal['ctrl', 'shift', 'win', 'command', 'option'] + # Function key literals FunctionKey = Literal['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'] @@ -35,6 +38,13 @@ class Key(Enum): BACKSPACE = 'backspace' DELETE = 'del' + # Modifier keys + CTRL = 'ctrl' + SHIFT = 'shift' + WIN = 'win' + COMMAND = 'command' + OPTION = 'option' + # Function keys F1 = 'f1' F2 = 'f2' @@ -73,14 +83,26 @@ class Key(Enum): 'escape': cls.ESCAPE, 'esc': cls.ESC, 'delete': cls.DELETE, - 'del': cls.DELETE + 'del': cls.DELETE, + # Modifier key mappings + 'ctrl': cls.CTRL, + 'control': cls.CTRL, + 'shift': cls.SHIFT, + 'win': cls.WIN, + 'windows': cls.WIN, + 'command': cls.COMMAND, + 'cmd': cls.COMMAND, + '⌘': cls.COMMAND, + 'option': cls.OPTION, + 'alt': cls.OPTION, + '⌥': cls.OPTION, } normalized = key.lower().strip() return key_mapping.get(normalized, key) # Combined key type -KeyType = Union[Key, NavigationKey, SpecialKey, FunctionKey, str] +KeyType = Union[Key, NavigationKey, SpecialKey, ModifierKey, FunctionKey, str] class AccessibilityWindow(TypedDict): """Information about a window in the accessibility tree.""" From 1e5ba4832a9e1b7fa97495993658177937f86708 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 11:41:20 -0700 Subject: [PATCH 13/38] mapping for super key --- libs/computer/computer/interface/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py index c09a092c..9a90acb4 100644 --- a/libs/computer/computer/interface/models.py +++ b/libs/computer/computer/interface/models.py @@ -90,6 +90,7 @@ class Key(Enum): 'shift': cls.SHIFT, 'win': cls.WIN, 'windows': cls.WIN, + 'super': cls.WIN, 'command': cls.COMMAND, 'cmd': cls.COMMAND, '⌘': cls.COMMAND, From 8b939e789057b11f0931a995889dc914f641260d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 21:53:58 -0400 Subject: [PATCH 14/38] fix uitars oai provider --- .../agent/agent/providers/uitars/clients/oaicompat.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py index 4567360b..963fb05b 100644 --- a/libs/agent/agent/providers/uitars/clients/oaicompat.py +++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py @@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient): """ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - final_messages = [{"role": "system", "content": system}] - + final_messages = [ + { + "role": "system", + "content": [ + { "type": "text", "text": system } + ] + } + ] + # Process messages for item in messages: if isinstance(item, dict): From 1df8194de1373e5c7521bb0e12abb6e9ddffe140 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 12:00:09 -0700 Subject: [PATCH 15/38] fix hotkeys on uitars and openai provider --- libs/agent/agent/providers/openai/tools/computer.py | 6 +----- libs/agent/agent/providers/uitars/tools/computer.py | 8 ++++++-- libs/computer/computer/interface/models.py | 5 +++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py index 5ec9460a..90ef5935 100644 --- a/libs/agent/agent/providers/openai/tools/computer.py +++ b/libs/agent/agent/providers/openai/tools/computer.py @@ -240,11 +240,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): if len(mapped_keys) > 1: # For key combinations (like Ctrl+C) - for k in mapped_keys: - await self.computer.interface.press_key(k) - await asyncio.sleep(0.1) - for k in reversed(mapped_keys): - await self.computer.interface.press_key(k) + await self.computer.interface.hotkey(*mapped_keys) else: # Single key press await self.computer.interface.press_key(mapped_keys[0]) diff --git a/libs/agent/agent/providers/uitars/tools/computer.py b/libs/agent/agent/providers/uitars/tools/computer.py index 5cf7f67a..4d5f2ce3 100644 --- a/libs/agent/agent/providers/uitars/tools/computer.py +++ b/libs/agent/agent/providers/uitars/tools/computer.py @@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool): elif action == "hotkey": if "keys" in kwargs: keys = kwargs["keys"] - for key in keys: - await self.computer.interface.press_key(key) + if len(keys) > 1: + await self.computer.interface.hotkey(*keys) + else: + # Single key press + await self.computer.interface.press_key(keys[0]) + # Wait for UI to update await asyncio.sleep(0.3) diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py index 9a90acb4..e8ec1b47 100644 --- a/libs/computer/computer/interface/models.py +++ b/libs/computer/computer/interface/models.py @@ -8,7 +8,7 @@ NavigationKey = Literal['pagedown', 'pageup', 'home', 'end', 'left', 'right', 'u SpecialKey = Literal['enter', 'esc', 'tab', 'space', 'backspace', 'del'] # Modifier key literals -ModifierKey = Literal['ctrl', 'shift', 'win', 'command', 'option'] +ModifierKey = Literal['ctrl', 'alt', 'shift', 'win', 'command', 'option'] # Function key literals FunctionKey = Literal['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12'] @@ -39,6 +39,7 @@ class Key(Enum): DELETE = 'del' # Modifier keys + ALT = 'alt' CTRL = 'ctrl' SHIFT = 'shift' WIN = 'win' @@ -85,6 +86,7 @@ class Key(Enum): 'delete': cls.DELETE, 'del': cls.DELETE, # Modifier key mappings + 'alt': cls.ALT, 'ctrl': cls.CTRL, 'control': cls.CTRL, 'shift': cls.SHIFT, @@ -95,7 +97,6 @@ class Key(Enum): 'cmd': cls.COMMAND, '⌘': cls.COMMAND, 'option': cls.OPTION, - 'alt': cls.OPTION, '⌥': cls.OPTION, } From f580be07a1f8e153f8d37630323cf220175ca74c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 12:01:43 -0700 Subject: [PATCH 16/38] prompt uitars to use the correct hotkeys on mac --- libs/agent/agent/providers/uitars/loop.py | 4 ++-- libs/agent/agent/providers/uitars/prompts.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 99132365..84393bd2 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -20,7 +20,7 @@ from computer import Computer from .utils import add_box_token, parse_actions, parse_action_parameters from .tools.manager import ToolManager from .tools.computer import ToolResult -from .prompts import COMPUTER_USE, SYSTEM_PROMPT +from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES from .clients.oaicompat import OAICompatClient @@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop): if first_user_idx is not None and instruction: # Create the computer use prompt user_prompt = COMPUTER_USE.format( - instruction=instruction, + instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]), language="English" ) diff --git a/libs/agent/agent/providers/uitars/prompts.py b/libs/agent/agent/providers/uitars/prompts.py index aa24557d..fe16f0d8 100644 --- a/libs/agent/agent/providers/uitars/prompts.py +++ b/libs/agent/agent/providers/uitars/prompts.py @@ -1,5 +1,9 @@ """Prompts for UI-TARS agent.""" +MAC_SPECIFIC_NOTES = """ +(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).) +""" + SYSTEM_PROMPT = "You are a helpful assistant." COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. @@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par ## User Instruction {instruction} -""" \ No newline at end of file +""" From 2e10e0922ab1b883d3bb4cae0f14db9fb9784ea7 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 29 Apr 2025 12:09:30 -0700 Subject: [PATCH 17/38] add top_p to uitars --- libs/agent/agent/providers/uitars/clients/oaicompat.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py index 963fb05b..1b512997 100644 --- a/libs/agent/agent/providers/uitars/clients/oaicompat.py +++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py @@ -145,8 +145,13 @@ class OAICompatClient(BaseUITarsClient): message = {"role": "user", "content": [{"type": "text", "text": item}]} final_messages.append(message) - payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature} - payload["max_tokens"] = max_tokens or self.max_tokens + payload = { + "model": self.model, + "messages": final_messages, + "max_tokens": max_tokens or self.max_tokens, + "temperature": self.temperature, + "top_p": 0.7, + } try: async with aiohttp.ClientSession() as session: From e8e446f8c2d8db43390e0f5834803161fe764f84 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Tue, 29 Apr 2025 16:37:33 -0700 Subject: [PATCH 18/38] Add lume --storage path --- libs/lume/scripts/install.sh | 79 ++++++++- libs/lume/src/Commands/Create.swift | 2 +- libs/lume/src/Commands/Delete.swift | 2 +- libs/lume/src/Commands/Get.swift | 2 +- libs/lume/src/Commands/List.swift | 11 +- libs/lume/src/Commands/Pull.swift | 2 +- libs/lume/src/Commands/Run.swift | 2 +- libs/lume/src/Commands/Set.swift | 2 +- libs/lume/src/Commands/Stop.swift | 2 +- .../ImageContainerRegistry.swift | 17 +- libs/lume/src/FileSystem/Home.swift | 22 +++ libs/lume/src/FileSystem/VMDirectory.swift | 42 +++-- libs/lume/src/LumeController.swift | 162 +++++++++++++++--- libs/lume/src/Server/Handlers.swift | 4 +- libs/lume/src/Server/Requests.swift | 2 +- libs/lume/src/Server/Server.swift | 6 +- libs/lumier/src/lib/vm.sh | 39 +++-- 17 files changed, 321 insertions(+), 77 deletions(-) diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh index f6313538..d854c0e4 100755 --- a/libs/lume/scripts/install.sh +++ b/libs/lume/scripts/install.sh @@ -12,8 +12,6 @@ GREEN=$(tput setaf 2) BLUE=$(tput setaf 4) YELLOW=$(tput setaf 3) - - # Default installation directory (user-specific, doesn't require sudo) DEFAULT_INSTALL_DIR="$HOME/.local/bin" INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" @@ -204,11 +202,84 @@ main() { create_temp_dir download_release install_binary - + echo "" echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}" echo "Run ${BOLD}lume${NORMAL} to get started." + + # --- LaunchAgent setup for lume daemon --- + SERVICE_NAME="com.trycua.lume_daemon" + PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" + LUME_BIN="$INSTALL_DIR/lume" + + echo "" + echo "Setting up LaunchAgent to run lume daemon on login..." + + # Create LaunchAgents directory if it doesn't exist + mkdir -p "$HOME/Library/LaunchAgents" + + # Unload existing service if present + if [ -f "$PLIST_PATH" ]; then + echo "Existing LaunchAgent found. Unloading..." + launchctl unload "$PLIST_PATH" 2>/dev/null || true + fi + + # Create the plist file + cat < "$PLIST_PATH" + + + + + Label + $SERVICE_NAME + ProgramArguments + + $LUME_BIN + serve + + RunAtLoad + + KeepAlive + + WorkingDirectory + $HOME + EnvironmentVariables + + PATH + /usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$HOME/.local/bin + HOME + $HOME + + StandardOutPath + /tmp/lume_daemon.log + StandardErrorPath + /tmp/lume_daemon.error.log + ProcessType + Interactive + SessionType + Aqua + + +EOF + + # Set permissions + chmod 644 "$PLIST_PATH" + touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log + chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log + + # Load the LaunchAgent + echo "Loading LaunchAgent..." + launchctl unload "$PLIST_PATH" 2>/dev/null || true + launchctl load "$PLIST_PATH" + + echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}" + echo "To check status: launchctl list | grep $SERVICE_NAME" + echo "To view logs: tail -f /tmp/lume_daemon.log" + echo "" + echo "To remove the lume daemon service, run:" + echo " launchctl unload \"$PLIST_PATH\"" + echo " rm \"$PLIST_PATH\"" } # Run the installation -main \ No newline at end of file +main diff --git a/libs/lume/src/Commands/Create.swift b/libs/lume/src/Commands/Create.swift index b4f02633..db042c69 100644 --- a/libs/lume/src/Commands/Create.swift +++ b/libs/lume/src/Commands/Create.swift @@ -40,7 +40,7 @@ struct Create: AsyncParsableCommand { ) var ipsw: String? - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() { diff --git a/libs/lume/src/Commands/Delete.swift b/libs/lume/src/Commands/Delete.swift index c3cd3653..7d78ca6d 100644 --- a/libs/lume/src/Commands/Delete.swift +++ b/libs/lume/src/Commands/Delete.swift @@ -12,7 +12,7 @@ struct Delete: AsyncParsableCommand { @Flag(name: .long, help: "Force deletion without confirmation") var force = false - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() {} diff --git a/libs/lume/src/Commands/Get.swift b/libs/lume/src/Commands/Get.swift index 5ff34113..aad56136 100644 --- a/libs/lume/src/Commands/Get.swift +++ b/libs/lume/src/Commands/Get.swift @@ -12,7 +12,7 @@ struct Get: AsyncParsableCommand { @Option(name: [.long, .customShort("f")], help: "Output format (json|text)") var format: FormatOption = .text - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() { diff --git a/libs/lume/src/Commands/List.swift b/libs/lume/src/Commands/List.swift index 6361f899..89a6dc6e 100644 --- a/libs/lume/src/Commands/List.swift +++ b/libs/lume/src/Commands/List.swift @@ -10,15 +10,22 @@ struct List: AsyncParsableCommand { @Option(name: [.long, .customShort("f")], help: "Output format (json|text)") var format: FormatOption = .text + @Option(name: .long, help: "Filter by storage location name") + var storage: String? + init() { } @MainActor func run() async throws { let manager = LumeController() - let vms = try manager.list() + let vms = try manager.list(storage: self.storage) if vms.isEmpty && self.format == .text { - print("No virtual machines found") + if let storageName = self.storage { + print("No virtual machines found in storage '\(storageName)'") + } else { + print("No virtual machines found") + } } else { try VMDetailsPrinter.printStatus(vms, format: self.format) } diff --git a/libs/lume/src/Commands/Pull.swift b/libs/lume/src/Commands/Pull.swift index 074e0fac..cd843381 100644 --- a/libs/lume/src/Commands/Pull.swift +++ b/libs/lume/src/Commands/Pull.swift @@ -19,7 +19,7 @@ struct Pull: AsyncParsableCommand { @Option(help: "Organization to pull from. Defaults to trycua") var organization: String = "trycua" - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() {} diff --git a/libs/lume/src/Commands/Run.swift b/libs/lume/src/Commands/Run.swift index bc659769..273e8ba7 100644 --- a/libs/lume/src/Commands/Run.swift +++ b/libs/lume/src/Commands/Run.swift @@ -48,7 +48,7 @@ struct Run: AsyncParsableCommand { @Option(help: "For MacOS VMs only, boot into the VM in recovery mode") var recoveryMode: Bool = false - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? private var parsedSharedDirectories: [SharedDirectory] { diff --git a/libs/lume/src/Commands/Set.swift b/libs/lume/src/Commands/Set.swift index 73bfe0c9..e2420a68 100644 --- a/libs/lume/src/Commands/Set.swift +++ b/libs/lume/src/Commands/Set.swift @@ -21,7 +21,7 @@ struct Set: AsyncParsableCommand { @Option(help: "New display resolution in format WIDTHxHEIGHT.") var display: VMDisplayResolution? - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() { diff --git a/libs/lume/src/Commands/Stop.swift b/libs/lume/src/Commands/Stop.swift index 933019e5..3b921114 100644 --- a/libs/lume/src/Commands/Stop.swift +++ b/libs/lume/src/Commands/Stop.swift @@ -9,7 +9,7 @@ struct Stop: AsyncParsableCommand { @Argument(help: "Name of the virtual machine", completion: .custom(completeVMName)) var name: String - @Option(name: .customLong("storage"), help: "VM storage location to use") + @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location") var storage: String? init() { diff --git a/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift b/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift index 714cf1cb..a7a68212 100644 --- a/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift +++ b/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift @@ -643,7 +643,7 @@ class ImageContainerRegistry: @unchecked Sendable { image: String, name: String?, locationName: String? = nil - ) async throws { + ) async throws -> VMDirectory { guard !image.isEmpty else { throw ValidationError("Image name cannot be empty") } @@ -652,7 +652,16 @@ class ImageContainerRegistry: @unchecked Sendable { // Use provided name or derive from image let vmName = name ?? image.split(separator: ":").first.map(String.init) ?? "" - let vmDir = try home.getVMDirectory(vmName, storage: locationName) + + // Determine if locationName is a direct path or a named storage location + let vmDir: VMDirectory + if let locationName = locationName, locationName.contains("/") || locationName.contains("\\") { + // Direct path + vmDir = try home.getVMDirectoryFromPath(vmName, storagePath: locationName) + } else { + // Named storage or default location + vmDir = try home.getVMDirectory(vmName, storage: locationName) + } // Optimize network early in the process optimizeNetworkSettings() @@ -991,6 +1000,7 @@ class ImageContainerRegistry: @unchecked Sendable { Logger.info( "Run 'lume run \(vmName)' to reduce the disk image file size by using macOS sparse file system" ) + return vmDir } // Helper function to clean up a specific cache entry @@ -3024,7 +3034,8 @@ class ImageContainerRegistry: @unchecked Sendable { // Replace original with optimized version try FileManager.default.removeItem(at: reassembledFile) - try FileManager.default.moveItem(at: optimizedFile, to: reassembledFile) + try FileManager.default.moveItem( + at: optimizedFile, to: reassembledFile) Logger.info("Using sparse-optimized file for verification") } else { Logger.info( diff --git a/libs/lume/src/FileSystem/Home.swift b/libs/lume/src/FileSystem/Home.swift index b8b4ae54..d83b39b0 100644 --- a/libs/lume/src/FileSystem/Home.swift +++ b/libs/lume/src/FileSystem/Home.swift @@ -92,6 +92,28 @@ final class Home { let baseDir = Path(location.expandedPath) return VMDirectory(baseDir.directory(name)) } + + /// Gets a VM directory from a direct file path + /// + /// - Parameters: + /// - name: Name of the VM directory + /// - storagePath: Direct file system path where the VM is located + /// - Returns: A VMDirectory instance + /// - Throws: HomeError if path is invalid + func getVMDirectoryFromPath(_ name: String, storagePath: String) throws -> VMDirectory { + let baseDir = Path(storagePath) + + // Create the directory if it doesn't exist + if !fileExists(at: storagePath) { + Logger.info("Creating storage directory", metadata: ["path": storagePath]) + try createVMLocation(at: storagePath) + } else if !isValidDirectory(at: storagePath) { + // Path exists but isn't a valid directory + throw HomeError.invalidHomeDirectory + } + + return VMDirectory(baseDir.directory(name)) + } /// Returns all initialized VM directories across all locations /// - Returns: An array of VMDirectory instances with location info diff --git a/libs/lume/src/FileSystem/VMDirectory.swift b/libs/lume/src/FileSystem/VMDirectory.swift index a902e34b..3335107d 100644 --- a/libs/lume/src/FileSystem/VMDirectory.swift +++ b/libs/lume/src/FileSystem/VMDirectory.swift @@ -8,7 +8,7 @@ import Foundation /// - Handling disk operations /// - Managing VM state and locking /// - Providing access to VM-related paths -struct VMDirectory { +struct VMDirectory: Sendable { // MARK: - Constants private enum FileNames { @@ -26,8 +26,6 @@ struct VMDirectory { let configPath: Path let sessionsPath: Path - private let fileManager: FileManager - /// The name of the VM directory var name: String { dir.name } @@ -36,10 +34,8 @@ struct VMDirectory { /// Creates a new VMDirectory instance /// - Parameters: /// - dir: The base directory path for the VM - /// - fileManager: FileManager instance to use for file operations - init(_ dir: Path, fileManager: FileManager = .default) { + init(_ dir: Path) { self.dir = dir - self.fileManager = fileManager self.nvramPath = dir.file(FileNames.nvram) self.diskPath = dir.file(FileNames.disk) self.configPath = dir.file(FileNames.config) @@ -52,7 +48,25 @@ struct VMDirectory { extension VMDirectory { /// Checks if the VM directory is fully initialized with all required files func initialized() -> Bool { - configPath.exists() && diskPath.exists() && nvramPath.exists() + // Add detailed logging for debugging + let configExists = configPath.exists() + let diskExists = diskPath.exists() + let nvramExists = nvramPath.exists() + + Logger.info( + "VM directory initialization check", + metadata: [ + "directory": dir.path, + "config_path": configPath.path, + "config_exists": "\(configExists)", + "disk_path": diskPath.path, + "disk_exists": "\(diskExists)", + "nvram_path": nvramPath.path, + "nvram_exists": "\(nvramExists)" + ] + ) + + return configExists && diskExists && nvramExists } /// Checks if the VM directory exists @@ -70,7 +84,7 @@ extension VMDirectory { func setDisk(_ size: UInt64) throws { do { if !diskPath.exists() { - guard fileManager.createFile(atPath: diskPath.path, contents: nil) else { + guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else { throw VMDirectoryError.fileCreationFailed(diskPath.path) } } @@ -96,7 +110,7 @@ extension VMDirectory { do { let data = try encoder.encode(config) - guard fileManager.createFile(atPath: configPath.path, contents: data) else { + guard FileManager.default.createFile(atPath: configPath.path, contents: data) else { throw VMDirectoryError.fileCreationFailed(configPath.path) } } catch { @@ -108,7 +122,7 @@ extension VMDirectory { /// - Returns: The loaded configuration /// - Throws: VMDirectoryError if the load operation fails func loadConfig() throws -> VMConfig { - guard let data = fileManager.contents(atPath: configPath.path) else { + guard let data = FileManager.default.contents(atPath: configPath.path) else { throw VMDirectoryError.configNotFound } @@ -137,7 +151,7 @@ extension VMDirectory { do { let data = try encoder.encode(session) - guard fileManager.createFile(atPath: sessionsPath.path, contents: data) else { + guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else { throw VMDirectoryError.fileCreationFailed(sessionsPath.path) } } catch { @@ -149,7 +163,7 @@ extension VMDirectory { /// - Returns: The loaded VNC session /// - Throws: VMDirectoryError if the load operation fails func loadSession() throws -> VNCSession { - guard let data = fileManager.contents(atPath: sessionsPath.path) else { + guard let data = FileManager.default.contents(atPath: sessionsPath.path) else { throw VMDirectoryError.sessionNotFound } @@ -163,7 +177,7 @@ extension VMDirectory { /// Removes the VNC session information from disk func clearSession() { - try? fileManager.removeItem(atPath: sessionsPath.path) + try? FileManager.default.removeItem(atPath: sessionsPath.path) } } @@ -176,6 +190,6 @@ extension VMDirectory: CustomStringConvertible { extension VMDirectory { func delete() throws { - try fileManager.removeItem(atPath: dir.path) + try FileManager.default.removeItem(atPath: dir.path) } } diff --git a/libs/lume/src/LumeController.swift b/libs/lume/src/LumeController.swift index ecdcec49..f25079ff 100644 --- a/libs/lume/src/LumeController.swift +++ b/libs/lume/src/LumeController.swift @@ -48,15 +48,72 @@ final class LumeController { /// Lists all virtual machines in the system @MainActor - public func list() throws -> [VMDetails] { + public func list(storage: String? = nil) throws -> [VMDetails] { do { - let vmLocations = try home.getAllVMDirectories() - let statuses = try vmLocations.map { vmWithLoc in - let vm = try self.get( - name: vmWithLoc.directory.name, storage: vmWithLoc.locationName) - return vm.details + if let storage = storage { + // If storage is specified, only return VMs from that location + if storage.contains("/") || storage.contains("\\") { + // Direct path - check if it exists + if !FileManager.default.fileExists(atPath: storage) { + // Return empty array if the path doesn't exist + return [] + } + + // Try to get all VMs from the specified path + // We need to check which subdirectories are valid VM dirs + let directoryURL = URL(fileURLWithPath: storage) + let contents = try FileManager.default.contentsOfDirectory( + at: directoryURL, + includingPropertiesForKeys: [.isDirectoryKey], + options: .skipsHiddenFiles + ) + + let statuses = try contents.compactMap { subdir -> VMDetails? in + guard let isDirectory = try subdir.resourceValues(forKeys: [.isDirectoryKey]).isDirectory, + isDirectory else { + return nil + } + + let vmName = subdir.lastPathComponent + // Check if it's a valid VM directory + let vmDir = try home.getVMDirectoryFromPath(vmName, storagePath: storage) + if !vmDir.initialized() { + return nil + } + + do { + let vm = try self.get(name: vmName, storage: storage) + return vm.details + } catch { + // Skip invalid VM directories + return nil + } + } + return statuses + } else { + // Named storage + let vmsWithLoc = try home.getAllVMDirectories() + let statuses = try vmsWithLoc.compactMap { vmWithLoc -> VMDetails? in + // Only include VMs from the specified location + if vmWithLoc.locationName != storage { + return nil + } + let vm = try self.get( + name: vmWithLoc.directory.name, storage: vmWithLoc.locationName) + return vm.details + } + return statuses + } + } else { + // No storage filter - get all VMs + let vmsWithLoc = try home.getAllVMDirectories() + let statuses = try vmsWithLoc.compactMap { vmWithLoc -> VMDetails? in + let vm = try self.get( + name: vmWithLoc.directory.name, storage: vmWithLoc.locationName) + return vm.details + } + return statuses } - return statuses } catch { Logger.error("Failed to list VMs", metadata: ["error": error.localizedDescription]) throw error @@ -133,20 +190,42 @@ final class LumeController { public func get(name: String, storage: String? = nil) throws -> VM { let normalizedName = normalizeVMName(name: name) do { - // Try to find the VM and get its actual location - let actualLocation = try self.validateVMExists( - normalizedName, storage: storage) + let vm: VM + if let storagePath = storage, storagePath.contains("/") || storagePath.contains("\\") { + // Storage is a direct path + let vmDir = try home.getVMDirectoryFromPath(normalizedName, storagePath: storagePath) + guard vmDir.initialized() else { + // Throw a specific error if the directory exists but isn't a valid VM + if vmDir.exists() { + throw VMError.notInitialized(normalizedName) + } else { + throw VMError.notFound(normalizedName) + } + } + // Pass the path as the storage context + vm = try self.loadVM(vmDir: vmDir, storage: storagePath) + } else { + // Storage is nil or a named location + let actualLocation = try self.validateVMExists( + normalizedName, storage: storage) - // Load the VM from its actual location - let vm = try self.loadVM(name: normalizedName, storage: actualLocation) + let vmDir = try home.getVMDirectory(normalizedName, storage: actualLocation) + // loadVM will re-check initialized, but good practice to keep validateVMExists result. + vm = try self.loadVM(vmDir: vmDir, storage: actualLocation) + } return vm } catch { - Logger.error("Failed to get VM", metadata: ["error": error.localizedDescription]) + Logger.error( + "Failed to get VM", + metadata: [ + "vmName": normalizedName, "storage": storage ?? "default", + "error": error.localizedDescription, + ]) + // Re-throw the original error to preserve its type throw error } } - /// Factory for creating the appropriate VM type based on the OS @MainActor public func create( name: String, @@ -488,7 +567,7 @@ final class LumeController { let imageContainerRegistry = ImageContainerRegistry( registry: registry, organization: organization) - try await imageContainerRegistry.pull( + let _ = try await imageContainerRegistry.pull( image: actualImage, name: vmName, locationName: storage) @@ -752,15 +831,17 @@ final class LumeController { } @MainActor - private func loadVM(name: String, storage: String? = nil) throws -> VM { - let vmDir = try home.getVMDirectory(name, storage: storage) + private func loadVM(vmDir: VMDirectory, storage: String?) throws -> VM { + // vmDir is now passed directly guard vmDir.initialized() else { - throw VMError.notInitialized(name) + throw VMError.notInitialized(vmDir.name) // Use name from vmDir } let config: VMConfig = try vmDir.loadConfig() + // Pass the provided storage (which could be a path or named location) let vmDirContext = VMDirContext( - dir: vmDir, config: config, home: home, storage: storage) + dir: vmDir, config: config, home: home, storage: storage + ) let imageLoader = config.os.lowercased() == "macos" ? imageLoaderFactory.createImageLoader() : nil @@ -808,11 +889,22 @@ final class LumeController { public func validateVMExists(_ name: String, storage: String? = nil) throws -> String? { // If location is specified, only check that location if let storage = storage { - let vmDir = try home.getVMDirectory(name, storage: storage) - guard vmDir.initialized() else { - throw VMError.notFound(name) + // Check if storage is a path by looking for directory separator + if storage.contains("/") || storage.contains("\\") { + // Treat as direct path + let vmDir = try home.getVMDirectoryFromPath(name, storagePath: storage) + guard vmDir.initialized() else { + throw VMError.notFound(name) + } + return storage // Return the path as the location identifier + } else { + // Treat as named storage + let vmDir = try home.getVMDirectory(name, storage: storage) + guard vmDir.initialized() else { + throw VMError.notFound(name) + } + return storage } - return storage } // If no location specified, try to find the VM in any location @@ -846,7 +938,29 @@ final class LumeController { throw ValidationError("Organization cannot be empty") } - let vmDir = try home.getVMDirectory(name, storage: storage) + // Determine if storage is a path or a named storage location + let vmDir: VMDirectory + if let storage = storage, storage.contains("/") || storage.contains("\\") { + // Create the base directory if it doesn't exist + if !FileManager.default.fileExists(atPath: storage) { + Logger.info("Creating VM storage directory", metadata: ["path": storage]) + do { + try FileManager.default.createDirectory( + atPath: storage, + withIntermediateDirectories: true + ) + } catch { + throw HomeError.directoryCreationFailed(path: storage) + } + } + + // Use getVMDirectoryFromPath for direct paths + vmDir = try home.getVMDirectoryFromPath(name, storagePath: storage) + } else { + // Use getVMDirectory for named storage locations + vmDir = try home.getVMDirectory(name, storage: storage) + } + if vmDir.exists() { throw VMError.alreadyExists(name) } diff --git a/libs/lume/src/Server/Handlers.swift b/libs/lume/src/Server/Handlers.swift index c968359a..bf289350 100644 --- a/libs/lume/src/Server/Handlers.swift +++ b/libs/lume/src/Server/Handlers.swift @@ -6,10 +6,10 @@ import Virtualization extension Server { // MARK: - VM Management Handlers - func handleListVMs() async throws -> HTTPResponse { + func handleListVMs(storage: String? = nil) async throws -> HTTPResponse { do { let vmController = LumeController() - let vms = try vmController.list() + let vms = try vmController.list(storage: storage) return try .json(vms) } catch { return .badRequest(message: error.localizedDescription) diff --git a/libs/lume/src/Server/Requests.swift b/libs/lume/src/Server/Requests.swift index da0bf681..5cde19d2 100644 --- a/libs/lume/src/Server/Requests.swift +++ b/libs/lume/src/Server/Requests.swift @@ -109,7 +109,7 @@ struct PushRequest: Codable { let tags: [String] // List of tags to push var registry: String // Registry URL var organization: String // Organization/user in the registry - let storage: String? // Optional VM storage location + let storage: String? // Optional VM storage location or direct path var chunkSizeMb: Int // Chunk size // dryRun and reassemble are less common for API, default to false? // verbose is usually handled by server logging diff --git a/libs/lume/src/Server/Server.swift b/libs/lume/src/Server/Server.swift index 71db4a75..98ffc588 100644 --- a/libs/lume/src/Server/Server.swift +++ b/libs/lume/src/Server/Server.swift @@ -79,9 +79,11 @@ final class Server { routes = [ Route( method: "GET", path: "/lume/vms", - handler: { [weak self] _ in + handler: { [weak self] request in guard let self else { throw HTTPError.internalError } - return try await self.handleListVMs() + // Extract storage from query params if present + let storage = self.extractQueryParam(request: request, name: "storage") + return try await self.handleListVMs(storage: storage) }), Route( method: "GET", path: "/lume/vms/:name", diff --git a/libs/lumier/src/lib/vm.sh b/libs/lumier/src/lib/vm.sh index 9d3dda06..5bcd5d7d 100755 --- a/libs/lumier/src/lib/vm.sh +++ b/libs/lumier/src/lib/vm.sh @@ -1,32 +1,32 @@ #!/usr/bin/env bash start_vm() { - # Set up dedicated storage for this VM - STORAGE_NAME="storage_${VM_NAME}" - if [ -n "$HOST_STORAGE_PATH" ]; then - lume config storage add "$STORAGE_NAME" "$HOST_STORAGE_PATH" >/dev/null 2>&1 || true + # Determine storage path for VM + STORAGE_PATH="$HOST_STORAGE_PATH" + if [ -z "$STORAGE_PATH" ]; then + STORAGE_PATH="storage_${VM_NAME}" fi # Check if VM exists and its status using JSON format - VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1) + VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>&1) # Check if VM not found error if [[ $VM_INFO == *"Virtual machine not found"* ]]; then IMAGE_NAME="${VERSION##*/}" - lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_NAME" + lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_PATH" else # Parse the JSON status - check if it contains "status" : "running" if [[ $VM_INFO == *'"status" : "running"'* ]]; then - # lume_stop "$VM_NAME" "$STORAGE_NAME" - lume stop "$VM_NAME" --storage "$STORAGE_NAME" + lume_stop "$VM_NAME" "$STORAGE_PATH" + # lume stop "$VM_NAME" --storage "$STORAGE_PATH" fi fi # Set VM parameters - lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_NAME" + lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_PATH" # Fetch VM configuration - CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json) + CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json) # Setup data directory args if necessary SHARED_DIR_ARGS="" @@ -39,8 +39,8 @@ start_vm() { fi # Run VM with VNC and shared directory using curl - # lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" & - lume run "$VM_NAME" --storage "$STORAGE_NAME" --no-display + lume_run $SHARED_DIR_ARGS --storage "$STORAGE_PATH" "$VM_NAME" & + # lume run "$VM_NAME" --storage "$STORAGE_PATH" --no-display # Wait for VM to be running and VNC URL to be available vm_ip="" @@ -50,7 +50,7 @@ start_vm() { while [ $attempt -lt $max_attempts ]; do # Get VM info as JSON - VM_INFO=$(lume get "$VM_NAME" -f json 2>/dev/null) + VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>/dev/null) # Check if VM has status 'running' if [[ $VM_INFO == *'"status" : "running"'* ]]; then @@ -71,8 +71,8 @@ start_vm() { if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then echo "Timed out waiting for VM to start or VNC URL to become available." - # lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1 - lume stop "$VM_NAME" --storage "$STORAGE_NAME" > /dev/null 2>&1 + lume_stop "$VM_NAME" "$STORAGE_PATH" > /dev/null 2>&1 + # lume stop "$VM_NAME" --storage "$STORAGE_PATH" > /dev/null 2>&1 exit 1 fi @@ -100,13 +100,16 @@ start_vm() { stop_vm() { echo "Stopping VM '$VM_NAME'..." - STORAGE_NAME="storage_${VM_NAME}" + STORAGE_PATH="$HOST_STORAGE_PATH" + if [ -z "$STORAGE_PATH" ]; then + STORAGE_PATH="storage_${VM_NAME}" + fi # Check if the VM exists and is running (use lume get for speed) - VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>/dev/null) + VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>/dev/null) if [[ -z "$VM_INFO" || $VM_INFO == *"Virtual machine not found"* ]]; then echo "VM '$VM_NAME' does not exist." elif [[ $VM_INFO == *'"status" : "running"'* ]]; then - lume_stop "$VM_NAME" "$STORAGE_NAME" + lume_stop "$VM_NAME" "$STORAGE_PATH" echo "VM '$VM_NAME' was running and is now stopped." elif [[ $VM_INFO == *'"status" : "stopped"'* ]]; then echo "VM '$VM_NAME' is already stopped." From 0543e16c1e11b807b90ccaedebfb94a1f3f11b9b Mon Sep 17 00:00:00 2001 From: "allcontributors[bot]" <46447321+allcontributors[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 23:46:06 +0000 Subject: [PATCH 19/38] docs: update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 53102fcb..eed2c55f 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,7 @@ Apple, macOS, and Apple Silicon are trademarks of Apple Inc. Ubuntu and Canonica Rahim Nathwani
Rahim Nathwani

💻 Matt Speck
Matt Speck

💻 + FinnBorge
FinnBorge

💻 From 8bfb9dbe052e9647d0c6f94669c48410d30178b4 Mon Sep 17 00:00:00 2001 From: "allcontributors[bot]" <46447321+allcontributors[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 23:46:07 +0000 Subject: [PATCH 20/38] docs: update .all-contributorsrc --- .all-contributorsrc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index d1b3578e..503f0e94 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -151,6 +151,15 @@ "contributions": [ "code" ] + }, + { + "login": "FinnBorge", + "name": "FinnBorge", + "avatar_url": "https://avatars.githubusercontent.com/u/9272726?v=4", + "profile": "https://github.com/FinnBorge", + "contributions": [ + "code" + ] } ] } From 9b78a40cb556ee67e159bb47a40b1af4d31fdf26 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Tue, 29 Apr 2025 17:33:17 -0700 Subject: [PATCH 21/38] Handle lume run storage not found --- libs/lume/src/LumeController.swift | 157 +++++++++++++++++------------ 1 file changed, 93 insertions(+), 64 deletions(-) diff --git a/libs/lume/src/LumeController.swift b/libs/lume/src/LumeController.swift index f25079ff..03db4999 100644 --- a/libs/lume/src/LumeController.swift +++ b/libs/lume/src/LumeController.swift @@ -408,58 +408,84 @@ final class LumeController { "Running VM", metadata: [ "name": normalizedName, - "location": storage ?? "default", "no_display": "\(noDisplay)", "shared_directories": "\(sharedDirectories.map( { $0.string } ).joined(separator: ", "))", "mount": mount?.path ?? "none", "vnc_port": "\(vncPort)", "recovery_mode": "\(recoveryMode)", - "storage_param": storage ?? "default", + "storage_param": storage ?? "default", // Log the original param "usb_storage_devices": "\(usbMassStoragePaths?.count ?? 0)", ]) do { - // Check if this is an image reference (contains a tag) - let components = name.split(separator: ":") - if components.count == 2 { - do { - _ = try self.validateVMExists(normalizedName, storage: storage) - } catch { - // If the VM doesn't exist, try to pull the image + // Check if name is an image ref to auto-pull + let components = normalizedName.split(separator: ":") + if components.count == 2 { // Check if it looks like image:tag + // Attempt to validate if VM exists first, suppressing the error + // This avoids pulling if the VM already exists, even if name looks like an image ref + let vmExists = (try? self.validateVMExists(normalizedName, storage: storage)) != nil + if !vmExists { + Logger.info( + "VM not found, attempting to pull image based on name", + metadata: ["imageRef": normalizedName]) + // Use the potentially new VM name derived from the image ref + let potentialVMName = String(components[0]) try await pullImage( - image: name, - name: nil, + image: normalizedName, // Full image ref + name: potentialVMName, // Name derived from image registry: registry, organization: organization, storage: storage ) + // Important: After pull, the effective name might have changed + // We proceed assuming the user wants to run the VM derived from image name + // normalizedName = potentialVMName // Re-assign normalizedName if pull logic creates it + // Note: Current pullImage doesn't return the final VM name, + // so we assume it matches the name derived from the image. + // This might need refinement if pullImage behaviour changes. } } - // Find VM and get its actual location - let actualLocation = try validateVMExists(normalizedName, storage: storage) + // Determine effective storage path or name AND get the VMDirectory + let effectiveStorage: String? + let vmDir: VMDirectory - // Log if we found the VM in a different location than default - if actualLocation != storage && actualLocation != nil { + if let storagePath = storage, storagePath.contains("/") || storagePath.contains("\\") { + // Storage is a direct path + vmDir = try home.getVMDirectoryFromPath(normalizedName, storagePath: storagePath) + guard vmDir.initialized() else { + if vmDir.exists() { + throw VMError.notInitialized(normalizedName) + } else { + throw VMError.notFound(normalizedName) + } + } + effectiveStorage = storagePath // Use the path string + Logger.info("Using direct storage path", metadata: ["path": storagePath]) + } else { + // Storage is nil or a named location - validate and get the actual name + let actualLocationName = try validateVMExists(normalizedName, storage: storage) + vmDir = try home.getVMDirectory(normalizedName, storage: actualLocationName) // Get VMDir for named location + effectiveStorage = actualLocationName // Use the named location string Logger.info( - "Found VM in location", + "Using named storage location", metadata: [ - "name": normalizedName, - "location": actualLocation ?? "default", + "requested": storage ?? "default", + "actual": actualLocationName ?? "default", ]) } + // Validate parameters using the located VMDirectory try validateRunParameters( - name: normalizedName, + vmDir: vmDir, // Pass vmDir sharedDirectories: sharedDirectories, mount: mount, - storage: actualLocation, usbMassStoragePaths: usbMassStoragePaths ) - // Use the actual VM location that we found - let vm = try get(name: normalizedName, storage: actualLocation) + // Load the VM directly using the located VMDirectory and storage context + let vm = try self.loadVM(vmDir: vmDir, storage: effectiveStorage) SharedVM.shared.setVM(name: normalizedName, vm: vm) try await vm.run( @@ -918,6 +944,51 @@ final class LumeController { throw VMError.notFound(name) } + private func validateRunParameters( + vmDir: VMDirectory, // Changed signature: accept VMDirectory + sharedDirectories: [SharedDirectory]?, + mount: Path?, + usbMassStoragePaths: [Path]? = nil + ) throws { + // VM existence is confirmed by having vmDir, no need for validateVMExists + if let dirs = sharedDirectories { + try self.validateSharedDirectories(dirs) + } + + // Validate USB mass storage paths + if let usbPaths = usbMassStoragePaths { + for path in usbPaths { + if !FileManager.default.fileExists(atPath: path.path) { + throw ValidationError("USB mass storage image not found: \(path.path)") + } + } + + if #available(macOS 15.0, *) { + // USB mass storage is supported + } else { + Logger.info( + "USB mass storage devices require macOS 15.0 or later. They will be ignored.") + } + } + + // Load config directly from vmDir + let vmConfig = try vmDir.loadConfig() + switch vmConfig.os.lowercased() { + case "macos": + if mount != nil { + throw ValidationError( + "Mounting disk images is not supported for macOS VMs. If you are looking to mount a IPSW, please use the --ipsw option in the create command." + ) + } + case "linux": + if let mount = mount, !FileManager.default.fileExists(atPath: mount.path) { + throw ValidationError("Mount file not found: \(mount.path)") + } + default: + break + } + } + private func validatePullParameters( image: String, name: String, @@ -966,48 +1037,6 @@ final class LumeController { } } - private func validateRunParameters( - name: String, sharedDirectories: [SharedDirectory]?, mount: Path?, - storage: String? = nil, usbMassStoragePaths: [Path]? = nil - ) throws { - _ = try self.validateVMExists(name, storage: storage) - if let dirs = sharedDirectories { - try self.validateSharedDirectories(dirs) - } - - // Validate USB mass storage paths - if let usbPaths = usbMassStoragePaths { - for path in usbPaths { - if !FileManager.default.fileExists(atPath: path.path) { - throw ValidationError("USB mass storage image not found: \(path.path)") - } - } - - if #available(macOS 15.0, *) { - // USB mass storage is supported - } else { - Logger.info( - "USB mass storage devices require macOS 15.0 or later. They will be ignored.") - } - } - - let vmConfig = try home.getVMDirectory(name, storage: storage).loadConfig() - switch vmConfig.os.lowercased() { - case "macos": - if mount != nil { - throw ValidationError( - "Mounting disk images is not supported for macOS VMs. If you are looking to mount a IPSW, please use the --ipsw option in the create command." - ) - } - case "linux": - if let mount = mount, !FileManager.default.fileExists(atPath: mount.path) { - throw ValidationError("Mount file not found: \(mount.path)") - } - default: - break - } - } - private func validatePushParameters( name: String, imageName: String, From 5fc627ed69734b79ab854c39f9ec06983b1403d6 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 11:02:39 -0700 Subject: [PATCH 22/38] added dragging along path --- .../providers/anthropic/tools/computer.py | 20 +++++---- .../agent/providers/openai/tools/computer.py | 41 +++++++++++++++++++ .../computer_server/handlers/base.py | 13 +++++- .../computer_server/handlers/macos.py | 35 +++++++++++++++- libs/computer-server/computer_server/main.py | 1 + libs/computer/computer/interface/base.py | 11 +++++ libs/computer/computer/interface/macos.py | 5 +++ 7 files changed, 115 insertions(+), 11 deletions(-) diff --git a/libs/agent/agent/providers/anthropic/tools/computer.py b/libs/agent/agent/providers/anthropic/tools/computer.py index 8425f35f..ecf232bd 100644 --- a/libs/agent/agent/providers/anthropic/tools/computer.py +++ b/libs/agent/agent/providers/anthropic/tools/computer.py @@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool): self.logger.info(f"Moving cursor to ({x}, {y})") await self.computer.interface.move_cursor(x, y) elif action == "left_click_drag": - self.logger.info(f"Dragging from ({x}, {y})") - # First move to the position - await self.computer.interface.move_cursor(x, y) - # Then perform drag operation - check if drag_to exists or we need to use other methods - try: - await self.computer.interface.drag_to(x, y) - except Exception as e: - self.logger.error(f"Error during drag operation: {str(e)}") - raise ToolError(f"Failed to perform drag: {str(e)}") + # Get the start coordinate from kwargs + start_coordinate = kwargs.get("start_coordinate") + if not start_coordinate: + raise ToolError("start_coordinate is required for left_click_drag action") + + start_x, start_y = start_coordinate + end_x, end_y = x, y + + self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})") + await self.computer.interface.move_cursor(start_x, start_y) + await self.computer.interface.drag_to(end_x, end_y) # Wait briefly for any UI changes await asyncio.sleep(0.5) diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py index 90ef5935..c5602f4e 100644 --- a/libs/agent/agent/providers/openai/tools/computer.py +++ b/libs/agent/agent/providers/openai/tools/computer.py @@ -44,6 +44,7 @@ Action = Literal[ "double_click", "screenshot", "scroll", + "drag", ] @@ -165,6 +166,11 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): scroll_x = kwargs.get("scroll_x", 0) // 50 scroll_y = kwargs.get("scroll_y", 0) // 50 return await self.handle_scroll(x, y, scroll_x, scroll_y) + elif type == "drag": + path = kwargs.get("path") + if not path or not isinstance(path, list) or len(path) < 2: + raise ToolError("path is required for drag action and must contain at least 2 points") + return await self.handle_drag(path) elif type == "screenshot": return await self.screenshot() elif type == "wait": @@ -302,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool): self.logger.error(f"Error in handle_scroll: {str(e)}") raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}") + async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult: + """Handle mouse drag operation using a path of coordinates. + + Args: + path: List of coordinate points {"x": int, "y": int} defining the drag path + + Returns: + ToolResult with the operation result and screenshot + """ + try: + # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format + points = [(p["x"], p["y"]) for p in path] + + # Perform drag action + if len(points) == 2: + await self.computer.interface.move_cursor(points[0][0], points[0][1]) + await self.computer.interface.drag_to(points[1][0], points[1][1]) + else: + await self.computer.interface.drag(points, button="left") + + # Wait for UI to update + await asyncio.sleep(0.5) + + # Take screenshot after action + screenshot = await self.computer.interface.screenshot() + base64_screenshot = base64.b64encode(screenshot).decode("utf-8") + + return ToolResult( + output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})", + base64_image=base64_screenshot, + ) + except Exception as e: + self.logger.error(f"Error in handle_drag: {str(e)}") + raise ToolError(f"Failed to perform drag operation: {str(e)}") + async def screenshot(self) -> ToolResult: """Take a screenshot.""" try: diff --git a/libs/computer-server/computer_server/handlers/base.py b/libs/computer-server/computer_server/handlers/base.py index 818d367c..08d57ad5 100644 --- a/libs/computer-server/computer_server/handlers/base.py +++ b/libs/computer-server/computer_server/handlers/base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List, Tuple class BaseAccessibilityHandler(ABC): """Abstract base class for OS-specific accessibility handlers.""" @@ -59,6 +59,17 @@ class BaseAutomationHandler(ABC): duration: How long the drag should take in seconds """ pass + + @abstractmethod + async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]: + """Drag the cursor from current position to specified coordinates. + + Args: + path: A list of tuples of x and y coordinates to drag to + button: The mouse button to use ('left', 'middle', 'right') + duration: How long the drag should take in seconds + """ + pass # Keyboard Actions @abstractmethod diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py index 180f083a..abdedc41 100644 --- a/libs/computer-server/computer_server/handlers/macos.py +++ b/libs/computer-server/computer_server/handlers/macos.py @@ -1,7 +1,7 @@ import pyautogui import base64 from io import BytesIO -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any, List, Tuple from ctypes import byref, c_void_p, POINTER from AppKit import NSWorkspace # type: ignore import AppKit @@ -563,6 +563,39 @@ class MacOSAutomationHandler(BaseAutomationHandler): except Exception as e: return {"success": False, "error": str(e)} + async def drag( + self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5 + ) -> Dict[str, Any]: + try: + if not path or len(path) < 2: + return {"success": False, "error": "Path must contain at least 2 points"} + + # Move to the first point + start_x, start_y = path[0] + pyautogui.moveTo(start_x, start_y) + + # Press the mouse button + pyautogui.mouseDown(button=button) + + # Calculate time between points to distribute duration evenly + step_duration = duration / (len(path) - 1) if len(path) > 1 else duration + + # Move through each subsequent point + for x, y in path[1:]: + pyautogui.moveTo(x, y, duration=step_duration) + + # Release the mouse button + pyautogui.mouseUp(button=button) + + return {"success": True} + except Exception as e: + # Make sure to release the mouse button if an error occurs + try: + pyautogui.mouseUp(button=button) + except: + pass + return {"success": False, "error": str(e)} + # Keyboard Actions async def type_text(self, text: str) -> Dict[str, Any]: try: diff --git a/libs/computer-server/computer_server/main.py b/libs/computer-server/computer_server/main.py index c95918d8..d7f66f89 100644 --- a/libs/computer-server/computer_server/main.py +++ b/libs/computer-server/computer_server/main.py @@ -65,6 +65,7 @@ async def websocket_endpoint(websocket: WebSocket): "type_text": manager.automation_handler.type_text, "press_key": manager.automation_handler.press_key, "drag_to": manager.automation_handler.drag_to, + "drag": manager.automation_handler.drag, "hotkey": manager.automation_handler.hotkey, "get_cursor_position": manager.automation_handler.get_cursor_position, "get_screen_size": manager.automation_handler.get_screen_size, diff --git a/libs/computer/computer/interface/base.py b/libs/computer/computer/interface/base.py index 31106c14..8fcbd21c 100644 --- a/libs/computer/computer/interface/base.py +++ b/libs/computer/computer/interface/base.py @@ -79,6 +79,17 @@ class BaseComputerInterface(ABC): """ pass + @abstractmethod + async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> None: + """Drag the cursor along a path of coordinates. + + Args: + path: List of (x, y) coordinate tuples defining the drag path + button: The mouse button to use ('left', 'middle', 'right') + duration: Total time in seconds that the drag operation should take + """ + pass + # Keyboard Actions @abstractmethod async def type_text(self, text: str) -> None: diff --git a/libs/computer/computer/interface/macos.py b/libs/computer/computer/interface/macos.py index a3b99f7d..2460086c 100644 --- a/libs/computer/computer/interface/macos.py +++ b/libs/computer/computer/interface/macos.py @@ -328,6 +328,11 @@ class MacOSComputerInterface(BaseComputerInterface): "drag_to", {"x": x, "y": y, "button": button, "duration": duration} ) + async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> None: + await self._send_command( + "drag", {"path": path, "button": button, "duration": duration} + ) + # Keyboard Actions async def type_text(self, text: str) -> None: await self._send_command("type_text", {"text": text}) From 7981000820f222b40f2a473987afaf29de335e31 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 11:02:54 -0700 Subject: [PATCH 23/38] added message when scalable oai endpoint is still warming up --- .../providers/uitars/clients/oaicompat.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py index 1b512997..423b1d3a 100644 --- a/libs/agent/agent/providers/uitars/clients/oaicompat.py +++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py @@ -190,25 +190,21 @@ class OAICompatClient(BaseUITarsClient): response_text = await response.text() logger.debug(f"Response content: {response_text}") + # if 503, then the endpoint is still warming up + if response.status == 503: + logger.error(f"Endpoint is still warming up, please try again later") + raise Exception(f"Endpoint is still warming up: {response_text}") + # Try to parse as JSON if the content type is appropriate if "application/json" in response.headers.get('Content-Type', ''): response_json = await response.json() else: raise Exception(f"Response is not JSON format") - # # Optionally try to parse it anyway - # try: - # import json - # response_json = json.loads(response_text) - # except json.JSONDecodeError as e: - # print(f"Failed to parse response as JSON: {e}") if response.status != 200: - error_msg = response_json.get("error", {}).get( - "message", str(response_json) - ) - logger.error(f"Error in API call: {error_msg}") - raise Exception(f"API error: {error_msg}") - + logger.error(f"Error in API call: {response_text}") + raise Exception(f"API error: {response_text}") + return response_json except Exception as e: From e55e649cd6254fc06f0699583e99b651d3c4d523 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 14:47:27 -0700 Subject: [PATCH 24/38] improved display of AgentResponse objects in gradio ui, and standardized uitars agent output --- libs/agent/agent/providers/uitars/loop.py | 44 ++----- libs/agent/agent/providers/uitars/utils.py | 113 +++++++++++++++++- libs/agent/agent/ui/gradio/app.py | 128 ++++++++++----------- 3 files changed, 179 insertions(+), 106 deletions(-) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 71d2c739..a30d3bee 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -17,7 +17,7 @@ from ...core.types import AgentResponse, LLMProvider from ...core.visualization import VisualizationHelper from computer import Computer -from .utils import add_box_token, parse_actions, parse_action_parameters +from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format from .tools.manager import ToolManager from .tools.computer import ToolResult from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES @@ -507,41 +507,14 @@ class UITARSLoop(BaseLoop): # Update whether an action screenshot was saved this turn action_screenshot_saved = action_screenshot_saved or new_screenshot_saved - - # Parse actions from the raw response - raw_response = response["choices"][0]["message"]["content"] - parsed_actions = parse_actions(raw_response) - # Extract thought content if available - thought = "" - if "Thought:" in raw_response: - thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL) - if thought_match: - thought = thought_match.group(1).strip() + agent_response = await to_agent_response_format( + response, + messages, + model=self.model, + ) + yield agent_response - # Create standardized thought response format - thought_response = { - "role": "assistant", - "content": thought or raw_response, - "metadata": { - "title": "🧠 UI-TARS Thoughts" - } - } - - # Create action response format - action_response = { - "role": "assistant", - "content": str(parsed_actions), - "metadata": { - "title": "🖱️ UI-TARS Actions", - } - } - - # Yield both responses to the caller (thoughts first, then actions) - yield thought_response - if parsed_actions: - yield action_response - # Check if we should continue this conversation running = should_continue @@ -562,7 +535,8 @@ class UITARSLoop(BaseLoop): logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}") yield { - "error": str(e), + "role": "assistant", + "content": f"Error: {str(e)}", "metadata": {"title": "❌ Error"}, } diff --git a/libs/agent/agent/providers/uitars/utils.py b/libs/agent/agent/providers/uitars/utils.py index 00565b88..cc904115 100644 --- a/libs/agent/agent/providers/uitars/utils.py +++ b/libs/agent/agent/providers/uitars/utils.py @@ -4,9 +4,114 @@ import logging import base64 import re from typing import Any, Dict, List, Optional, Union, Tuple +from datetime import datetime logger = logging.getLogger(__name__) +from ...core.types import AgentResponse + +async def to_agent_response_format( + response: Dict[str, Any], + messages: List[Dict[str, Any]], + model: Optional[str] = None, +) -> AgentResponse: + """Convert raw UI-TARS response to agent response format. + + Args: + response: Raw UI-TARS response + messages: List of messages in standard format + model: Optional model name + + Returns: + AgentResponse: Standardized agent response format + """ + # Create unique IDs for this response + response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}" + reasoning_id = f"rs_{response_id}" + action_id = f"cu_{response_id}" + call_id = f"call_{response_id}" + + # Parse actions from the raw response + content = response["choices"][0]["message"]["content"] + actions = parse_actions(content) + + # Extract thought content if available + reasoning_text = "" + if "Thought:" in content: + thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL) + if thought_match: + reasoning_text = thought_match.group(1).strip() + + # Create output items + output_items = [] + if reasoning_text: + output_items.append({ + "type": "reasoning", + "id": reasoning_id, + "text": reasoning_text + }) + if actions: + for i, action in enumerate(actions): + action_name, tool_args = parse_action_parameters(action) + if action_name == "finished": + output_items.append({ + "type": "message", + "role": "assistant", + "content": [{ + "type": "output_text", + "text": tool_args["content"] + }], + "id": f"action_{i}_{action_id}", + "status": "completed" + }) + else: + if tool_args.get("action") == action_name: + del tool_args["action"] + output_items.append({ + "type": "computer_call", + "id": f"{action}_{i}_{action_id}", + "call_id": f"call_{i}_{action_id}", + "action": { "type": action_name, **tool_args }, + "pending_safety_checks": [], + "status": "completed" + }) + + # Create agent response + agent_response = AgentResponse( + id=response_id, + object="response", + created_at=int(datetime.now().timestamp()), + status="completed", + error=None, + incomplete_details=None, + instructions=None, + max_output_tokens=None, + model=model or response["model"], + output=output_items, + parallel_tool_calls=True, + previous_response_id=None, + reasoning={"effort": "medium"}, + store=True, + temperature=0.0, + top_p=0.7, + text={"format": {"type": "text"}}, + tool_choice="auto", + tools=[ + { + "type": "computer_use_preview", + "display_height": 768, + "display_width": 1024, + "environment": "mac", + } + ], + truncation="auto", + usage=response["usage"], + user=None, + metadata={}, + response=response + ) + return agent_response + def add_box_token(input_string: str) -> str: """Add box tokens to the coordinates in the model response. @@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]: """ # Handle "finished" action if action.startswith("finished"): - return "finished", {} + # Parse content if it exists + content_match = re.search(r"content='([^']*)'", action) + if content_match: + content = content_match.group(1) + return "finished", {"content": content} + else: + return "finished", {} # Parse action parameters action_match = re.match(r'(\w+)\((.*)\)', action) diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py index c6ac57ea..b8ab480a 100644 --- a/libs/agent/agent/ui/gradio/app.py +++ b/libs/agent/agent/ui/gradio/app.py @@ -35,6 +35,7 @@ from pathlib import Path from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union import gradio as gr from gradio.components.chatbot import MetadataDict +from typing import cast # Import from agent package from agent.core.types import AgentResponse @@ -447,66 +448,6 @@ def create_agent( return global_agent - -def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]: - """Process agent results for the Gradio UI.""" - # Extract text content - text_obj = result.get("text", {}) - metadata = result.get("metadata", {}) - - # Create a properly typed MetadataDict - metadata_dict = MetadataDict() - metadata_dict["title"] = metadata.get("title", "") - metadata_dict["status"] = "done" - metadata = metadata_dict - - # For OpenAI's Computer-Use Agent, text field is an object with format property - if ( - text_obj - and isinstance(text_obj, dict) - and "format" in text_obj - and not text_obj.get("value", "") - ): - content, metadata = extract_synthesized_text(result) - else: - if not text_obj: - text_obj = result - - # For other types of results, try to get text directly - if isinstance(text_obj, dict): - if "value" in text_obj: - content = text_obj["value"] - elif "text" in text_obj: - content = text_obj["text"] - elif "content" in text_obj: - content = text_obj["content"] - else: - content = "" - else: - content = str(text_obj) if text_obj else "" - - # If still no content but we have outputs, create a summary - if not content and "output" in result and result["output"]: - output = result["output"] - for out in output: - if out.get("type") == "reasoning": - content = out.get("content", "") - if content: - break - elif out.get("type") == "computer_call": - action = out.get("action", {}) - action_type = action.get("type", "") - if action_type: - content = f"Performing action: {action_type}" - break - - # Clean up the text - ensure content is a string - if not isinstance(content, str): - content = str(content) if content else "" - - return content, metadata - - def create_gradio_ui( provider_name: str = "openai", model_name: str = "gpt-4o", @@ -907,17 +848,64 @@ def create_gradio_ui( # Stream responses from the agent async for result in global_agent.run(last_user_message): - # Process result - content, metadata = process_agent_result(result) - - # Skip empty content - if content or metadata.get("title"): - history.append( - gr.ChatMessage( - role="assistant", content=content, metadata=metadata + print(f"DEBUG - Agent response ------- START") + from pprint import pprint + pprint(result) + print(f"DEBUG - Agent response ------- END") + + def generate_gradio_messages(): + if result.get("content"): + yield gr.ChatMessage( + role="assistant", + content=result.get("content", ""), + metadata=cast(MetadataDict, result.get("metadata", {})) ) - ) - yield history + else: + outputs = result.get("output", []) + for output in outputs: + if output.get("type") == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + yield gr.ChatMessage( + role=output.get("role", "assistant"), + content=content_part.get("text", ""), + metadata=content_part.get("metadata", {}) + ) + elif output.get("type") == "reasoning": + # if it's openAI, we only have access to a summary of the reasoning + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + yield gr.ChatMessage( + role="assistant", + content=summary_part.get("text", "") + ) + else: + summary_content = output.get("text", "") + if summary_content: + yield gr.ChatMessage( + role="assistant", + content=summary_content, + ) + elif output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type: + action_title = f"🛠️ Performing {action_type}" + if action.get("x") and action.get("y"): + action_title += f" at ({action['x']}, {action['y']})" + yield gr.ChatMessage( + role="assistant", + content=f"```json\n{json.dumps(action)}\n```", + metadata={"title": action_title} + ) + + for message in generate_gradio_messages(): + history.append(message) + yield history + except Exception as e: import traceback From 2e6d3e4d2d5ceb3349a217fad339d17c22b0294c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 14:48:51 -0700 Subject: [PATCH 25/38] old code removal --- libs/agent/agent/ui/gradio/app.py | 57 ------------------------------- 1 file changed, 57 deletions(-) diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py index b8ab480a..a4541019 100644 --- a/libs/agent/agent/ui/gradio/app.py +++ b/libs/agent/agent/ui/gradio/app.py @@ -323,63 +323,6 @@ def get_ollama_models() -> List[str]: logging.error(f"Error getting Ollama models: {e}") return [] - -def extract_synthesized_text( - result: Union[AgentResponse, Dict[str, Any]], -) -> Tuple[str, MetadataDict]: - """Extract synthesized text from the agent result.""" - synthesized_text = "" - metadata = MetadataDict() - - if "output" in result and result["output"]: - for output in result["output"]: - if output.get("type") == "reasoning": - metadata["title"] = "🧠 Reasoning" - content = output.get("content", "") - if content: - synthesized_text += f"{content}\n" - elif output.get("type") == "message": - # Handle message type outputs - can contain rich content - content = output.get("content", []) - - # Content is usually an array of content blocks - if isinstance(content, list): - for block in content: - if isinstance(block, dict) and block.get("type") == "output_text": - text_value = block.get("text", "") - if text_value: - synthesized_text += f"{text_value}\n" - - elif output.get("type") == "computer_call": - action = output.get("action", {}) - action_type = action.get("type", "") - - # Create a descriptive text about the action - if action_type == "click": - button = action.get("button", "") - x = action.get("x", "") - y = action.get("y", "") - synthesized_text += f"Clicked {button} at position ({x}, {y}).\n" - elif action_type == "type": - text = action.get("text", "") - synthesized_text += f"Typed: {text}.\n" - elif action_type == "keypress": - # Extract key correctly from either keys array or key field - if isinstance(action.get("keys"), list): - key = ", ".join(action.get("keys")) - else: - key = action.get("key", "") - - synthesized_text += f"Pressed key: {key}\n" - else: - synthesized_text += f"Performed {action_type} action.\n" - - metadata["status"] = "done" - metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}" - - return synthesized_text.strip(), metadata - - def create_computer_instance(verbosity: int = logging.INFO) -> Computer: """Create or get the global Computer instance.""" global global_computer From a5ec926922f55c03303261e90aa4805ce4fc146d Mon Sep 17 00:00:00 2001 From: f-trycua Date: Wed, 30 Apr 2025 15:11:49 -0700 Subject: [PATCH 26/38] Add --no-background-service option --- libs/lume/scripts/install.sh | 91 ++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 35 deletions(-) diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh index d854c0e4..4c1efdc9 100755 --- a/libs/lume/scripts/install.sh +++ b/libs/lume/scripts/install.sh @@ -20,24 +20,32 @@ INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}" GITHUB_REPO="trycua/cua" LATEST_RELEASE_URL="https://api.github.com/repos/$GITHUB_REPO/releases/latest" +# Option to skip background service setup (default: install it) +INSTALL_BACKGROUND_SERVICE=true + # Parse command line arguments while [ "$#" -gt 0 ]; do case "$1" in --install-dir=*) INSTALL_DIR="${1#*=}" ;; + --no-background-service|--skip-background-service) + INSTALL_BACKGROUND_SERVICE=false + ;; --help) echo "${BOLD}${BLUE}Lume Installer${NORMAL}" echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" - echo " --install-dir=DIR Install to the specified directory (default: $DEFAULT_INSTALL_DIR)" - echo " --help Display this help message" + echo " --install-dir=DIR Install to the specified directory (default: $DEFAULT_INSTALL_DIR)" + echo " --no-background-service Do not setup the Lume background service (LaunchAgent)" + echo " --help Display this help message" echo "" echo "Examples:" - echo " $0 # Install to $DEFAULT_INSTALL_DIR" - echo " $0 --install-dir=/usr/local/bin # Install to system directory (may require root privileges)" - echo " INSTALL_DIR=/opt/lume $0 # Install to /opt/lume (legacy env var support)" + echo " $0 # Install to $DEFAULT_INSTALL_DIR and setup background service" + echo " $0 --install-dir=/usr/local/bin # Install to system directory (may require root privileges)" + echo " $0 --no-background-service # Install without setting up the background service" + echo " INSTALL_DIR=/opt/lume $0 # Install to /opt/lume (legacy env var support)" exit 0 ;; *) @@ -207,25 +215,26 @@ main() { echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}" echo "Run ${BOLD}lume${NORMAL} to get started." - # --- LaunchAgent setup for lume daemon --- - SERVICE_NAME="com.trycua.lume_daemon" - PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" - LUME_BIN="$INSTALL_DIR/lume" + if [ "$INSTALL_BACKGROUND_SERVICE" = true ]; then + # --- Setup background service (LaunchAgent) for Lume --- + SERVICE_NAME="com.trycua.lume_daemon" + PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" + LUME_BIN="$INSTALL_DIR/lume" - echo "" - echo "Setting up LaunchAgent to run lume daemon on login..." + echo "" + echo "Setting up LaunchAgent to run lume daemon on login..." - # Create LaunchAgents directory if it doesn't exist - mkdir -p "$HOME/Library/LaunchAgents" + # Create LaunchAgents directory if it doesn't exist + mkdir -p "$HOME/Library/LaunchAgents" - # Unload existing service if present - if [ -f "$PLIST_PATH" ]; then - echo "Existing LaunchAgent found. Unloading..." - launchctl unload "$PLIST_PATH" 2>/dev/null || true - fi + # Unload existing service if present + if [ -f "$PLIST_PATH" ]; then + echo "Existing LaunchAgent found. Unloading..." + launchctl unload "$PLIST_PATH" 2>/dev/null || true + fi - # Create the plist file - cat < "$PLIST_PATH" + # Create the plist file + cat < "$PLIST_PATH" @@ -262,23 +271,35 @@ main() { EOF - # Set permissions - chmod 644 "$PLIST_PATH" - touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log - chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log + # Set permissions + chmod 644 "$PLIST_PATH" + touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log + chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log - # Load the LaunchAgent - echo "Loading LaunchAgent..." - launchctl unload "$PLIST_PATH" 2>/dev/null || true - launchctl load "$PLIST_PATH" + # Load the LaunchAgent + echo "Loading LaunchAgent..." + launchctl unload "$PLIST_PATH" 2>/dev/null || true + launchctl load "$PLIST_PATH" - echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}" - echo "To check status: launchctl list | grep $SERVICE_NAME" - echo "To view logs: tail -f /tmp/lume_daemon.log" - echo "" - echo "To remove the lume daemon service, run:" - echo " launchctl unload \"$PLIST_PATH\"" - echo " rm \"$PLIST_PATH\"" + echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}" + echo "To check status: launchctl list | grep $SERVICE_NAME" + echo "To view logs: tail -f /tmp/lume_daemon.log" + echo "" + echo "To remove the lume daemon service, run:" + echo " launchctl unload \"$PLIST_PATH\"" + echo " rm \"$PLIST_PATH\"" + else + SERVICE_NAME="com.trycua.lume_daemon" + PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist" + if [ -f "$PLIST_PATH" ]; then + echo "Removing existing Lume background service (LaunchAgent)..." + launchctl unload "$PLIST_PATH" 2>/dev/null || true + rm "$PLIST_PATH" + echo "Lume background service (LaunchAgent) removed." + else + echo "Skipping Lume background service (LaunchAgent) setup as requested (use --no-background-service)." + fi + fi } # Run the installation From db40dae0803247a58dab817845da88a00079294a Mon Sep 17 00:00:00 2001 From: f-trycua Date: Wed, 30 Apr 2025 15:21:10 -0700 Subject: [PATCH 27/38] Add background service docs --- README.md | 18 ++++++++++-------- libs/lume/README.md | 8 ++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 53102fcb..ae6254a8 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,13 @@ If you only need the virtualization capabilities: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` +Optionally, if you don't want Lume to run as a background service: +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh) --no-background-service" +``` + +**Note:** If you choose this option, you'll need to manually start the Lume API service whenever needed by running `lume serve` in your terminal. This applies to Option 2 after completing step 1. + For Lume usage instructions, refer to the [Lume documentation](./libs/lume/README.md). ### Option 2: Full Computer-Use Agent Capabilities @@ -62,17 +69,12 @@ If you want to use AI agents with virtualized environments: lume pull macos-sequoia-cua:latest ``` -3. Start Lume daemon service: - ```bash - lume serve - ``` - -4. Install the Python libraries: +3. Install the Python libraries: ```bash pip install cua-computer cua-agent[all] ``` -5. Use the libraries in your Python code: +4. Use the libraries in your Python code: ```python from computer import Computer from agent import ComputerAgent, LLM, AgentLoop, LLMProvider @@ -95,7 +97,7 @@ If you want to use AI agents with virtualized environments: Explore the [Agent Notebook](./notebooks/) for a ready-to-run example. -6. Optionally, you can use the Agent with a Gradio UI: +5. Optionally, you can use the Agent with a Gradio UI: ```python from utils import load_dotenv_files diff --git a/libs/lume/README.md b/libs/lume/README.md index 3d9c0524..b7112b07 100644 --- a/libs/lume/README.md +++ b/libs/lume/README.md @@ -147,6 +147,14 @@ Install with a single command: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" ``` +By default, Lume is installed as a background service that starts automatically on login. If you prefer to start the Lume API service manually when needed, you can use the `--no-background-service` option: + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh) --no-background-service" +``` + +**Note:** With this option, you'll need to manually start the Lume API service by running `lume serve` in your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the Computer-Use Agent). + You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/lume/releases), extract it, and install the package manually. ## Prebuilt Images From c4f9da50079f97fa58809816c2f084cc2754e46a Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 16:38:40 -0700 Subject: [PATCH 28/38] added mcp development guide, and vision output to tool calls --- libs/mcp-server/README.md | 35 +++++++++++ libs/mcp-server/mcp_server/server.py | 93 +++++++++++++++++++--------- 2 files changed, 99 insertions(+), 29 deletions(-) diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md index a4307616..5649cc19 100644 --- a/libs/mcp-server/README.md +++ b/libs/mcp-server/README.md @@ -75,6 +75,41 @@ You can then use the script in your MCP configuration like this: } ``` +## Development Guide + +If you want to develop with the cua-mcp-server directly without installation, you can use this configuration: + +```json +{ + "mcpServers": { + "cua-agent": { + "command": "/Users/YOURUSERNAME/cua/.venv/bin/python", + "args": ["-m", "mcp_server.server"], + "env": { + "PYTHONPATH": "/Users/YOURUSERNAME/cua/libs/mcp-server:/Users/YOURUSERNAME/cua/libs/agent:/Users/YOURUSERNAME/cua/libs/computer:/Users/YOURUSERNAME/cua/libs/core:/Users/YOURUSERNAME/cua/libs/pylume", + "CUA_AGENT_LOOP": "UITARS", + "CUA_MODEL_PROVIDER": "OAICOMPAT", + "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B", + "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1" + } + } + } +} +``` + +To see the logs: +``` +tail -n 20 -f ~/Library/Logs/Claude/mcp*.log +``` + +This configuration: +- Uses your local Python virtual environment to run the server module directly +- Sets the Python path to include all necessary library dependencies +- Works with Claude Desktop, Cursor, or any other MCP client +- Automatically uses your development code without requiring installation + +Just add this to your MCP client's configuration and it will use your local development version of the server. + ## Claude Desktop Integration To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`): diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py index f6692f9a..f5fb37b2 100644 --- a/libs/mcp-server/mcp_server/server.py +++ b/libs/mcp-server/mcp_server/server.py @@ -1,9 +1,10 @@ import asyncio +import base64 import logging import os import sys import traceback -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Tuple # Configure logging to output to stderr for debug visibility logging.basicConfig( @@ -17,7 +18,7 @@ logger = logging.getLogger("mcp-server") logger.debug("MCP Server module loading...") try: - from mcp.server.fastmcp import Context, FastMCP + from mcp.server.fastmcp import Context, FastMCP, Image logger.debug("Successfully imported FastMCP") except ImportError as e: @@ -49,7 +50,28 @@ def serve() -> FastMCP: server = FastMCP("cua-agent") @server.tool() - async def run_cua_task(ctx: Context, task: str) -> str: + async def screenshot_cua(ctx: Context) -> Image: + """ + Take a screenshot of the current screen and return the image. + + Args: + ctx: The MCP context + + Returns: + An image resource containing the screenshot + """ + global global_computer + if global_computer is None: + global_computer = Computer(verbosity=logging.INFO) + await global_computer.run() + screenshot = await global_computer.interface.screenshot() + return Image( + format="png", + data=screenshot + ) + + @server.tool() + async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]: """ Run a Computer-Use Agent (CUA) task and return the results. @@ -58,7 +80,7 @@ def serve() -> FastMCP: task: The instruction or task for the agent to perform Returns: - A string containing the agent's response + A tuple containing the agent's response and the final screenshot """ global global_computer @@ -76,6 +98,8 @@ def serve() -> FastMCP: loop = AgentLoop.OPENAI elif loop_str == "ANTHROPIC": loop = AgentLoop.ANTHROPIC + elif loop_str == "UITARS": + loop = AgentLoop.UITARS else: loop = AgentLoop.OMNI @@ -107,33 +131,34 @@ def serve() -> FastMCP: full_result = "" async for result in agent.run(task): logger.info(f"Agent step complete: {result.get('id', 'unknown')}") + ctx.info(f"Agent step complete: {result.get('id', 'unknown')}") # Add response ID to output full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n" - - # Extract and concatenate text responses - if "text" in result: - # Handle both string and dict responses - text_response = result.get("text", "") - if isinstance(text_response, str): - full_result += f"Response: {text_response}\n" - else: - # If it's a dict or other structure, convert to string representation - full_result += f"Response: {str(text_response)}\n" - - # Log detailed information - if "tools" in result: - tools_info = result.get("tools") - logger.debug(f"Tools used: {tools_info}") - full_result += f"\nTools used: {tools_info}\n" + + if "content" in result: + full_result += f"Response: {result.get('content', '')}\n" # Process output if available outputs = result.get("output", []) for output in outputs: output_type = output.get("type") - if output_type == "reasoning": + if output_type == "message": + logger.debug(f"Message: {output}") + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + full_result += f"\nMessage: {content_part.get('text', '')}\n" + elif output_type == "reasoning": logger.debug(f"Reasoning: {output}") - full_result += f"\nReasoning: {output.get('content', '')}\n" + + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("text"): + full_result += f"\nReasoning: {summary_part.get('text', '')}\n" + else: + full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n" elif output_type == "computer_call": logger.debug(f"Computer call: {output}") action = output.get("action", "") @@ -144,15 +169,23 @@ def serve() -> FastMCP: full_result += "\n" + "-" * 40 + "\n" logger.info(f"CUA task completed successfully") - return full_result or "Task completed with no text output." + ctx.info(f"CUA task completed successfully") + return ( + full_result or "Task completed with no text output.", + Image( + format="png", + data=await global_computer.interface.screenshot() + ) + ) except Exception as e: error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}" logger.error(error_msg) + ctx.error(error_msg) return f"Error during task execution: {str(e)}" @server.tool() - async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str: + async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List: """ Run multiple CUA tasks in sequence and return the combined results. @@ -164,13 +197,15 @@ def serve() -> FastMCP: Combined results from all tasks """ results = [] - for i, task in enumerate(tasks): logger.info(f"Running task {i+1}/{len(tasks)}: {task}") - result = await run_cua_task(ctx, task) - results.append(f"Task {i+1}: {task}\nResult: {result}\n") - - return "\n".join(results) + ctx.info(f"Running task {i+1}/{len(tasks)}: {task}") + + ctx.report_progress(i / len(tasks)) + results.extend(await run_cua_task(ctx, task)) + ctx.report_progress((i + 1) / len(tasks)) + + return results return server From 60bcb0716cfa720084faafb1d0c9b1f716a6945c Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 30 Apr 2025 17:05:49 -0700 Subject: [PATCH 29/38] improved mcp prompting --- libs/mcp-server/mcp_server/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py index f5fb37b2..67f7fe12 100644 --- a/libs/mcp-server/mcp_server/server.py +++ b/libs/mcp-server/mcp_server/server.py @@ -52,7 +52,7 @@ def serve() -> FastMCP: @server.tool() async def screenshot_cua(ctx: Context) -> Image: """ - Take a screenshot of the current screen and return the image. + Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state. Args: ctx: The MCP context @@ -73,7 +73,7 @@ def serve() -> FastMCP: @server.tool() async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]: """ - Run a Computer-Use Agent (CUA) task and return the results. + Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results. Args: ctx: The MCP context @@ -187,7 +187,7 @@ def serve() -> FastMCP: @server.tool() async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List: """ - Run multiple CUA tasks in sequence and return the combined results. + Run multiple CUA tasks in a MacOS VM in sequence and return the combined results. Args: ctx: The MCP context From cf7d05421239d403da08e7227e724d3945daf58e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 1 May 2025 11:58:30 -0700 Subject: [PATCH 30/38] added standardized logging for easy parsing of trajectories --- examples/video_maker_traj.py | 360 +++++++++++-------- libs/agent/agent/providers/anthropic/loop.py | 2 + libs/agent/agent/providers/omni/loop.py | 2 + libs/agent/agent/providers/openai/loop.py | 4 + libs/agent/agent/providers/uitars/loop.py | 4 +- 5 files changed, 231 insertions(+), 141 deletions(-) diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py index 34a5ad3c..b9966aa0 100644 --- a/examples/video_maker_traj.py +++ b/examples/video_maker_traj.py @@ -68,82 +68,148 @@ def load_cursor_images(): last_known_cursor_position = None last_known_thought = None -def extract_thought_from_api_response(filename): - """Extract thought from API response for the current frame.""" +def parse_agent_response(filename_or_turn_dir): + """Parse agent response JSON file to extract text, actions, and cursor positions.""" + + # Check if we're getting a filename or turn directory + if os.path.isdir(filename_or_turn_dir): + turn_dir = filename_or_turn_dir + else: + turn_dir = os.path.dirname(filename_or_turn_dir) + + # Find agent response files in the turn directory + agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] + + result = { + "text": [], + "actions": [], + "cursor_positions": [] + } + + for agent_file in agent_response_files: + try: + with open(os.path.join(turn_dir, agent_file), 'r') as f: + data = json.load(f) + response_data = data.get('response', {}) + + # First check for content field (simple text response) + if response_data.get("content"): + result["text"].append(response_data.get("content", "")) + + # Process outputs array if present + outputs = response_data.get("output", []) + for output in outputs: + output_type = output.get("type") + + if output_type == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + result["text"].append(content_part.get("text", "")) + + elif output_type == "reasoning": + # Handle reasoning (thought) content + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + result["text"].append(summary_part.get("text", "")) + else: + summary_text = output.get("text", "") + if summary_text: + result["text"].append(summary_text) + + elif output_type == "computer_call": + action = output.get("action", {}) + if action: + result["actions"].append(action) + # Extract cursor position if available + if action.get("x") is not None and action.get("y") is not None: + result["cursor_positions"].append((action.get("x"), action.get("y"))) + except Exception as e: + print(f"Error processing {agent_file}: {e}") + + return result + +def extract_thought_from_agent_response(filename_or_turn_dir): + """Extract thought from agent response for the current frame.""" global last_known_thought - turn_dir = os.path.dirname(filename) - api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')] + agent_response = parse_agent_response(filename_or_turn_dir) - for api_file in api_response_files: - try: - with open(os.path.join(turn_dir, api_file), 'r') as f: - data = json.load(f) - # Extract content from response - content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '') - - # Extract the Thought section - thought_match = re.search(r"Thought: (.*?)(?:\nAction:|$)", content, re.DOTALL) - if thought_match: - thought = thought_match.group(1).strip() - if thought: - last_known_thought = thought - return thought - except (json.JSONDecodeError, FileNotFoundError, KeyError): - pass + if agent_response["text"]: + # Use the first text entry as the thought + last_known_thought = agent_response["text"][0] + return last_known_thought # Return the last known thought if no new thought is found return last_known_thought -def extract_cursor_position_from_filename(filename): - """Extract cursor position from a filename containing click info.""" +def extract_cursor_position_from_agent_response(filename_or_turn_dir): + """Extract cursor position from agent response.""" global last_known_cursor_position - # For 'screenshot_NNN_click_TIMESTAMP.png', try to extract coordinates - match = re.search(r'click_(\d+)_(\d+)_\d+\.png$', filename) - if match: - position = (int(match.group(1)), int(match.group(2))) - last_known_cursor_position = position - return position + # Check if we're getting a filename or turn directory + if os.path.isdir(filename_or_turn_dir): + turn_dir = filename_or_turn_dir + else: + turn_dir = os.path.dirname(filename_or_turn_dir) - # Check if we have position info from API response - turn_dir = os.path.dirname(filename) - api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')] + # Find agent response files in the turn directory + agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] - for api_file in api_response_files: + for agent_file in agent_response_files: try: - with open(os.path.join(turn_dir, api_file), 'r') as f: + with open(os.path.join(turn_dir, agent_file), 'r') as f: data = json.load(f) - # Extract action from response - content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '') - # Look for coordinates in the action - # First try the pattern from the example: click(start_box='(28,15)') - coord_match = re.search(r"click\(start_box='\((\d+),(\d+)\)'\)", content) - if coord_match: - position = (int(coord_match.group(1)), int(coord_match.group(2))) - last_known_cursor_position = position - return position + response_data = data.get('response', {}) - # Try alternative pattern: click(start_box='<|box_start|>(x,y)<|box_end|>') - alt_match = re.search(r"click\(start_box='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)", content) - if alt_match: - position = (int(alt_match.group(1)), int(alt_match.group(2))) - last_known_cursor_position = position - return position - except (json.JSONDecodeError, FileNotFoundError, KeyError): - pass + # Process outputs array if present + outputs = response_data.get("output", []) + for output in outputs: + if output.get("type") == "computer_call": + action = output.get("action", {}) + if action.get("x") is not None and action.get("y") is not None: + position = (action.get("x"), action.get("y")) + last_known_cursor_position = position + return position + except Exception as e: + print(f"Error processing {agent_file}: {e}") - # No new position found, return the last known position + # No position found in agent response, return the last known position return last_known_cursor_position -def extract_action_from_filename(filename): - """Determine the action type from the filename pattern.""" - if 'click' in filename: - return "clicking" - elif 'type' in filename: - return "typing" +def extract_action_from_agent_response(filename_or_turn_dir): + """Determine the action type from agent response.""" + # Check if we're getting a filename or turn directory + if os.path.isdir(filename_or_turn_dir): + turn_dir = filename_or_turn_dir else: - return "normal" + turn_dir = os.path.dirname(filename_or_turn_dir) + + # Find agent response files in the turn directory + agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] + + for agent_file in agent_response_files: + try: + with open(os.path.join(turn_dir, agent_file), 'r') as f: + data = json.load(f) + response_data = data.get('response', {}) + + # Process outputs array if present + outputs = response_data.get("output", []) + for output in outputs: + if output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type == "click": + return "clicking" + elif action_type == "type" or action_type == "input": + return "typing" + except Exception as e: + print(f"Error processing {agent_file}: {e}") + + return "normal" def create_animated_vignette(image, frame_index): """ @@ -451,58 +517,54 @@ def create_cursor_overlay(base_image, position, cursor_images, thought_text=None return result -def get_screenshot_files(trajectory_dir): +def get_turns(trajectory_dir): """ - Get all screenshot files from a trajectory directory, sorted by sequence number. + Get all turn folders from a trajectory directory and their corresponding files. Args: - trajectory_dir: Path to trajectory directory containing turn_XXX folders + trajectory_dir: Path to trajectory directory Returns: - List of tuples (path, sequence_number, action_type, position) + List of tuples (turn_dir, agent_response_path, image_file_path) """ - screenshot_files = [] + turns = [] # List all turn directories in order turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], - key=lambda x: int(x.split('_')[1])) + key=lambda x: int(x.split('_')[1])) - for turn_dir in turn_dirs: - turn_path = os.path.join(trajectory_dir, turn_dir) + for turn_dir_name in turn_dirs: + turn_path = os.path.join(trajectory_dir, turn_dir_name) if not os.path.isdir(turn_path): continue - - # Get all screenshot files in this turn - files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')] - for file in files: - file_path = os.path.join(turn_path, file) - - # Extract sequence number from filename (e.g., screenshot_003_...) - seq_match = re.search(r'screenshot_(\d+)', file) - if seq_match: - seq_number = int(seq_match.group(1)) - - # Determine action type from filename - action_type = extract_action_from_filename(file) - - # Get cursor position if available - position = extract_cursor_position_from_filename(file_path) - - screenshot_files.append((file_path, seq_number, action_type, position)) + # Find agent response files (if any) + agent_response_files = [f for f in os.listdir(turn_path) if f.endswith('_agent_response.json')] + agent_response_path = None + if agent_response_files: + agent_response_path = os.path.join(turn_path, agent_response_files[0]) + + # Find screenshot files (if any) + screenshot_files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')] + screenshot_path = None + if screenshot_files: + # Sort by sequence number to get the main one + sorted_screenshots = sorted(screenshot_files, + key=lambda x: int(re.search(r'screenshot_(\d+)', x).group(1) + if re.search(r'screenshot_(\d+)', x) else 0)) + screenshot_path = os.path.join(turn_path, sorted_screenshots[0]) if sorted_screenshots else None + + turns.append((turn_path, agent_response_path, screenshot_path)) - # Sort by sequence number - screenshot_files.sort(key=lambda x: x[1]) - - return screenshot_files + return turns def process_trajectory(trajectory_dir, output_dir, cursors): """Process a trajectory directory and create output frames.""" - # Get all screenshot files - screenshot_files = get_screenshot_files(trajectory_dir) + # Get all turns with their associated files + turns = get_turns(trajectory_dir) - if not screenshot_files: - print(f"No screenshot files found in {trajectory_dir}") + if not turns: + print(f"No turn directories found in {trajectory_dir}") return # Create output directory @@ -511,20 +573,27 @@ def process_trajectory(trajectory_dir, output_dir, cursors): # Track frame index frame_index = 0 - # Process each screenshot + # Process each turn prev_img = None prev_cursor_pos = None - for i, (file_path, seq_number, action_type, position) in enumerate(tqdm(screenshot_files, desc="Processing frames")): + for turn_path, agent_response_path, screenshot_path in tqdm(turns, desc="Processing turns"): + if not screenshot_path: + continue # Skip turns without screenshots + # Load the current image try: - current_img = Image.open(file_path) + current_img = Image.open(screenshot_path) except Exception as e: - print(f"Error loading image {file_path}: {e}") + print(f"Error loading image {screenshot_path}: {e}") continue - # Current cursor position - current_cursor_pos = position + # Extract action and position from agent response + action_type = extract_action_from_agent_response(turn_path) + current_cursor_pos = extract_cursor_position_from_agent_response(turn_path) + + # Extract thought from agent response + current_thought = extract_thought_from_agent_response(turn_path) # Check if the current frame has an action (click/typing) is_action_frame = action_type in ["clicking", "typing"] @@ -535,9 +604,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors): half_frames = FRAMES_PER_CLICK // 2 # First half of animation uses PREVIOUS image for j in range(half_frames): - # Get the thought from the API response - current_thought = extract_thought_from_api_response(file_path) - output_img = create_cursor_overlay( prev_img, current_cursor_pos, cursors, thought_text=current_thought, @@ -552,9 +618,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors): # Second half uses CURRENT image for j in range(half_frames, FRAMES_PER_CLICK): - # Get the thought from the API response - current_thought = extract_thought_from_api_response(file_path) - output_img = create_cursor_overlay( current_img, current_cursor_pos, cursors, thought_text=current_thought, @@ -569,9 +632,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors): else: # If no previous frame, use current for full animation for j in range(FRAMES_PER_CLICK): - # Get the thought from the API response - current_thought = extract_thought_from_api_response(file_path) - output_img = create_cursor_overlay( current_img, current_cursor_pos, cursors, thought_text=current_thought, @@ -585,9 +645,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors): frame_index += 1 else: # Regular frame with normal cursor - # Get the thought from the API response - current_thought = extract_thought_from_api_response(file_path) - output_img = create_cursor_overlay( current_img, current_cursor_pos, cursors, thought_text=current_thought, @@ -599,42 +656,43 @@ def process_trajectory(trajectory_dir, output_dir, cursors): output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) frame_index += 1 - # Add position interpolation frames if we're not at the last frame - if i < len(screenshot_files) - 1: - # Get next position - next_cursor_pos = screenshot_files[i+1][3] - - # Only interpolate if both positions are valid and different - if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos: - for j in range(1, FRAMES_PER_MOVE): - progress = j / FRAMES_PER_MOVE - interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress - interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress - interp_pos = (int(interp_x), int(interp_y)) - - # Create interpolated movement frame - # Get the thought from the API response - current_thought = extract_thought_from_api_response(file_path) - - output_img = create_cursor_overlay( - current_img, interp_pos, cursors, - thought_text=current_thought, - cursor_type="normal", - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - - # Save current frame as previous for next iteration + # Store current frame as previous for next iteration prev_img = current_img prev_cursor_pos = current_cursor_pos + + # Add position interpolation frames if we have both current and next turn data + current_turn_index = turns.index((turn_path, agent_response_path, screenshot_path)) + if current_turn_index < len(turns) - 1: + # Get next turn data + next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1] + if next_screenshot_path: # Only if next turn has a screenshot + # Get next position + next_cursor_pos = extract_cursor_position_from_agent_response(next_turn_path) + + # Only interpolate if both positions are valid and different + if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos: + for j in range(1, FRAMES_PER_MOVE): + progress = j / FRAMES_PER_MOVE + interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress + interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress + interp_pos = (int(interp_x), int(interp_y)) + + # Create interpolated movement frame + output_img = create_cursor_overlay( + current_img, interp_pos, cursors, + thought_text=current_thought, + cursor_type="normal", + frame_index=frame_index + ) + # Apply animated vignette effect + output_img = create_animated_vignette(output_img, frame_index) + output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) + frame_index += 1 def main(): """Main function to process the trajectory and create video frames.""" parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.') - parser.add_argument('trajectory_dir', type=str, help='Path to the trajectory folder') + parser.add_argument('trajectory_dir', type=str, nargs='?', help='Path to the trajectory folder') parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames') parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video') parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file') @@ -642,6 +700,28 @@ def main(): args = parser.parse_args() trajectory_dir = args.trajectory_dir + + # If trajectory_dir is not provided, find the latest folder in './trajectories' + if trajectory_dir is None: + trajectories_base_dir = "./trajectories" + if os.path.exists(trajectories_base_dir) and os.path.isdir(trajectories_base_dir): + # Get all directories in the trajectories folder + trajectory_folders = [os.path.join(trajectories_base_dir, d) for d in os.listdir(trajectories_base_dir) + if os.path.isdir(os.path.join(trajectories_base_dir, d))] + + if trajectory_folders: + # Sort folders by modification time, most recent last + trajectory_folders.sort(key=lambda x: os.path.getmtime(x)) + # Use the most recent folder + trajectory_dir = trajectory_folders[-1] + print(f"No trajectory directory specified, using latest: {trajectory_dir}") + else: + print(f"No trajectory folders found in {trajectories_base_dir}") + return + else: + print(f"Trajectories directory {trajectories_base_dir} does not exist") + return + output_dir = args.output_dir fps = args.fps output_video = args.output_video diff --git a/libs/agent/agent/providers/anthropic/loop.py b/libs/agent/agent/providers/anthropic/loop.py index 0ccdc79a..130a43cb 100644 --- a/libs/agent/agent/providers/anthropic/loop.py +++ b/libs/agent/agent/providers/anthropic/loop.py @@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop): messages, model=self.model, ) + # Log standardized response for ease of parsing + self._log_api_call("agent_response", request=None, response=openai_compatible_response) await queue.put(openai_compatible_response) if not should_continue: diff --git a/libs/agent/agent/providers/omni/loop.py b/libs/agent/agent/providers/omni/loop.py index b53c120c..18e0375f 100644 --- a/libs/agent/agent/providers/omni/loop.py +++ b/libs/agent/agent/providers/omni/loop.py @@ -670,6 +670,8 @@ class OmniLoop(BaseLoop): parsed_screen=parsed_screen, parser=self.parser ) + # Log standardized response for ease of parsing + self._log_api_call("agent_response", request=None, response=openai_compatible_response) # Yield the response to the caller yield openai_compatible_response diff --git a/libs/agent/agent/providers/openai/loop.py b/libs/agent/agent/providers/openai/loop.py index 8e507a1b..c4e0dfb5 100644 --- a/libs/agent/agent/providers/openai/loop.py +++ b/libs/agent/agent/providers/openai/loop.py @@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop): ) # Don't reset last_response_id to None - keep the previous value if available + + # Log standardized response for ease of parsing + # Since this is the openAI responses format, we don't need to convert it to agent response format + self._log_api_call("agent_response", request=None, response=response) # Process API response await queue.put(response) diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index a30d3bee..ac14ed1e 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -440,7 +440,7 @@ class UITARSLoop(BaseLoop): # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD ########################################### - async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]: + async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]: """Run the agent loop with provided messages. Args: @@ -513,6 +513,8 @@ class UITARSLoop(BaseLoop): messages, model=self.model, ) + # Log standardized response for ease of parsing + self._log_api_call("agent_response", request=None, response=agent_response) yield agent_response # Check if we should continue this conversation From f449005751d5ac699a1b050e9f5bb45398bbca47 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 1 May 2025 12:07:43 -0700 Subject: [PATCH 31/38] code cleanup --- examples/video_maker_traj.py | 123 ++++++++++------------------------- 1 file changed, 34 insertions(+), 89 deletions(-) diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py index b9966aa0..552969b6 100644 --- a/examples/video_maker_traj.py +++ b/examples/video_maker_traj.py @@ -69,7 +69,8 @@ last_known_cursor_position = None last_known_thought = None def parse_agent_response(filename_or_turn_dir): - """Parse agent response JSON file to extract text, actions, and cursor positions.""" + """Parse agent response JSON file to extract text, actions, cursor positions, thought, and action type.""" + global last_known_cursor_position, last_known_thought # Check if we're getting a filename or turn directory if os.path.isdir(filename_or_turn_dir): @@ -83,7 +84,9 @@ def parse_agent_response(filename_or_turn_dir): result = { "text": [], "actions": [], - "cursor_positions": [] + "cursor_positions": [], + "thought": None, + "action_type": "normal" } for agent_file in agent_response_files: @@ -125,92 +128,32 @@ def parse_agent_response(filename_or_turn_dir): result["actions"].append(action) # Extract cursor position if available if action.get("x") is not None and action.get("y") is not None: - result["cursor_positions"].append((action.get("x"), action.get("y"))) + position = (action.get("x"), action.get("y")) + result["cursor_positions"].append(position) + last_known_cursor_position = position + + # Determine action type + action_type = action.get("type", "") + if action_type == "click": + result["action_type"] = "clicking" + elif action_type == "type" or action_type == "input": + result["action_type"] = "typing" except Exception as e: print(f"Error processing {agent_file}: {e}") + # Set thought from text if available + if result["text"]: + result["thought"] = ' '.join(result["text"]) + last_known_thought = result["thought"] + else: + result["thought"] = last_known_thought + + # Set cursor position if not found + if not result["cursor_positions"]: + result["cursor_positions"] = [last_known_cursor_position] if last_known_cursor_position else [] + return result -def extract_thought_from_agent_response(filename_or_turn_dir): - """Extract thought from agent response for the current frame.""" - global last_known_thought - - agent_response = parse_agent_response(filename_or_turn_dir) - - if agent_response["text"]: - # Use the first text entry as the thought - last_known_thought = agent_response["text"][0] - return last_known_thought - - # Return the last known thought if no new thought is found - return last_known_thought - -def extract_cursor_position_from_agent_response(filename_or_turn_dir): - """Extract cursor position from agent response.""" - global last_known_cursor_position - - # Check if we're getting a filename or turn directory - if os.path.isdir(filename_or_turn_dir): - turn_dir = filename_or_turn_dir - else: - turn_dir = os.path.dirname(filename_or_turn_dir) - - # Find agent response files in the turn directory - agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] - - for agent_file in agent_response_files: - try: - with open(os.path.join(turn_dir, agent_file), 'r') as f: - data = json.load(f) - response_data = data.get('response', {}) - - # Process outputs array if present - outputs = response_data.get("output", []) - for output in outputs: - if output.get("type") == "computer_call": - action = output.get("action", {}) - if action.get("x") is not None and action.get("y") is not None: - position = (action.get("x"), action.get("y")) - last_known_cursor_position = position - return position - except Exception as e: - print(f"Error processing {agent_file}: {e}") - - # No position found in agent response, return the last known position - return last_known_cursor_position - -def extract_action_from_agent_response(filename_or_turn_dir): - """Determine the action type from agent response.""" - # Check if we're getting a filename or turn directory - if os.path.isdir(filename_or_turn_dir): - turn_dir = filename_or_turn_dir - else: - turn_dir = os.path.dirname(filename_or_turn_dir) - - # Find agent response files in the turn directory - agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] - - for agent_file in agent_response_files: - try: - with open(os.path.join(turn_dir, agent_file), 'r') as f: - data = json.load(f) - response_data = data.get('response', {}) - - # Process outputs array if present - outputs = response_data.get("output", []) - for output in outputs: - if output.get("type") == "computer_call": - action = output.get("action", {}) - action_type = action.get("type", "") - if action_type == "click": - return "clicking" - elif action_type == "type" or action_type == "input": - return "typing" - except Exception as e: - print(f"Error processing {agent_file}: {e}") - - return "normal" - def create_animated_vignette(image, frame_index): """ Create an animated purple/blue gradient vignette effect around the border of the image. @@ -588,12 +531,13 @@ def process_trajectory(trajectory_dir, output_dir, cursors): print(f"Error loading image {screenshot_path}: {e}") continue - # Extract action and position from agent response - action_type = extract_action_from_agent_response(turn_path) - current_cursor_pos = extract_cursor_position_from_agent_response(turn_path) + # Parse agent response + agent_response = parse_agent_response(turn_path) - # Extract thought from agent response - current_thought = extract_thought_from_agent_response(turn_path) + # Extract action type, cursor position, and thought + action_type = agent_response["action_type"] + current_cursor_pos = agent_response["cursor_positions"][0] if agent_response["cursor_positions"] else None + current_thought = agent_response["thought"] # Check if the current frame has an action (click/typing) is_action_frame = action_type in ["clicking", "typing"] @@ -667,7 +611,8 @@ def process_trajectory(trajectory_dir, output_dir, cursors): next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1] if next_screenshot_path: # Only if next turn has a screenshot # Get next position - next_cursor_pos = extract_cursor_position_from_agent_response(next_turn_path) + next_agent_response = parse_agent_response(next_turn_path) + next_cursor_pos = next_agent_response["cursor_positions"][0] if next_agent_response["cursor_positions"] else None # Only interpolate if both positions are valid and different if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos: From f2e390ba553f9e4b85515b6aa8e7d43090deca5d Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 1 May 2025 14:30:14 -0700 Subject: [PATCH 32/38] added env variable for CUA_PROVIDER_API_KEY --- libs/mcp-server/README.md | 5 +++-- libs/mcp-server/mcp_server/server.py | 13 +++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md index 5649cc19..86430e95 100644 --- a/libs/mcp-server/README.md +++ b/libs/mcp-server/README.md @@ -68,7 +68,7 @@ You can then use the script in your MCP configuration like this: "CUA_AGENT_LOOP": "OMNI", "CUA_MODEL_PROVIDER": "ANTHROPIC", "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219", - "ANTHROPIC_API_KEY": "your-api-key" + "CUA_PROVIDER_API_KEY": "your-api-key" } } } @@ -90,7 +90,8 @@ If you want to develop with the cua-mcp-server directly without installation, yo "CUA_AGENT_LOOP": "UITARS", "CUA_MODEL_PROVIDER": "OAICOMPAT", "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B", - "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1" + "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1", + "CUA_PROVIDER_API_KEY": "your-api-key" } } } diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py index 67f7fe12..03971cb6 100644 --- a/libs/mcp-server/mcp_server/server.py +++ b/libs/mcp-server/mcp_server/server.py @@ -94,14 +94,7 @@ def serve() -> FastMCP: # Determine which loop to use loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI") - if loop_str == "OPENAI": - loop = AgentLoop.OPENAI - elif loop_str == "ANTHROPIC": - loop = AgentLoop.ANTHROPIC - elif loop_str == "UITARS": - loop = AgentLoop.UITARS - else: - loop = AgentLoop.OMNI + loop = getattr(AgentLoop, loop_str) # Determine provider provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC") @@ -113,6 +106,9 @@ def serve() -> FastMCP: # Get base URL for provider (if needed) provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None) + # Get api key for provider (if needed) + api_key = os.getenv("CUA_PROVIDER_API_KEY", None) + # Create agent with the specified configuration agent = ComputerAgent( computer=global_computer, @@ -122,6 +118,7 @@ def serve() -> FastMCP: name=model_name, provider_base_url=provider_base_url, ), + api_key=api_key, save_trajectory=False, only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")), verbosity=logging.INFO, From 3ec479368b6af825eeff9a5b35f797fdb6b762da Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 1 May 2025 15:04:20 -0700 Subject: [PATCH 33/38] changed dev guide to use a .sh script --- libs/mcp-server/README.md | 22 +++++++++++---------- libs/mcp-server/scripts/start_mcp_server.sh | 14 +++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) create mode 100755 libs/mcp-server/scripts/start_mcp_server.sh diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md index 86430e95..736ab364 100644 --- a/libs/mcp-server/README.md +++ b/libs/mcp-server/README.md @@ -83,10 +83,9 @@ If you want to develop with the cua-mcp-server directly without installation, yo { "mcpServers": { "cua-agent": { - "command": "/Users/YOURUSERNAME/cua/.venv/bin/python", - "args": ["-m", "mcp_server.server"], + "command": "/bin/bash", + "args": ["~/cua/libs/mcp-server/scripts/start_mcp_server.sh"], "env": { - "PYTHONPATH": "/Users/YOURUSERNAME/cua/libs/mcp-server:/Users/YOURUSERNAME/cua/libs/agent:/Users/YOURUSERNAME/cua/libs/computer:/Users/YOURUSERNAME/cua/libs/core:/Users/YOURUSERNAME/cua/libs/pylume", "CUA_AGENT_LOOP": "UITARS", "CUA_MODEL_PROVIDER": "OAICOMPAT", "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B", @@ -98,19 +97,22 @@ If you want to develop with the cua-mcp-server directly without installation, yo } ``` -To see the logs: -``` -tail -n 20 -f ~/Library/Logs/Claude/mcp*.log -``` - This configuration: -- Uses your local Python virtual environment to run the server module directly -- Sets the Python path to include all necessary library dependencies +- Uses the start_mcp_server.sh script which automatically sets up the Python path and runs the server module - Works with Claude Desktop, Cursor, or any other MCP client - Automatically uses your development code without requiring installation Just add this to your MCP client's configuration and it will use your local development version of the server. +### Troubleshooting + +If you get a `/bin/bash: ~/cua/libs/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative. + +To see the logs: +``` +tail -n 20 -f ~/Library/Logs/Claude/mcp*.log +``` + ## Claude Desktop Integration To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`): diff --git a/libs/mcp-server/scripts/start_mcp_server.sh b/libs/mcp-server/scripts/start_mcp_server.sh new file mode 100755 index 00000000..17fd9dab --- /dev/null +++ b/libs/mcp-server/scripts/start_mcp_server.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e + +# Set the CUA repository path based on script location +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +CUA_REPO_DIR="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )" +PYTHON_PATH="${CUA_REPO_DIR}/.venv/bin/python" + +# Set Python path to include all necessary libraries +export PYTHONPATH="${CUA_REPO_DIR}/libs/mcp-server:${CUA_REPO_DIR}/libs/agent:${CUA_REPO_DIR}/libs/computer:${CUA_REPO_DIR}/libs/core:${CUA_REPO_DIR}/libs/pylume" + +# Run the MCP server directly as a module +$PYTHON_PATH -m mcp_server.server \ No newline at end of file From 9a00c510740ee42d3a7a8d24efe16ca65791ae9a Mon Sep 17 00:00:00 2001 From: f-trycua Date: Thu, 1 May 2025 16:34:05 -0700 Subject: [PATCH 34/38] Fix model name param --- libs/agent/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/agent/README.md b/libs/agent/README.md index e5dad869..cdcdb8f7 100644 --- a/libs/agent/README.md +++ b/libs/agent/README.md @@ -50,10 +50,10 @@ async with Computer() as macos_computer: # model=LLM(provider=LLMProvider.ANTHROPIC) # or # loop=AgentLoop.OMNI, - # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3") + # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3") # or # loop=AgentLoop.UITARS, - # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") + # model=LLM(provider=LLMProvider.OAICOMPAT, name="name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") ) tasks = [ From 6072d59cd93aa1385298cc449d723452a00ce2fc Mon Sep 17 00:00:00 2001 From: f-trycua Date: Thu, 1 May 2025 16:34:49 -0700 Subject: [PATCH 35/38] Update README.md --- libs/agent/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/agent/README.md b/libs/agent/README.md index cdcdb8f7..c1aac96c 100644 --- a/libs/agent/README.md +++ b/libs/agent/README.md @@ -53,7 +53,7 @@ async with Computer() as macos_computer: # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3") # or # loop=AgentLoop.UITARS, - # model=LLM(provider=LLMProvider.OAICOMPAT, name="name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") + # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1") ) tasks = [ From aa6bbdf3a89ea4b8fdb7af6fde86cf122e85c2f4 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Thu, 1 May 2025 16:36:10 -0700 Subject: [PATCH 36/38] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 45a97758..8eb12022 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ If you want to use AI agents with virtualized environments: async with Computer(verbosity=logging.DEBUG) as macos_computer: agent = ComputerAgent( computer=macos_computer, - loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI + loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.UITARS, or AgentLoop.OMNI model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC) ) From e9a9c03b637916eb307af0e7e3c8a7113f723449 Mon Sep 17 00:00:00 2001 From: f-trycua Date: Thu, 1 May 2025 18:27:54 -0700 Subject: [PATCH 37/38] Fix handleStop --- libs/lume/src/Server/Server.swift | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/libs/lume/src/Server/Server.swift b/libs/lume/src/Server/Server.swift index 98ffc588..782efa70 100644 --- a/libs/lume/src/Server/Server.swift +++ b/libs/lume/src/Server/Server.swift @@ -179,8 +179,21 @@ final class Server { return HTTPResponse(statusCode: .badRequest, body: "Missing VM name") } - // Extract storage from query params if present - let storage = self.extractQueryParam(request: request, name: "storage") + Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path]) + + // Extract storage from the request body + var storage: String? = nil + if let bodyData = request.body, !bodyData.isEmpty { + do { + if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any], + let bodyStorage = json["storage"] as? String { + storage = bodyStorage + Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage]) + } + } catch { + Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription]) + } + } return try await self.handleStopVM(name: name, storage: storage) }), From d55f566aa19491f3047565728ee227cfa7f2cdeb Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Thu, 1 May 2025 20:29:56 -0700 Subject: [PATCH 38/38] delete video maker python script in favor of video player react component --- examples/video_maker_traj.py | 717 ----------------------------------- 1 file changed, 717 deletions(-) delete mode 100644 examples/video_maker_traj.py diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py deleted file mode 100644 index 552969b6..00000000 --- a/examples/video_maker_traj.py +++ /dev/null @@ -1,717 +0,0 @@ -""" -Video Maker for Trajectory Dataset - -This script processes a trajectory dataset folder, extracts frames, -and creates an animated video with cursor overlays. -""" - -from utils import load_dotenv_files -load_dotenv_files() - -import os -import json -import math -import shutil -import re -from pathlib import Path -import argparse -import numpy as np -from PIL import Image, ImageDraw, ImageFilter -import requests -from io import BytesIO -from tqdm import tqdm - -# Constants -CURSOR_SCALE = 2 # Scale factor for cursor size -FRAMES_PER_CLICK = 8 # Number of frames to show for click animation -FRAMES_PER_MOVE = 10 # Number of frames to interpolate between cursor positions -CURSOR_NORMAL = "https://mac-cursors.netlify.app/png/default@2x.png" -CURSOR_CLICKING = "https://mac-cursors.netlify.app/png/handpointing@2x.png" -CURSOR_TYPING = "https://mac-cursors.netlify.app/png/textcursor@2x.png" -CURSOR_HOTSPOT = (20, 15) -OUTPUT_DIR = "examples/output/video_frames" - -# Vignette effect constants -VIGNETTE_WIDTH = 10 # Width of the vignette border in pixels -VIGNETTE_COLORS = [(128, 0, 255), (0, 0, 255)] # Purple to Blue gradient colors -VIGNETTE_ANIMATION_SPEED = 0.1 # Controls speed of the animation pulse - -def download_image(url): - """Download an image from a URL.""" - response = requests.get(url) - return Image.open(BytesIO(response.content)) - -def load_cursor_images(): - """Load and resize cursor images.""" - cursor_normal = download_image(CURSOR_NORMAL) - cursor_clicking = download_image(CURSOR_CLICKING) - cursor_typing = download_image(CURSOR_TYPING) - - # Resize all cursors based on CURSOR_SCALE - width_normal, height_normal = cursor_normal.size - width_clicking, height_clicking = cursor_clicking.size - width_typing, height_typing = cursor_typing.size - - cursor_normal = cursor_normal.resize((int(width_normal * CURSOR_SCALE), int(height_normal * CURSOR_SCALE))) - cursor_clicking = cursor_clicking.resize((int(width_clicking * CURSOR_SCALE), int(height_clicking * CURSOR_SCALE))) - cursor_typing = cursor_typing.resize((int(width_typing * CURSOR_SCALE), int(height_typing * CURSOR_SCALE))) - - cursors = { - "normal": cursor_normal, - "clicking": cursor_clicking, - "typing": cursor_typing - } - - return cursors - -# Store the last known cursor position and thought across all frames -last_known_cursor_position = None -last_known_thought = None - -def parse_agent_response(filename_or_turn_dir): - """Parse agent response JSON file to extract text, actions, cursor positions, thought, and action type.""" - global last_known_cursor_position, last_known_thought - - # Check if we're getting a filename or turn directory - if os.path.isdir(filename_or_turn_dir): - turn_dir = filename_or_turn_dir - else: - turn_dir = os.path.dirname(filename_or_turn_dir) - - # Find agent response files in the turn directory - agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')] - - result = { - "text": [], - "actions": [], - "cursor_positions": [], - "thought": None, - "action_type": "normal" - } - - for agent_file in agent_response_files: - try: - with open(os.path.join(turn_dir, agent_file), 'r') as f: - data = json.load(f) - response_data = data.get('response', {}) - - # First check for content field (simple text response) - if response_data.get("content"): - result["text"].append(response_data.get("content", "")) - - # Process outputs array if present - outputs = response_data.get("output", []) - for output in outputs: - output_type = output.get("type") - - if output_type == "message": - content = output.get("content", []) - for content_part in content: - if content_part.get("text"): - result["text"].append(content_part.get("text", "")) - - elif output_type == "reasoning": - # Handle reasoning (thought) content - summary_content = output.get("summary", []) - if summary_content: - for summary_part in summary_content: - if summary_part.get("type") == "summary_text": - result["text"].append(summary_part.get("text", "")) - else: - summary_text = output.get("text", "") - if summary_text: - result["text"].append(summary_text) - - elif output_type == "computer_call": - action = output.get("action", {}) - if action: - result["actions"].append(action) - # Extract cursor position if available - if action.get("x") is not None and action.get("y") is not None: - position = (action.get("x"), action.get("y")) - result["cursor_positions"].append(position) - last_known_cursor_position = position - - # Determine action type - action_type = action.get("type", "") - if action_type == "click": - result["action_type"] = "clicking" - elif action_type == "type" or action_type == "input": - result["action_type"] = "typing" - except Exception as e: - print(f"Error processing {agent_file}: {e}") - - # Set thought from text if available - if result["text"]: - result["thought"] = ' '.join(result["text"]) - last_known_thought = result["thought"] - else: - result["thought"] = last_known_thought - - # Set cursor position if not found - if not result["cursor_positions"]: - result["cursor_positions"] = [last_known_cursor_position] if last_known_cursor_position else [] - - return result - -def create_animated_vignette(image, frame_index): - """ - Create an animated purple/blue gradient vignette effect around the border of the image. - The animation pulses the colors and gently varies their intensity over time. - - Args: - image: The base image to apply the vignette to - frame_index: Current frame index for animation timing - - Returns: - Image with vignette effect applied - """ - # Create a copy of the image to work with - result = image.copy() - width, height = result.size - - # Create a blank RGBA image for the vignette overlay - vignette = Image.new('RGBA', (width, height), (0, 0, 0, 0)) - draw = ImageDraw.Draw(vignette) - - # Calculate animation phase based on frame index - phase = math.sin(frame_index * VIGNETTE_ANIMATION_SPEED) * 0.5 + 0.5 # Oscillates between 0 and 1 - - # Interpolate between the vignette colors based on the animation phase - color1 = VIGNETTE_COLORS[0] - color2 = VIGNETTE_COLORS[1] - animated_color = ( - int(color1[0] + (color2[0] - color1[0]) * phase), - int(color1[1] + (color2[1] - color1[1]) * phase), - int(color1[2] + (color2[2] - color1[2]) * phase), - ) - - # Draw gradient borders around each edge - # Top border - for i in range(VIGNETTE_WIDTH): - alpha = int(150 * (1 - i / VIGNETTE_WIDTH)) - border_color = animated_color[:3] + (alpha,) - draw.line([(0, i), (width, i)], fill=border_color, width=1) - draw.line([(0, height-i-1), (width, height-i-1)], fill=border_color, width=1) - draw.line([(i, 0), (i, height)], fill=border_color, width=1) - draw.line([(width-i-1, 0), (width-i-1, height)], fill=border_color, width=1) - - # Apply slight blur to smooth the gradient - vignette = vignette.filter(ImageFilter.GaussianBlur(16)) - - # Composite the vignette over the original image - result = Image.alpha_composite(result.convert('RGBA'), vignette) - - return result.convert('RGB') # Convert back to RGB for consistency - -def scale_cursor_with_animation(cursor, frame, max_frames, cursor_type): - """Create springy scale animation for cursor.""" - if cursor_type == "normal": - return cursor - - # For clicking or typing cursors, create a spring effect - progress = frame / max_frames - - # Spring effect calculation - starts big, gets smaller, then back to normal - if progress < 0.3: - # Start with larger scale, shrink down - scale = 1.3 - progress - elif progress < 0.7: - # Then bounce back up a bit - scale = 0.7 + (progress - 0.3) * 0.8 - else: - # Then settle to normal (1.0) - scale = 1.0 + (1.0 - progress) * 0.3 - - # Apply scale - width, height = cursor.size - new_width = int(width * scale) - new_height = int(height * scale) - return cursor.resize((new_width, new_height)) - -# Store the last thought bubble position -last_thought_bubble_pos = None - -def draw_thought_bubble(image, position, thought_text, frame_index): - """Draw a thought bubble with the AI's thoughts near the cursor position.""" - global last_thought_bubble_pos - - if thought_text is None or position is None: - return image - - # Create a copy of the image to work with - result = image.copy() - - # Set up text parameters - font_size = 16 - try: - # Try to use a nice font if available - from PIL import ImageFont - try: - font = ImageFont.truetype("Arial", font_size) - except IOError: - # Fallback to default font - font = ImageFont.load_default() - except ImportError: - font = None - - # Wrap text to fit in bubble - max_width = 400 # Max width in pixels - wrapped_lines = [] - words = thought_text.split() - current_line = [] - - for word in words: - # Add word to current line - test_line = ' '.join(current_line + [word]) - - # Create a temporary draw object to measure text width if needed - temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1))) - - # Measure the text width - if font: - if hasattr(temp_draw, 'textlength'): - text_width = temp_draw.textlength(test_line, font=font) - else: - # Fall back to rough estimation - text_width = len(test_line) * (font_size * 0.6) - else: - # Rough estimation if no font metrics are available - text_width = len(test_line) * (font_size * 0.6) - - if text_width <= max_width: - current_line.append(word) - else: - # Line is full, start a new line - if current_line: - wrapped_lines.append(' '.join(current_line)) - current_line = [word] - - # Don't forget the last line - if current_line: - wrapped_lines.append(' '.join(current_line)) - - # Limit number of lines for very long thoughts - max_lines = 8 - if len(wrapped_lines) > max_lines: - wrapped_lines = wrapped_lines[:max_lines-1] + ["..."] - - # Calculate text dimensions - line_height = font_size + 4 - text_height = len(wrapped_lines) * line_height - - # Find the widest line - if font: - # Create a draw object to measure text width - temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1))) - if hasattr(temp_draw, 'textlength'): - text_width = max(temp_draw.textlength(line, font=font) for line in wrapped_lines) - else: - # Fall back to rough estimation - text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines) - else: - text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines) - - # Add padding - padding = 20 - bubble_width = text_width + padding * 2 - bubble_height = text_height + padding * 2 - - # Calculate bubble position - move slowly towards cursor position - x, y = position - screen_width, screen_height = image.size - - # Default initial position if this is the first bubble - target_bubble_x = min(x + 30, screen_width - bubble_width - 10) - target_bubble_y = max(y - bubble_height - 30, 10) - - # Ensure target position is fully on screen - if target_bubble_x < 10: - target_bubble_x = 10 - if target_bubble_y + bubble_height > screen_height - 10: - target_bubble_y = screen_height - bubble_height - 10 - - # Calculate new position with slow movement towards target - # Very slow movement factor (0.01 means it moves 1% of the distance per frame) - movement_factor = 0.001 - - if last_thought_bubble_pos is None: - # First frame, set to target position - bubble_x, bubble_y = target_bubble_x, target_bubble_y - else: - # Interpolate slowly towards target position - last_x, last_y = last_thought_bubble_pos - bubble_x = last_x + (target_bubble_x - last_x) * movement_factor - bubble_y = last_y + (target_bubble_y - last_y) * movement_factor - - # Add a subtle animation effect to the bubble - # animation_offset = math.sin(frame_index * 0.1) * 2 - # bubble_y += int(animation_offset) - - # Store position for next frame - last_thought_bubble_pos = (bubble_x, bubble_y) - - # Draw rounded rectangle for bubble - corner_radius = 15 - - # Background with black gaussian blur - background_color = (0, 0, 0, 180) # Black with transparency - outline_color = (50, 50, 50, 255) # Dark gray outline - - # Draw the bubble background - first create an RGBA version - bubble_img = Image.new('RGBA', result.size, (0, 0, 0, 0)) - bubble_draw = ImageDraw.Draw(bubble_img) - - # Draw rounded rectangle - # Check if rounded_rectangle is available (PIL 8.0.0+) - if hasattr(bubble_draw, 'rounded_rectangle'): - bubble_draw.rounded_rectangle( - [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height], - radius=corner_radius, - fill=background_color, - outline=outline_color, - width=2 - ) - else: - # Fall back to regular rectangle if rounded_rectangle not available - bubble_draw.rectangle( - [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height], - fill=background_color, - outline=outline_color - ) - - # Apply gaussian blur to the bubble background - bubble_img = bubble_img.filter(ImageFilter.GaussianBlur(3)) - - # Draw small triangle pointing to cursor - pointer_size = 10 - pointer_x = x + 15 - pointer_y = y - 5 - - # Make sure pointer is under the bubble - if pointer_x > bubble_x + bubble_width: - pointer_x = bubble_x + bubble_width - 20 - elif pointer_x < bubble_x: - pointer_x = bubble_x + 20 - - # Create an overlay for the pointer - pointer_overlay = Image.new('RGBA', result.size, (0, 0, 0, 0)) - pointer_draw = ImageDraw.Draw(pointer_overlay) - - # Draw pointer triangle - # pointer_draw.polygon( - # [ - # (pointer_x, pointer_y), - # (pointer_x - pointer_size, pointer_y - pointer_size), - # (pointer_x + pointer_size, pointer_y - pointer_size) - # ], - # fill=background_color, - # outline=outline_color - # ) - - # Apply gaussian blur to the pointer - pointer_overlay = pointer_overlay.filter(ImageFilter.GaussianBlur(3)) - - # Composite the bubble and pointer onto the original image - result = Image.alpha_composite(result.convert('RGBA'), bubble_img) - result = Image.alpha_composite(result, pointer_overlay) - - # Now draw the text - draw = ImageDraw.Draw(result) - text_x = bubble_x + padding - text_y = bubble_y + padding - - text_color = (255, 255, 255, 255) # White text - for line in wrapped_lines: - draw.text((text_x, text_y), line, font=font, fill=text_color) - text_y += line_height - - return result.convert('RGB') - -def create_cursor_overlay(base_image, position, cursor_images, thought_text=None, cursor_type="normal", animation_frame=0, frame_index=0): - """Create an image with cursor overlaid on the base image and thought bubble if available.""" - # Create a copy of the base image - result = base_image.copy() - - # If position is None, return the image without a cursor - if position is None: - return result - - # Get the appropriate cursor image - cursor = cursor_images[cursor_type] - - # Apply animation scaling if needed - if cursor_type in ["clicking", "typing"]: - cursor = scale_cursor_with_animation(cursor, animation_frame, FRAMES_PER_CLICK, cursor_type) - - # Calculate position to center the cursor hotspot - # Cursor hotspot is at (20,15) of the cursor image - x, y = position - hotspot_x, hotspot_y = CURSOR_HOTSPOT - cursor_x = x - (hotspot_x * CURSOR_SCALE) # X offset for hotspot - cursor_y = y - (hotspot_y * CURSOR_SCALE) # Y offset for hotspot - - # Paste the cursor onto the image - result.paste(cursor, (int(cursor_x), int(cursor_y)), cursor) - - # Add thought bubble if text is available - if thought_text: - result = draw_thought_bubble(result, position, thought_text, frame_index) - - return result - -def get_turns(trajectory_dir): - """ - Get all turn folders from a trajectory directory and their corresponding files. - - Args: - trajectory_dir: Path to trajectory directory - - Returns: - List of tuples (turn_dir, agent_response_path, image_file_path) - """ - turns = [] - - # List all turn directories in order - turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], - key=lambda x: int(x.split('_')[1])) - - for turn_dir_name in turn_dirs: - turn_path = os.path.join(trajectory_dir, turn_dir_name) - if not os.path.isdir(turn_path): - continue - - # Find agent response files (if any) - agent_response_files = [f for f in os.listdir(turn_path) if f.endswith('_agent_response.json')] - agent_response_path = None - if agent_response_files: - agent_response_path = os.path.join(turn_path, agent_response_files[0]) - - # Find screenshot files (if any) - screenshot_files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')] - screenshot_path = None - if screenshot_files: - # Sort by sequence number to get the main one - sorted_screenshots = sorted(screenshot_files, - key=lambda x: int(re.search(r'screenshot_(\d+)', x).group(1) - if re.search(r'screenshot_(\d+)', x) else 0)) - screenshot_path = os.path.join(turn_path, sorted_screenshots[0]) if sorted_screenshots else None - - turns.append((turn_path, agent_response_path, screenshot_path)) - - return turns - -def process_trajectory(trajectory_dir, output_dir, cursors): - """Process a trajectory directory and create output frames.""" - # Get all turns with their associated files - turns = get_turns(trajectory_dir) - - if not turns: - print(f"No turn directories found in {trajectory_dir}") - return - - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - # Track frame index - frame_index = 0 - - # Process each turn - prev_img = None - prev_cursor_pos = None - - for turn_path, agent_response_path, screenshot_path in tqdm(turns, desc="Processing turns"): - if not screenshot_path: - continue # Skip turns without screenshots - - # Load the current image - try: - current_img = Image.open(screenshot_path) - except Exception as e: - print(f"Error loading image {screenshot_path}: {e}") - continue - - # Parse agent response - agent_response = parse_agent_response(turn_path) - - # Extract action type, cursor position, and thought - action_type = agent_response["action_type"] - current_cursor_pos = agent_response["cursor_positions"][0] if agent_response["cursor_positions"] else None - current_thought = agent_response["thought"] - - # Check if the current frame has an action (click/typing) - is_action_frame = action_type in ["clicking", "typing"] - - if is_action_frame: - # If we have a previous frame, use it for the first half of animation - if prev_img is not None: - half_frames = FRAMES_PER_CLICK // 2 - # First half of animation uses PREVIOUS image - for j in range(half_frames): - output_img = create_cursor_overlay( - prev_img, current_cursor_pos, cursors, - thought_text=current_thought, - cursor_type=action_type, - animation_frame=j, - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - - # Second half uses CURRENT image - for j in range(half_frames, FRAMES_PER_CLICK): - output_img = create_cursor_overlay( - current_img, current_cursor_pos, cursors, - thought_text=current_thought, - cursor_type=action_type, - animation_frame=j, - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - else: - # If no previous frame, use current for full animation - for j in range(FRAMES_PER_CLICK): - output_img = create_cursor_overlay( - current_img, current_cursor_pos, cursors, - thought_text=current_thought, - cursor_type=action_type, - animation_frame=j, - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - else: - # Regular frame with normal cursor - output_img = create_cursor_overlay( - current_img, current_cursor_pos, cursors, - thought_text=current_thought, - cursor_type="normal", - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - - # Store current frame as previous for next iteration - prev_img = current_img - prev_cursor_pos = current_cursor_pos - - # Add position interpolation frames if we have both current and next turn data - current_turn_index = turns.index((turn_path, agent_response_path, screenshot_path)) - if current_turn_index < len(turns) - 1: - # Get next turn data - next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1] - if next_screenshot_path: # Only if next turn has a screenshot - # Get next position - next_agent_response = parse_agent_response(next_turn_path) - next_cursor_pos = next_agent_response["cursor_positions"][0] if next_agent_response["cursor_positions"] else None - - # Only interpolate if both positions are valid and different - if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos: - for j in range(1, FRAMES_PER_MOVE): - progress = j / FRAMES_PER_MOVE - interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress - interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress - interp_pos = (int(interp_x), int(interp_y)) - - # Create interpolated movement frame - output_img = create_cursor_overlay( - current_img, interp_pos, cursors, - thought_text=current_thought, - cursor_type="normal", - frame_index=frame_index - ) - # Apply animated vignette effect - output_img = create_animated_vignette(output_img, frame_index) - output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png")) - frame_index += 1 - -def main(): - """Main function to process the trajectory and create video frames.""" - parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.') - parser.add_argument('trajectory_dir', type=str, nargs='?', help='Path to the trajectory folder') - parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames') - parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video') - parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file') - parser.add_argument('--skip_ffmpeg', action='store_true', help='Skip running ffmpeg to create video') - args = parser.parse_args() - - trajectory_dir = args.trajectory_dir - - # If trajectory_dir is not provided, find the latest folder in './trajectories' - if trajectory_dir is None: - trajectories_base_dir = "./trajectories" - if os.path.exists(trajectories_base_dir) and os.path.isdir(trajectories_base_dir): - # Get all directories in the trajectories folder - trajectory_folders = [os.path.join(trajectories_base_dir, d) for d in os.listdir(trajectories_base_dir) - if os.path.isdir(os.path.join(trajectories_base_dir, d))] - - if trajectory_folders: - # Sort folders by modification time, most recent last - trajectory_folders.sort(key=lambda x: os.path.getmtime(x)) - # Use the most recent folder - trajectory_dir = trajectory_folders[-1] - print(f"No trajectory directory specified, using latest: {trajectory_dir}") - else: - print(f"No trajectory folders found in {trajectories_base_dir}") - return - else: - print(f"Trajectories directory {trajectories_base_dir} does not exist") - return - - output_dir = args.output_dir - fps = args.fps - output_video = args.output_video - skip_ffmpeg = args.skip_ffmpeg - - # Check if trajectory directory exists - if not os.path.exists(trajectory_dir): - print(f"Trajectory directory {trajectory_dir} does not exist") - return - - # Clean output directory if it exists - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - - # Create output directory - os.makedirs(output_dir, exist_ok=True) - - # Load cursor images - print("Loading cursor images...") - cursors = load_cursor_images() - - # Process the trajectory - print(f"Processing trajectory from {trajectory_dir}...") - process_trajectory(trajectory_dir, output_dir, cursors) - - print(f"Processing complete. Frames saved to {output_dir}") - - # Run ffmpeg to create the video - if not skip_ffmpeg: - print(f"Running ffmpeg to create video: {output_video}") - ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}" - try: - import subprocess - result = subprocess.run(ffmpeg_cmd, shell=True, check=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True) - print(f"Video created successfully: {output_video}") - except subprocess.CalledProcessError as e: - print(f"Error running ffmpeg: {e}") - print(f"ffmpeg output:\n{e.stdout}\n{e.stderr}") - print("\nYou can create a video manually with this command:") - print(ffmpeg_cmd) - else: - print("Skipping ffmpeg. You can create a video from these frames using ffmpeg with this command:") - print(f"ffmpeg -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}") - -if __name__ == "__main__": - main()