From 7fb5e75711685464e121f1b9361820227d3a7257 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 24 Apr 2025 19:24:12 -0400
Subject: [PATCH 01/38] consistency with other loops

---
 libs/agent/agent/providers/uitars/loop.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 99132365..0d3bc9f7 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
 
-                # Convert messages to UI-TARS format
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
                 prepared_messages = self.message_manager.get_messages()
+                
+                # Convert messages to UI-TARS format
                 uitars_messages = self.to_uitars_format(prepared_messages)
                 
                 # Log request

From 505a9a5f453d1b738005ba3a41082e7a8e6ea0bc Mon Sep 17 00:00:00 2001
From: Finn <finnborge@gmail.com>
Date: Sat, 26 Apr 2025 20:58:21 -0400
Subject: [PATCH 02/38] docs: fix wait action

---
 notebooks/blog/build-your-own-operator-on-macos-1.ipynb | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/notebooks/blog/build-your-own-operator-on-macos-1.ipynb b/notebooks/blog/build-your-own-operator-on-macos-1.ipynb
index 22db332d..70c0e6ea 100644
--- a/notebooks/blog/build-your-own-operator-on-macos-1.ipynb
+++ b/notebooks/blog/build-your-own-operator-on-macos-1.ipynb
@@ -145,9 +145,8 @@
     "                await computer.interface.press_key(key)\n",
     "    \n",
     "    elif action_type == \"wait\":\n",
-    "        wait_time = action.time\n",
-    "        print(f\"Waiting for {wait_time} seconds\")\n",
-    "        await asyncio.sleep(wait_time)\n",
+    "        print(f\"Waiting for 2 seconds\")\n",
+    "        await asyncio.sleep(2)\n",
     "    \n",
     "    elif action_type == \"screenshot\":\n",
     "        print(\"Taking screenshot\")\n",

From 967a732bbad03159c916d6cbed9560e79d89e264 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Sun, 27 Apr 2025 22:43:34 -0700
Subject: [PATCH 03/38] Add Lumier

---
 .gitignore                           |   8 +-
 .vscode/lumier.code-workspace        |  30 ++++
 libs/lumier/.dockerignore            |  24 ++++
 libs/lumier/Dockerfile               |  74 ++++++++++
 libs/lumier/README.md                | 175 +++++++++++++++++++++++
 libs/lumier/install.sh               | 176 +++++++++++++++++++++++
 libs/lumier/lumier                   | 200 +++++++++++++++++++++++++++
 libs/lumier/mount/server.py          |  10 ++
 libs/lumier/mount/setup.sh           |   9 ++
 libs/lumier/src/bin/entry.sh         |  98 +++++++++++++
 libs/lumier/src/bin/server.sh        |  99 +++++++++++++
 libs/lumier/src/bin/tunnel-script.sh |  44 ++++++
 libs/lumier/src/bin/tunnel.sh        |  96 +++++++++++++
 libs/lumier/src/config/constants.sh  |  25 ++++
 libs/lumier/src/hooks/on-logon.sh    |   8 ++
 libs/lumier/src/lib/utils.sh         | 106 ++++++++++++++
 libs/lumier/src/lib/vm.sh            | 175 +++++++++++++++++++++++
 17 files changed, 1355 insertions(+), 2 deletions(-)
 create mode 100644 .vscode/lumier.code-workspace
 create mode 100644 libs/lumier/.dockerignore
 create mode 100644 libs/lumier/Dockerfile
 create mode 100644 libs/lumier/README.md
 create mode 100755 libs/lumier/install.sh
 create mode 100755 libs/lumier/lumier
 create mode 100644 libs/lumier/mount/server.py
 create mode 100755 libs/lumier/mount/setup.sh
 create mode 100755 libs/lumier/src/bin/entry.sh
 create mode 100755 libs/lumier/src/bin/server.sh
 create mode 100755 libs/lumier/src/bin/tunnel-script.sh
 create mode 100755 libs/lumier/src/bin/tunnel.sh
 create mode 100644 libs/lumier/src/config/constants.sh
 create mode 100755 libs/lumier/src/hooks/on-logon.sh
 create mode 100755 libs/lumier/src/lib/utils.sh
 create mode 100755 libs/lumier/src/lib/vm.sh

diff --git a/.gitignore b/.gitignore
index ce8445bf..8265a5a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,8 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
+lib/*
+!libs/lumier/src/lib/
 lib64/
 parts/
 sdist/
@@ -242,4 +243,7 @@ trajectories/
 .storage/
 
 # Gradio settings
-.gradio_settings.json
\ No newline at end of file
+.gradio_settings.json
+
+# Lumier Storage
+storage/
\ No newline at end of file
diff --git a/.vscode/lumier.code-workspace b/.vscode/lumier.code-workspace
new file mode 100644
index 00000000..26e12846
--- /dev/null
+++ b/.vscode/lumier.code-workspace
@@ -0,0 +1,30 @@
+{
+    "folders": [
+        {
+            "name": "lumier",
+            "path": "../libs/lumier"
+        },
+        {
+            "name": "lume",
+            "path": "../libs/lume"
+        }
+    ],
+    "settings": {
+        "files.exclude": {
+            "**/.git": true,
+            "**/.svn": true,
+            "**/.hg": true,
+            "**/CVS": true,
+            "**/.DS_Store": true
+        }
+    },
+    "tasks": {
+        "version": "2.0.0",
+        "tasks": [
+        ]
+    },
+    "launch": {
+        "configurations": [
+        ]
+    }
+} 
\ No newline at end of file
diff --git a/libs/lumier/.dockerignore b/libs/lumier/.dockerignore
new file mode 100644
index 00000000..3e0f9c98
--- /dev/null
+++ b/libs/lumier/.dockerignore
@@ -0,0 +1,24 @@
+# Ignore macOS system files and trash
+.DS_Store
+.Trashes
+**/.Trashes
+**/.*
+
+# Ignore Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# Ignore virtual environments
+.venv/
+venv/
+
+# Ignore editor/project files
+.vscode/
+.idea/
+*.swp
+
+# Ignore test artifacts
+test-results/
+
+# Ignore anything else you don't want in the Docker build context
diff --git a/libs/lumier/Dockerfile b/libs/lumier/Dockerfile
new file mode 100644
index 00000000..710eb80b
--- /dev/null
+++ b/libs/lumier/Dockerfile
@@ -0,0 +1,74 @@
+# Base image using Debian for arm64 architecture (optimized for Apple Silicon)
+FROM debian:bullseye-slim AS lumier-base
+
+# Set environment variables for Lume API server configuration
+ENV LUME_API_HOST="host.docker.internal"
+ENV LUME_API_PORT="8080"
+
+# Default VM configuration (can be overridden at runtime)
+ENV VERSION="ghcr.io/trycua/macos-sequoia-vanilla:latest"
+ENV RAM_SIZE="8192"
+ENV CPU_CORES="4"
+ENV DISK_SIZE="100"
+ENV DISPLAY="1024x768"
+ENV VM_NAME="lumier"
+ENV HOST_DATA_PATH=""
+ENV LUMIER_DEBUG="0"
+
+# Install necessary tools and noVNC dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    netcat-traditional \
+    curl \
+    sshpass \
+    wget \
+    unzip \
+    git \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    procps && \
+    rm -rf /var/lib/apt/lists/*
+
+# Add a dummy environment variable to invalidate cache
+ENV CACHEBUST=1
+
+# Download and install noVNC without caching
+RUN wget https://github.com/trycua/noVNC/archive/refs/heads/master.zip -O master1.zip && \
+    unzip master1.zip && \
+    mv noVNC-master /opt/noVNC && \
+    rm master1.zip
+
+# Set environment variables for noVNC
+ENV NOVNC_PATH="/opt/noVNC"
+
+# Create directory structure
+RUN mkdir -p /run/bin /run/lib /run/config /run/hooks
+
+# Copy scripts to the container
+COPY src/bin/tunnel.sh /run/bin/
+COPY src/bin/tunnel-script.sh /usr/local/bin/lume
+COPY src/bin/tunnel-script.sh /usr/local/bin/sshpass
+COPY src/config/constants.sh /run/config/
+COPY src/bin/entry.sh /run/bin/entry.sh
+
+# Copy library files if they exist
+COPY src/lib/ /run/lib/
+COPY src/hooks/ /run/hooks/
+
+# Make scripts executable
+RUN chmod +x /usr/local/bin/lume \
+    /usr/local/bin/sshpass \
+    /run/bin/* \
+    /run/hooks/* 2>/dev/null || true
+
+# Expose ports for noVNC and Lume API
+EXPOSE 8080
+EXPOSE 8006
+
+# VOLUME setup
+VOLUME [ "/storage" ]
+VOLUME [ "/data" ]
+
+# Default entrypoint
+ENTRYPOINT ["/run/bin/entry.sh"]
\ No newline at end of file
diff --git a/libs/lumier/README.md b/libs/lumier/README.md
new file mode 100644
index 00000000..65803e39
--- /dev/null
+++ b/libs/lumier/README.md
@@ -0,0 +1,175 @@
+<div align="center">
+<h1>
+  <div class="image-wrapper" style="display: inline-block;">
+    <picture>
+      <source media="(prefers-color-scheme: dark)" alt="logo" height="150" srcset="../../img/logo_white.png" style="display: block; margin: auto;">
+      <source media="(prefers-color-scheme: light)" alt="logo" height="150" srcset="../../img/logo_black.png" style="display: block; margin: auto;">
+      <img alt="Shows my svg">
+    </picture>
+  </div>
+
+  [![Swift 6](https://img.shields.io/badge/Swift_6-F54A2A?logo=swift&logoColor=white&labelColor=F54A2A)](#)
+  [![macOS](https://img.shields.io/badge/macOS-000000?logo=apple&logoColor=F0F0F0)](#)
+  [![Homebrew](https://img.shields.io/badge/Homebrew-FBB040?logo=homebrew&logoColor=fff)](#install)
+  [![Discord](https://img.shields.io/badge/Discord-%235865F2.svg?&logo=discord&logoColor=white)](https://discord.com/invite/mVnXXpdE85)
+</h1>
+</div>
+
+**Lumier** provides a Docker-based interface for the `lume` CLI, allowing you to easily run macOS virtual machines inside a container with VNC access. It creates a secure tunnel to execute lume commands on your host machine while providing a containerized environment for your applications.
+
+## Requirements
+
+Before using Lumier, make sure you have:
+
+1. Install [lume](https://github.com/trycua/cua/blob/main/libs/lume/README.md) on your host machine
+2. Docker installed on your host machine
+3. `socat` installed for the tunnel (install with Homebrew: `brew install socat`)
+
+## Installation
+
+You can use Lumier directly from its directory or install it to your system:
+
+```bash
+# Option 1: Install to your user's bin directory (recommended)
+./install.sh
+
+# Option 2: Install to a custom directory
+./install.sh --install-dir=/usr/local/bin  # May require sudo
+
+# Option 3: View installation options
+./install.sh --help
+```
+
+After installation, you can run `lumier` from anywhere in your terminal.
+
+If you get a "command not found" error, make sure the installation directory is in your PATH. The installer will warn you if it isn't and provide instructions to add it.
+
+## Usage
+
+There are two ways to use Lumier: with the provided script or directly with Docker.
+
+### Option 1: Using the Lumier Script
+
+Lumier provides a simple CLI interface to manage VMs in Docker with full Docker compatibility:
+
+```bash
+# Show help and available commands
+lumier help
+
+# Start the tunnel to connect to lume 
+lumier start
+
+# Check if the tunnel is running
+lumier status
+
+# Stop the tunnel
+lumier stop
+
+# Build the Docker image (optional, happens automatically on first run)
+lumier build
+
+# Run a VM with default settings
+lumier run -it --rm
+
+# Run a VM with custom settings using Docker's -e flag
+lumier run -it --rm \
+    --name lumier-vm \
+    -p 8006:8006 \
+    -v $(pwd)/storage:/storage \
+    -v $(pwd)/shared:/data \
+    -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
+    -e CPU_CORES=4 \
+    -e RAM_SIZE=8192
+    
+# Note:
+# The lumier script now automatically detects the real host paths for ./storage and ./shared
+# and passes them to the container as HOST_STORAGE_PATH and HOST_DATA_PATH.
+# You do NOT need to specify these environment variables manually.
+# The VM name is always set from the container name.
+```
+
+### Option 2: Using Docker Directly
+
+You can also use Docker commands directly without the lumier utility:
+
+```bash
+# 1. Start the tunnel manually
+cd libs/lumier
+socat TCP-LISTEN:8080,reuseaddr,fork EXEC:"$PWD/src/bin/tunnel.sh" &
+TUNNEL_PID=$!
+
+# 2. Build the Docker image
+docker build -t lumier:latest .
+
+# 3. Run the container
+docker run -it --rm \
+    --name lumier-vm \
+    -p 8006:8006 \
+    -v $(pwd)/storage:/storage \
+    -v $(pwd)/shared:/data \
+    -e VM_NAME=lumier-vm \
+    -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
+    -e CPU_CORES=4 \
+    -e RAM_SIZE=8192 \
+    -e HOST_STORAGE_PATH=$(pwd)/storage \
+    -e HOST_DATA_PATH=$(pwd)/shared \
+    lumier:latest
+    
+# 4. Stop the tunnel when you're done
+kill $TUNNEL_PID
+
+# Alternatively, find and kill the tunnel process
+# First, find the process
+lsof -i TCP:8080
+# Then kill it by PID
+kill <PID>
+```
+
+Note that when using Docker directly, you're responsible for:
+- Starting and managing the tunnel
+- Building the Docker image
+- Providing the correct environment variables 
+
+## Available Environment Variables
+
+These variables can be set using Docker's `-e` flag:
+
+- `VM_NAME`: Set the VM name (default: lumier)
+- `VERSION`: Set the VM image (default: ghcr.io/trycua/macos-sequoia-vanilla:latest)
+- `CPU_CORES`: Set the number of CPU cores (default: 4)
+- `RAM_SIZE`: Set the memory size in MB (default: 8192)
+- `DISPLAY`: Set the display resolution (default: 1024x768)
+- `HOST_DATA_PATH`: Path on the host to share with the VM
+- `LUMIER_DEBUG`: Enable debug mode (set to 1)
+
+## Project Structure
+
+The project is organized as follows:
+
+```
+lumier/
+├── Dockerfile            # Main Docker image definition
+├── README.md             # This file
+├── lumier                # Main CLI script
+├── install.sh            # Installation script
+├── src/                  # Source code
+│   ├── bin/              # Executable scripts
+│   │   ├── entry.sh      # Docker entrypoint
+│   │   ├── server.sh     # Tunnel server manager
+│   │   └── tunnel.sh     # Tunnel request handler
+│   ├── config/           # Configuration
+│   │   └── constants.sh  # Shared constants
+│   ├── hooks/            # Lifecycle hooks
+│   │   └── on-logon.sh   # Run after VM boots
+│   └── lib/              # Shared library code
+│       ├── utils.sh      # Utility functions
+│       └── vm.sh         # VM management functions
+└── mount/                # Default shared directory
+```
+
+## VNC Access
+
+When a VM is running, you can access it via VNC through:
+http://localhost:8006/vnc.html
+
+The password is displayed in the console output when the VM starts.
\ No newline at end of file
diff --git a/libs/lumier/install.sh b/libs/lumier/install.sh
new file mode 100755
index 00000000..bd9e3b6b
--- /dev/null
+++ b/libs/lumier/install.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+set -e
+
+# Lumier Installer
+# This script installs Lumier to your system
+
+# Define colors for output
+BOLD=$(tput bold)
+NORMAL=$(tput sgr0)
+RED=$(tput setaf 1)
+GREEN=$(tput setaf 2)
+BLUE=$(tput setaf 4)
+YELLOW=$(tput setaf 3)
+
+# Default installation directory (user-specific, doesn't require sudo)
+DEFAULT_INSTALL_DIR="$HOME/.local/bin"
+INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}"
+
+# Script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Parse command line arguments
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --install-dir=*)
+      INSTALL_DIR="${1#*=}"
+      ;;
+    --help)
+      echo "${BOLD}${BLUE}Lumier Installer${NORMAL}"
+      echo "Usage: $0 [OPTIONS]"
+      echo ""
+      echo "Options:"
+      echo "  --install-dir=DIR   Install to the specified directory (default: $DEFAULT_INSTALL_DIR)"
+      echo "  --help              Display this help message"
+      echo ""
+      echo "Examples:"
+      echo "  $0                               # Install to $DEFAULT_INSTALL_DIR"
+      echo "  $0 --install-dir=/usr/local/bin  # Install to system directory (may require root privileges)"
+      echo "  INSTALL_DIR=/opt/lumier $0       # Install to /opt/lumier (legacy env var support)"
+      exit 0
+      ;;
+    *)
+      echo "${RED}Unknown option: $1${NORMAL}"
+      echo "Use --help for usage information"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+echo "${BOLD}${BLUE}Lumier Installer${NORMAL}"
+echo "This script will install Lumier to your system."
+
+# Check if we're running with appropriate permissions
+check_permissions() {
+  # System directories that typically require root privileges
+  SYSTEM_DIRS=("/usr/local/bin" "/usr/bin" "/bin" "/opt")
+  
+  NEEDS_ROOT=false
+  for DIR in "${SYSTEM_DIRS[@]}"; do
+    if [[ "$INSTALL_DIR" == "$DIR"* ]] && [ ! -w "$INSTALL_DIR" ]; then
+      NEEDS_ROOT=true
+      break
+    fi
+  done
+  
+  if [ "$NEEDS_ROOT" = true ]; then
+    echo "${YELLOW}Warning: Installing to $INSTALL_DIR may require root privileges.${NORMAL}"
+    echo "Consider these alternatives:"
+    echo "  • Install to a user-writable location: $0 --install-dir=$HOME/.local/bin"
+    echo "  • Create the directory with correct permissions first:"
+    echo "    sudo mkdir -p $INSTALL_DIR && sudo chown $(whoami) $INSTALL_DIR"
+    echo ""
+    
+    # Check if we already have write permission (might have been set up previously)
+    if [ ! -w "$INSTALL_DIR" ] && [ ! -w "$(dirname "$INSTALL_DIR")" ]; then
+      echo "${RED}Error: You don't have write permission to $INSTALL_DIR${NORMAL}"
+      echo "Please choose a different installation directory or ensure you have the proper permissions."
+      exit 1
+    fi
+  fi
+}
+
+# Detect OS and architecture
+detect_platform() {
+  OS=$(uname -s | tr '[:upper:]' '[:lower:]')
+  ARCH=$(uname -m)
+  
+  if [ "$OS" != "darwin" ]; then
+    echo "${RED}Error: Currently only macOS is supported.${NORMAL}"
+    exit 1
+  fi
+  
+  if [ "$ARCH" != "arm64" ]; then
+    echo "${RED}Error: Lumier only supports macOS on Apple Silicon (ARM64).${NORMAL}"
+    exit 1
+  fi
+  
+  PLATFORM="darwin-arm64"
+  echo "Detected platform: ${BOLD}$PLATFORM${NORMAL}"
+}
+
+# Check dependencies
+check_dependencies() {
+  echo "Checking dependencies..."
+  
+  # Check if lume is installed
+  if ! command -v lume &> /dev/null; then
+    echo "${RED}Error: Lume is required but not installed.${NORMAL}"
+    echo "Please install Lume first: https://github.com/trycua/cua/blob/main/libs/lume/README.md"
+    exit 1
+  fi
+  
+  # Check if socat is installed
+  if ! command -v socat &> /dev/null; then
+    echo "${YELLOW}Warning: socat is required but not installed.${NORMAL}"
+    echo "Installing socat with Homebrew..."
+    
+    # Check if Homebrew is installed
+    if ! command -v brew &> /dev/null; then
+      echo "${RED}Error: Homebrew is required to install socat.${NORMAL}"
+      echo "Please install Homebrew first: https://brew.sh/"
+      echo "Or install socat manually, then run this script again."
+      exit 1
+    fi
+    
+    # Install socat
+    brew install socat
+  fi
+  
+  # Check if Docker is installed
+  if ! command -v docker &> /dev/null; then
+    echo "${YELLOW}Warning: Docker is required but not installed.${NORMAL}"
+    echo "Please install Docker: https://docs.docker.com/get-docker/"
+    echo "Continuing with installation, but Lumier will not work without Docker."
+  fi
+  
+  echo "${GREEN}All dependencies are satisfied.${NORMAL}"
+}
+
+# Copy the lumier script directly
+copy_lumier() {
+  echo "Copying lumier script to $INSTALL_DIR..."
+  cp "$SCRIPT_DIR/lumier" "$INSTALL_DIR/lumier"
+  chmod +x "$INSTALL_DIR/lumier"
+}
+
+# Main installation flow
+main() {
+  check_permissions
+  detect_platform
+  check_dependencies
+  
+  echo "Installing Lumier to $INSTALL_DIR..."
+  
+  # Create install directory if it doesn't exist
+  mkdir -p "$INSTALL_DIR"
+  
+  # Copy the lumier script
+  copy_lumier
+  
+  echo "${GREEN}Installation complete!${NORMAL}"
+  echo "Lumier has been installed to ${BOLD}$INSTALL_DIR/lumier${NORMAL}"
+  
+  # Check if the installation directory is in PATH
+  if [[ ":$PATH:" != *":$INSTALL_DIR:"* ]]; then
+    echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}"
+    echo "To add it, run one of these commands based on your shell:"
+    echo "  For bash: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile"
+    echo "  For zsh:  echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zshrc"
+    echo "  For fish: echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish"
+  fi
+}
+
+# Run the installation
+main 
\ No newline at end of file
diff --git a/libs/lumier/lumier b/libs/lumier/lumier
new file mode 100755
index 00000000..1c3912cb
--- /dev/null
+++ b/libs/lumier/lumier
@@ -0,0 +1,200 @@
+#!/usr/bin/env bash
+
+# Exit on errors, undefined variables, and propagate errors in pipes
+set -eo pipefail
+
+# Always use the current working directory as the build context
+SCRIPT_DIR="$(pwd)"
+PORT=8080
+DEBUG=${LUMIER_DEBUG:-0}
+
+usage() {
+    cat <<EOF
+Lumier - Docker container wrapper for lume Virtual Machines
+
+Usage: $(basename "$0") COMMAND [OPTIONS]
+
+Commands:
+  run [DOCKER_ARGS]       Build (if needed) and run the Lumier container with Docker args
+  tunnel start            Start the Lumier tunnel
+  tunnel stop             Stop the Lumier tunnel
+  tunnel status           Check the status of the Lumier tunnel
+  build [DOCKER_ARGS]     Build the Lumier Docker image with optional Docker args
+  help                    Show this help message
+
+Docker Container Environment Variables:
+  These can be set using Docker's -e flag:
+  
+  VM_NAME                 Set the VM name (default: lumier)
+  VERSION                 Set the VM image (default: ghcr.io/trycua/macos-sequoia-vanilla:latest)
+  CPU_CORES               Set the number of CPU cores (default: 4)
+  RAM_SIZE                Set the memory size in MB (default: 8192)
+  HOST_DATA_PATH          Path to mount as shared directory in the VM
+  LUMIER_DEBUG            Enable debug mode (set to 1)
+
+Script Environment Variables:
+  LUMIER_IMAGE            Docker image name (default: lumier:latest)
+
+Examples:
+  # Run a VM with default settings
+  $(basename "$0") run -it --rm
+  
+  # Run a VM with custom settings using Docker's -e flag
+  $(basename "$0") run -it --rm \\
+      --name custom-container-name \\
+      -e VM_NAME=my-vm \\
+      -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \\
+      -e RAM_SIZE=16384 \\
+      -v $(pwd)/mount:/data
+
+  # Build with a custom image name
+  LUMIER_IMAGE=myorg/lumier:v1 $(basename "$0") build
+EOF
+}
+
+# Check if the tunnel is active
+is_tunnel_active() {
+    if lsof -i TCP:$PORT 2>/dev/null | grep LISTEN > /dev/null; then
+        return 0  # Tunnel is active
+    else
+        return 1  # Tunnel is not active
+    fi
+}
+
+# Start the tunnel if needed
+ensure_tunnel() {
+    if ! is_tunnel_active; then
+        echo "Tunnel is not active. Starting tunnel..."
+        "$SCRIPT_DIR/src/bin/server.sh" start
+        sleep 2  # Wait for the tunnel to start
+        
+        if ! is_tunnel_active; then
+            echo "Failed to start tunnel. Make sure 'lume' is installed on your host."
+            exit 1
+        fi
+    else
+        echo "Tunnel is already active."
+    fi
+}
+
+# Build the Docker image with cache busting
+build_image() {
+    local image_name="${LUMIER_IMAGE:-lumier:latest}"
+    echo "Building Lumier Docker image: $image_name"
+    echo "SCRIPT_DIR=$SCRIPT_DIR"
+    echo "Checking for Dockerfile at: $SCRIPT_DIR/Dockerfile"
+    ls -l "$SCRIPT_DIR/Dockerfile" || echo "Dockerfile not found at $SCRIPT_DIR/Dockerfile"
+    
+    # Pass any additional arguments to docker build with cache busting
+    docker build --build-arg CACHEBUST=$(date +%s) -t "$image_name" "$SCRIPT_DIR" "$@"
+    
+    echo "Lumier image built successfully: $image_name"
+}
+
+# Run the Docker container
+run_container() {
+    local image_name="${LUMIER_IMAGE:-lumier:latest}"
+    
+    # Ensure the Docker image exists
+    if ! docker image inspect "$image_name" &>/dev/null; then
+        echo "Docker image '$image_name' not found. Building it..."
+        build_image
+    fi
+    
+    # Ensure the tunnel is running
+    ensure_tunnel
+    
+    # Automatically resolve and pass host paths for storage and data
+    STORAGE_PATH="${HOST_STORAGE_PATH:-$(realpath ./storage)}"
+    DATA_PATH="${HOST_DATA_PATH:-$(realpath ./shared)}"
+
+    # Only add -e if not already present in args
+    DOCKER_ARGS=( )
+    add_env_var() {
+        local var="$1"; local val="$2"; local flag="-e $var="
+        for arg in "$@"; do
+            [[ "$arg" == *"$flag"* ]] && return 0
+        done
+        DOCKER_ARGS+=( -e "$var=$val" )
+    }
+    add_env_var HOST_STORAGE_PATH "$STORAGE_PATH"
+    add_env_var HOST_DATA_PATH "$DATA_PATH"
+
+    # Detect --name argument and set VM_NAME if not already present
+    local container_name=""
+    local prev_arg=""
+    for arg in "$@"; do
+        if [[ "$prev_arg" == "--name" ]]; then
+            container_name="$arg"
+            break
+        elif [[ "$arg" == --name=* ]]; then
+            container_name="${arg#--name=}"
+            break
+        fi
+        prev_arg="$arg"
+    done
+    # Only add -e VM_NAME if not already present and container_name is set
+    local vm_name_set=false
+    for arg in "$@"; do
+        if [[ "$arg" == "-e" ]] && [[ "$2" == VM_NAME=* ]]; then
+            vm_name_set=true
+            break
+        elif [[ "$arg" == "-eVM_NAME="* ]]; then
+            vm_name_set=true
+            break
+        elif [[ "$arg" == "-e"* ]] && [[ "$arg" == *"VM_NAME="* ]]; then
+            vm_name_set=true
+            break
+        fi
+    done
+    if [[ -n "$container_name" && "$vm_name_set" != true ]]; then
+        DOCKER_ARGS+=( -e "VM_NAME=$container_name" )
+    fi
+
+    echo "Running Lumier container with image: $image_name"
+    if [[ "$*" == *"-p 8006:8006"* || "$*" == *"-p"*"8006:8006"* ]]; then
+        docker run "${DOCKER_ARGS[@]}" "$@" "$image_name"
+    else
+        docker run "${DOCKER_ARGS[@]}" -p 8006:8006 "$@" "$image_name"
+    fi
+}
+
+# Main command handling
+case "${1:-help}" in
+    run)
+        shift
+        run_container "$@"
+        ;;
+    tunnel)
+        # Handle tunnel subcommands
+        case "${2:-}" in
+            start)
+                "$SCRIPT_DIR/src/bin/server.sh" start
+                ;;
+            stop)
+                "$SCRIPT_DIR/src/bin/server.sh" stop
+                ;;
+            status)
+                "$SCRIPT_DIR/src/bin/server.sh" status
+                ;;
+            *)
+                echo "Unknown tunnel subcommand: $2"
+                usage
+                exit 1
+                ;;
+        esac
+        ;;
+
+    build)
+        shift
+        build_image "$@"
+        ;;
+    help)
+        usage
+        ;;
+    *)
+        echo "Unknown command: $1"
+        usage
+        exit 1
+        ;;
+esac 
\ No newline at end of file
diff --git a/libs/lumier/mount/server.py b/libs/lumier/mount/server.py
new file mode 100644
index 00000000..464c26ad
--- /dev/null
+++ b/libs/lumier/mount/server.py
@@ -0,0 +1,10 @@
+from flask import Flask
+
+app = Flask(__name__)
+
+@app.route('/')
+def hello_world():
+    return 'Hello, World, from VM!'
+
+if __name__ == '__main__':
+    app.run(debug=True, host="0.0.0.0", port=5001)
\ No newline at end of file
diff --git a/libs/lumier/mount/setup.sh b/libs/lumier/mount/setup.sh
new file mode 100755
index 00000000..8897896e
--- /dev/null
+++ b/libs/lumier/mount/setup.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+echo "Creating helloworld.txt on the Desktop..."
+if [ ! -f ~/Desktop/helloworld.txt ]; then
+  echo "Hello, World!" > ~/Desktop/helloworld.txt
+  echo "helloworld.txt created successfully."
+else
+  echo "helloworld.txt already exists."
+fi
\ No newline at end of file
diff --git a/libs/lumier/src/bin/entry.sh b/libs/lumier/src/bin/entry.sh
new file mode 100755
index 00000000..66a375ad
--- /dev/null
+++ b/libs/lumier/src/bin/entry.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+
+# Exit on errors, undefined variables, and propagate errors in pipes
+set -euo pipefail
+
+# Source configuration files
+CONFIG_DIR="/run/config"
+LIB_DIR="/run/lib"
+
+# Source constants if available
+if [ -f "${CONFIG_DIR}/constants.sh" ]; then
+  source "${CONFIG_DIR}/constants.sh"
+fi
+
+# Import utilities
+for lib in "${LIB_DIR}"/*.sh; do
+  if [ -f "$lib" ]; then
+    source "$lib"
+  fi
+done
+
+# Set VM_NAME to env or fallback to container name (from --name)
+if [ -z "${VM_NAME:-}" ]; then
+    VM_NAME="$(cat /etc/hostname)"
+    export VM_NAME
+fi
+
+# Set HOST_STORAGE_PATH to /storage/$VM_NAME if not set
+if [ -z "${HOST_STORAGE_PATH:-}" ]; then
+    HOST_STORAGE_PATH="/storage/$VM_NAME"
+    export HOST_STORAGE_PATH
+fi
+
+# Optionally check for mountpoints
+if mountpoint -q /storage; then
+    echo "/storage is mounted"
+fi
+if mountpoint -q /data; then
+    echo "/data is mounted"
+fi
+
+# Log startup info
+echo "Lumier VM is starting..."
+
+# Cleanup function to ensure VM and noVNC proxy shutdown on container stop
+cleanup() {
+  set +e  # Don't exit on error in cleanup
+  echo "[cleanup] Caught signal, shutting down..."
+  echo "[cleanup] Stopping VM..."
+  stop_vm
+  # Now gently stop noVNC proxy if running
+  # if [ -n "${NOVNC_PID:-}" ] && kill -0 "$NOVNC_PID" 2>/dev/null; then
+  #   echo "[cleanup] Stopping noVNC proxy (PID $NOVNC_PID)..."
+  #   kill -TERM "$NOVNC_PID"
+  #   # Wait up to 5s for noVNC to exit
+  #   for i in {1..5}; do
+  #     if ! kill -0 "$NOVNC_PID" 2>/dev/null; then
+  #       echo "[cleanup] noVNC proxy stopped."
+  #       break
+  #     fi
+  #     sleep 1
+  #   done
+  #   # Escalate if still running
+  #   if kill -0 "$NOVNC_PID" 2>/dev/null; then
+  #     echo "[cleanup] noVNC proxy did not exit, killing..."
+  #     kill -KILL "$NOVNC_PID" 2>/dev/null
+  #   fi
+  # fi
+  echo "[cleanup] Done. Exiting."
+  exit 0
+}
+trap cleanup SIGTERM SIGINT
+
+# Start the VM
+start_vm
+
+# Start noVNC for VNC access
+NOVNC_PID=""
+if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then
+  echo "Starting noVNC proxy with optimized color settings..."
+  ${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 &
+  NOVNC_PID=$!
+  disown $NOVNC_PID
+  echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true&logging=debug"
+fi
+
+# Run any post-startup hooks
+if [ -d "/run/hooks" ]; then
+  for hook in /run/hooks/*; do
+    if [ -x "$hook" ]; then
+      echo "Running hook: $(basename "$hook")"
+      "$hook"
+    fi
+  done
+fi
+
+echo "Lumier is running. Press Ctrl+C to stop."
+tail -f /dev/null
\ No newline at end of file
diff --git a/libs/lumier/src/bin/server.sh b/libs/lumier/src/bin/server.sh
new file mode 100755
index 00000000..5849d667
--- /dev/null
+++ b/libs/lumier/src/bin/server.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+# Exit on errors, undefined variables, and propagate errors in pipes
+set -euo pipefail
+
+# Source constants if available
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+if [ -f "${SCRIPT_DIR}/../config/constants.sh" ]; then
+  source "${SCRIPT_DIR}/../config/constants.sh"
+fi
+
+# Use the tunnel port from constants if available, otherwise default to 8080
+PORT="${TUNNEL_PORT:-8080}"
+TUNNEL_SCRIPT="${SCRIPT_DIR}/tunnel.sh"
+
+# Function to check if the tunnel is active
+is_tunnel_active() {
+    if lsof -i TCP:$PORT 2>/dev/null | grep LISTEN > /dev/null; then
+        return 0  # Tunnel is active
+    else
+        return 1  # Tunnel is not active
+    fi
+}
+
+# Function to start the tunnel
+start_tunnel() {
+    echo "Starting tunnel on port $PORT..."
+    if is_tunnel_active; then
+        echo "Tunnel is already running on port $PORT."
+        return 0
+    fi
+    
+    # Start socat in the background
+    socat TCP-LISTEN:$PORT,reuseaddr,fork EXEC:"$TUNNEL_SCRIPT" &
+    SOCAT_PID=$!
+    
+    # Check if the tunnel started successfully
+    sleep 1
+    if ! is_tunnel_active; then
+        echo "Failed to start tunnel on port $PORT."
+        return 1
+    fi
+    
+    echo "Tunnel started successfully on port $PORT (PID: $SOCAT_PID)."
+    return 0
+}
+
+# Function to stop the tunnel
+stop_tunnel() {
+    echo "Stopping tunnel on port $PORT..."
+    if ! is_tunnel_active; then
+        echo "No tunnel running on port $PORT."
+        return 0
+    fi
+    
+    # Find and kill the socat process
+    local pid=$(lsof -i TCP:$PORT | grep LISTEN | awk '{print $2}')
+    if [ -n "$pid" ]; then
+        kill $pid
+        echo "Tunnel stopped (PID: $pid)."
+        return 0
+    else
+        echo "Failed to find process using port $PORT."
+        return 1
+    fi
+}
+
+# Function to check tunnel status
+status_tunnel() {
+    if is_tunnel_active; then
+        local pid=$(lsof -i TCP:$PORT | grep LISTEN | awk '{print $2}')
+        echo "Tunnel is active on port $PORT (PID: $pid)."
+        return 0
+    else
+        echo "No tunnel running on port $PORT."
+        return 1
+    fi
+}
+
+# Parse command line arguments
+case "${1:-}" in
+    start)
+        start_tunnel
+        ;;
+    stop)
+        stop_tunnel
+        ;;
+    restart)
+        stop_tunnel
+        start_tunnel
+        ;;
+    status)
+        status_tunnel
+        ;;
+    *)
+        echo "Usage: $0 {start|stop|restart|status}"
+        exit 1
+        ;;
+esac 
\ No newline at end of file
diff --git a/libs/lumier/src/bin/tunnel-script.sh b/libs/lumier/src/bin/tunnel-script.sh
new file mode 100755
index 00000000..529839ea
--- /dev/null
+++ b/libs/lumier/src/bin/tunnel-script.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+# Source constants if running in container context
+if [ -f "/run/config/constants.sh" ]; then
+  source "/run/config/constants.sh"
+fi
+
+# Define server address with fallback
+SERVER="${TUNNEL_HOST:-host.docker.internal}:${TUNNEL_PORT:-8080}"
+
+# Extract the base name of the command and arguments
+command=$(basename "$0")
+subcommand="$1"
+shift
+args="$@"
+
+command="$command $subcommand $args"
+
+# Concatenate command and any stdin data
+full_data="$command"
+if [ ! -t 0 ]; then
+  stdin_data=$(cat)
+  if [ -n "$stdin_data" ]; then
+    # Format full_data to include stdin data
+    full_data="$full_data << 'EOF'
+    $stdin_data
+EOF"
+  fi
+fi
+
+# Trim leading/trailing whitespace and newlines
+full_data=$(echo -e "$full_data" | sed 's/^[ \t\n]*//;s/[ \t\n]*$//')
+
+# Log command if debug is enabled
+if [ "${LUMIER_DEBUG:-0}" -eq 1 ]; then
+  echo "Executing lume command: $full_data" >&2
+  echo "Sending to: $SERVER" >&2
+fi
+
+# Use curl with -N to disable output buffering and -s for silent mode
+curl -N -s -X POST \
+  -H "Content-Type: application/octet-stream" \
+  --data-binary @- \
+  "http://$SERVER" <<< "$full_data" 
\ No newline at end of file
diff --git a/libs/lumier/src/bin/tunnel.sh b/libs/lumier/src/bin/tunnel.sh
new file mode 100755
index 00000000..6de14282
--- /dev/null
+++ b/libs/lumier/src/bin/tunnel.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+
+# Exit on errors, undefined variables, and propagate errors in pipes
+set -euo pipefail
+
+# Source constants if available
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+if [ -f "${SCRIPT_DIR}/../config/constants.sh" ]; then
+  source "${SCRIPT_DIR}/../config/constants.sh"
+fi
+
+# Handle errors and cleanup
+cleanup() {
+  local exit_code=$?
+  # Clean up any temporary files if they exist
+  [ -n "${temp_file:-}" ] && [ -f "$temp_file" ] && rm "$temp_file"
+  [ -n "${fifo:-}" ] && [ -p "$fifo" ] && rm "$fifo"
+  exit $exit_code
+}
+trap cleanup EXIT INT TERM
+
+log_debug() {
+  if [ "${LUMIER_DEBUG:-0}" -eq 1 ]; then
+    echo "[DEBUG] $*" >&2
+  fi
+}
+
+send_error_response() {
+  local status_code=$1
+  local message=$2
+  echo "HTTP/1.1 $status_code"
+  echo "Content-Type: text/plain"
+  echo ""
+  echo "$message"
+  exit 1
+}
+
+# Read the HTTP request line
+read -r request_line
+log_debug "Request: $request_line"
+
+# Read headers and look for Content-Length
+content_length=0
+while IFS= read -r header; do
+    [[ $header == $'\r' ]] && break  # End of headers
+    log_debug "Header: $header"
+    if [[ "$header" =~ ^Content-Length:\ ([0-9]+) ]]; then
+        content_length="${BASH_REMATCH[1]}"
+    fi
+done
+
+# Read the body using the content length
+command=""
+if [ "$content_length" -gt 0 ]; then
+    command=$(dd bs=1 count="$content_length" 2>/dev/null)
+    log_debug "Received command: $command"
+fi
+
+# Determine the executable and arguments based on the command
+if [[ "$command" == lume* ]]; then
+    executable="$(which lume || echo "/usr/local/bin/lume")"
+    command_args="${command#lume}"  # Remove 'lume' from the command
+elif [[ "$command" == sshpass* ]]; then
+    executable="$(which sshpass || echo "/usr/local/bin/sshpass")"
+    command_args="${command#sshpass}"
+else
+    send_error_response "400 Bad Request" "Unsupported command: $command"
+fi
+
+# Check if executable exists
+if [ ! -x "$executable" ]; then
+    send_error_response "500 Internal Server Error" "Executable not found or not executable: $executable"
+fi
+
+# Create a temporary file to store the command
+temp_file=$(mktemp)
+echo "$executable $command_args" > "$temp_file"
+chmod +x "$temp_file"
+
+# Create a FIFO (named pipe) for capturing output
+fifo=$(mktemp -u)
+mkfifo "$fifo"
+
+# Execute the command and pipe its output through awk to ensure line-buffering
+{
+    log_debug "Executing: $executable $command_args"
+    "$temp_file" 2>&1 | awk '{ print; fflush() }' > "$fifo"
+} &
+
+# Stream the output from the FIFO as an HTTP response
+{
+    echo -e "HTTP/1.1 200 OK\r"
+    echo -e "Content-Type: text/plain\r"
+    echo -e "\r"
+    cat "$fifo"
+} 
\ No newline at end of file
diff --git a/libs/lumier/src/config/constants.sh b/libs/lumier/src/config/constants.sh
new file mode 100644
index 00000000..766c4373
--- /dev/null
+++ b/libs/lumier/src/config/constants.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Port configuration
+TUNNEL_PORT=8080
+VNC_PORT=8006
+
+# Host configuration
+TUNNEL_HOST="host.docker.internal"
+
+# Default VM configuration
+DEFAULT_RAM_SIZE="8192"
+DEFAULT_CPU_CORES="4"
+DEFAULT_DISK_SIZE="100"
+DEFAULT_VM_NAME="lumier"
+DEFAULT_VM_VERSION="ghcr.io/trycua/macos-sequoia-vanilla:latest"
+
+# Paths
+NOVNC_PATH="/opt/noVNC"
+LIFECYCLE_HOOKS_DIR="/run/hooks"
+
+# VM connection details
+HOST_USER="lume"
+HOST_PASSWORD="lume"
+SSH_RETRY_ATTEMPTS=20
+SSH_RETRY_INTERVAL=5 
\ No newline at end of file
diff --git a/libs/lumier/src/hooks/on-logon.sh b/libs/lumier/src/hooks/on-logon.sh
new file mode 100755
index 00000000..faa817c0
--- /dev/null
+++ b/libs/lumier/src/hooks/on-logon.sh
@@ -0,0 +1,8 @@
+setup_script="$DATA_FOLDER_PATH/setup.sh"
+
+if [ -f "$setup_script" ]; then
+    chmod +x "$setup_script"
+    source "$setup_script"
+else
+    echo "Setup script not found at: $setup_script"
+fi
\ No newline at end of file
diff --git a/libs/lumier/src/lib/utils.sh b/libs/lumier/src/lib/utils.sh
new file mode 100755
index 00000000..7d599669
--- /dev/null
+++ b/libs/lumier/src/lib/utils.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+# Function to wait for SSH to become available
+wait_for_ssh() {
+    local host_ip=$1
+    local user=$2
+    local password=$3
+    local retry_interval=${4:-5}   # Default retry interval is 5 seconds
+    local max_retries=${5:-20}    # Default maximum retries is 20 (0 for infinite)
+
+    echo "Waiting for SSH to become available on $host_ip..."
+
+    local retry_count=0
+    while true; do
+        # Try to connect via SSH
+        sshpass -p "$password" ssh -o StrictHostKeyChecking=no "$user@$host_ip" "exit"
+
+        # Check the exit status of the SSH command
+        if [ $? -eq 0 ]; then
+            echo "SSH is ready on $host_ip!"
+            return 0
+        fi
+
+        # Increment retry count
+        ((retry_count++))
+        
+        # Exit if maximum retries are reached
+        if [ $max_retries -ne 0 ] && [ $retry_count -ge $max_retries ]; then
+            echo "Maximum retries reached. SSH is not available."
+            return 1
+        fi
+
+        echo "SSH not ready. Retrying in $retry_interval seconds... (Attempt $retry_count)"
+        sleep $retry_interval
+    done
+}
+
+# Function to execute a script on a remote server using sshpass
+execute_remote_script() {
+    local host="$1"
+    local user="$2"
+    local password="$3"
+    local script_path="$4"
+    local vnc_password="$5"
+    local data_folder="$6"
+
+    # Check if all required arguments are provided
+    if [ -z "$host" ] || [ -z "$user" ] || [ -z "$password" ] || [ -z "$script_path" ] || [ -z "$vnc_password" ]; then
+        echo "Usage: execute_remote_script <host> <user> <password> <script_path> <vnc_password> [data_folder]"
+        return 1
+    fi
+
+    echo "VNC password exported to VM: $vnc_password"
+
+    data_folder_path="$VM_SHARED_FILES_PATH/$data_folder"
+    echo "Data folder path in VM: $data_folder_path"
+
+    # Read the script content and prepend the shebang
+    script_content="#!/usr/bin/env bash\n"
+    if [ -n "$data_folder" ]; then
+        script_content+="export VNC_PASSWORD='$vnc_password'\n"
+        script_content+="export DATA_FOLDER_PATH='$data_folder_path'\n"
+    fi
+    script_content+="$(<"$script_path")"
+
+    # Use a here-document to send the script content
+    sshpass -p "$password" ssh -o StrictHostKeyChecking=no "$user@$host" "bash -s" <<EOF
+$script_content
+EOF
+
+    # Check the exit status of the sshpass command
+    if [ $? -ne 0 ]; then
+        echo "Failed to execute script on remote host $host."
+        return 1
+    fi
+}
+
+# Example usage
+# output = execute_remote_script('192.168.1.100', 'username', 'password', '/path/to/script.sh')
+# print(output)
+
+extract_json_field() {
+    local field_name=$1
+    local input=$2
+    local result
+    result=$(echo "$input" | grep -oP '"'"$field_name"'"\s*:\s*"\K[^"]+')
+    if [[ $? -ne 0 ]]; then
+        echo ""
+    else
+        echo "$result"
+    fi
+}
+
+extract_json_field_from_file() {
+    local field_name=$1
+    local json_file=$2
+    local json_text
+    json_text=$(<"$json_file")
+    extract_json_field "$field_name" "$json_text"
+}
+
+extract_json_field_from_text() {
+    local field_name=$1
+    local json_text=$2
+    extract_json_field "$field_name" "$json_text"
+}
diff --git a/libs/lumier/src/lib/vm.sh b/libs/lumier/src/lib/vm.sh
new file mode 100755
index 00000000..19dcff07
--- /dev/null
+++ b/libs/lumier/src/lib/vm.sh
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+
+start_vm() {
+    # Set up dedicated storage for this VM
+    STORAGE_NAME="storage_${VM_NAME}"
+    if [ -n "$HOST_STORAGE_PATH" ]; then
+        lume config storage add "$STORAGE_NAME" "$HOST_STORAGE_PATH" >/dev/null 2>&1 || true
+    fi
+
+    # Check if VM exists and its status using JSON format
+    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1)
+    
+    # Check if VM not found error
+    if [[ $VM_INFO == *"Virtual machine not found"* ]]; then
+        IMAGE_NAME="${VERSION##*/}"
+        lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_NAME"
+    else
+        # Parse the JSON status - check if it contains "status" : "running"
+        if [[ $VM_INFO == *'"status" : "running"'* ]]; then
+            lume_stop "$VM_NAME" "$STORAGE_NAME"
+        fi
+    fi
+
+    # Set VM parameters
+    lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_NAME"
+
+    # Fetch VM configuration
+    CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json)
+    
+    # Setup data directory args if necessary
+    SHARED_DIR_ARGS=""
+    if [ -d "/data" ]; then
+        if [ -n "$HOST_DATA_PATH" ]; then
+            SHARED_DIR_ARGS="--shared-dir=$HOST_DATA_PATH"
+        else
+            echo "Warning: /data volume exists but HOST_DATA_PATH is not set. Cannot mount volume."
+        fi
+    fi
+
+    # Run VM with VNC and shared directory using curl
+    lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" &
+
+    # Wait for VM to be running and VNC URL to be available
+    vm_ip=""
+    vnc_url=""
+    max_attempts=30
+    attempt=0
+    
+    while [ $attempt -lt $max_attempts ]; do
+        # Get VM info as JSON
+        VM_INFO=$(lume get "$VM_NAME" -f json 2>/dev/null)
+        
+        # Check if VM has status 'running'
+        if [[ $VM_INFO == *'"status" : "running"'* ]]; then
+            # Extract IP address using the existing function from utils.sh
+            vm_ip=$(extract_json_field "ipAddress" "$VM_INFO")
+            # Extract VNC URL using the existing function from utils.sh
+            vnc_url=$(extract_json_field "vncUrl" "$VM_INFO")
+            
+            # If we have both IP and VNC URL, break the loop
+            if [ -n "$vm_ip" ] && [ -n "$vnc_url" ]; then
+                break
+            fi
+        fi
+        
+        sleep 2
+        attempt=$((attempt + 1))
+    done
+    
+    if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then
+        echo "Timed out waiting for VM to start or VNC URL to become available."
+        lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1
+        exit 1
+    fi
+
+        
+    # Parse VNC URL to extract password and port
+    VNC_PASSWORD=$(echo "$vnc_url" | sed -n 's/.*:\(.*\)@.*/\1/p')
+    VNC_PORT=$(echo "$vnc_url" | sed -n 's/.*:\([0-9]\+\)$/\1/p')
+    
+    # Wait for SSH to become available
+        wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20
+
+    # Export VNC variables for entry.sh to use
+    export VNC_PORT
+    export VNC_PASSWORD
+    
+    # Execute on-logon.sh if present
+    on_logon_script="/run/lifecycle/on-logon.sh"
+    if [ -f "$on_logon_script" ]; then
+        execute_remote_script "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" "$on_logon_script" "$VNC_PASSWORD" "$DATA_FOLDER"
+    fi
+
+    # The VM is still running because we never killed lume run.
+    # If you want to stop the VM at some point, you can kill $LUME_PID or use lume_stop.
+}
+
+stop_vm() {
+    echo "Stopping VM '$VM_NAME'..."
+    STORAGE_NAME="storage_${VM_NAME}"
+    # Check if the VM exists and is running (use lume get for speed)
+    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>/dev/null)
+    if [[ -z "$VM_INFO" || $VM_INFO == *"Virtual machine not found"* ]]; then
+        echo "VM '$VM_NAME' does not exist."
+    elif [[ $VM_INFO == *'"status" : "running"'* ]]; then
+        lume_stop "$VM_NAME" "$STORAGE_NAME"
+        echo "VM '$VM_NAME' was running and is now stopped."
+    elif [[ $VM_INFO == *'"status" : "stopped"'* ]]; then
+        echo "VM '$VM_NAME' is already stopped."
+    else
+        echo "Unknown VM status for '$VM_NAME'."
+    fi
+}
+
+is_vm_running() {
+    lume ls | grep -q "$VM_NAME"
+}
+
+# Stop VM with storage location specified using curl
+lume_stop() {
+    local vm_name="$1"
+    local storage="$2"
+    curl --connect-timeout 6000 \
+      --max-time 5000 \
+      -X POST \
+      -H "Content-Type: application/json" \
+      -d '{"storage":"'$storage'"}' \
+      "http://host.docker.internal:3000/lume/vms/${vm_name}/stop"
+}
+
+# Run VM with VNC client started and shared directory using curl
+lume_run() {
+    # Parse args
+    local shared_dir=""
+    local storage="ssd"
+    local vm_name="lume_vm"
+    local no_display=true
+    while [[ $# -gt 0 ]]; do
+        case $1 in
+            --shared-dir=*)
+                shared_dir="${1#*=}"
+                shift
+                ;;
+            --storage)
+                storage="$2"
+                shift 2
+                ;;
+            --no-display)
+                no_display=true
+                shift
+                ;;
+            *)
+                # Assume last arg is VM name if not an option
+                vm_name="$1"
+                shift
+                ;;
+        esac
+    done
+    
+    # Default to ~/Projects if not provided
+    if [[ -z "$shared_dir" ]]; then
+        shared_dir="~/Projects"
+    fi
+    
+    local json_body="{\"noDisplay\": true, \"sharedDirectories\": [{\"hostPath\": \"$shared_dir\", \"readOnly\": false}], \"storage\": \"$storage\", \"recoveryMode\": false}"
+    local curl_cmd="curl --connect-timeout 6000 \
+      --max-time 5000 \
+      -X POST \
+      -H 'Content-Type: application/json' \
+      -d '$json_body' \
+      http://host.docker.internal:3000/lume/vms/$vm_name/run"
+    echo "[lume_run] Running:"
+    echo "$curl_cmd"
+    eval "$curl_cmd"
+}
\ No newline at end of file

From 885831f04ea7e8ae740761af46f2413f0e0c44ae Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Sun, 27 Apr 2025 22:52:11 -0700
Subject: [PATCH 04/38] Add lume options

---
 libs/lumier/src/lib/vm.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/libs/lumier/src/lib/vm.sh b/libs/lumier/src/lib/vm.sh
index 19dcff07..9d3dda06 100755
--- a/libs/lumier/src/lib/vm.sh
+++ b/libs/lumier/src/lib/vm.sh
@@ -9,7 +9,7 @@ start_vm() {
 
     # Check if VM exists and its status using JSON format
     VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1)
-    
+
     # Check if VM not found error
     if [[ $VM_INFO == *"Virtual machine not found"* ]]; then
         IMAGE_NAME="${VERSION##*/}"
@@ -17,7 +17,8 @@ start_vm() {
     else
         # Parse the JSON status - check if it contains "status" : "running"
         if [[ $VM_INFO == *'"status" : "running"'* ]]; then
-            lume_stop "$VM_NAME" "$STORAGE_NAME"
+            # lume_stop "$VM_NAME" "$STORAGE_NAME"
+            lume stop "$VM_NAME" --storage "$STORAGE_NAME"
         fi
     fi
 
@@ -38,7 +39,8 @@ start_vm() {
     fi
 
     # Run VM with VNC and shared directory using curl
-    lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" &
+    # lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" &
+    lume run "$VM_NAME" --storage "$STORAGE_NAME" --no-display
 
     # Wait for VM to be running and VNC URL to be available
     vm_ip=""
@@ -69,7 +71,8 @@ start_vm() {
     
     if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then
         echo "Timed out waiting for VM to start or VNC URL to become available."
-        lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1
+        # lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1
+        lume stop "$VM_NAME" --storage "$STORAGE_NAME" > /dev/null 2>&1
         exit 1
     fi
 
@@ -79,7 +82,7 @@ start_vm() {
     VNC_PORT=$(echo "$vnc_url" | sed -n 's/.*:\([0-9]\+\)$/\1/p')
     
     # Wait for SSH to become available
-        wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20
+    wait_for_ssh "$vm_ip" "$HOST_USER" "$HOST_PASSWORD" 5 20
 
     # Export VNC variables for entry.sh to use
     export VNC_PORT

From cf75a7e577a68a571ffb0449c45639c9202dc1f2 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Sun, 27 Apr 2025 23:06:45 -0700
Subject: [PATCH 05/38] Update to zprofile

---
 libs/lume/scripts/install.sh | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh
index aa0529c6..f6313538 100755
--- a/libs/lume/scripts/install.sh
+++ b/libs/lume/scripts/install.sh
@@ -12,6 +12,8 @@ GREEN=$(tput setaf 2)
 BLUE=$(tput setaf 4)
 YELLOW=$(tput setaf 3)
 
+
+
 # Default installation directory (user-specific, doesn't require sudo)
 DEFAULT_INSTALL_DIR="$HOME/.local/bin"
 INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}"
@@ -173,11 +175,25 @@ install_binary() {
   
   # Check if the installation directory is in PATH
   if [ -n "${PATH##*$INSTALL_DIR*}" ]; then
+    SHELL_NAME=$(basename "$SHELL")
     echo "${YELLOW}Warning: $INSTALL_DIR is not in your PATH.${NORMAL}"
-    echo "To add it, run one of these commands based on your shell:"
-    echo "  For bash: echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile"
-    echo "  For zsh:  echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zshrc"
-    echo "  For fish: echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish"
+    case "$SHELL_NAME" in
+      zsh)
+        echo "To add it, run:"
+        echo "  echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.zprofile"
+        ;;
+      bash)
+        echo "To add it, run:"
+        echo "  echo 'export PATH=\"\$PATH:$INSTALL_DIR\"' >> ~/.bash_profile"
+        ;;
+      fish)
+        echo "To add it, run:"
+        echo "  echo 'fish_add_path $INSTALL_DIR' >> ~/.config/fish/config.fish"
+        ;;
+      *)
+        echo "Add $INSTALL_DIR to your PATH in your shell profile file."
+        ;;
+    esac
   fi
 }
 

From a37fa708482eb4c99e9b5fc425e848d85cb85bbc Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 10:15:25 -0400
Subject: [PATCH 06/38] added basic demo video maker

---
 examples/video_maker_traj.py | 692 +++++++++++++++++++++++++++++++++++
 1 file changed, 692 insertions(+)
 create mode 100644 examples/video_maker_traj.py

diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py
new file mode 100644
index 00000000..34a5ad3c
--- /dev/null
+++ b/examples/video_maker_traj.py
@@ -0,0 +1,692 @@
+"""
+Video Maker for Trajectory Dataset
+
+This script processes a trajectory dataset folder, extracts frames,
+and creates an animated video with cursor overlays.
+"""
+
+from utils import load_dotenv_files
+load_dotenv_files()
+
+import os
+import json
+import math
+import shutil
+import re
+from pathlib import Path
+import argparse
+import numpy as np
+from PIL import Image, ImageDraw, ImageFilter
+import requests
+from io import BytesIO
+from tqdm import tqdm
+
+# Constants
+CURSOR_SCALE = 2  # Scale factor for cursor size
+FRAMES_PER_CLICK = 8  # Number of frames to show for click animation
+FRAMES_PER_MOVE = 10  # Number of frames to interpolate between cursor positions
+CURSOR_NORMAL = "https://mac-cursors.netlify.app/png/default@2x.png"
+CURSOR_CLICKING = "https://mac-cursors.netlify.app/png/handpointing@2x.png"
+CURSOR_TYPING = "https://mac-cursors.netlify.app/png/textcursor@2x.png"
+CURSOR_HOTSPOT = (20, 15)
+OUTPUT_DIR = "examples/output/video_frames"
+
+# Vignette effect constants
+VIGNETTE_WIDTH = 10  # Width of the vignette border in pixels
+VIGNETTE_COLORS = [(128, 0, 255), (0, 0, 255)]  # Purple to Blue gradient colors
+VIGNETTE_ANIMATION_SPEED = 0.1  # Controls speed of the animation pulse
+
+def download_image(url):
+    """Download an image from a URL."""
+    response = requests.get(url)
+    return Image.open(BytesIO(response.content))
+
+def load_cursor_images():
+    """Load and resize cursor images."""
+    cursor_normal = download_image(CURSOR_NORMAL)
+    cursor_clicking = download_image(CURSOR_CLICKING)
+    cursor_typing = download_image(CURSOR_TYPING)
+    
+    # Resize all cursors based on CURSOR_SCALE
+    width_normal, height_normal = cursor_normal.size
+    width_clicking, height_clicking = cursor_clicking.size
+    width_typing, height_typing = cursor_typing.size
+    
+    cursor_normal = cursor_normal.resize((int(width_normal * CURSOR_SCALE), int(height_normal * CURSOR_SCALE)))
+    cursor_clicking = cursor_clicking.resize((int(width_clicking * CURSOR_SCALE), int(height_clicking * CURSOR_SCALE)))
+    cursor_typing = cursor_typing.resize((int(width_typing * CURSOR_SCALE), int(height_typing * CURSOR_SCALE)))
+    
+    cursors = {
+        "normal": cursor_normal,
+        "clicking": cursor_clicking,
+        "typing": cursor_typing
+    }
+    
+    return cursors
+
+# Store the last known cursor position and thought across all frames
+last_known_cursor_position = None
+last_known_thought = None
+
+def extract_thought_from_api_response(filename):
+    """Extract thought from API response for the current frame."""
+    global last_known_thought
+    
+    turn_dir = os.path.dirname(filename)
+    api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')]
+    
+    for api_file in api_response_files:
+        try:
+            with open(os.path.join(turn_dir, api_file), 'r') as f:
+                data = json.load(f)
+                # Extract content from response
+                content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
+                
+                # Extract the Thought section
+                thought_match = re.search(r"Thought: (.*?)(?:\nAction:|$)", content, re.DOTALL)
+                if thought_match:
+                    thought = thought_match.group(1).strip()
+                    if thought:
+                        last_known_thought = thought
+                        return thought
+        except (json.JSONDecodeError, FileNotFoundError, KeyError):
+            pass
+    
+    # Return the last known thought if no new thought is found
+    return last_known_thought
+
+def extract_cursor_position_from_filename(filename):
+    """Extract cursor position from a filename containing click info."""
+    global last_known_cursor_position
+    
+    # For 'screenshot_NNN_click_TIMESTAMP.png', try to extract coordinates
+    match = re.search(r'click_(\d+)_(\d+)_\d+\.png$', filename)
+    if match:
+        position = (int(match.group(1)), int(match.group(2)))
+        last_known_cursor_position = position
+        return position
+    
+    # Check if we have position info from API response
+    turn_dir = os.path.dirname(filename)
+    api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')]
+    
+    for api_file in api_response_files:
+        try:
+            with open(os.path.join(turn_dir, api_file), 'r') as f:
+                data = json.load(f)
+                # Extract action from response
+                content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
+                # Look for coordinates in the action
+                # First try the pattern from the example: click(start_box='(28,15)')
+                coord_match = re.search(r"click\(start_box='\((\d+),(\d+)\)'\)", content)
+                if coord_match:
+                    position = (int(coord_match.group(1)), int(coord_match.group(2)))
+                    last_known_cursor_position = position
+                    return position
+                
+                # Try alternative pattern: click(start_box='<|box_start|>(x,y)<|box_end|>')
+                alt_match = re.search(r"click\(start_box='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)", content)
+                if alt_match:
+                    position = (int(alt_match.group(1)), int(alt_match.group(2)))
+                    last_known_cursor_position = position
+                    return position
+        except (json.JSONDecodeError, FileNotFoundError, KeyError):
+            pass
+    
+    # No new position found, return the last known position
+    return last_known_cursor_position
+
+def extract_action_from_filename(filename):
+    """Determine the action type from the filename pattern."""
+    if 'click' in filename:
+        return "clicking"
+    elif 'type' in filename:
+        return "typing"
+    else:
+        return "normal"
+
+def create_animated_vignette(image, frame_index):
+    """
+    Create an animated purple/blue gradient vignette effect around the border of the image.
+    The animation pulses the colors and gently varies their intensity over time.
+    
+    Args:
+        image: The base image to apply the vignette to
+        frame_index: Current frame index for animation timing
+    
+    Returns:
+        Image with vignette effect applied
+    """
+    # Create a copy of the image to work with
+    result = image.copy()
+    width, height = result.size
+    
+    # Create a blank RGBA image for the vignette overlay
+    vignette = Image.new('RGBA', (width, height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(vignette)
+    
+    # Calculate animation phase based on frame index
+    phase = math.sin(frame_index * VIGNETTE_ANIMATION_SPEED) * 0.5 + 0.5  # Oscillates between 0 and 1
+    
+    # Interpolate between the vignette colors based on the animation phase
+    color1 = VIGNETTE_COLORS[0]
+    color2 = VIGNETTE_COLORS[1]
+    animated_color = (
+        int(color1[0] + (color2[0] - color1[0]) * phase),
+        int(color1[1] + (color2[1] - color1[1]) * phase),
+        int(color1[2] + (color2[2] - color1[2]) * phase),
+    )
+    
+    # Draw gradient borders around each edge
+    # Top border
+    for i in range(VIGNETTE_WIDTH):
+        alpha = int(150 * (1 - i / VIGNETTE_WIDTH))
+        border_color = animated_color[:3] + (alpha,)
+        draw.line([(0, i), (width, i)], fill=border_color, width=1)
+        draw.line([(0, height-i-1), (width, height-i-1)], fill=border_color, width=1)
+        draw.line([(i, 0), (i, height)], fill=border_color, width=1)
+        draw.line([(width-i-1, 0), (width-i-1, height)], fill=border_color, width=1)
+    
+    # Apply slight blur to smooth the gradient
+    vignette = vignette.filter(ImageFilter.GaussianBlur(16))
+    
+    # Composite the vignette over the original image
+    result = Image.alpha_composite(result.convert('RGBA'), vignette)
+    
+    return result.convert('RGB')  # Convert back to RGB for consistency
+
+def scale_cursor_with_animation(cursor, frame, max_frames, cursor_type):
+    """Create springy scale animation for cursor."""
+    if cursor_type == "normal":
+        return cursor
+    
+    # For clicking or typing cursors, create a spring effect
+    progress = frame / max_frames
+    
+    # Spring effect calculation - starts big, gets smaller, then back to normal
+    if progress < 0.3:
+        # Start with larger scale, shrink down
+        scale = 1.3 - progress
+    elif progress < 0.7:
+        # Then bounce back up a bit
+        scale = 0.7 + (progress - 0.3) * 0.8
+    else:
+        # Then settle to normal (1.0)
+        scale = 1.0 + (1.0 - progress) * 0.3
+    
+    # Apply scale
+    width, height = cursor.size
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    return cursor.resize((new_width, new_height))
+
+# Store the last thought bubble position
+last_thought_bubble_pos = None
+
+def draw_thought_bubble(image, position, thought_text, frame_index):
+    """Draw a thought bubble with the AI's thoughts near the cursor position."""
+    global last_thought_bubble_pos
+    
+    if thought_text is None or position is None:
+        return image
+        
+    # Create a copy of the image to work with
+    result = image.copy()
+    
+    # Set up text parameters
+    font_size = 16
+    try:
+        # Try to use a nice font if available
+        from PIL import ImageFont
+        try:
+            font = ImageFont.truetype("Arial", font_size)
+        except IOError:
+            # Fallback to default font
+            font = ImageFont.load_default()
+    except ImportError:
+        font = None
+    
+    # Wrap text to fit in bubble
+    max_width = 400  # Max width in pixels
+    wrapped_lines = []
+    words = thought_text.split()
+    current_line = []
+    
+    for word in words:
+        # Add word to current line
+        test_line = ' '.join(current_line + [word])
+        
+        # Create a temporary draw object to measure text width if needed
+        temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
+        
+        # Measure the text width
+        if font:
+            if hasattr(temp_draw, 'textlength'):
+                text_width = temp_draw.textlength(test_line, font=font)
+            else:
+                # Fall back to rough estimation
+                text_width = len(test_line) * (font_size * 0.6)
+        else:
+            # Rough estimation if no font metrics are available
+            text_width = len(test_line) * (font_size * 0.6)
+        
+        if text_width <= max_width:
+            current_line.append(word)
+        else:
+            # Line is full, start a new line
+            if current_line:
+                wrapped_lines.append(' '.join(current_line))
+            current_line = [word]
+    
+    # Don't forget the last line
+    if current_line:
+        wrapped_lines.append(' '.join(current_line))
+    
+    # Limit number of lines for very long thoughts
+    max_lines = 8
+    if len(wrapped_lines) > max_lines:
+        wrapped_lines = wrapped_lines[:max_lines-1] + ["..."]
+    
+    # Calculate text dimensions
+    line_height = font_size + 4
+    text_height = len(wrapped_lines) * line_height
+    
+    # Find the widest line
+    if font:
+        # Create a draw object to measure text width
+        temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
+        if hasattr(temp_draw, 'textlength'):
+            text_width = max(temp_draw.textlength(line, font=font) for line in wrapped_lines)
+        else:
+            # Fall back to rough estimation
+            text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines)
+    else:
+        text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines)
+    
+    # Add padding
+    padding = 20
+    bubble_width = text_width + padding * 2
+    bubble_height = text_height + padding * 2
+    
+    # Calculate bubble position - move slowly towards cursor position
+    x, y = position
+    screen_width, screen_height = image.size
+    
+    # Default initial position if this is the first bubble
+    target_bubble_x = min(x + 30, screen_width - bubble_width - 10)
+    target_bubble_y = max(y - bubble_height - 30, 10)
+    
+    # Ensure target position is fully on screen
+    if target_bubble_x < 10:
+        target_bubble_x = 10
+    if target_bubble_y + bubble_height > screen_height - 10:
+        target_bubble_y = screen_height - bubble_height - 10
+    
+    # Calculate new position with slow movement towards target
+    # Very slow movement factor (0.01 means it moves 1% of the distance per frame)
+    movement_factor = 0.001
+    
+    if last_thought_bubble_pos is None:
+        # First frame, set to target position
+        bubble_x, bubble_y = target_bubble_x, target_bubble_y
+    else:
+        # Interpolate slowly towards target position
+        last_x, last_y = last_thought_bubble_pos
+        bubble_x = last_x + (target_bubble_x - last_x) * movement_factor
+        bubble_y = last_y + (target_bubble_y - last_y) * movement_factor
+    
+    # Add a subtle animation effect to the bubble
+    # animation_offset = math.sin(frame_index * 0.1) * 2
+    # bubble_y += int(animation_offset)
+    
+    # Store position for next frame
+    last_thought_bubble_pos = (bubble_x, bubble_y)
+    
+    # Draw rounded rectangle for bubble
+    corner_radius = 15
+    
+    # Background with black gaussian blur
+    background_color = (0, 0, 0, 180)  # Black with transparency
+    outline_color = (50, 50, 50, 255)   # Dark gray outline
+    
+    # Draw the bubble background - first create an RGBA version
+    bubble_img = Image.new('RGBA', result.size, (0, 0, 0, 0))
+    bubble_draw = ImageDraw.Draw(bubble_img)
+    
+    # Draw rounded rectangle
+    # Check if rounded_rectangle is available (PIL 8.0.0+)
+    if hasattr(bubble_draw, 'rounded_rectangle'):
+        bubble_draw.rounded_rectangle(
+            [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height],
+            radius=corner_radius,
+            fill=background_color,
+            outline=outline_color,
+            width=2
+        )
+    else:
+        # Fall back to regular rectangle if rounded_rectangle not available
+        bubble_draw.rectangle(
+            [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height],
+            fill=background_color,
+            outline=outline_color
+        )
+    
+    # Apply gaussian blur to the bubble background
+    bubble_img = bubble_img.filter(ImageFilter.GaussianBlur(3))
+    
+    # Draw small triangle pointing to cursor
+    pointer_size = 10
+    pointer_x = x + 15
+    pointer_y = y - 5
+    
+    # Make sure pointer is under the bubble
+    if pointer_x > bubble_x + bubble_width:
+        pointer_x = bubble_x + bubble_width - 20
+    elif pointer_x < bubble_x:
+        pointer_x = bubble_x + 20
+    
+    # Create an overlay for the pointer
+    pointer_overlay = Image.new('RGBA', result.size, (0, 0, 0, 0))
+    pointer_draw = ImageDraw.Draw(pointer_overlay)
+    
+    # Draw pointer triangle
+    # pointer_draw.polygon(
+    #     [
+    #         (pointer_x, pointer_y),
+    #         (pointer_x - pointer_size, pointer_y - pointer_size),
+    #         (pointer_x + pointer_size, pointer_y - pointer_size)
+    #     ],
+    #     fill=background_color,
+    #     outline=outline_color
+    # )
+    
+    # Apply gaussian blur to the pointer
+    pointer_overlay = pointer_overlay.filter(ImageFilter.GaussianBlur(3))
+    
+    # Composite the bubble and pointer onto the original image
+    result = Image.alpha_composite(result.convert('RGBA'), bubble_img)
+    result = Image.alpha_composite(result, pointer_overlay)
+    
+    # Now draw the text
+    draw = ImageDraw.Draw(result)
+    text_x = bubble_x + padding
+    text_y = bubble_y + padding
+    
+    text_color = (255, 255, 255, 255)  # White text
+    for line in wrapped_lines:
+        draw.text((text_x, text_y), line, font=font, fill=text_color)
+        text_y += line_height
+    
+    return result.convert('RGB')
+
+def create_cursor_overlay(base_image, position, cursor_images, thought_text=None, cursor_type="normal", animation_frame=0, frame_index=0):
+    """Create an image with cursor overlaid on the base image and thought bubble if available."""
+    # Create a copy of the base image
+    result = base_image.copy()
+    
+    # If position is None, return the image without a cursor
+    if position is None:
+        return result
+    
+    # Get the appropriate cursor image
+    cursor = cursor_images[cursor_type]
+    
+    # Apply animation scaling if needed
+    if cursor_type in ["clicking", "typing"]:
+        cursor = scale_cursor_with_animation(cursor, animation_frame, FRAMES_PER_CLICK, cursor_type)
+    
+    # Calculate position to center the cursor hotspot
+    # Cursor hotspot is at (20,15) of the cursor image
+    x, y = position
+    hotspot_x, hotspot_y = CURSOR_HOTSPOT
+    cursor_x = x - (hotspot_x * CURSOR_SCALE)  # X offset for hotspot
+    cursor_y = y - (hotspot_y * CURSOR_SCALE)  # Y offset for hotspot
+    
+    # Paste the cursor onto the image
+    result.paste(cursor, (int(cursor_x), int(cursor_y)), cursor)
+    
+    # Add thought bubble if text is available
+    if thought_text:
+        result = draw_thought_bubble(result, position, thought_text, frame_index)
+    
+    return result
+
+def get_screenshot_files(trajectory_dir):
+    """
+    Get all screenshot files from a trajectory directory, sorted by sequence number.
+    
+    Args:
+        trajectory_dir: Path to trajectory directory containing turn_XXX folders
+        
+    Returns:
+        List of tuples (path, sequence_number, action_type, position)
+    """
+    screenshot_files = []
+    
+    # List all turn directories in order
+    turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], 
+                      key=lambda x: int(x.split('_')[1]))
+    
+    for turn_dir in turn_dirs:
+        turn_path = os.path.join(trajectory_dir, turn_dir)
+        if not os.path.isdir(turn_path):
+            continue
+            
+        # Get all screenshot files in this turn
+        files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')]
+        
+        for file in files:
+            file_path = os.path.join(turn_path, file)
+            
+            # Extract sequence number from filename (e.g., screenshot_003_...)
+            seq_match = re.search(r'screenshot_(\d+)', file)
+            if seq_match:
+                seq_number = int(seq_match.group(1))
+                
+                # Determine action type from filename
+                action_type = extract_action_from_filename(file)
+                
+                # Get cursor position if available
+                position = extract_cursor_position_from_filename(file_path)
+                
+                screenshot_files.append((file_path, seq_number, action_type, position))
+    
+    # Sort by sequence number
+    screenshot_files.sort(key=lambda x: x[1])
+    
+    return screenshot_files
+
+def process_trajectory(trajectory_dir, output_dir, cursors):
+    """Process a trajectory directory and create output frames."""
+    # Get all screenshot files
+    screenshot_files = get_screenshot_files(trajectory_dir)
+    
+    if not screenshot_files:
+        print(f"No screenshot files found in {trajectory_dir}")
+        return
+    
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Track frame index
+    frame_index = 0
+    
+    # Process each screenshot
+    prev_img = None
+    prev_cursor_pos = None
+    
+    for i, (file_path, seq_number, action_type, position) in enumerate(tqdm(screenshot_files, desc="Processing frames")):
+        # Load the current image
+        try:
+            current_img = Image.open(file_path)
+        except Exception as e:
+            print(f"Error loading image {file_path}: {e}")
+            continue
+        
+        # Current cursor position
+        current_cursor_pos = position
+        
+        # Check if the current frame has an action (click/typing)
+        is_action_frame = action_type in ["clicking", "typing"]
+        
+        if is_action_frame:
+            # If we have a previous frame, use it for the first half of animation
+            if prev_img is not None:
+                half_frames = FRAMES_PER_CLICK // 2
+                # First half of animation uses PREVIOUS image
+                for j in range(half_frames):
+                    # Get the thought from the API response
+                    current_thought = extract_thought_from_api_response(file_path)
+                    
+                    output_img = create_cursor_overlay(
+                        prev_img, current_cursor_pos, cursors,
+                        thought_text=current_thought,
+                        cursor_type=action_type, 
+                        animation_frame=j,
+                        frame_index=frame_index
+                    )
+                    # Apply animated vignette effect
+                    output_img = create_animated_vignette(output_img, frame_index)
+                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+                    frame_index += 1
+                
+                # Second half uses CURRENT image
+                for j in range(half_frames, FRAMES_PER_CLICK):
+                    # Get the thought from the API response
+                    current_thought = extract_thought_from_api_response(file_path)
+                    
+                    output_img = create_cursor_overlay(
+                        current_img, current_cursor_pos, cursors,
+                        thought_text=current_thought,
+                        cursor_type=action_type,
+                        animation_frame=j,
+                        frame_index=frame_index
+                    )
+                    # Apply animated vignette effect
+                    output_img = create_animated_vignette(output_img, frame_index)
+                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+                    frame_index += 1
+            else:
+                # If no previous frame, use current for full animation
+                for j in range(FRAMES_PER_CLICK):
+                    # Get the thought from the API response
+                    current_thought = extract_thought_from_api_response(file_path)
+                    
+                    output_img = create_cursor_overlay(
+                        current_img, current_cursor_pos, cursors,
+                        thought_text=current_thought,
+                        cursor_type=action_type,
+                        animation_frame=j,
+                        frame_index=frame_index
+                    )
+                    # Apply animated vignette effect
+                    output_img = create_animated_vignette(output_img, frame_index)
+                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+                    frame_index += 1
+        else:
+            # Regular frame with normal cursor
+            # Get the thought from the API response
+            current_thought = extract_thought_from_api_response(file_path)
+            
+            output_img = create_cursor_overlay(
+                current_img, current_cursor_pos, cursors,
+                thought_text=current_thought,
+                cursor_type="normal",
+                frame_index=frame_index
+            )
+            # Apply animated vignette effect
+            output_img = create_animated_vignette(output_img, frame_index)
+            output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+            frame_index += 1
+        
+        # Add position interpolation frames if we're not at the last frame
+        if i < len(screenshot_files) - 1:
+            # Get next position
+            next_cursor_pos = screenshot_files[i+1][3]
+            
+            # Only interpolate if both positions are valid and different
+            if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos:
+                for j in range(1, FRAMES_PER_MOVE):
+                    progress = j / FRAMES_PER_MOVE
+                    interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress
+                    interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress
+                    interp_pos = (int(interp_x), int(interp_y))
+                    
+                    # Create interpolated movement frame
+                    # Get the thought from the API response
+                    current_thought = extract_thought_from_api_response(file_path)
+                    
+                    output_img = create_cursor_overlay(
+                        current_img, interp_pos, cursors,
+                        thought_text=current_thought,
+                        cursor_type="normal",
+                        frame_index=frame_index
+                    )
+                    # Apply animated vignette effect
+                    output_img = create_animated_vignette(output_img, frame_index)
+                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+                    frame_index += 1
+        
+        # Save current frame as previous for next iteration
+        prev_img = current_img
+        prev_cursor_pos = current_cursor_pos
+
+def main():
+    """Main function to process the trajectory and create video frames."""
+    parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.')
+    parser.add_argument('trajectory_dir', type=str, help='Path to the trajectory folder')
+    parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames')
+    parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video')
+    parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file')
+    parser.add_argument('--skip_ffmpeg', action='store_true', help='Skip running ffmpeg to create video')
+    args = parser.parse_args()
+    
+    trajectory_dir = args.trajectory_dir
+    output_dir = args.output_dir
+    fps = args.fps
+    output_video = args.output_video
+    skip_ffmpeg = args.skip_ffmpeg
+    
+    # Check if trajectory directory exists
+    if not os.path.exists(trajectory_dir):
+        print(f"Trajectory directory {trajectory_dir} does not exist")
+        return
+    
+    # Clean output directory if it exists
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    
+    # Create output directory
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Load cursor images
+    print("Loading cursor images...")
+    cursors = load_cursor_images()
+    
+    # Process the trajectory
+    print(f"Processing trajectory from {trajectory_dir}...")
+    process_trajectory(trajectory_dir, output_dir, cursors)
+    
+    print(f"Processing complete. Frames saved to {output_dir}")
+    
+    # Run ffmpeg to create the video
+    if not skip_ffmpeg:
+        print(f"Running ffmpeg to create video: {output_video}")
+        ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}"
+        try:
+            import subprocess
+            result = subprocess.run(ffmpeg_cmd, shell=True, check=True, 
+                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
+                                   text=True)
+            print(f"Video created successfully: {output_video}")
+        except subprocess.CalledProcessError as e:
+            print(f"Error running ffmpeg: {e}")
+            print(f"ffmpeg output:\n{e.stdout}\n{e.stderr}")
+            print("\nYou can create a video manually with this command:")
+            print(ffmpeg_cmd)
+    else:
+        print("Skipping ffmpeg. You can create a video from these frames using ffmpeg with this command:")
+        print(f"ffmpeg -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}")
+
+if __name__ == "__main__":
+    main()

From d049fa6ebbdfa21128c29b782fb8f62c77b63c9b Mon Sep 17 00:00:00 2001
From: Finn <finnborge@gmail.com>
Date: Mon, 28 Apr 2025 21:30:49 -0400
Subject: [PATCH 07/38] Renames os arg to os_type to avoid module collision

---
 libs/computer/computer/computer.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py
index f4d9d9bf..ddb68f9e 100644
--- a/libs/computer/computer/computer.py
+++ b/libs/computer/computer/computer.py
@@ -29,7 +29,7 @@ class Computer:
         display: Union[Display, Dict[str, int], str] = "1024x768",
         memory: str = "8GB",
         cpu: str = "4",
-        os: OSType = "macos",
+        os_type: OSType = "macos",
         name: str = "",
         image: str = "macos-sequoia-cua:latest",
         shared_directories: Optional[List[str]] = None,
@@ -68,6 +68,7 @@ class Computer:
         self.image = image
         self.port = port
         self.host = host
+        self.os_type = os_type
 
         # Store telemetry preference
         self._telemetry_enabled = telemetry_enabled
@@ -129,8 +130,8 @@ class Computer:
         self.shared_paths = []
         if shared_directories:
             for path in shared_directories:
-                abs_path = os.path.abspath(os.path.expanduser(path))  # type: ignore[attr-defined]
-                if not os.path.exists(abs_path):  # type: ignore[attr-defined]
+                abs_path = os.path.abspath(os.path.expanduser(path))
+                if not os.path.exists(abs_path):
                     raise ValueError(f"Shared directory does not exist: {path}")
                 self.shared_paths.append(abs_path)
         self._pylume_context = None
@@ -188,7 +189,7 @@ class Computer:
                 self._interface = cast(
                     BaseComputerInterface,
                     InterfaceFactory.create_interface_for_os(
-                        os=self.os, ip_address=ip_address  # type: ignore[arg-type]
+                        os=self.os_type, ip_address=ip_address  # type: ignore[arg-type]
                     ),
                 )
 
@@ -288,13 +289,13 @@ class Computer:
 
         try:
             # Initialize the interface using the factory with the specified OS
-            self.logger.info(f"Initializing interface for {self.os} at {ip_address}")
+            self.logger.info(f"Initializing interface for {self.os_type} at {ip_address}")
             from .interface.base import BaseComputerInterface
 
             self._interface = cast(
                 BaseComputerInterface,
                 InterfaceFactory.create_interface_for_os(
-                    os=self.os, ip_address=ip_address  # type: ignore[arg-type]
+                    os=self.os_type, ip_address=ip_address  # type: ignore[arg-type]
                 ),
             )
 

From d502cbdc991c496669bbd6b8fdf732ebea091958 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 19:12:32 -0400
Subject: [PATCH 08/38] fix endpoint not liking string message content

---
 .../agent/providers/omni/clients/oaicompat.py      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/omni/clients/oaicompat.py b/libs/agent/agent/providers/omni/clients/oaicompat.py
index 6a95896a..b15515fd 100644
--- a/libs/agent/agent/providers/omni/clients/oaicompat.py
+++ b/libs/agent/agent/providers/omni/clients/oaicompat.py
@@ -93,7 +93,14 @@ class OAICompatClient(BaseOmniClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
 
-        final_messages = [{"role": "system", "content": system}]
+        final_messages = [
+            {
+                "role": "system", 
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
 
         # Process messages
         for item in messages:
@@ -117,7 +124,10 @@ class OAICompatClient(BaseOmniClient):
                     else:
                         message = {
                             "role": item["role"],
-                            "content": [{"type": "text", "text": item["content"]}],
+                            "content": [{
+                                "type": "text", 
+                                "text": item["content"]
+                            }],
                         }
                     final_messages.append(message)
             else:

From 84ed45c0dd621ecd7907b3b6edf4b1db16f4f350 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 24 Apr 2025 19:24:12 -0400
Subject: [PATCH 09/38] consistency with other loops

---
 libs/agent/agent/providers/uitars/loop.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 99132365..0d3bc9f7 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -232,8 +232,11 @@ class UITARSLoop(BaseLoop):
                     if self.client is None:
                         raise RuntimeError("Failed to initialize client")
 
-                # Convert messages to UI-TARS format
+                # Get messages in standard format from the message manager
+                self.message_manager.messages = messages.copy()
                 prepared_messages = self.message_manager.get_messages()
+                
+                # Convert messages to UI-TARS format
                 uitars_messages = self.to_uitars_format(prepared_messages)
                 
                 # Log request

From 3608491419be160503250436fc6d8a1933747b9e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 21:53:58 -0400
Subject: [PATCH 10/38] fix uitars oai provider

---
 .../agent/agent/providers/uitars/clients/oaicompat.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py
index 4567360b..963fb05b 100644
--- a/libs/agent/agent/providers/uitars/clients/oaicompat.py
+++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py
@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
 
-        final_messages = [{"role": "system", "content": system}]
-
+        final_messages = [
+            {
+                "role": "system", 
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
+        
         # Process messages
         for item in messages:
             if isinstance(item, dict):

From b4af3f67d5643be1c66ea12272f30db97c4fcd52 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 11:23:03 -0700
Subject: [PATCH 11/38] decreased scroll sensitivity for openai's cua

---
 libs/agent/agent/providers/openai/tools/computer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py
index ae4fdce8..5ec9460a 100644
--- a/libs/agent/agent/providers/openai/tools/computer.py
+++ b/libs/agent/agent/providers/openai/tools/computer.py
@@ -162,8 +162,8 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
                 y = kwargs.get("y")
                 if x is None or y is None:
                     raise ToolError("x and y coordinates are required for scroll action")
-                scroll_x = kwargs.get("scroll_x", 0) // 20
-                scroll_y = kwargs.get("scroll_y", 0) // 20
+                scroll_x = kwargs.get("scroll_x", 0) // 50
+                scroll_y = kwargs.get("scroll_y", 0) // 50
                 return await self.handle_scroll(x, y, scroll_x, scroll_y)
             elif type == "screenshot":
                 return await self.screenshot()

From ea31cc63408498d7c3bfe606eb4a15d65cf8defa Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 11:34:18 -0700
Subject: [PATCH 12/38] added mappings for modifier keys

---
 libs/computer/computer/interface/models.py | 26 ++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py
index b586a9f7..c09a092c 100644
--- a/libs/computer/computer/interface/models.py
+++ b/libs/computer/computer/interface/models.py
@@ -7,6 +7,9 @@ NavigationKey = Literal['pagedown', 'pageup', 'home', 'end', 'left', 'right', 'u
 # Special key literals
 SpecialKey = Literal['enter', 'esc', 'tab', 'space', 'backspace', 'del']
 
+# Modifier key literals
+ModifierKey = Literal['ctrl', 'shift', 'win', 'command', 'option']
+
 # Function key literals
 FunctionKey = Literal['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12']
 
@@ -35,6 +38,13 @@ class Key(Enum):
     BACKSPACE = 'backspace'
     DELETE = 'del'
     
+    # Modifier keys
+    CTRL = 'ctrl'
+    SHIFT = 'shift'
+    WIN = 'win'
+    COMMAND = 'command'
+    OPTION = 'option'
+    
     # Function keys
     F1 = 'f1'
     F2 = 'f2'
@@ -73,14 +83,26 @@ class Key(Enum):
             'escape': cls.ESCAPE,
             'esc': cls.ESC,
             'delete': cls.DELETE,
-            'del': cls.DELETE
+            'del': cls.DELETE,
+            # Modifier key mappings
+            'ctrl': cls.CTRL,
+            'control': cls.CTRL,
+            'shift': cls.SHIFT,
+            'win': cls.WIN,
+            'windows': cls.WIN,
+            'command': cls.COMMAND,
+            'cmd': cls.COMMAND,
+            '⌘': cls.COMMAND,
+            'option': cls.OPTION,
+            'alt': cls.OPTION,
+            '⌥': cls.OPTION,
         }
         
         normalized = key.lower().strip()
         return key_mapping.get(normalized, key)
 
 # Combined key type
-KeyType = Union[Key, NavigationKey, SpecialKey, FunctionKey, str]
+KeyType = Union[Key, NavigationKey, SpecialKey, ModifierKey, FunctionKey, str]
 
 class AccessibilityWindow(TypedDict):
     """Information about a window in the accessibility tree."""

From 1e5ba4832a9e1b7fa97495993658177937f86708 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 11:41:20 -0700
Subject: [PATCH 13/38] mapping for super key

---
 libs/computer/computer/interface/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py
index c09a092c..9a90acb4 100644
--- a/libs/computer/computer/interface/models.py
+++ b/libs/computer/computer/interface/models.py
@@ -90,6 +90,7 @@ class Key(Enum):
             'shift': cls.SHIFT,
             'win': cls.WIN,
             'windows': cls.WIN,
+            'super': cls.WIN,
             'command': cls.COMMAND,
             'cmd': cls.COMMAND,
             '⌘': cls.COMMAND,

From 8b939e789057b11f0931a995889dc914f641260d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Mon, 28 Apr 2025 21:53:58 -0400
Subject: [PATCH 14/38] fix uitars oai provider

---
 .../agent/agent/providers/uitars/clients/oaicompat.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py
index 4567360b..963fb05b 100644
--- a/libs/agent/agent/providers/uitars/clients/oaicompat.py
+++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py
@@ -94,8 +94,15 @@ class OAICompatClient(BaseUITarsClient):
         """
         headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
 
-        final_messages = [{"role": "system", "content": system}]
-
+        final_messages = [
+            {
+                "role": "system", 
+                "content": [
+                    { "type": "text", "text": system }
+                ]
+            }
+        ]
+        
         # Process messages
         for item in messages:
             if isinstance(item, dict):

From 1df8194de1373e5c7521bb0e12abb6e9ddffe140 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 12:00:09 -0700
Subject: [PATCH 15/38] fix hotkeys on uitars and openai provider

---
 libs/agent/agent/providers/openai/tools/computer.py | 6 +-----
 libs/agent/agent/providers/uitars/tools/computer.py | 8 ++++++--
 libs/computer/computer/interface/models.py          | 5 +++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py
index 5ec9460a..90ef5935 100644
--- a/libs/agent/agent/providers/openai/tools/computer.py
+++ b/libs/agent/agent/providers/openai/tools/computer.py
@@ -240,11 +240,7 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
 
             if len(mapped_keys) > 1:
                 # For key combinations (like Ctrl+C)
-                for k in mapped_keys:
-                    await self.computer.interface.press_key(k)
-                await asyncio.sleep(0.1)
-                for k in reversed(mapped_keys):
-                    await self.computer.interface.press_key(k)
+                await self.computer.interface.hotkey(*mapped_keys)
             else:
                 # Single key press
                 await self.computer.interface.press_key(mapped_keys[0])
diff --git a/libs/agent/agent/providers/uitars/tools/computer.py b/libs/agent/agent/providers/uitars/tools/computer.py
index 5cf7f67a..4d5f2ce3 100644
--- a/libs/agent/agent/providers/uitars/tools/computer.py
+++ b/libs/agent/agent/providers/uitars/tools/computer.py
@@ -173,9 +173,13 @@ class ComputerTool(BaseComputerTool):
             elif action == "hotkey":
                 if "keys" in kwargs:
                     keys = kwargs["keys"]
-                    for key in keys:
-                        await self.computer.interface.press_key(key)
                     
+                    if len(keys) > 1:
+                        await self.computer.interface.hotkey(*keys)
+                    else:
+                        # Single key press
+                        await self.computer.interface.press_key(keys[0])
+                        
                     # Wait for UI to update
                     await asyncio.sleep(0.3)
                     
diff --git a/libs/computer/computer/interface/models.py b/libs/computer/computer/interface/models.py
index 9a90acb4..e8ec1b47 100644
--- a/libs/computer/computer/interface/models.py
+++ b/libs/computer/computer/interface/models.py
@@ -8,7 +8,7 @@ NavigationKey = Literal['pagedown', 'pageup', 'home', 'end', 'left', 'right', 'u
 SpecialKey = Literal['enter', 'esc', 'tab', 'space', 'backspace', 'del']
 
 # Modifier key literals
-ModifierKey = Literal['ctrl', 'shift', 'win', 'command', 'option']
+ModifierKey = Literal['ctrl', 'alt', 'shift', 'win', 'command', 'option']
 
 # Function key literals
 FunctionKey = Literal['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12']
@@ -39,6 +39,7 @@ class Key(Enum):
     DELETE = 'del'
     
     # Modifier keys
+    ALT = 'alt'
     CTRL = 'ctrl'
     SHIFT = 'shift'
     WIN = 'win'
@@ -85,6 +86,7 @@ class Key(Enum):
             'delete': cls.DELETE,
             'del': cls.DELETE,
             # Modifier key mappings
+            'alt': cls.ALT,
             'ctrl': cls.CTRL,
             'control': cls.CTRL,
             'shift': cls.SHIFT,
@@ -95,7 +97,6 @@ class Key(Enum):
             'cmd': cls.COMMAND,
             '⌘': cls.COMMAND,
             'option': cls.OPTION,
-            'alt': cls.OPTION,
             '⌥': cls.OPTION,
         }
         

From f580be07a1f8e153f8d37630323cf220175ca74c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 12:01:43 -0700
Subject: [PATCH 16/38] prompt uitars to use the correct hotkeys on mac

---
 libs/agent/agent/providers/uitars/loop.py    | 4 ++--
 libs/agent/agent/providers/uitars/prompts.py | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 99132365..84393bd2 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -20,7 +20,7 @@ from computer import Computer
 from .utils import add_box_token, parse_actions, parse_action_parameters
 from .tools.manager import ToolManager
 from .tools.computer import ToolResult
-from .prompts import COMPUTER_USE, SYSTEM_PROMPT
+from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
 
 from .clients.oaicompat import OAICompatClient
 
@@ -184,7 +184,7 @@ class UITARSLoop(BaseLoop):
         if first_user_idx is not None and instruction:
             # Create the computer use prompt
             user_prompt = COMPUTER_USE.format(
-                instruction=instruction,
+                instruction='\n'.join([instruction, MAC_SPECIFIC_NOTES]),
                 language="English"
             )
             
diff --git a/libs/agent/agent/providers/uitars/prompts.py b/libs/agent/agent/providers/uitars/prompts.py
index aa24557d..fe16f0d8 100644
--- a/libs/agent/agent/providers/uitars/prompts.py
+++ b/libs/agent/agent/providers/uitars/prompts.py
@@ -1,5 +1,9 @@
 """Prompts for UI-TARS agent."""
 
+MAC_SPECIFIC_NOTES = """
+(You are operating on macOS, use 'cmd' instead of 'ctrl' for most shortcuts e.g., hotkey(key='cmd c') for copy, hotkey(key='cmd v') for paste, hotkey(key='cmd t') for new tab).)
+"""
+
 SYSTEM_PROMPT = "You are a helpful assistant."
 
 COMPUTER_USE = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@@ -56,4 +60,4 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
 
 ## User Instruction
 {instruction}
-""" 
\ No newline at end of file
+"""

From 2e10e0922ab1b883d3bb4cae0f14db9fb9784ea7 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Tue, 29 Apr 2025 12:09:30 -0700
Subject: [PATCH 17/38] add top_p to uitars

---
 libs/agent/agent/providers/uitars/clients/oaicompat.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py
index 963fb05b..1b512997 100644
--- a/libs/agent/agent/providers/uitars/clients/oaicompat.py
+++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py
@@ -145,8 +145,13 @@ class OAICompatClient(BaseUITarsClient):
                     message = {"role": "user", "content": [{"type": "text", "text": item}]}
                 final_messages.append(message)
 
-        payload = {"model": self.model, "messages": final_messages, "temperature": self.temperature}
-        payload["max_tokens"] = max_tokens or self.max_tokens
+        payload = {
+            "model": self.model, 
+            "messages": final_messages, 
+            "max_tokens": max_tokens or self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": 0.7,
+        }
 
         try:
             async with aiohttp.ClientSession() as session:

From e8e446f8c2d8db43390e0f5834803161fe764f84 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Tue, 29 Apr 2025 16:37:33 -0700
Subject: [PATCH 18/38] Add lume --storage path

---
 libs/lume/scripts/install.sh                  |  79 ++++++++-
 libs/lume/src/Commands/Create.swift           |   2 +-
 libs/lume/src/Commands/Delete.swift           |   2 +-
 libs/lume/src/Commands/Get.swift              |   2 +-
 libs/lume/src/Commands/List.swift             |  11 +-
 libs/lume/src/Commands/Pull.swift             |   2 +-
 libs/lume/src/Commands/Run.swift              |   2 +-
 libs/lume/src/Commands/Set.swift              |   2 +-
 libs/lume/src/Commands/Stop.swift             |   2 +-
 .../ImageContainerRegistry.swift              |  17 +-
 libs/lume/src/FileSystem/Home.swift           |  22 +++
 libs/lume/src/FileSystem/VMDirectory.swift    |  42 +++--
 libs/lume/src/LumeController.swift            | 162 +++++++++++++++---
 libs/lume/src/Server/Handlers.swift           |   4 +-
 libs/lume/src/Server/Requests.swift           |   2 +-
 libs/lume/src/Server/Server.swift             |   6 +-
 libs/lumier/src/lib/vm.sh                     |  39 +++--
 17 files changed, 321 insertions(+), 77 deletions(-)

diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh
index f6313538..d854c0e4 100755
--- a/libs/lume/scripts/install.sh
+++ b/libs/lume/scripts/install.sh
@@ -12,8 +12,6 @@ GREEN=$(tput setaf 2)
 BLUE=$(tput setaf 4)
 YELLOW=$(tput setaf 3)
 
-
-
 # Default installation directory (user-specific, doesn't require sudo)
 DEFAULT_INSTALL_DIR="$HOME/.local/bin"
 INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}"
@@ -204,11 +202,84 @@ main() {
   create_temp_dir
   download_release
   install_binary
-  
+
   echo ""
   echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}"
   echo "Run ${BOLD}lume${NORMAL} to get started."
+
+  # --- LaunchAgent setup for lume daemon ---
+  SERVICE_NAME="com.trycua.lume_daemon"
+  PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist"
+  LUME_BIN="$INSTALL_DIR/lume"
+
+  echo ""
+  echo "Setting up LaunchAgent to run lume daemon on login..."
+
+  # Create LaunchAgents directory if it doesn't exist
+  mkdir -p "$HOME/Library/LaunchAgents"
+
+  # Unload existing service if present
+  if [ -f "$PLIST_PATH" ]; then
+    echo "Existing LaunchAgent found. Unloading..."
+    launchctl unload "$PLIST_PATH" 2>/dev/null || true
+  fi
+
+  # Create the plist file
+  cat <<EOF > "$PLIST_PATH"
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>Label</key>
+    <string>$SERVICE_NAME</string>
+    <key>ProgramArguments</key>
+    <array>
+        <string>$LUME_BIN</string>
+        <string>serve</string>
+    </array>
+    <key>RunAtLoad</key>
+    <true/>
+    <key>KeepAlive</key>
+    <true/>
+    <key>WorkingDirectory</key>
+    <string>$HOME</string>
+    <key>EnvironmentVariables</key>
+    <dict>
+        <key>PATH</key>
+        <string>/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$HOME/.local/bin</string>
+        <key>HOME</key>
+        <string>$HOME</string>
+    </dict>
+    <key>StandardOutPath</key>
+    <string>/tmp/lume_daemon.log</string>
+    <key>StandardErrorPath</key>
+    <string>/tmp/lume_daemon.error.log</string>
+    <key>ProcessType</key>
+    <string>Interactive</string>
+    <key>SessionType</key>
+    <string>Aqua</string>
+</dict>
+</plist>
+EOF
+
+  # Set permissions
+  chmod 644 "$PLIST_PATH"
+  touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log
+  chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log
+
+  # Load the LaunchAgent
+  echo "Loading LaunchAgent..."
+  launchctl unload "$PLIST_PATH" 2>/dev/null || true
+  launchctl load "$PLIST_PATH"
+
+  echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}"
+  echo "To check status: launchctl list | grep $SERVICE_NAME"
+  echo "To view logs: tail -f /tmp/lume_daemon.log"
+  echo ""
+  echo "To remove the lume daemon service, run:"
+  echo "  launchctl unload \"$PLIST_PATH\""
+  echo "  rm \"$PLIST_PATH\""
 }
 
 # Run the installation
-main 
\ No newline at end of file
+main
diff --git a/libs/lume/src/Commands/Create.swift b/libs/lume/src/Commands/Create.swift
index b4f02633..db042c69 100644
--- a/libs/lume/src/Commands/Create.swift
+++ b/libs/lume/src/Commands/Create.swift
@@ -40,7 +40,7 @@ struct Create: AsyncParsableCommand {
     )
     var ipsw: String?
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {
diff --git a/libs/lume/src/Commands/Delete.swift b/libs/lume/src/Commands/Delete.swift
index c3cd3653..7d78ca6d 100644
--- a/libs/lume/src/Commands/Delete.swift
+++ b/libs/lume/src/Commands/Delete.swift
@@ -12,7 +12,7 @@ struct Delete: AsyncParsableCommand {
     @Flag(name: .long, help: "Force deletion without confirmation")
     var force = false
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {}
diff --git a/libs/lume/src/Commands/Get.swift b/libs/lume/src/Commands/Get.swift
index 5ff34113..aad56136 100644
--- a/libs/lume/src/Commands/Get.swift
+++ b/libs/lume/src/Commands/Get.swift
@@ -12,7 +12,7 @@ struct Get: AsyncParsableCommand {
     @Option(name: [.long, .customShort("f")], help: "Output format (json|text)")
     var format: FormatOption = .text
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {
diff --git a/libs/lume/src/Commands/List.swift b/libs/lume/src/Commands/List.swift
index 6361f899..89a6dc6e 100644
--- a/libs/lume/src/Commands/List.swift
+++ b/libs/lume/src/Commands/List.swift
@@ -10,15 +10,22 @@ struct List: AsyncParsableCommand {
     @Option(name: [.long, .customShort("f")], help: "Output format (json|text)")
     var format: FormatOption = .text
     
+    @Option(name: .long, help: "Filter by storage location name")
+    var storage: String?
+
     init() {
     }
     
     @MainActor
     func run() async throws {
         let manager = LumeController()
-        let vms = try manager.list()
+        let vms = try manager.list(storage: self.storage)
         if vms.isEmpty && self.format == .text {
-            print("No virtual machines found")
+            if let storageName = self.storage {
+                print("No virtual machines found in storage '\(storageName)'")
+            } else {
+                print("No virtual machines found")
+            }
         } else {
             try VMDetailsPrinter.printStatus(vms, format: self.format)
         }
diff --git a/libs/lume/src/Commands/Pull.swift b/libs/lume/src/Commands/Pull.swift
index 074e0fac..cd843381 100644
--- a/libs/lume/src/Commands/Pull.swift
+++ b/libs/lume/src/Commands/Pull.swift
@@ -19,7 +19,7 @@ struct Pull: AsyncParsableCommand {
     @Option(help: "Organization to pull from. Defaults to trycua")
     var organization: String = "trycua"
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {}
diff --git a/libs/lume/src/Commands/Run.swift b/libs/lume/src/Commands/Run.swift
index bc659769..273e8ba7 100644
--- a/libs/lume/src/Commands/Run.swift
+++ b/libs/lume/src/Commands/Run.swift
@@ -48,7 +48,7 @@ struct Run: AsyncParsableCommand {
     @Option(help: "For MacOS VMs only, boot into the VM in recovery mode")
     var recoveryMode: Bool = false
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     private var parsedSharedDirectories: [SharedDirectory] {
diff --git a/libs/lume/src/Commands/Set.swift b/libs/lume/src/Commands/Set.swift
index 73bfe0c9..e2420a68 100644
--- a/libs/lume/src/Commands/Set.swift
+++ b/libs/lume/src/Commands/Set.swift
@@ -21,7 +21,7 @@ struct Set: AsyncParsableCommand {
     @Option(help: "New display resolution in format WIDTHxHEIGHT.")
     var display: VMDisplayResolution?
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {
diff --git a/libs/lume/src/Commands/Stop.swift b/libs/lume/src/Commands/Stop.swift
index 933019e5..3b921114 100644
--- a/libs/lume/src/Commands/Stop.swift
+++ b/libs/lume/src/Commands/Stop.swift
@@ -9,7 +9,7 @@ struct Stop: AsyncParsableCommand {
     @Argument(help: "Name of the virtual machine", completion: .custom(completeVMName))
     var name: String
 
-    @Option(name: .customLong("storage"), help: "VM storage location to use")
+    @Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
     var storage: String?
 
     init() {
diff --git a/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift b/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift
index 714cf1cb..a7a68212 100644
--- a/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift
+++ b/libs/lume/src/ContainerRegistry/ImageContainerRegistry.swift
@@ -643,7 +643,7 @@ class ImageContainerRegistry: @unchecked Sendable {
         image: String,
         name: String?,
         locationName: String? = nil
-    ) async throws {
+    ) async throws -> VMDirectory {
         guard !image.isEmpty else {
             throw ValidationError("Image name cannot be empty")
         }
@@ -652,7 +652,16 @@ class ImageContainerRegistry: @unchecked Sendable {
 
         // Use provided name or derive from image
         let vmName = name ?? image.split(separator: ":").first.map(String.init) ?? ""
-        let vmDir = try home.getVMDirectory(vmName, storage: locationName)
+        
+        // Determine if locationName is a direct path or a named storage location
+        let vmDir: VMDirectory
+        if let locationName = locationName, locationName.contains("/") || locationName.contains("\\") {
+            // Direct path
+            vmDir = try home.getVMDirectoryFromPath(vmName, storagePath: locationName)
+        } else {
+            // Named storage or default location
+            vmDir = try home.getVMDirectory(vmName, storage: locationName)
+        }
 
         // Optimize network early in the process
         optimizeNetworkSettings()
@@ -991,6 +1000,7 @@ class ImageContainerRegistry: @unchecked Sendable {
         Logger.info(
             "Run 'lume run \(vmName)' to reduce the disk image file size by using macOS sparse file system"
         )
+        return vmDir
     }
 
     // Helper function to clean up a specific cache entry
@@ -3024,7 +3034,8 @@ class ImageContainerRegistry: @unchecked Sendable {
 
                             // Replace original with optimized version
                             try FileManager.default.removeItem(at: reassembledFile)
-                            try FileManager.default.moveItem(at: optimizedFile, to: reassembledFile)
+                            try FileManager.default.moveItem(
+                                at: optimizedFile, to: reassembledFile)
                             Logger.info("Using sparse-optimized file for verification")
                         } else {
                             Logger.info(
diff --git a/libs/lume/src/FileSystem/Home.swift b/libs/lume/src/FileSystem/Home.swift
index b8b4ae54..d83b39b0 100644
--- a/libs/lume/src/FileSystem/Home.swift
+++ b/libs/lume/src/FileSystem/Home.swift
@@ -92,6 +92,28 @@ final class Home {
         let baseDir = Path(location.expandedPath)
         return VMDirectory(baseDir.directory(name))
     }
+    
+    /// Gets a VM directory from a direct file path
+    ///
+    /// - Parameters:
+    ///   - name: Name of the VM directory
+    ///   - storagePath: Direct file system path where the VM is located
+    /// - Returns: A VMDirectory instance
+    /// - Throws: HomeError if path is invalid
+    func getVMDirectoryFromPath(_ name: String, storagePath: String) throws -> VMDirectory {
+        let baseDir = Path(storagePath)
+        
+        // Create the directory if it doesn't exist
+        if !fileExists(at: storagePath) {
+            Logger.info("Creating storage directory", metadata: ["path": storagePath])
+            try createVMLocation(at: storagePath)
+        } else if !isValidDirectory(at: storagePath) {
+            // Path exists but isn't a valid directory
+            throw HomeError.invalidHomeDirectory
+        }
+        
+        return VMDirectory(baseDir.directory(name))
+    }
 
     /// Returns all initialized VM directories across all locations
     /// - Returns: An array of VMDirectory instances with location info
diff --git a/libs/lume/src/FileSystem/VMDirectory.swift b/libs/lume/src/FileSystem/VMDirectory.swift
index a902e34b..3335107d 100644
--- a/libs/lume/src/FileSystem/VMDirectory.swift
+++ b/libs/lume/src/FileSystem/VMDirectory.swift
@@ -8,7 +8,7 @@ import Foundation
 /// - Handling disk operations
 /// - Managing VM state and locking
 /// - Providing access to VM-related paths
-struct VMDirectory {
+struct VMDirectory: Sendable {
     // MARK: - Constants
     
     private enum FileNames {
@@ -26,8 +26,6 @@ struct VMDirectory {
     let configPath: Path
     let sessionsPath: Path
     
-    private let fileManager: FileManager
-    
     /// The name of the VM directory
     var name: String { dir.name }
     
@@ -36,10 +34,8 @@ struct VMDirectory {
     /// Creates a new VMDirectory instance
     /// - Parameters:
     ///   - dir: The base directory path for the VM
-    ///   - fileManager: FileManager instance to use for file operations
-    init(_ dir: Path, fileManager: FileManager = .default) {
+    init(_ dir: Path) {
         self.dir = dir
-        self.fileManager = fileManager
         self.nvramPath = dir.file(FileNames.nvram)
         self.diskPath = dir.file(FileNames.disk)
         self.configPath = dir.file(FileNames.config)
@@ -52,7 +48,25 @@ struct VMDirectory {
 extension VMDirectory {
     /// Checks if the VM directory is fully initialized with all required files
     func initialized() -> Bool {
-        configPath.exists() && diskPath.exists() && nvramPath.exists()
+        // Add detailed logging for debugging
+        let configExists = configPath.exists()
+        let diskExists = diskPath.exists()
+        let nvramExists = nvramPath.exists()
+        
+        Logger.info(
+            "VM directory initialization check", 
+            metadata: [
+                "directory": dir.path,
+                "config_path": configPath.path,
+                "config_exists": "\(configExists)",
+                "disk_path": diskPath.path,
+                "disk_exists": "\(diskExists)",
+                "nvram_path": nvramPath.path,
+                "nvram_exists": "\(nvramExists)"
+            ]
+        )
+        
+        return configExists && diskExists && nvramExists
     }
 
     /// Checks if the VM directory exists
@@ -70,7 +84,7 @@ extension VMDirectory {
     func setDisk(_ size: UInt64) throws {
         do {
             if !diskPath.exists() {
-                guard fileManager.createFile(atPath: diskPath.path, contents: nil) else {
+                guard FileManager.default.createFile(atPath: diskPath.path, contents: nil) else {
                     throw VMDirectoryError.fileCreationFailed(diskPath.path)
                 }
             }
@@ -96,7 +110,7 @@ extension VMDirectory {
         
         do {
             let data = try encoder.encode(config)
-            guard fileManager.createFile(atPath: configPath.path, contents: data) else {
+            guard FileManager.default.createFile(atPath: configPath.path, contents: data) else {
                 throw VMDirectoryError.fileCreationFailed(configPath.path)
             }
         } catch {
@@ -108,7 +122,7 @@ extension VMDirectory {
     /// - Returns: The loaded configuration
     /// - Throws: VMDirectoryError if the load operation fails
     func loadConfig() throws -> VMConfig {
-        guard let data = fileManager.contents(atPath: configPath.path) else {
+        guard let data = FileManager.default.contents(atPath: configPath.path) else {
             throw VMDirectoryError.configNotFound
         }
         
@@ -137,7 +151,7 @@ extension VMDirectory {
         
         do {
             let data = try encoder.encode(session)
-            guard fileManager.createFile(atPath: sessionsPath.path, contents: data) else {
+            guard FileManager.default.createFile(atPath: sessionsPath.path, contents: data) else {
                 throw VMDirectoryError.fileCreationFailed(sessionsPath.path)
             }
         } catch {
@@ -149,7 +163,7 @@ extension VMDirectory {
     /// - Returns: The loaded VNC session
     /// - Throws: VMDirectoryError if the load operation fails
     func loadSession() throws -> VNCSession {
-        guard let data = fileManager.contents(atPath: sessionsPath.path) else {
+        guard let data = FileManager.default.contents(atPath: sessionsPath.path) else {
             throw VMDirectoryError.sessionNotFound
         }
         
@@ -163,7 +177,7 @@ extension VMDirectory {
     
     /// Removes the VNC session information from disk
     func clearSession() {
-        try? fileManager.removeItem(atPath: sessionsPath.path)
+        try? FileManager.default.removeItem(atPath: sessionsPath.path)
     }
 }
 
@@ -176,6 +190,6 @@ extension VMDirectory: CustomStringConvertible {
 
 extension VMDirectory {
     func delete() throws {
-        try fileManager.removeItem(atPath: dir.path)
+        try FileManager.default.removeItem(atPath: dir.path)
     }
 }
diff --git a/libs/lume/src/LumeController.swift b/libs/lume/src/LumeController.swift
index ecdcec49..f25079ff 100644
--- a/libs/lume/src/LumeController.swift
+++ b/libs/lume/src/LumeController.swift
@@ -48,15 +48,72 @@ final class LumeController {
 
     /// Lists all virtual machines in the system
     @MainActor
-    public func list() throws -> [VMDetails] {
+    public func list(storage: String? = nil) throws -> [VMDetails] {
         do {
-            let vmLocations = try home.getAllVMDirectories()
-            let statuses = try vmLocations.map { vmWithLoc in
-                let vm = try self.get(
-                    name: vmWithLoc.directory.name, storage: vmWithLoc.locationName)
-                return vm.details
+            if let storage = storage {
+                // If storage is specified, only return VMs from that location
+                if storage.contains("/") || storage.contains("\\") {
+                    // Direct path - check if it exists
+                    if !FileManager.default.fileExists(atPath: storage) {
+                        // Return empty array if the path doesn't exist
+                        return []
+                    }
+                    
+                    // Try to get all VMs from the specified path
+                    // We need to check which subdirectories are valid VM dirs
+                    let directoryURL = URL(fileURLWithPath: storage)
+                    let contents = try FileManager.default.contentsOfDirectory(
+                        at: directoryURL,
+                        includingPropertiesForKeys: [.isDirectoryKey],
+                        options: .skipsHiddenFiles
+                    )
+                    
+                    let statuses = try contents.compactMap { subdir -> VMDetails? in
+                        guard let isDirectory = try subdir.resourceValues(forKeys: [.isDirectoryKey]).isDirectory,
+                              isDirectory else {
+                            return nil
+                        }
+                        
+                        let vmName = subdir.lastPathComponent
+                        // Check if it's a valid VM directory
+                        let vmDir = try home.getVMDirectoryFromPath(vmName, storagePath: storage)
+                        if !vmDir.initialized() {
+                            return nil
+                        }
+                        
+                        do {
+                            let vm = try self.get(name: vmName, storage: storage)
+                            return vm.details
+                        } catch {
+                            // Skip invalid VM directories
+                            return nil
+                        }
+                    }
+                    return statuses
+                } else {
+                    // Named storage
+                    let vmsWithLoc = try home.getAllVMDirectories()
+                    let statuses = try vmsWithLoc.compactMap { vmWithLoc -> VMDetails? in
+                        // Only include VMs from the specified location
+                        if vmWithLoc.locationName != storage {
+                            return nil
+                        }
+                        let vm = try self.get(
+                            name: vmWithLoc.directory.name, storage: vmWithLoc.locationName)
+                        return vm.details
+                    }
+                    return statuses
+                }
+            } else {
+                // No storage filter - get all VMs
+                let vmsWithLoc = try home.getAllVMDirectories()
+                let statuses = try vmsWithLoc.compactMap { vmWithLoc -> VMDetails? in
+                    let vm = try self.get(
+                        name: vmWithLoc.directory.name, storage: vmWithLoc.locationName)
+                    return vm.details
+                }
+                return statuses
             }
-            return statuses
         } catch {
             Logger.error("Failed to list VMs", metadata: ["error": error.localizedDescription])
             throw error
@@ -133,20 +190,42 @@ final class LumeController {
     public func get(name: String, storage: String? = nil) throws -> VM {
         let normalizedName = normalizeVMName(name: name)
         do {
-            // Try to find the VM and get its actual location
-            let actualLocation = try self.validateVMExists(
-                normalizedName, storage: storage)
+            let vm: VM
+            if let storagePath = storage, storagePath.contains("/") || storagePath.contains("\\") {
+                // Storage is a direct path
+                let vmDir = try home.getVMDirectoryFromPath(normalizedName, storagePath: storagePath)
+                guard vmDir.initialized() else {
+                    // Throw a specific error if the directory exists but isn't a valid VM
+                    if vmDir.exists() {
+                        throw VMError.notInitialized(normalizedName)
+                    } else {
+                        throw VMError.notFound(normalizedName)
+                    }
+                }
+                // Pass the path as the storage context
+                vm = try self.loadVM(vmDir: vmDir, storage: storagePath)
+            } else {
+                // Storage is nil or a named location
+                let actualLocation = try self.validateVMExists(
+                    normalizedName, storage: storage)
 
-            // Load the VM from its actual location
-            let vm = try self.loadVM(name: normalizedName, storage: actualLocation)
+                let vmDir = try home.getVMDirectory(normalizedName, storage: actualLocation)
+                // loadVM will re-check initialized, but good practice to keep validateVMExists result.
+                vm = try self.loadVM(vmDir: vmDir, storage: actualLocation)
+            }
             return vm
         } catch {
-            Logger.error("Failed to get VM", metadata: ["error": error.localizedDescription])
+            Logger.error(
+                "Failed to get VM",
+                metadata: [
+                    "vmName": normalizedName, "storage": storage ?? "default",
+                    "error": error.localizedDescription,
+                ])
+            // Re-throw the original error to preserve its type
             throw error
         }
     }
 
-    /// Factory for creating the appropriate VM type based on the OS
     @MainActor
     public func create(
         name: String,
@@ -488,7 +567,7 @@ final class LumeController {
 
             let imageContainerRegistry = ImageContainerRegistry(
                 registry: registry, organization: organization)
-            try await imageContainerRegistry.pull(
+            let _ = try await imageContainerRegistry.pull(
                 image: actualImage,
                 name: vmName,
                 locationName: storage)
@@ -752,15 +831,17 @@ final class LumeController {
     }
 
     @MainActor
-    private func loadVM(name: String, storage: String? = nil) throws -> VM {
-        let vmDir = try home.getVMDirectory(name, storage: storage)
+    private func loadVM(vmDir: VMDirectory, storage: String?) throws -> VM {
+        // vmDir is now passed directly
         guard vmDir.initialized() else {
-            throw VMError.notInitialized(name)
+            throw VMError.notInitialized(vmDir.name) // Use name from vmDir
         }
 
         let config: VMConfig = try vmDir.loadConfig()
+        // Pass the provided storage (which could be a path or named location)
         let vmDirContext = VMDirContext(
-            dir: vmDir, config: config, home: home, storage: storage)
+            dir: vmDir, config: config, home: home, storage: storage
+        )
 
         let imageLoader =
             config.os.lowercased() == "macos" ? imageLoaderFactory.createImageLoader() : nil
@@ -808,11 +889,22 @@ final class LumeController {
     public func validateVMExists(_ name: String, storage: String? = nil) throws -> String? {
         // If location is specified, only check that location
         if let storage = storage {
-            let vmDir = try home.getVMDirectory(name, storage: storage)
-            guard vmDir.initialized() else {
-                throw VMError.notFound(name)
+            // Check if storage is a path by looking for directory separator
+            if storage.contains("/") || storage.contains("\\") {
+                // Treat as direct path
+                let vmDir = try home.getVMDirectoryFromPath(name, storagePath: storage)
+                guard vmDir.initialized() else {
+                    throw VMError.notFound(name)
+                }
+                return storage  // Return the path as the location identifier
+            } else {
+                // Treat as named storage
+                let vmDir = try home.getVMDirectory(name, storage: storage)
+                guard vmDir.initialized() else {
+                    throw VMError.notFound(name)
+                }
+                return storage
             }
-            return storage
         }
 
         // If no location specified, try to find the VM in any location
@@ -846,7 +938,29 @@ final class LumeController {
             throw ValidationError("Organization cannot be empty")
         }
 
-        let vmDir = try home.getVMDirectory(name, storage: storage)
+        // Determine if storage is a path or a named storage location
+        let vmDir: VMDirectory
+        if let storage = storage, storage.contains("/") || storage.contains("\\") {
+            // Create the base directory if it doesn't exist
+            if !FileManager.default.fileExists(atPath: storage) {
+                Logger.info("Creating VM storage directory", metadata: ["path": storage])
+                do {
+                    try FileManager.default.createDirectory(
+                        atPath: storage,
+                        withIntermediateDirectories: true
+                    )
+                } catch {
+                    throw HomeError.directoryCreationFailed(path: storage)
+                }
+            }
+            
+            // Use getVMDirectoryFromPath for direct paths
+            vmDir = try home.getVMDirectoryFromPath(name, storagePath: storage)
+        } else {
+            // Use getVMDirectory for named storage locations
+            vmDir = try home.getVMDirectory(name, storage: storage)
+        }
+        
         if vmDir.exists() {
             throw VMError.alreadyExists(name)
         }
diff --git a/libs/lume/src/Server/Handlers.swift b/libs/lume/src/Server/Handlers.swift
index c968359a..bf289350 100644
--- a/libs/lume/src/Server/Handlers.swift
+++ b/libs/lume/src/Server/Handlers.swift
@@ -6,10 +6,10 @@ import Virtualization
 extension Server {
     // MARK: - VM Management Handlers
 
-    func handleListVMs() async throws -> HTTPResponse {
+    func handleListVMs(storage: String? = nil) async throws -> HTTPResponse {
         do {
             let vmController = LumeController()
-            let vms = try vmController.list()
+            let vms = try vmController.list(storage: storage)
             return try .json(vms)
         } catch {
             return .badRequest(message: error.localizedDescription)
diff --git a/libs/lume/src/Server/Requests.swift b/libs/lume/src/Server/Requests.swift
index da0bf681..5cde19d2 100644
--- a/libs/lume/src/Server/Requests.swift
+++ b/libs/lume/src/Server/Requests.swift
@@ -109,7 +109,7 @@ struct PushRequest: Codable {
     let tags: [String] // List of tags to push
     var registry: String // Registry URL
     var organization: String // Organization/user in the registry
-    let storage: String? // Optional VM storage location
+    let storage: String? // Optional VM storage location or direct path
     var chunkSizeMb: Int // Chunk size
     // dryRun and reassemble are less common for API, default to false?
     // verbose is usually handled by server logging
diff --git a/libs/lume/src/Server/Server.swift b/libs/lume/src/Server/Server.swift
index 71db4a75..98ffc588 100644
--- a/libs/lume/src/Server/Server.swift
+++ b/libs/lume/src/Server/Server.swift
@@ -79,9 +79,11 @@ final class Server {
         routes = [
             Route(
                 method: "GET", path: "/lume/vms",
-                handler: { [weak self] _ in
+                handler: { [weak self] request in
                     guard let self else { throw HTTPError.internalError }
-                    return try await self.handleListVMs()
+                    // Extract storage from query params if present
+                    let storage = self.extractQueryParam(request: request, name: "storage")
+                    return try await self.handleListVMs(storage: storage)
                 }),
             Route(
                 method: "GET", path: "/lume/vms/:name",
diff --git a/libs/lumier/src/lib/vm.sh b/libs/lumier/src/lib/vm.sh
index 9d3dda06..5bcd5d7d 100755
--- a/libs/lumier/src/lib/vm.sh
+++ b/libs/lumier/src/lib/vm.sh
@@ -1,32 +1,32 @@
 #!/usr/bin/env bash
 
 start_vm() {
-    # Set up dedicated storage for this VM
-    STORAGE_NAME="storage_${VM_NAME}"
-    if [ -n "$HOST_STORAGE_PATH" ]; then
-        lume config storage add "$STORAGE_NAME" "$HOST_STORAGE_PATH" >/dev/null 2>&1 || true
+    # Determine storage path for VM
+    STORAGE_PATH="$HOST_STORAGE_PATH"
+    if [ -z "$STORAGE_PATH" ]; then
+        STORAGE_PATH="storage_${VM_NAME}"
     fi
 
     # Check if VM exists and its status using JSON format
-    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>&1)
+    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>&1)
 
     # Check if VM not found error
     if [[ $VM_INFO == *"Virtual machine not found"* ]]; then
         IMAGE_NAME="${VERSION##*/}"
-        lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_NAME"
+        lume pull "$IMAGE_NAME" "$VM_NAME" --storage "$STORAGE_PATH"
     else
         # Parse the JSON status - check if it contains "status" : "running"
         if [[ $VM_INFO == *'"status" : "running"'* ]]; then
-            # lume_stop "$VM_NAME" "$STORAGE_NAME"
-            lume stop "$VM_NAME" --storage "$STORAGE_NAME"
+            lume_stop "$VM_NAME" "$STORAGE_PATH"
+            # lume stop "$VM_NAME" --storage "$STORAGE_PATH"
         fi
     fi
 
     # Set VM parameters
-    lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_NAME"
+    lume set "$VM_NAME" --cpu "$CPU_CORES" --memory "${RAM_SIZE}MB" --display "$DISPLAY" --storage "$STORAGE_PATH"
 
     # Fetch VM configuration
-    CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json)
+    CONFIG_JSON=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json)
     
     # Setup data directory args if necessary
     SHARED_DIR_ARGS=""
@@ -39,8 +39,8 @@ start_vm() {
     fi
 
     # Run VM with VNC and shared directory using curl
-    # lume_run $SHARED_DIR_ARGS --storage "$STORAGE_NAME" "$VM_NAME" &
-    lume run "$VM_NAME" --storage "$STORAGE_NAME" --no-display
+    lume_run $SHARED_DIR_ARGS --storage "$STORAGE_PATH" "$VM_NAME" &
+    # lume run "$VM_NAME" --storage "$STORAGE_PATH" --no-display
 
     # Wait for VM to be running and VNC URL to be available
     vm_ip=""
@@ -50,7 +50,7 @@ start_vm() {
     
     while [ $attempt -lt $max_attempts ]; do
         # Get VM info as JSON
-        VM_INFO=$(lume get "$VM_NAME" -f json 2>/dev/null)
+        VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>/dev/null)
         
         # Check if VM has status 'running'
         if [[ $VM_INFO == *'"status" : "running"'* ]]; then
@@ -71,8 +71,8 @@ start_vm() {
     
     if [ -z "$vm_ip" ] || [ -z "$vnc_url" ]; then
         echo "Timed out waiting for VM to start or VNC URL to become available."
-        # lume_stop "$VM_NAME" "$STORAGE_NAME" > /dev/null 2>&1
-        lume stop "$VM_NAME" --storage "$STORAGE_NAME" > /dev/null 2>&1
+        lume_stop "$VM_NAME" "$STORAGE_PATH" > /dev/null 2>&1
+        # lume stop "$VM_NAME" --storage "$STORAGE_PATH" > /dev/null 2>&1
         exit 1
     fi
 
@@ -100,13 +100,16 @@ start_vm() {
 
 stop_vm() {
     echo "Stopping VM '$VM_NAME'..."
-    STORAGE_NAME="storage_${VM_NAME}"
+    STORAGE_PATH="$HOST_STORAGE_PATH"
+    if [ -z "$STORAGE_PATH" ]; then
+        STORAGE_PATH="storage_${VM_NAME}"
+    fi
     # Check if the VM exists and is running (use lume get for speed)
-    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_NAME" -f json 2>/dev/null)
+    VM_INFO=$(lume get "$VM_NAME" --storage "$STORAGE_PATH" -f json 2>/dev/null)
     if [[ -z "$VM_INFO" || $VM_INFO == *"Virtual machine not found"* ]]; then
         echo "VM '$VM_NAME' does not exist."
     elif [[ $VM_INFO == *'"status" : "running"'* ]]; then
-        lume_stop "$VM_NAME" "$STORAGE_NAME"
+        lume_stop "$VM_NAME" "$STORAGE_PATH"
         echo "VM '$VM_NAME' was running and is now stopped."
     elif [[ $VM_INFO == *'"status" : "stopped"'* ]]; then
         echo "VM '$VM_NAME' is already stopped."

From 0543e16c1e11b807b90ccaedebfb94a1f3f11b9b Mon Sep 17 00:00:00 2001
From: "allcontributors[bot]"
 <46447321+allcontributors[bot]@users.noreply.github.com>
Date: Tue, 29 Apr 2025 23:46:06 +0000
Subject: [PATCH 19/38] docs: update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 53102fcb..eed2c55f 100644
--- a/README.md
+++ b/README.md
@@ -228,6 +228,7 @@ Apple, macOS, and Apple Silicon are trademarks of Apple Inc. Ubuntu and Canonica
     <tr>
       <td align="center" valign="top" width="14.28%"><a href="https://www.encona.com/"><img src="https://avatars.githubusercontent.com/u/891558?v=4?s=100" width="100px;" alt="Rahim Nathwani"/><br /><sub><b>Rahim Nathwani</b></sub></a><br /><a href="#code-rahimnathwani" title="Code">💻</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://mjspeck.github.io/"><img src="https://avatars.githubusercontent.com/u/20689127?v=4?s=100" width="100px;" alt="Matt Speck"/><br /><sub><b>Matt Speck</b></sub></a><br /><a href="#code-mjspeck" title="Code">💻</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/FinnBorge"><img src="https://avatars.githubusercontent.com/u/9272726?v=4?s=100" width="100px;" alt="FinnBorge"/><br /><sub><b>FinnBorge</b></sub></a><br /><a href="#code-FinnBorge" title="Code">💻</a></td>
     </tr>
   </tbody>
 </table>

From 8bfb9dbe052e9647d0c6f94669c48410d30178b4 Mon Sep 17 00:00:00 2001
From: "allcontributors[bot]"
 <46447321+allcontributors[bot]@users.noreply.github.com>
Date: Tue, 29 Apr 2025 23:46:07 +0000
Subject: [PATCH 20/38] docs: update .all-contributorsrc

---
 .all-contributorsrc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.all-contributorsrc b/.all-contributorsrc
index d1b3578e..503f0e94 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -151,6 +151,15 @@
       "contributions": [
         "code"
       ]
+    },
+    {
+      "login": "FinnBorge",
+      "name": "FinnBorge",
+      "avatar_url": "https://avatars.githubusercontent.com/u/9272726?v=4",
+      "profile": "https://github.com/FinnBorge",
+      "contributions": [
+        "code"
+      ]
     }
   ]
 }

From 9b78a40cb556ee67e159bb47a40b1af4d31fdf26 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Tue, 29 Apr 2025 17:33:17 -0700
Subject: [PATCH 21/38] Handle lume run storage not found

---
 libs/lume/src/LumeController.swift | 157 +++++++++++++++++------------
 1 file changed, 93 insertions(+), 64 deletions(-)

diff --git a/libs/lume/src/LumeController.swift b/libs/lume/src/LumeController.swift
index f25079ff..03db4999 100644
--- a/libs/lume/src/LumeController.swift
+++ b/libs/lume/src/LumeController.swift
@@ -408,58 +408,84 @@ final class LumeController {
             "Running VM",
             metadata: [
                 "name": normalizedName,
-                "location": storage ?? "default",
                 "no_display": "\(noDisplay)",
                 "shared_directories":
                     "\(sharedDirectories.map( { $0.string } ).joined(separator: ", "))",
                 "mount": mount?.path ?? "none",
                 "vnc_port": "\(vncPort)",
                 "recovery_mode": "\(recoveryMode)",
-                "storage_param": storage ?? "default",
+                "storage_param": storage ?? "default", // Log the original param
                 "usb_storage_devices": "\(usbMassStoragePaths?.count ?? 0)",
             ])
 
         do {
-            // Check if this is an image reference (contains a tag)
-            let components = name.split(separator: ":")
-            if components.count == 2 {
-                do {
-                    _ = try self.validateVMExists(normalizedName, storage: storage)
-                } catch {
-                    // If the VM doesn't exist, try to pull the image
+            // Check if name is an image ref to auto-pull
+            let components = normalizedName.split(separator: ":")
+            if components.count == 2 { // Check if it looks like image:tag
+                // Attempt to validate if VM exists first, suppressing the error
+                // This avoids pulling if the VM already exists, even if name looks like an image ref
+                let vmExists = (try? self.validateVMExists(normalizedName, storage: storage)) != nil
+                if !vmExists {
+                    Logger.info(
+                        "VM not found, attempting to pull image based on name",
+                        metadata: ["imageRef": normalizedName])
+                    // Use the potentially new VM name derived from the image ref
+                    let potentialVMName = String(components[0])
                     try await pullImage(
-                        image: name,
-                        name: nil,
+                        image: normalizedName, // Full image ref
+                        name: potentialVMName, // Name derived from image
                         registry: registry,
                         organization: organization,
                         storage: storage
                     )
+                    // Important: After pull, the effective name might have changed
+                    // We proceed assuming the user wants to run the VM derived from image name
+                    // normalizedName = potentialVMName // Re-assign normalizedName if pull logic creates it
+                    // Note: Current pullImage doesn't return the final VM name, 
+                    // so we assume it matches the name derived from the image.
+                    // This might need refinement if pullImage behaviour changes.
                 }
             }
 
-            // Find VM and get its actual location
-            let actualLocation = try validateVMExists(normalizedName, storage: storage)
+            // Determine effective storage path or name AND get the VMDirectory
+            let effectiveStorage: String?
+            let vmDir: VMDirectory
 
-            // Log if we found the VM in a different location than default
-            if actualLocation != storage && actualLocation != nil {
+            if let storagePath = storage, storagePath.contains("/") || storagePath.contains("\\") {
+                // Storage is a direct path
+                vmDir = try home.getVMDirectoryFromPath(normalizedName, storagePath: storagePath)
+                guard vmDir.initialized() else {
+                    if vmDir.exists() {
+                        throw VMError.notInitialized(normalizedName)
+                    } else {
+                        throw VMError.notFound(normalizedName)
+                    }
+                }
+                effectiveStorage = storagePath // Use the path string
+                Logger.info("Using direct storage path", metadata: ["path": storagePath])
+            } else {
+                // Storage is nil or a named location - validate and get the actual name
+                let actualLocationName = try validateVMExists(normalizedName, storage: storage)
+                vmDir = try home.getVMDirectory(normalizedName, storage: actualLocationName) // Get VMDir for named location
+                effectiveStorage = actualLocationName // Use the named location string
                 Logger.info(
-                    "Found VM in location",
+                    "Using named storage location",
                     metadata: [
-                        "name": normalizedName,
-                        "location": actualLocation ?? "default",
+                        "requested": storage ?? "default",
+                        "actual": actualLocationName ?? "default",
                     ])
             }
 
+            // Validate parameters using the located VMDirectory
             try validateRunParameters(
-                name: normalizedName,
+                vmDir: vmDir, // Pass vmDir
                 sharedDirectories: sharedDirectories,
                 mount: mount,
-                storage: actualLocation,
                 usbMassStoragePaths: usbMassStoragePaths
             )
 
-            // Use the actual VM location that we found
-            let vm = try get(name: normalizedName, storage: actualLocation)
+            // Load the VM directly using the located VMDirectory and storage context
+            let vm = try self.loadVM(vmDir: vmDir, storage: effectiveStorage)
 
             SharedVM.shared.setVM(name: normalizedName, vm: vm)
             try await vm.run(
@@ -918,6 +944,51 @@ final class LumeController {
         throw VMError.notFound(name)
     }
 
+    private func validateRunParameters(
+        vmDir: VMDirectory, // Changed signature: accept VMDirectory
+        sharedDirectories: [SharedDirectory]?,
+        mount: Path?,
+        usbMassStoragePaths: [Path]? = nil
+    ) throws {
+        // VM existence is confirmed by having vmDir, no need for validateVMExists
+        if let dirs = sharedDirectories {
+            try self.validateSharedDirectories(dirs)
+        }
+
+        // Validate USB mass storage paths
+        if let usbPaths = usbMassStoragePaths {
+            for path in usbPaths {
+                if !FileManager.default.fileExists(atPath: path.path) {
+                    throw ValidationError("USB mass storage image not found: \(path.path)")
+                }
+            }
+
+            if #available(macOS 15.0, *) {
+                // USB mass storage is supported
+            } else {
+                Logger.info(
+                    "USB mass storage devices require macOS 15.0 or later. They will be ignored.")
+            }
+        }
+
+        // Load config directly from vmDir
+        let vmConfig = try vmDir.loadConfig()
+        switch vmConfig.os.lowercased() {
+        case "macos":
+            if mount != nil {
+                throw ValidationError(
+                    "Mounting disk images is not supported for macOS VMs. If you are looking to mount a IPSW, please use the --ipsw option in the create command."
+                )
+            }
+        case "linux":
+            if let mount = mount, !FileManager.default.fileExists(atPath: mount.path) {
+                throw ValidationError("Mount file not found: \(mount.path)")
+            }
+        default:
+            break
+        }
+    }
+
     private func validatePullParameters(
         image: String,
         name: String,
@@ -966,48 +1037,6 @@ final class LumeController {
         }
     }
 
-    private func validateRunParameters(
-        name: String, sharedDirectories: [SharedDirectory]?, mount: Path?,
-        storage: String? = nil, usbMassStoragePaths: [Path]? = nil
-    ) throws {
-        _ = try self.validateVMExists(name, storage: storage)
-        if let dirs = sharedDirectories {
-            try self.validateSharedDirectories(dirs)
-        }
-
-        // Validate USB mass storage paths
-        if let usbPaths = usbMassStoragePaths {
-            for path in usbPaths {
-                if !FileManager.default.fileExists(atPath: path.path) {
-                    throw ValidationError("USB mass storage image not found: \(path.path)")
-                }
-            }
-
-            if #available(macOS 15.0, *) {
-                // USB mass storage is supported
-            } else {
-                Logger.info(
-                    "USB mass storage devices require macOS 15.0 or later. They will be ignored.")
-            }
-        }
-
-        let vmConfig = try home.getVMDirectory(name, storage: storage).loadConfig()
-        switch vmConfig.os.lowercased() {
-        case "macos":
-            if mount != nil {
-                throw ValidationError(
-                    "Mounting disk images is not supported for macOS VMs. If you are looking to mount a IPSW, please use the --ipsw option in the create command."
-                )
-            }
-        case "linux":
-            if let mount = mount, !FileManager.default.fileExists(atPath: mount.path) {
-                throw ValidationError("Mount file not found: \(mount.path)")
-            }
-        default:
-            break
-        }
-    }
-
     private func validatePushParameters(
         name: String,
         imageName: String,

From 5fc627ed69734b79ab854c39f9ec06983b1403d6 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 11:02:39 -0700
Subject: [PATCH 22/38] added dragging along path

---
 .../providers/anthropic/tools/computer.py     | 20 +++++----
 .../agent/providers/openai/tools/computer.py  | 41 +++++++++++++++++++
 .../computer_server/handlers/base.py          | 13 +++++-
 .../computer_server/handlers/macos.py         | 35 +++++++++++++++-
 libs/computer-server/computer_server/main.py  |  1 +
 libs/computer/computer/interface/base.py      | 11 +++++
 libs/computer/computer/interface/macos.py     |  5 +++
 7 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/libs/agent/agent/providers/anthropic/tools/computer.py b/libs/agent/agent/providers/anthropic/tools/computer.py
index 8425f35f..ecf232bd 100644
--- a/libs/agent/agent/providers/anthropic/tools/computer.py
+++ b/libs/agent/agent/providers/anthropic/tools/computer.py
@@ -161,15 +161,17 @@ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
                     self.logger.info(f"Moving cursor to ({x}, {y})")
                     await self.computer.interface.move_cursor(x, y)
                 elif action == "left_click_drag":
-                    self.logger.info(f"Dragging from ({x}, {y})")
-                    # First move to the position
-                    await self.computer.interface.move_cursor(x, y)
-                    # Then perform drag operation - check if drag_to exists or we need to use other methods
-                    try:
-                        await self.computer.interface.drag_to(x, y)
-                    except Exception as e:
-                        self.logger.error(f"Error during drag operation: {str(e)}")
-                        raise ToolError(f"Failed to perform drag: {str(e)}")
+                    # Get the start coordinate from kwargs
+                    start_coordinate = kwargs.get("start_coordinate")
+                    if not start_coordinate:
+                        raise ToolError("start_coordinate is required for left_click_drag action")
+                    
+                    start_x, start_y = start_coordinate
+                    end_x, end_y = x, y
+                    
+                    self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
+                    await self.computer.interface.move_cursor(start_x, start_y)
+                    await self.computer.interface.drag_to(end_x, end_y)
 
                 # Wait briefly for any UI changes
                 await asyncio.sleep(0.5)
diff --git a/libs/agent/agent/providers/openai/tools/computer.py b/libs/agent/agent/providers/openai/tools/computer.py
index 90ef5935..c5602f4e 100644
--- a/libs/agent/agent/providers/openai/tools/computer.py
+++ b/libs/agent/agent/providers/openai/tools/computer.py
@@ -44,6 +44,7 @@ Action = Literal[
     "double_click",
     "screenshot",
     "scroll",
+    "drag",
 ]
 
 
@@ -165,6 +166,11 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
                 scroll_x = kwargs.get("scroll_x", 0) // 50
                 scroll_y = kwargs.get("scroll_y", 0) // 50
                 return await self.handle_scroll(x, y, scroll_x, scroll_y)
+            elif type == "drag":
+                path = kwargs.get("path")
+                if not path or not isinstance(path, list) or len(path) < 2:
+                    raise ToolError("path is required for drag action and must contain at least 2 points")
+                return await self.handle_drag(path)
             elif type == "screenshot":
                 return await self.screenshot()
             elif type == "wait":
@@ -302,6 +308,41 @@ class ComputerTool(BaseComputerTool, BaseOpenAITool):
             self.logger.error(f"Error in handle_scroll: {str(e)}")
             raise ToolError(f"Failed to scroll at ({x}, {y}): {str(e)}")
 
+    async def handle_drag(self, path: List[Dict[str, int]]) -> ToolResult:
+        """Handle mouse drag operation using a path of coordinates.
+
+        Args:
+            path: List of coordinate points {"x": int, "y": int} defining the drag path
+
+        Returns:
+            ToolResult with the operation result and screenshot
+        """
+        try:
+            # Convert from [{"x": x, "y": y}, ...] format to [(x, y), ...] format
+            points = [(p["x"], p["y"]) for p in path]
+            
+            # Perform drag action
+            if len(points) == 2:
+                await self.computer.interface.move_cursor(points[0][0], points[0][1])
+                await self.computer.interface.drag_to(points[1][0], points[1][1])
+            else:
+                await self.computer.interface.drag(points, button="left")
+            
+            # Wait for UI to update
+            await asyncio.sleep(0.5)
+            
+            # Take screenshot after action
+            screenshot = await self.computer.interface.screenshot()
+            base64_screenshot = base64.b64encode(screenshot).decode("utf-8")
+            
+            return ToolResult(
+                output=f"Dragged from ({path[0]['x']}, {path[0]['y']}) to ({path[-1]['x']}, {path[-1]['y']})",
+                base64_image=base64_screenshot,
+            )
+        except Exception as e:
+            self.logger.error(f"Error in handle_drag: {str(e)}")
+            raise ToolError(f"Failed to perform drag operation: {str(e)}")
+
     async def screenshot(self) -> ToolResult:
         """Take a screenshot."""
         try:
diff --git a/libs/computer-server/computer_server/handlers/base.py b/libs/computer-server/computer_server/handlers/base.py
index 818d367c..08d57ad5 100644
--- a/libs/computer-server/computer_server/handlers/base.py
+++ b/libs/computer-server/computer_server/handlers/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List, Tuple
 
 class BaseAccessibilityHandler(ABC):
     """Abstract base class for OS-specific accessibility handlers."""
@@ -59,6 +59,17 @@ class BaseAutomationHandler(ABC):
             duration: How long the drag should take in seconds
         """
         pass
+    
+    @abstractmethod
+    async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]:
+        """Drag the cursor from current position to specified coordinates.
+        
+        Args:
+            path: A list of tuples of x and y coordinates to drag to
+            button: The mouse button to use ('left', 'middle', 'right')
+            duration: How long the drag should take in seconds
+        """
+        pass
 
     # Keyboard Actions
     @abstractmethod
diff --git a/libs/computer-server/computer_server/handlers/macos.py b/libs/computer-server/computer_server/handlers/macos.py
index 180f083a..abdedc41 100644
--- a/libs/computer-server/computer_server/handlers/macos.py
+++ b/libs/computer-server/computer_server/handlers/macos.py
@@ -1,7 +1,7 @@
 import pyautogui
 import base64
 from io import BytesIO
-from typing import Optional, Dict, Any, List
+from typing import Optional, Dict, Any, List, Tuple
 from ctypes import byref, c_void_p, POINTER
 from AppKit import NSWorkspace  # type: ignore
 import AppKit
@@ -563,6 +563,39 @@ class MacOSAutomationHandler(BaseAutomationHandler):
         except Exception as e:
             return {"success": False, "error": str(e)}
 
+    async def drag(
+        self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5
+    ) -> Dict[str, Any]:
+        try:
+            if not path or len(path) < 2:
+                return {"success": False, "error": "Path must contain at least 2 points"}
+            
+            # Move to the first point
+            start_x, start_y = path[0]
+            pyautogui.moveTo(start_x, start_y)
+            
+            # Press the mouse button
+            pyautogui.mouseDown(button=button)
+            
+            # Calculate time between points to distribute duration evenly
+            step_duration = duration / (len(path) - 1) if len(path) > 1 else duration
+            
+            # Move through each subsequent point
+            for x, y in path[1:]:
+                pyautogui.moveTo(x, y, duration=step_duration)
+            
+            # Release the mouse button
+            pyautogui.mouseUp(button=button)
+            
+            return {"success": True}
+        except Exception as e:
+            # Make sure to release the mouse button if an error occurs
+            try:
+                pyautogui.mouseUp(button=button)
+            except:
+                pass
+            return {"success": False, "error": str(e)}
+
     # Keyboard Actions
     async def type_text(self, text: str) -> Dict[str, Any]:
         try:
diff --git a/libs/computer-server/computer_server/main.py b/libs/computer-server/computer_server/main.py
index c95918d8..d7f66f89 100644
--- a/libs/computer-server/computer_server/main.py
+++ b/libs/computer-server/computer_server/main.py
@@ -65,6 +65,7 @@ async def websocket_endpoint(websocket: WebSocket):
         "type_text": manager.automation_handler.type_text,
         "press_key": manager.automation_handler.press_key,
         "drag_to": manager.automation_handler.drag_to,
+        "drag": manager.automation_handler.drag,
         "hotkey": manager.automation_handler.hotkey,
         "get_cursor_position": manager.automation_handler.get_cursor_position,
         "get_screen_size": manager.automation_handler.get_screen_size,
diff --git a/libs/computer/computer/interface/base.py b/libs/computer/computer/interface/base.py
index 31106c14..8fcbd21c 100644
--- a/libs/computer/computer/interface/base.py
+++ b/libs/computer/computer/interface/base.py
@@ -79,6 +79,17 @@ class BaseComputerInterface(ABC):
         """
         pass
 
+    @abstractmethod
+    async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> None:
+        """Drag the cursor along a path of coordinates.
+
+        Args:
+            path: List of (x, y) coordinate tuples defining the drag path
+            button: The mouse button to use ('left', 'middle', 'right')
+            duration: Total time in seconds that the drag operation should take
+        """
+        pass
+
     # Keyboard Actions
     @abstractmethod
     async def type_text(self, text: str) -> None:
diff --git a/libs/computer/computer/interface/macos.py b/libs/computer/computer/interface/macos.py
index a3b99f7d..2460086c 100644
--- a/libs/computer/computer/interface/macos.py
+++ b/libs/computer/computer/interface/macos.py
@@ -328,6 +328,11 @@ class MacOSComputerInterface(BaseComputerInterface):
             "drag_to", {"x": x, "y": y, "button": button, "duration": duration}
         )
 
+    async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> None:
+        await self._send_command(
+            "drag", {"path": path, "button": button, "duration": duration}
+        )
+
     # Keyboard Actions
     async def type_text(self, text: str) -> None:
         await self._send_command("type_text", {"text": text})

From 7981000820f222b40f2a473987afaf29de335e31 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 11:02:54 -0700
Subject: [PATCH 23/38] added message when scalable oai endpoint is still
 warming up

---
 .../providers/uitars/clients/oaicompat.py     | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/clients/oaicompat.py b/libs/agent/agent/providers/uitars/clients/oaicompat.py
index 1b512997..423b1d3a 100644
--- a/libs/agent/agent/providers/uitars/clients/oaicompat.py
+++ b/libs/agent/agent/providers/uitars/clients/oaicompat.py
@@ -190,25 +190,21 @@ class OAICompatClient(BaseUITarsClient):
                     response_text = await response.text()
                     logger.debug(f"Response content: {response_text}")
                     
+                    # if 503, then the endpoint is still warming up
+                    if response.status == 503:
+                        logger.error(f"Endpoint is still warming up, please try again later")
+                        raise Exception(f"Endpoint is still warming up: {response_text}")
+                    
                     # Try to parse as JSON if the content type is appropriate
                     if "application/json" in response.headers.get('Content-Type', ''):
                         response_json = await response.json()
                     else:
                         raise Exception(f"Response is not JSON format")
-                        # # Optionally try to parse it anyway
-                        # try:
-                        #     import json
-                        #     response_json = json.loads(response_text)
-                        # except json.JSONDecodeError as e:
-                        #     print(f"Failed to parse response as JSON: {e}")
 
                     if response.status != 200:
-                        error_msg = response_json.get("error", {}).get(
-                            "message", str(response_json)
-                        )
-                        logger.error(f"Error in API call: {error_msg}")
-                        raise Exception(f"API error: {error_msg}")
-
+                        logger.error(f"Error in API call: {response_text}")
+                        raise Exception(f"API error: {response_text}")
+                    
                     return response_json
 
         except Exception as e:

From e55e649cd6254fc06f0699583e99b651d3c4d523 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 14:47:27 -0700
Subject: [PATCH 24/38] improved display of AgentResponse objects in gradio ui,
 and standardized uitars agent output

---
 libs/agent/agent/providers/uitars/loop.py  |  44 ++-----
 libs/agent/agent/providers/uitars/utils.py | 113 +++++++++++++++++-
 libs/agent/agent/ui/gradio/app.py          | 128 ++++++++++-----------
 3 files changed, 179 insertions(+), 106 deletions(-)

diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index 71d2c739..a30d3bee 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -17,7 +17,7 @@ from ...core.types import AgentResponse, LLMProvider
 from ...core.visualization import VisualizationHelper
 from computer import Computer
 
-from .utils import add_box_token, parse_actions, parse_action_parameters
+from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
 from .tools.manager import ToolManager
 from .tools.computer import ToolResult
 from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
@@ -507,41 +507,14 @@ class UITARSLoop(BaseLoop):
 
                 # Update whether an action screenshot was saved this turn
                 action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
-
-                # Parse actions from the raw response
-                raw_response = response["choices"][0]["message"]["content"]
-                parsed_actions = parse_actions(raw_response)
                 
-                # Extract thought content if available
-                thought = ""
-                if "Thought:" in raw_response:
-                    thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
-                    if thought_match:
-                        thought = thought_match.group(1).strip()
+                agent_response = await to_agent_response_format(
+                    response,
+                    messages,
+                    model=self.model,
+                )
+                yield agent_response
                 
-                # Create standardized thought response format
-                thought_response = {
-                    "role": "assistant",
-                    "content": thought or raw_response,
-                    "metadata": {
-                        "title": "🧠 UI-TARS Thoughts"
-                    }
-                }
-                
-                # Create action response format
-                action_response = {
-                    "role": "assistant",
-                    "content": str(parsed_actions),
-                    "metadata": {
-                        "title": "🖱️ UI-TARS Actions",
-                    }
-                }
-
-                # Yield both responses to the caller (thoughts first, then actions)
-                yield thought_response
-                if parsed_actions:
-                    yield action_response
-
                 # Check if we should continue this conversation
                 running = should_continue
 
@@ -562,7 +535,8 @@ class UITARSLoop(BaseLoop):
                     logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
 
                 yield {
-                    "error": str(e),
+                    "role": "assistant",
+                    "content": f"Error: {str(e)}",
                     "metadata": {"title": "❌ Error"},
                 }
 
diff --git a/libs/agent/agent/providers/uitars/utils.py b/libs/agent/agent/providers/uitars/utils.py
index 00565b88..cc904115 100644
--- a/libs/agent/agent/providers/uitars/utils.py
+++ b/libs/agent/agent/providers/uitars/utils.py
@@ -4,9 +4,114 @@ import logging
 import base64
 import re
 from typing import Any, Dict, List, Optional, Union, Tuple
+from datetime import datetime
 
 logger = logging.getLogger(__name__)
 
+from ...core.types import AgentResponse
+
+async def to_agent_response_format(
+    response: Dict[str, Any],
+    messages: List[Dict[str, Any]],
+    model: Optional[str] = None,
+) -> AgentResponse:
+    """Convert raw UI-TARS response to agent response format.
+    
+    Args:
+        response: Raw UI-TARS response
+        messages: List of messages in standard format
+        model: Optional model name
+    
+    Returns:
+        AgentResponse: Standardized agent response format
+    """
+    # Create unique IDs for this response
+    response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
+    reasoning_id = f"rs_{response_id}"
+    action_id = f"cu_{response_id}"
+    call_id = f"call_{response_id}"
+
+    # Parse actions from the raw response
+    content = response["choices"][0]["message"]["content"]
+    actions = parse_actions(content)
+    
+    # Extract thought content if available
+    reasoning_text = ""
+    if "Thought:" in content:
+        thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
+        if thought_match:
+            reasoning_text = thought_match.group(1).strip()
+    
+    # Create output items
+    output_items = []
+    if reasoning_text:
+        output_items.append({
+            "type": "reasoning",
+            "id": reasoning_id,
+            "text": reasoning_text
+        })
+    if actions:
+        for i, action in enumerate(actions):
+            action_name, tool_args = parse_action_parameters(action)
+            if action_name == "finished":
+                output_items.append({
+                    "type": "message",
+                    "role": "assistant",
+                    "content": [{
+                        "type": "output_text",
+                        "text": tool_args["content"]
+                    }],
+                    "id": f"action_{i}_{action_id}",
+                    "status": "completed"
+                })
+            else:
+                if tool_args.get("action") == action_name:
+                    del tool_args["action"]
+                output_items.append({
+                    "type": "computer_call",
+                    "id": f"{action}_{i}_{action_id}",
+                    "call_id": f"call_{i}_{action_id}",
+                    "action": { "type": action_name, **tool_args },
+                    "pending_safety_checks": [],
+                    "status": "completed"
+                })
+    
+    # Create agent response
+    agent_response = AgentResponse(
+        id=response_id,
+        object="response",
+        created_at=int(datetime.now().timestamp()),
+        status="completed",
+        error=None,
+        incomplete_details=None,
+        instructions=None,
+        max_output_tokens=None,
+        model=model or response["model"],
+        output=output_items,
+        parallel_tool_calls=True,
+        previous_response_id=None,
+        reasoning={"effort": "medium"},
+        store=True,
+        temperature=0.0,
+        top_p=0.7,
+        text={"format": {"type": "text"}},
+        tool_choice="auto",
+        tools=[
+            {
+                "type": "computer_use_preview",
+                "display_height": 768,
+                "display_width": 1024,
+                "environment": "mac",
+            }
+        ],
+        truncation="auto",
+        usage=response["usage"],
+        user=None,
+        metadata={},
+        response=response
+    )
+    return agent_response
+
 
 def add_box_token(input_string: str) -> str:
     """Add box tokens to the coordinates in the model response.
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
     """
     # Handle "finished" action
     if action.startswith("finished"):
-        return "finished", {}
+        # Parse content if it exists
+        content_match = re.search(r"content='([^']*)'", action)
+        if content_match:
+            content = content_match.group(1)
+            return "finished", {"content": content}
+        else:
+            return "finished", {}
     
     # Parse action parameters
     action_match = re.match(r'(\w+)\((.*)\)', action)
diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py
index c6ac57ea..b8ab480a 100644
--- a/libs/agent/agent/ui/gradio/app.py
+++ b/libs/agent/agent/ui/gradio/app.py
@@ -35,6 +35,7 @@ from pathlib import Path
 from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
 import gradio as gr
 from gradio.components.chatbot import MetadataDict
+from typing import cast
 
 # Import from agent package
 from agent.core.types import AgentResponse
@@ -447,66 +448,6 @@ def create_agent(
 
     return global_agent
 
-
-def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
-    """Process agent results for the Gradio UI."""
-    # Extract text content
-    text_obj = result.get("text", {})
-    metadata = result.get("metadata", {})
-
-    # Create a properly typed MetadataDict
-    metadata_dict = MetadataDict()
-    metadata_dict["title"] = metadata.get("title", "")
-    metadata_dict["status"] = "done"
-    metadata = metadata_dict
-
-    # For OpenAI's Computer-Use Agent, text field is an object with format property
-    if (
-        text_obj
-        and isinstance(text_obj, dict)
-        and "format" in text_obj
-        and not text_obj.get("value", "")
-    ):
-        content, metadata = extract_synthesized_text(result)
-    else:
-        if not text_obj:
-            text_obj = result
-
-        # For other types of results, try to get text directly
-        if isinstance(text_obj, dict):
-            if "value" in text_obj:
-                content = text_obj["value"]
-            elif "text" in text_obj:
-                content = text_obj["text"]
-            elif "content" in text_obj:
-                content = text_obj["content"]
-            else:
-                content = ""
-        else:
-            content = str(text_obj) if text_obj else ""
-
-    # If still no content but we have outputs, create a summary
-    if not content and "output" in result and result["output"]:
-        output = result["output"]
-        for out in output:
-            if out.get("type") == "reasoning":
-                content = out.get("content", "")
-                if content:
-                    break
-            elif out.get("type") == "computer_call":
-                action = out.get("action", {})
-                action_type = action.get("type", "")
-                if action_type:
-                    content = f"Performing action: {action_type}"
-                    break
-
-    # Clean up the text - ensure content is a string
-    if not isinstance(content, str):
-        content = str(content) if content else ""
-
-    return content, metadata
-
-
 def create_gradio_ui(
     provider_name: str = "openai",
     model_name: str = "gpt-4o",
@@ -907,17 +848,64 @@ def create_gradio_ui(
 
                         # Stream responses from the agent
                         async for result in global_agent.run(last_user_message):
-                            # Process result
-                            content, metadata = process_agent_result(result)
-
-                            # Skip empty content
-                            if content or metadata.get("title"):
-                                history.append(
-                                    gr.ChatMessage(
-                                        role="assistant", content=content, metadata=metadata
+                            print(f"DEBUG - Agent response ------- START")
+                            from pprint import pprint
+                            pprint(result)
+                            print(f"DEBUG - Agent response ------- END")
+                            
+                            def generate_gradio_messages():
+                                if result.get("content"):
+                                    yield gr.ChatMessage(
+                                        role="assistant",
+                                        content=result.get("content", ""),
+                                        metadata=cast(MetadataDict, result.get("metadata", {}))
                                     )
-                                )
-                            yield history
+                                else:
+                                    outputs = result.get("output", [])
+                                    for output in outputs:
+                                        if output.get("type") == "message":
+                                            content = output.get("content", [])
+                                            for content_part in content:
+                                                if content_part.get("text"):
+                                                    yield gr.ChatMessage(
+                                                        role=output.get("role", "assistant"),
+                                                        content=content_part.get("text", ""),
+                                                        metadata=content_part.get("metadata", {})
+                                                    )
+                                        elif output.get("type") == "reasoning":
+                                            # if it's openAI, we only have access to a summary of the reasoning
+                                            summary_content = output.get("summary", [])
+                                            if summary_content:
+                                                for summary_part in summary_content:
+                                                    if summary_part.get("type") == "summary_text":
+                                                        yield gr.ChatMessage(
+                                                            role="assistant",
+                                                            content=summary_part.get("text", "")
+                                                        )
+                                            else:
+                                                summary_content = output.get("text", "")
+                                                if summary_content:
+                                                    yield gr.ChatMessage(
+                                                        role="assistant",
+                                                        content=summary_content,
+                                                    )
+                                        elif output.get("type") == "computer_call":
+                                            action = output.get("action", {})
+                                            action_type = action.get("type", "")
+                                            if action_type:
+                                                action_title = f"🛠️ Performing {action_type}"
+                                                if action.get("x") and action.get("y"):
+                                                    action_title += f" at ({action['x']}, {action['y']})"
+                                                yield gr.ChatMessage(
+                                                    role="assistant",
+                                                    content=f"```json\n{json.dumps(action)}\n```",
+                                                    metadata={"title": action_title}
+                                                )
+                            
+                            for message in generate_gradio_messages():
+                                history.append(message)
+                                yield history
+                            
                     except Exception as e:
                         import traceback
 

From 2e6d3e4d2d5ceb3349a217fad339d17c22b0294c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 14:48:51 -0700
Subject: [PATCH 25/38] old code removal

---
 libs/agent/agent/ui/gradio/app.py | 57 -------------------------------
 1 file changed, 57 deletions(-)

diff --git a/libs/agent/agent/ui/gradio/app.py b/libs/agent/agent/ui/gradio/app.py
index b8ab480a..a4541019 100644
--- a/libs/agent/agent/ui/gradio/app.py
+++ b/libs/agent/agent/ui/gradio/app.py
@@ -323,63 +323,6 @@ def get_ollama_models() -> List[str]:
         logging.error(f"Error getting Ollama models: {e}")
         return []
 
-
-def extract_synthesized_text(
-    result: Union[AgentResponse, Dict[str, Any]],
-) -> Tuple[str, MetadataDict]:
-    """Extract synthesized text from the agent result."""
-    synthesized_text = ""
-    metadata = MetadataDict()
-
-    if "output" in result and result["output"]:
-        for output in result["output"]:
-            if output.get("type") == "reasoning":
-                metadata["title"] = "🧠 Reasoning"
-                content = output.get("content", "")
-                if content:
-                    synthesized_text += f"{content}\n"
-            elif output.get("type") == "message":
-                # Handle message type outputs - can contain rich content
-                content = output.get("content", [])
-
-                # Content is usually an array of content blocks
-                if isinstance(content, list):
-                    for block in content:
-                        if isinstance(block, dict) and block.get("type") == "output_text":
-                            text_value = block.get("text", "")
-                            if text_value:
-                                synthesized_text += f"{text_value}\n"
-
-            elif output.get("type") == "computer_call":
-                action = output.get("action", {})
-                action_type = action.get("type", "")
-
-                # Create a descriptive text about the action
-                if action_type == "click":
-                    button = action.get("button", "")
-                    x = action.get("x", "")
-                    y = action.get("y", "")
-                    synthesized_text += f"Clicked {button} at position ({x}, {y}).\n"
-                elif action_type == "type":
-                    text = action.get("text", "")
-                    synthesized_text += f"Typed: {text}.\n"
-                elif action_type == "keypress":
-                    # Extract key correctly from either keys array or key field
-                    if isinstance(action.get("keys"), list):
-                        key = ", ".join(action.get("keys"))
-                    else:
-                        key = action.get("key", "")
-
-                    synthesized_text += f"Pressed key: {key}\n"
-                else:
-                    synthesized_text += f"Performed {action_type} action.\n"
-
-                metadata["status"] = "done"
-                metadata["title"] = f"🛠️ {synthesized_text.strip().splitlines()[-1]}"
-
-    return synthesized_text.strip(), metadata
-
-
 def create_computer_instance(verbosity: int = logging.INFO) -> Computer:
     """Create or get the global Computer instance."""
     global global_computer

From a5ec926922f55c03303261e90aa4805ce4fc146d Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Wed, 30 Apr 2025 15:11:49 -0700
Subject: [PATCH 26/38] Add --no-background-service option

---
 libs/lume/scripts/install.sh | 91 ++++++++++++++++++++++--------------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/libs/lume/scripts/install.sh b/libs/lume/scripts/install.sh
index d854c0e4..4c1efdc9 100755
--- a/libs/lume/scripts/install.sh
+++ b/libs/lume/scripts/install.sh
@@ -20,24 +20,32 @@ INSTALL_DIR="${INSTALL_DIR:-$DEFAULT_INSTALL_DIR}"
 GITHUB_REPO="trycua/cua"
 LATEST_RELEASE_URL="https://api.github.com/repos/$GITHUB_REPO/releases/latest"
 
+# Option to skip background service setup (default: install it)
+INSTALL_BACKGROUND_SERVICE=true
+
 # Parse command line arguments
 while [ "$#" -gt 0 ]; do
   case "$1" in
     --install-dir=*)
       INSTALL_DIR="${1#*=}"
       ;;
+    --no-background-service|--skip-background-service)
+      INSTALL_BACKGROUND_SERVICE=false
+      ;;
     --help)
       echo "${BOLD}${BLUE}Lume Installer${NORMAL}"
       echo "Usage: $0 [OPTIONS]"
       echo ""
       echo "Options:"
-      echo "  --install-dir=DIR   Install to the specified directory (default: $DEFAULT_INSTALL_DIR)"
-      echo "  --help              Display this help message"
+      echo "  --install-dir=DIR         Install to the specified directory (default: $DEFAULT_INSTALL_DIR)"
+      echo "  --no-background-service   Do not setup the Lume background service (LaunchAgent)"
+      echo "  --help                    Display this help message"
       echo ""
       echo "Examples:"
-      echo "  $0                               # Install to $DEFAULT_INSTALL_DIR"
-      echo "  $0 --install-dir=/usr/local/bin  # Install to system directory (may require root privileges)"
-      echo "  INSTALL_DIR=/opt/lume $0         # Install to /opt/lume (legacy env var support)"
+      echo "  $0                                   # Install to $DEFAULT_INSTALL_DIR and setup background service"
+      echo "  $0 --install-dir=/usr/local/bin      # Install to system directory (may require root privileges)"
+      echo "  $0 --no-background-service           # Install without setting up the background service"
+      echo "  INSTALL_DIR=/opt/lume $0             # Install to /opt/lume (legacy env var support)"
       exit 0
       ;;
     *)
@@ -207,25 +215,26 @@ main() {
   echo "${GREEN}${BOLD}Lume has been successfully installed!${NORMAL}"
   echo "Run ${BOLD}lume${NORMAL} to get started."
 
-  # --- LaunchAgent setup for lume daemon ---
-  SERVICE_NAME="com.trycua.lume_daemon"
-  PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist"
-  LUME_BIN="$INSTALL_DIR/lume"
+  if [ "$INSTALL_BACKGROUND_SERVICE" = true ]; then
+    # --- Setup background service (LaunchAgent) for Lume ---
+    SERVICE_NAME="com.trycua.lume_daemon"
+    PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist"
+    LUME_BIN="$INSTALL_DIR/lume"
 
-  echo ""
-  echo "Setting up LaunchAgent to run lume daemon on login..."
+    echo ""
+    echo "Setting up LaunchAgent to run lume daemon on login..."
 
-  # Create LaunchAgents directory if it doesn't exist
-  mkdir -p "$HOME/Library/LaunchAgents"
+    # Create LaunchAgents directory if it doesn't exist
+    mkdir -p "$HOME/Library/LaunchAgents"
 
-  # Unload existing service if present
-  if [ -f "$PLIST_PATH" ]; then
-    echo "Existing LaunchAgent found. Unloading..."
-    launchctl unload "$PLIST_PATH" 2>/dev/null || true
-  fi
+    # Unload existing service if present
+    if [ -f "$PLIST_PATH" ]; then
+      echo "Existing LaunchAgent found. Unloading..."
+      launchctl unload "$PLIST_PATH" 2>/dev/null || true
+    fi
 
-  # Create the plist file
-  cat <<EOF > "$PLIST_PATH"
+    # Create the plist file
+    cat <<EOF > "$PLIST_PATH"
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
@@ -262,23 +271,35 @@ main() {
 </plist>
 EOF
 
-  # Set permissions
-  chmod 644 "$PLIST_PATH"
-  touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log
-  chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log
+    # Set permissions
+    chmod 644 "$PLIST_PATH"
+    touch /tmp/lume_daemon.log /tmp/lume_daemon.error.log
+    chmod 644 /tmp/lume_daemon.log /tmp/lume_daemon.error.log
 
-  # Load the LaunchAgent
-  echo "Loading LaunchAgent..."
-  launchctl unload "$PLIST_PATH" 2>/dev/null || true
-  launchctl load "$PLIST_PATH"
+    # Load the LaunchAgent
+    echo "Loading LaunchAgent..."
+    launchctl unload "$PLIST_PATH" 2>/dev/null || true
+    launchctl load "$PLIST_PATH"
 
-  echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}"
-  echo "To check status: launchctl list | grep $SERVICE_NAME"
-  echo "To view logs: tail -f /tmp/lume_daemon.log"
-  echo ""
-  echo "To remove the lume daemon service, run:"
-  echo "  launchctl unload \"$PLIST_PATH\""
-  echo "  rm \"$PLIST_PATH\""
+    echo "${GREEN}Lume daemon LaunchAgent installed and loaded. It will start automatically on login!${NORMAL}"
+    echo "To check status: launchctl list | grep $SERVICE_NAME"
+    echo "To view logs: tail -f /tmp/lume_daemon.log"
+    echo ""
+    echo "To remove the lume daemon service, run:"
+    echo "  launchctl unload \"$PLIST_PATH\""
+    echo "  rm \"$PLIST_PATH\""
+  else
+    SERVICE_NAME="com.trycua.lume_daemon"
+    PLIST_PATH="$HOME/Library/LaunchAgents/$SERVICE_NAME.plist"
+    if [ -f "$PLIST_PATH" ]; then
+      echo "Removing existing Lume background service (LaunchAgent)..."
+      launchctl unload "$PLIST_PATH" 2>/dev/null || true
+      rm "$PLIST_PATH"
+      echo "Lume background service (LaunchAgent) removed."
+    else
+      echo "Skipping Lume background service (LaunchAgent) setup as requested (use --no-background-service)."
+    fi
+  fi
 }
 
 # Run the installation

From db40dae0803247a58dab817845da88a00079294a Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Wed, 30 Apr 2025 15:21:10 -0700
Subject: [PATCH 27/38] Add background service docs

---
 README.md           | 18 ++++++++++--------
 libs/lume/README.md |  8 ++++++++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 53102fcb..ae6254a8 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,13 @@ If you only need the virtualization capabilities:
 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
 ```
 
+Optionally, if you don't want Lume to run as a background service:
+```bash
+/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh) --no-background-service"
+```
+
+**Note:** If you choose this option, you'll need to manually start the Lume API service whenever needed by running `lume serve` in your terminal. This applies to Option 2 after completing step 1.
+
 For Lume usage instructions, refer to the [Lume documentation](./libs/lume/README.md).
 
 ### Option 2: Full Computer-Use Agent Capabilities
@@ -62,17 +69,12 @@ If you want to use AI agents with virtualized environments:
    lume pull macos-sequoia-cua:latest
    ```
 
-3. Start Lume daemon service:
-   ```bash
-   lume serve
-   ```
-
-4. Install the Python libraries:
+3. Install the Python libraries:
    ```bash
    pip install cua-computer cua-agent[all]
    ```
 
-5. Use the libraries in your Python code:
+4. Use the libraries in your Python code:
    ```python
    from computer import Computer
    from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
@@ -95,7 +97,7 @@ If you want to use AI agents with virtualized environments:
    
    Explore the [Agent Notebook](./notebooks/) for a ready-to-run example.
 
-6. Optionally, you can use the Agent with a Gradio UI:
+5. Optionally, you can use the Agent with a Gradio UI:
 
    ```python
    from utils import load_dotenv_files
diff --git a/libs/lume/README.md b/libs/lume/README.md
index 3d9c0524..b7112b07 100644
--- a/libs/lume/README.md
+++ b/libs/lume/README.md
@@ -147,6 +147,14 @@ Install with a single command:
 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
 ```
 
+By default, Lume is installed as a background service that starts automatically on login. If you prefer to start the Lume API service manually when needed, you can use the `--no-background-service` option:
+
+```bash
+/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh) --no-background-service"
+```
+
+**Note:** With this option, you'll need to manually start the Lume API service by running `lume serve` in your terminal whenever you need to use tools or libraries that rely on the Lume API (such as the Computer-Use Agent).
+
 You can also download the `lume.pkg.tar.gz` archive from the [latest release](https://github.com/trycua/lume/releases), extract it, and install the package manually.
 
 ## Prebuilt Images

From c4f9da50079f97fa58809816c2f084cc2754e46a Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 16:38:40 -0700
Subject: [PATCH 28/38] added mcp development guide, and vision output to tool
 calls

---
 libs/mcp-server/README.md            | 35 +++++++++++
 libs/mcp-server/mcp_server/server.py | 93 +++++++++++++++++++---------
 2 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md
index a4307616..5649cc19 100644
--- a/libs/mcp-server/README.md
+++ b/libs/mcp-server/README.md
@@ -75,6 +75,41 @@ You can then use the script in your MCP configuration like this:
 }
 ```
 
+## Development Guide
+
+If you want to develop with the cua-mcp-server directly without installation, you can use this configuration:
+
+```json
+{
+  "mcpServers": {
+    "cua-agent": {
+      "command": "/Users/YOURUSERNAME/cua/.venv/bin/python",
+      "args": ["-m", "mcp_server.server"],
+      "env": {
+        "PYTHONPATH": "/Users/YOURUSERNAME/cua/libs/mcp-server:/Users/YOURUSERNAME/cua/libs/agent:/Users/YOURUSERNAME/cua/libs/computer:/Users/YOURUSERNAME/cua/libs/core:/Users/YOURUSERNAME/cua/libs/pylume",
+        "CUA_AGENT_LOOP": "UITARS",
+        "CUA_MODEL_PROVIDER": "OAICOMPAT",
+        "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
+        "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1"
+      }
+    }
+  }
+}
+```
+
+To see the logs:
+```
+tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
+```
+
+This configuration:
+- Uses your local Python virtual environment to run the server module directly
+- Sets the Python path to include all necessary library dependencies
+- Works with Claude Desktop, Cursor, or any other MCP client
+- Automatically uses your development code without requiring installation
+
+Just add this to your MCP client's configuration and it will use your local development version of the server.
+
 ## Claude Desktop Integration
 
 To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py
index f6692f9a..f5fb37b2 100644
--- a/libs/mcp-server/mcp_server/server.py
+++ b/libs/mcp-server/mcp_server/server.py
@@ -1,9 +1,10 @@
 import asyncio
+import base64
 import logging
 import os
 import sys
 import traceback
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 # Configure logging to output to stderr for debug visibility
 logging.basicConfig(
@@ -17,7 +18,7 @@ logger = logging.getLogger("mcp-server")
 logger.debug("MCP Server module loading...")
 
 try:
-    from mcp.server.fastmcp import Context, FastMCP
+    from mcp.server.fastmcp import Context, FastMCP, Image
 
     logger.debug("Successfully imported FastMCP")
 except ImportError as e:
@@ -49,7 +50,28 @@ def serve() -> FastMCP:
     server = FastMCP("cua-agent")
 
     @server.tool()
-    async def run_cua_task(ctx: Context, task: str) -> str:
+    async def screenshot_cua(ctx: Context) -> Image:
+        """
+        Take a screenshot of the current screen and return the image.
+
+        Args:
+            ctx: The MCP context
+
+        Returns:
+            An image resource containing the screenshot
+        """
+        global global_computer
+        if global_computer is None:
+            global_computer = Computer(verbosity=logging.INFO)
+            await global_computer.run()
+        screenshot = await global_computer.interface.screenshot()
+        return Image(
+            format="png",
+            data=screenshot
+        )
+
+    @server.tool()
+    async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
         """
         Run a Computer-Use Agent (CUA) task and return the results.
 
@@ -58,7 +80,7 @@ def serve() -> FastMCP:
             task: The instruction or task for the agent to perform
 
         Returns:
-            A string containing the agent's response
+            A tuple containing the agent's response and the final screenshot
         """
         global global_computer
 
@@ -76,6 +98,8 @@ def serve() -> FastMCP:
                 loop = AgentLoop.OPENAI
             elif loop_str == "ANTHROPIC":
                 loop = AgentLoop.ANTHROPIC
+            elif loop_str == "UITARS":
+                loop = AgentLoop.UITARS
             else:
                 loop = AgentLoop.OMNI
 
@@ -107,33 +131,34 @@ def serve() -> FastMCP:
             full_result = ""
             async for result in agent.run(task):
                 logger.info(f"Agent step complete: {result.get('id', 'unknown')}")
+                ctx.info(f"Agent step complete: {result.get('id', 'unknown')}")
 
                 # Add response ID to output
                 full_result += f"\n[Response ID: {result.get('id', 'unknown')}]\n"
-
-                # Extract and concatenate text responses
-                if "text" in result:
-                    # Handle both string and dict responses
-                    text_response = result.get("text", "")
-                    if isinstance(text_response, str):
-                        full_result += f"Response: {text_response}\n"
-                    else:
-                        # If it's a dict or other structure, convert to string representation
-                        full_result += f"Response: {str(text_response)}\n"
-
-                # Log detailed information
-                if "tools" in result:
-                    tools_info = result.get("tools")
-                    logger.debug(f"Tools used: {tools_info}")
-                    full_result += f"\nTools used: {tools_info}\n"
+                
+                if "content" in result:
+                    full_result += f"Response: {result.get('content', '')}\n"
 
                 # Process output if available
                 outputs = result.get("output", [])
                 for output in outputs:
                     output_type = output.get("type")
-                    if output_type == "reasoning":
+                    if output_type == "message":
+                        logger.debug(f"Message: {output}")
+                        content = output.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                full_result += f"\nMessage: {content_part.get('text', '')}\n"
+                    elif output_type == "reasoning":
                         logger.debug(f"Reasoning: {output}")
-                        full_result += f"\nReasoning: {output.get('content', '')}\n"
+                        
+                        summary_content = output.get("summary", [])
+                        if summary_content:
+                            for summary_part in summary_content:
+                                if summary_part.get("text"):
+                                    full_result += f"\nReasoning: {summary_part.get('text', '')}\n"
+                        else:
+                            full_result += f"\nReasoning: {output.get('text', output.get('content', ''))}\n"
                     elif output_type == "computer_call":
                         logger.debug(f"Computer call: {output}")
                         action = output.get("action", "")
@@ -144,15 +169,23 @@ def serve() -> FastMCP:
                 full_result += "\n" + "-" * 40 + "\n"
 
             logger.info(f"CUA task completed successfully")
-            return full_result or "Task completed with no text output."
+            ctx.info(f"CUA task completed successfully")
+            return (
+                full_result or "Task completed with no text output.",
+                Image(
+                    format="png",
+                    data=await global_computer.interface.screenshot()
+                )
+            )
 
         except Exception as e:
             error_msg = f"Error running CUA task: {str(e)}\n{traceback.format_exc()}"
             logger.error(error_msg)
+            ctx.error(error_msg)
             return f"Error during task execution: {str(e)}"
 
     @server.tool()
-    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> str:
+    async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
         """
         Run multiple CUA tasks in sequence and return the combined results.
 
@@ -164,13 +197,15 @@ def serve() -> FastMCP:
             Combined results from all tasks
         """
         results = []
-
         for i, task in enumerate(tasks):
             logger.info(f"Running task {i+1}/{len(tasks)}: {task}")
-            result = await run_cua_task(ctx, task)
-            results.append(f"Task {i+1}: {task}\nResult: {result}\n")
-
-        return "\n".join(results)
+            ctx.info(f"Running task {i+1}/{len(tasks)}: {task}")
+            
+            ctx.report_progress(i / len(tasks))
+            results.extend(await run_cua_task(ctx, task))
+            ctx.report_progress((i + 1) / len(tasks))
+            
+        return results
 
     return server
 

From 60bcb0716cfa720084faafb1d0c9b1f716a6945c Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Wed, 30 Apr 2025 17:05:49 -0700
Subject: [PATCH 29/38] improved mcp prompting

---
 libs/mcp-server/mcp_server/server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py
index f5fb37b2..67f7fe12 100644
--- a/libs/mcp-server/mcp_server/server.py
+++ b/libs/mcp-server/mcp_server/server.py
@@ -52,7 +52,7 @@ def serve() -> FastMCP:
     @server.tool()
     async def screenshot_cua(ctx: Context) -> Image:
         """
-        Take a screenshot of the current screen and return the image.
+        Take a screenshot of the current MacOS VM screen and return the image. Use this before running a CUA task to get a snapshot of the current state.
 
         Args:
             ctx: The MCP context
@@ -73,7 +73,7 @@ def serve() -> FastMCP:
     @server.tool()
     async def run_cua_task(ctx: Context, task: str) -> Tuple[str, Image]:
         """
-        Run a Computer-Use Agent (CUA) task and return the results.
+        Run a Computer-Use Agent (CUA) task in a MacOS VM and return the results.
 
         Args:
             ctx: The MCP context
@@ -187,7 +187,7 @@ def serve() -> FastMCP:
     @server.tool()
     async def run_multi_cua_tasks(ctx: Context, tasks: List[str]) -> List:
         """
-        Run multiple CUA tasks in sequence and return the combined results.
+        Run multiple CUA tasks in a MacOS VM in sequence and return the combined results.
 
         Args:
             ctx: The MCP context

From cf7d05421239d403da08e7227e724d3945daf58e Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 1 May 2025 11:58:30 -0700
Subject: [PATCH 30/38] added standardized logging for easy parsing of
 trajectories

---
 examples/video_maker_traj.py                 | 360 +++++++++++--------
 libs/agent/agent/providers/anthropic/loop.py |   2 +
 libs/agent/agent/providers/omni/loop.py      |   2 +
 libs/agent/agent/providers/openai/loop.py    |   4 +
 libs/agent/agent/providers/uitars/loop.py    |   4 +-
 5 files changed, 231 insertions(+), 141 deletions(-)

diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py
index 34a5ad3c..b9966aa0 100644
--- a/examples/video_maker_traj.py
+++ b/examples/video_maker_traj.py
@@ -68,82 +68,148 @@ def load_cursor_images():
 last_known_cursor_position = None
 last_known_thought = None
 
-def extract_thought_from_api_response(filename):
-    """Extract thought from API response for the current frame."""
+def parse_agent_response(filename_or_turn_dir):
+    """Parse agent response JSON file to extract text, actions, and cursor positions."""
+    
+    # Check if we're getting a filename or turn directory
+    if os.path.isdir(filename_or_turn_dir):
+        turn_dir = filename_or_turn_dir
+    else:
+        turn_dir = os.path.dirname(filename_or_turn_dir)
+    
+    # Find agent response files in the turn directory
+    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
+    
+    result = {
+        "text": [],
+        "actions": [],
+        "cursor_positions": []
+    }
+    
+    for agent_file in agent_response_files:
+        try:
+            with open(os.path.join(turn_dir, agent_file), 'r') as f:
+                data = json.load(f)
+                response_data = data.get('response', {})
+                
+                # First check for content field (simple text response)
+                if response_data.get("content"):
+                    result["text"].append(response_data.get("content", ""))
+                
+                # Process outputs array if present
+                outputs = response_data.get("output", [])
+                for output in outputs:
+                    output_type = output.get("type")
+                    
+                    if output_type == "message":
+                        content = output.get("content", [])
+                        for content_part in content:
+                            if content_part.get("text"):
+                                result["text"].append(content_part.get("text", ""))
+                    
+                    elif output_type == "reasoning":
+                        # Handle reasoning (thought) content
+                        summary_content = output.get("summary", [])
+                        if summary_content:
+                            for summary_part in summary_content:
+                                if summary_part.get("type") == "summary_text":
+                                    result["text"].append(summary_part.get("text", ""))
+                        else:
+                            summary_text = output.get("text", "")
+                            if summary_text:
+                                result["text"].append(summary_text)
+                    
+                    elif output_type == "computer_call":
+                        action = output.get("action", {})
+                        if action:
+                            result["actions"].append(action)
+                            # Extract cursor position if available
+                            if action.get("x") is not None and action.get("y") is not None:
+                                result["cursor_positions"].append((action.get("x"), action.get("y")))
+        except Exception as e:
+            print(f"Error processing {agent_file}: {e}")
+    
+    return result
+
+def extract_thought_from_agent_response(filename_or_turn_dir):
+    """Extract thought from agent response for the current frame."""
     global last_known_thought
     
-    turn_dir = os.path.dirname(filename)
-    api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')]
+    agent_response = parse_agent_response(filename_or_turn_dir)
     
-    for api_file in api_response_files:
-        try:
-            with open(os.path.join(turn_dir, api_file), 'r') as f:
-                data = json.load(f)
-                # Extract content from response
-                content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
-                
-                # Extract the Thought section
-                thought_match = re.search(r"Thought: (.*?)(?:\nAction:|$)", content, re.DOTALL)
-                if thought_match:
-                    thought = thought_match.group(1).strip()
-                    if thought:
-                        last_known_thought = thought
-                        return thought
-        except (json.JSONDecodeError, FileNotFoundError, KeyError):
-            pass
+    if agent_response["text"]:
+        # Use the first text entry as the thought
+        last_known_thought = agent_response["text"][0]
+        return last_known_thought
     
     # Return the last known thought if no new thought is found
     return last_known_thought
 
-def extract_cursor_position_from_filename(filename):
-    """Extract cursor position from a filename containing click info."""
+def extract_cursor_position_from_agent_response(filename_or_turn_dir):
+    """Extract cursor position from agent response."""
     global last_known_cursor_position
     
-    # For 'screenshot_NNN_click_TIMESTAMP.png', try to extract coordinates
-    match = re.search(r'click_(\d+)_(\d+)_\d+\.png$', filename)
-    if match:
-        position = (int(match.group(1)), int(match.group(2)))
-        last_known_cursor_position = position
-        return position
+    # Check if we're getting a filename or turn directory
+    if os.path.isdir(filename_or_turn_dir):
+        turn_dir = filename_or_turn_dir
+    else:
+        turn_dir = os.path.dirname(filename_or_turn_dir)
     
-    # Check if we have position info from API response
-    turn_dir = os.path.dirname(filename)
-    api_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_response.json')]
+    # Find agent response files in the turn directory
+    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
     
-    for api_file in api_response_files:
+    for agent_file in agent_response_files:
         try:
-            with open(os.path.join(turn_dir, api_file), 'r') as f:
+            with open(os.path.join(turn_dir, agent_file), 'r') as f:
                 data = json.load(f)
-                # Extract action from response
-                content = data.get('response', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
-                # Look for coordinates in the action
-                # First try the pattern from the example: click(start_box='(28,15)')
-                coord_match = re.search(r"click\(start_box='\((\d+),(\d+)\)'\)", content)
-                if coord_match:
-                    position = (int(coord_match.group(1)), int(coord_match.group(2)))
-                    last_known_cursor_position = position
-                    return position
+                response_data = data.get('response', {})
                 
-                # Try alternative pattern: click(start_box='<|box_start|>(x,y)<|box_end|>')
-                alt_match = re.search(r"click\(start_box='<\|box_start\|>\((\d+),(\d+)\)<\|box_end\|>'\)", content)
-                if alt_match:
-                    position = (int(alt_match.group(1)), int(alt_match.group(2)))
-                    last_known_cursor_position = position
-                    return position
-        except (json.JSONDecodeError, FileNotFoundError, KeyError):
-            pass
+                # Process outputs array if present
+                outputs = response_data.get("output", [])
+                for output in outputs:
+                    if output.get("type") == "computer_call":
+                        action = output.get("action", {})
+                        if action.get("x") is not None and action.get("y") is not None:
+                            position = (action.get("x"), action.get("y"))
+                            last_known_cursor_position = position
+                            return position
+        except Exception as e:
+            print(f"Error processing {agent_file}: {e}")
     
-    # No new position found, return the last known position
+    # No position found in agent response, return the last known position
     return last_known_cursor_position
 
-def extract_action_from_filename(filename):
-    """Determine the action type from the filename pattern."""
-    if 'click' in filename:
-        return "clicking"
-    elif 'type' in filename:
-        return "typing"
+def extract_action_from_agent_response(filename_or_turn_dir):
+    """Determine the action type from agent response."""
+    # Check if we're getting a filename or turn directory
+    if os.path.isdir(filename_or_turn_dir):
+        turn_dir = filename_or_turn_dir
     else:
-        return "normal"
+        turn_dir = os.path.dirname(filename_or_turn_dir)
+    
+    # Find agent response files in the turn directory
+    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
+    
+    for agent_file in agent_response_files:
+        try:
+            with open(os.path.join(turn_dir, agent_file), 'r') as f:
+                data = json.load(f)
+                response_data = data.get('response', {})
+                
+                # Process outputs array if present
+                outputs = response_data.get("output", [])
+                for output in outputs:
+                    if output.get("type") == "computer_call":
+                        action = output.get("action", {})
+                        action_type = action.get("type", "")
+                        if action_type == "click":
+                            return "clicking"
+                        elif action_type == "type" or action_type == "input":
+                            return "typing"
+        except Exception as e:
+            print(f"Error processing {agent_file}: {e}")
+    
+    return "normal"
 
 def create_animated_vignette(image, frame_index):
     """
@@ -451,58 +517,54 @@ def create_cursor_overlay(base_image, position, cursor_images, thought_text=None
     
     return result
 
-def get_screenshot_files(trajectory_dir):
+def get_turns(trajectory_dir):
     """
-    Get all screenshot files from a trajectory directory, sorted by sequence number.
+    Get all turn folders from a trajectory directory and their corresponding files.
     
     Args:
-        trajectory_dir: Path to trajectory directory containing turn_XXX folders
+        trajectory_dir: Path to trajectory directory
         
     Returns:
-        List of tuples (path, sequence_number, action_type, position)
+        List of tuples (turn_dir, agent_response_path, image_file_path)
     """
-    screenshot_files = []
+    turns = []
     
     # List all turn directories in order
     turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], 
-                      key=lambda x: int(x.split('_')[1]))
+                     key=lambda x: int(x.split('_')[1]))
     
-    for turn_dir in turn_dirs:
-        turn_path = os.path.join(trajectory_dir, turn_dir)
+    for turn_dir_name in turn_dirs:
+        turn_path = os.path.join(trajectory_dir, turn_dir_name)
         if not os.path.isdir(turn_path):
             continue
-            
-        # Get all screenshot files in this turn
-        files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')]
         
-        for file in files:
-            file_path = os.path.join(turn_path, file)
-            
-            # Extract sequence number from filename (e.g., screenshot_003_...)
-            seq_match = re.search(r'screenshot_(\d+)', file)
-            if seq_match:
-                seq_number = int(seq_match.group(1))
-                
-                # Determine action type from filename
-                action_type = extract_action_from_filename(file)
-                
-                # Get cursor position if available
-                position = extract_cursor_position_from_filename(file_path)
-                
-                screenshot_files.append((file_path, seq_number, action_type, position))
+        # Find agent response files (if any)
+        agent_response_files = [f for f in os.listdir(turn_path) if f.endswith('_agent_response.json')]
+        agent_response_path = None
+        if agent_response_files:
+            agent_response_path = os.path.join(turn_path, agent_response_files[0])
+        
+        # Find screenshot files (if any)
+        screenshot_files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')]
+        screenshot_path = None
+        if screenshot_files:
+            # Sort by sequence number to get the main one
+            sorted_screenshots = sorted(screenshot_files, 
+                                      key=lambda x: int(re.search(r'screenshot_(\d+)', x).group(1) 
+                                                   if re.search(r'screenshot_(\d+)', x) else 0))
+            screenshot_path = os.path.join(turn_path, sorted_screenshots[0]) if sorted_screenshots else None
+        
+        turns.append((turn_path, agent_response_path, screenshot_path))
     
-    # Sort by sequence number
-    screenshot_files.sort(key=lambda x: x[1])
-    
-    return screenshot_files
+    return turns
 
 def process_trajectory(trajectory_dir, output_dir, cursors):
     """Process a trajectory directory and create output frames."""
-    # Get all screenshot files
-    screenshot_files = get_screenshot_files(trajectory_dir)
+    # Get all turns with their associated files
+    turns = get_turns(trajectory_dir)
     
-    if not screenshot_files:
-        print(f"No screenshot files found in {trajectory_dir}")
+    if not turns:
+        print(f"No turn directories found in {trajectory_dir}")
         return
     
     # Create output directory
@@ -511,20 +573,27 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
     # Track frame index
     frame_index = 0
     
-    # Process each screenshot
+    # Process each turn
     prev_img = None
     prev_cursor_pos = None
     
-    for i, (file_path, seq_number, action_type, position) in enumerate(tqdm(screenshot_files, desc="Processing frames")):
+    for turn_path, agent_response_path, screenshot_path in tqdm(turns, desc="Processing turns"):
+        if not screenshot_path:
+            continue  # Skip turns without screenshots
+        
         # Load the current image
         try:
-            current_img = Image.open(file_path)
+            current_img = Image.open(screenshot_path)
         except Exception as e:
-            print(f"Error loading image {file_path}: {e}")
+            print(f"Error loading image {screenshot_path}: {e}")
             continue
         
-        # Current cursor position
-        current_cursor_pos = position
+        # Extract action and position from agent response
+        action_type = extract_action_from_agent_response(turn_path)
+        current_cursor_pos = extract_cursor_position_from_agent_response(turn_path)
+        
+        # Extract thought from agent response
+        current_thought = extract_thought_from_agent_response(turn_path)
         
         # Check if the current frame has an action (click/typing)
         is_action_frame = action_type in ["clicking", "typing"]
@@ -535,9 +604,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
                 half_frames = FRAMES_PER_CLICK // 2
                 # First half of animation uses PREVIOUS image
                 for j in range(half_frames):
-                    # Get the thought from the API response
-                    current_thought = extract_thought_from_api_response(file_path)
-                    
                     output_img = create_cursor_overlay(
                         prev_img, current_cursor_pos, cursors,
                         thought_text=current_thought,
@@ -552,9 +618,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
                 
                 # Second half uses CURRENT image
                 for j in range(half_frames, FRAMES_PER_CLICK):
-                    # Get the thought from the API response
-                    current_thought = extract_thought_from_api_response(file_path)
-                    
                     output_img = create_cursor_overlay(
                         current_img, current_cursor_pos, cursors,
                         thought_text=current_thought,
@@ -569,9 +632,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
             else:
                 # If no previous frame, use current for full animation
                 for j in range(FRAMES_PER_CLICK):
-                    # Get the thought from the API response
-                    current_thought = extract_thought_from_api_response(file_path)
-                    
                     output_img = create_cursor_overlay(
                         current_img, current_cursor_pos, cursors,
                         thought_text=current_thought,
@@ -585,9 +645,6 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
                     frame_index += 1
         else:
             # Regular frame with normal cursor
-            # Get the thought from the API response
-            current_thought = extract_thought_from_api_response(file_path)
-            
             output_img = create_cursor_overlay(
                 current_img, current_cursor_pos, cursors,
                 thought_text=current_thought,
@@ -599,42 +656,43 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
             output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
             frame_index += 1
         
-        # Add position interpolation frames if we're not at the last frame
-        if i < len(screenshot_files) - 1:
-            # Get next position
-            next_cursor_pos = screenshot_files[i+1][3]
-            
-            # Only interpolate if both positions are valid and different
-            if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos:
-                for j in range(1, FRAMES_PER_MOVE):
-                    progress = j / FRAMES_PER_MOVE
-                    interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress
-                    interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress
-                    interp_pos = (int(interp_x), int(interp_y))
-                    
-                    # Create interpolated movement frame
-                    # Get the thought from the API response
-                    current_thought = extract_thought_from_api_response(file_path)
-                    
-                    output_img = create_cursor_overlay(
-                        current_img, interp_pos, cursors,
-                        thought_text=current_thought,
-                        cursor_type="normal",
-                        frame_index=frame_index
-                    )
-                    # Apply animated vignette effect
-                    output_img = create_animated_vignette(output_img, frame_index)
-                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-                    frame_index += 1
-        
-        # Save current frame as previous for next iteration
+        # Store current frame as previous for next iteration
         prev_img = current_img
         prev_cursor_pos = current_cursor_pos
+        
+        # Add position interpolation frames if we have both current and next turn data
+        current_turn_index = turns.index((turn_path, agent_response_path, screenshot_path))
+        if current_turn_index < len(turns) - 1:
+            # Get next turn data
+            next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1]
+            if next_screenshot_path:  # Only if next turn has a screenshot
+                # Get next position
+                next_cursor_pos = extract_cursor_position_from_agent_response(next_turn_path)
+                
+                # Only interpolate if both positions are valid and different
+                if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos:
+                    for j in range(1, FRAMES_PER_MOVE):
+                        progress = j / FRAMES_PER_MOVE
+                        interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress
+                        interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress
+                        interp_pos = (int(interp_x), int(interp_y))
+                        
+                        # Create interpolated movement frame
+                        output_img = create_cursor_overlay(
+                            current_img, interp_pos, cursors,
+                            thought_text=current_thought,
+                            cursor_type="normal",
+                            frame_index=frame_index
+                        )
+                        # Apply animated vignette effect
+                        output_img = create_animated_vignette(output_img, frame_index)
+                        output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
+                        frame_index += 1
 
 def main():
     """Main function to process the trajectory and create video frames."""
     parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.')
-    parser.add_argument('trajectory_dir', type=str, help='Path to the trajectory folder')
+    parser.add_argument('trajectory_dir', type=str, nargs='?', help='Path to the trajectory folder')
     parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames')
     parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video')
     parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file')
@@ -642,6 +700,28 @@ def main():
     args = parser.parse_args()
     
     trajectory_dir = args.trajectory_dir
+    
+    # If trajectory_dir is not provided, find the latest folder in './trajectories'
+    if trajectory_dir is None:
+        trajectories_base_dir = "./trajectories"
+        if os.path.exists(trajectories_base_dir) and os.path.isdir(trajectories_base_dir):
+            # Get all directories in the trajectories folder
+            trajectory_folders = [os.path.join(trajectories_base_dir, d) for d in os.listdir(trajectories_base_dir) 
+                                 if os.path.isdir(os.path.join(trajectories_base_dir, d))]
+            
+            if trajectory_folders:
+                # Sort folders by modification time, most recent last
+                trajectory_folders.sort(key=lambda x: os.path.getmtime(x))
+                # Use the most recent folder
+                trajectory_dir = trajectory_folders[-1]
+                print(f"No trajectory directory specified, using latest: {trajectory_dir}")
+            else:
+                print(f"No trajectory folders found in {trajectories_base_dir}")
+                return
+        else:
+            print(f"Trajectories directory {trajectories_base_dir} does not exist")
+            return
+    
     output_dir = args.output_dir
     fps = args.fps
     output_video = args.output_video
diff --git a/libs/agent/agent/providers/anthropic/loop.py b/libs/agent/agent/providers/anthropic/loop.py
index 0ccdc79a..130a43cb 100644
--- a/libs/agent/agent/providers/anthropic/loop.py
+++ b/libs/agent/agent/providers/anthropic/loop.py
@@ -279,6 +279,8 @@ class AnthropicLoop(BaseLoop):
                     messages,
                     model=self.model,
                 )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=openai_compatible_response)
                 await queue.put(openai_compatible_response)
 
                 if not should_continue:
diff --git a/libs/agent/agent/providers/omni/loop.py b/libs/agent/agent/providers/omni/loop.py
index b53c120c..18e0375f 100644
--- a/libs/agent/agent/providers/omni/loop.py
+++ b/libs/agent/agent/providers/omni/loop.py
@@ -670,6 +670,8 @@ class OmniLoop(BaseLoop):
                     parsed_screen=parsed_screen,
                     parser=self.parser
                 )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=openai_compatible_response)
 
                 # Yield the response to the caller
                 yield openai_compatible_response
diff --git a/libs/agent/agent/providers/openai/loop.py b/libs/agent/agent/providers/openai/loop.py
index 8e507a1b..c4e0dfb5 100644
--- a/libs/agent/agent/providers/openai/loop.py
+++ b/libs/agent/agent/providers/openai/loop.py
@@ -276,6 +276,10 @@ class OpenAILoop(BaseLoop):
                     )
                     # Don't reset last_response_id to None - keep the previous value if available
 
+
+                # Log standardized response for ease of parsing
+                # Since this is the openAI responses format, we don't need to convert it to agent response format
+                self._log_api_call("agent_response", request=None, response=response)
                 # Process API response
                 await queue.put(response)
 
diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py
index a30d3bee..ac14ed1e 100644
--- a/libs/agent/agent/providers/uitars/loop.py
+++ b/libs/agent/agent/providers/uitars/loop.py
@@ -440,7 +440,7 @@ class UITARSLoop(BaseLoop):
     # MAIN LOOP - IMPLEMENTING ABSTRACT METHOD
     ###########################################
 
-    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[Dict[str, Any], None]:
+    async def run(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]:
         """Run the agent loop with provided messages.
 
         Args:
@@ -513,6 +513,8 @@ class UITARSLoop(BaseLoop):
                     messages,
                     model=self.model,
                 )
+                # Log standardized response for ease of parsing
+                self._log_api_call("agent_response", request=None, response=agent_response)
                 yield agent_response
                 
                 # Check if we should continue this conversation

From f449005751d5ac699a1b050e9f5bb45398bbca47 Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 1 May 2025 12:07:43 -0700
Subject: [PATCH 31/38] code cleanup

---
 examples/video_maker_traj.py | 123 ++++++++++-------------------------
 1 file changed, 34 insertions(+), 89 deletions(-)

diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py
index b9966aa0..552969b6 100644
--- a/examples/video_maker_traj.py
+++ b/examples/video_maker_traj.py
@@ -69,7 +69,8 @@ last_known_cursor_position = None
 last_known_thought = None
 
 def parse_agent_response(filename_or_turn_dir):
-    """Parse agent response JSON file to extract text, actions, and cursor positions."""
+    """Parse agent response JSON file to extract text, actions, cursor positions, thought, and action type."""
+    global last_known_cursor_position, last_known_thought
     
     # Check if we're getting a filename or turn directory
     if os.path.isdir(filename_or_turn_dir):
@@ -83,7 +84,9 @@ def parse_agent_response(filename_or_turn_dir):
     result = {
         "text": [],
         "actions": [],
-        "cursor_positions": []
+        "cursor_positions": [],
+        "thought": None,
+        "action_type": "normal"
     }
     
     for agent_file in agent_response_files:
@@ -125,92 +128,32 @@ def parse_agent_response(filename_or_turn_dir):
                             result["actions"].append(action)
                             # Extract cursor position if available
                             if action.get("x") is not None and action.get("y") is not None:
-                                result["cursor_positions"].append((action.get("x"), action.get("y")))
+                                position = (action.get("x"), action.get("y"))
+                                result["cursor_positions"].append(position)
+                                last_known_cursor_position = position
+                            
+                            # Determine action type
+                            action_type = action.get("type", "")
+                            if action_type == "click":
+                                result["action_type"] = "clicking"
+                            elif action_type == "type" or action_type == "input":
+                                result["action_type"] = "typing"
         except Exception as e:
             print(f"Error processing {agent_file}: {e}")
     
+    # Set thought from text if available
+    if result["text"]:
+        result["thought"] = ' '.join(result["text"])
+        last_known_thought = result["thought"]
+    else:
+        result["thought"] = last_known_thought
+    
+    # Set cursor position if not found
+    if not result["cursor_positions"]:
+        result["cursor_positions"] = [last_known_cursor_position] if last_known_cursor_position else []
+    
     return result
 
-def extract_thought_from_agent_response(filename_or_turn_dir):
-    """Extract thought from agent response for the current frame."""
-    global last_known_thought
-    
-    agent_response = parse_agent_response(filename_or_turn_dir)
-    
-    if agent_response["text"]:
-        # Use the first text entry as the thought
-        last_known_thought = agent_response["text"][0]
-        return last_known_thought
-    
-    # Return the last known thought if no new thought is found
-    return last_known_thought
-
-def extract_cursor_position_from_agent_response(filename_or_turn_dir):
-    """Extract cursor position from agent response."""
-    global last_known_cursor_position
-    
-    # Check if we're getting a filename or turn directory
-    if os.path.isdir(filename_or_turn_dir):
-        turn_dir = filename_or_turn_dir
-    else:
-        turn_dir = os.path.dirname(filename_or_turn_dir)
-    
-    # Find agent response files in the turn directory
-    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
-    
-    for agent_file in agent_response_files:
-        try:
-            with open(os.path.join(turn_dir, agent_file), 'r') as f:
-                data = json.load(f)
-                response_data = data.get('response', {})
-                
-                # Process outputs array if present
-                outputs = response_data.get("output", [])
-                for output in outputs:
-                    if output.get("type") == "computer_call":
-                        action = output.get("action", {})
-                        if action.get("x") is not None and action.get("y") is not None:
-                            position = (action.get("x"), action.get("y"))
-                            last_known_cursor_position = position
-                            return position
-        except Exception as e:
-            print(f"Error processing {agent_file}: {e}")
-    
-    # No position found in agent response, return the last known position
-    return last_known_cursor_position
-
-def extract_action_from_agent_response(filename_or_turn_dir):
-    """Determine the action type from agent response."""
-    # Check if we're getting a filename or turn directory
-    if os.path.isdir(filename_or_turn_dir):
-        turn_dir = filename_or_turn_dir
-    else:
-        turn_dir = os.path.dirname(filename_or_turn_dir)
-    
-    # Find agent response files in the turn directory
-    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
-    
-    for agent_file in agent_response_files:
-        try:
-            with open(os.path.join(turn_dir, agent_file), 'r') as f:
-                data = json.load(f)
-                response_data = data.get('response', {})
-                
-                # Process outputs array if present
-                outputs = response_data.get("output", [])
-                for output in outputs:
-                    if output.get("type") == "computer_call":
-                        action = output.get("action", {})
-                        action_type = action.get("type", "")
-                        if action_type == "click":
-                            return "clicking"
-                        elif action_type == "type" or action_type == "input":
-                            return "typing"
-        except Exception as e:
-            print(f"Error processing {agent_file}: {e}")
-    
-    return "normal"
-
 def create_animated_vignette(image, frame_index):
     """
     Create an animated purple/blue gradient vignette effect around the border of the image.
@@ -588,12 +531,13 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
             print(f"Error loading image {screenshot_path}: {e}")
             continue
         
-        # Extract action and position from agent response
-        action_type = extract_action_from_agent_response(turn_path)
-        current_cursor_pos = extract_cursor_position_from_agent_response(turn_path)
+        # Parse agent response
+        agent_response = parse_agent_response(turn_path)
         
-        # Extract thought from agent response
-        current_thought = extract_thought_from_agent_response(turn_path)
+        # Extract action type, cursor position, and thought
+        action_type = agent_response["action_type"]
+        current_cursor_pos = agent_response["cursor_positions"][0] if agent_response["cursor_positions"] else None
+        current_thought = agent_response["thought"]
         
         # Check if the current frame has an action (click/typing)
         is_action_frame = action_type in ["clicking", "typing"]
@@ -667,7 +611,8 @@ def process_trajectory(trajectory_dir, output_dir, cursors):
             next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1]
             if next_screenshot_path:  # Only if next turn has a screenshot
                 # Get next position
-                next_cursor_pos = extract_cursor_position_from_agent_response(next_turn_path)
+                next_agent_response = parse_agent_response(next_turn_path)
+                next_cursor_pos = next_agent_response["cursor_positions"][0] if next_agent_response["cursor_positions"] else None
                 
                 # Only interpolate if both positions are valid and different
                 if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos:

From f2e390ba553f9e4b85515b6aa8e7d43090deca5d Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 1 May 2025 14:30:14 -0700
Subject: [PATCH 32/38] added env variable for CUA_PROVIDER_API_KEY

---
 libs/mcp-server/README.md            |  5 +++--
 libs/mcp-server/mcp_server/server.py | 13 +++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md
index 5649cc19..86430e95 100644
--- a/libs/mcp-server/README.md
+++ b/libs/mcp-server/README.md
@@ -68,7 +68,7 @@ You can then use the script in your MCP configuration like this:
         "CUA_AGENT_LOOP": "OMNI",
         "CUA_MODEL_PROVIDER": "ANTHROPIC",
         "CUA_MODEL_NAME": "claude-3-7-sonnet-20250219",
-        "ANTHROPIC_API_KEY": "your-api-key"
+        "CUA_PROVIDER_API_KEY": "your-api-key"
       }
     }
   }
@@ -90,7 +90,8 @@ If you want to develop with the cua-mcp-server directly without installation, yo
         "CUA_AGENT_LOOP": "UITARS",
         "CUA_MODEL_PROVIDER": "OAICOMPAT",
         "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
-        "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1"
+        "CUA_PROVIDER_BASE_URL": "https://****************.us-east-1.aws.endpoints.huggingface.cloud/v1",
+        "CUA_PROVIDER_API_KEY": "your-api-key"
       }
     }
   }
diff --git a/libs/mcp-server/mcp_server/server.py b/libs/mcp-server/mcp_server/server.py
index 67f7fe12..03971cb6 100644
--- a/libs/mcp-server/mcp_server/server.py
+++ b/libs/mcp-server/mcp_server/server.py
@@ -94,14 +94,7 @@ def serve() -> FastMCP:
 
             # Determine which loop to use
             loop_str = os.getenv("CUA_AGENT_LOOP", "OMNI")
-            if loop_str == "OPENAI":
-                loop = AgentLoop.OPENAI
-            elif loop_str == "ANTHROPIC":
-                loop = AgentLoop.ANTHROPIC
-            elif loop_str == "UITARS":
-                loop = AgentLoop.UITARS
-            else:
-                loop = AgentLoop.OMNI
+            loop = getattr(AgentLoop, loop_str)
 
             # Determine provider
             provider_str = os.getenv("CUA_MODEL_PROVIDER", "ANTHROPIC")
@@ -113,6 +106,9 @@ def serve() -> FastMCP:
             # Get base URL for provider (if needed)
             provider_base_url = os.getenv("CUA_PROVIDER_BASE_URL", None)
 
+            # Get api key for provider (if needed)
+            api_key = os.getenv("CUA_PROVIDER_API_KEY", None)
+
             # Create agent with the specified configuration
             agent = ComputerAgent(
                 computer=global_computer,
@@ -122,6 +118,7 @@ def serve() -> FastMCP:
                     name=model_name,
                     provider_base_url=provider_base_url,
                 ),
+                api_key=api_key,
                 save_trajectory=False,
                 only_n_most_recent_images=int(os.getenv("CUA_MAX_IMAGES", "3")),
                 verbosity=logging.INFO,

From 3ec479368b6af825eeff9a5b35f797fdb6b762da Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 1 May 2025 15:04:20 -0700
Subject: [PATCH 33/38] changed dev guide to use a .sh script

---
 libs/mcp-server/README.md                   | 22 +++++++++++----------
 libs/mcp-server/scripts/start_mcp_server.sh | 14 +++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)
 create mode 100755 libs/mcp-server/scripts/start_mcp_server.sh

diff --git a/libs/mcp-server/README.md b/libs/mcp-server/README.md
index 86430e95..736ab364 100644
--- a/libs/mcp-server/README.md
+++ b/libs/mcp-server/README.md
@@ -83,10 +83,9 @@ If you want to develop with the cua-mcp-server directly without installation, yo
 {
   "mcpServers": {
     "cua-agent": {
-      "command": "/Users/YOURUSERNAME/cua/.venv/bin/python",
-      "args": ["-m", "mcp_server.server"],
+      "command": "/bin/bash",
+      "args": ["~/cua/libs/mcp-server/scripts/start_mcp_server.sh"],
       "env": {
-        "PYTHONPATH": "/Users/YOURUSERNAME/cua/libs/mcp-server:/Users/YOURUSERNAME/cua/libs/agent:/Users/YOURUSERNAME/cua/libs/computer:/Users/YOURUSERNAME/cua/libs/core:/Users/YOURUSERNAME/cua/libs/pylume",
         "CUA_AGENT_LOOP": "UITARS",
         "CUA_MODEL_PROVIDER": "OAICOMPAT",
         "CUA_MODEL_NAME": "ByteDance-Seed/UI-TARS-1.5-7B",
@@ -98,19 +97,22 @@ If you want to develop with the cua-mcp-server directly without installation, yo
 }
 ```
 
-To see the logs:
-```
-tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
-```
-
 This configuration:
-- Uses your local Python virtual environment to run the server module directly
-- Sets the Python path to include all necessary library dependencies
+- Uses the start_mcp_server.sh script which automatically sets up the Python path and runs the server module
 - Works with Claude Desktop, Cursor, or any other MCP client
 - Automatically uses your development code without requiring installation
 
 Just add this to your MCP client's configuration and it will use your local development version of the server.
 
+### Troubleshooting
+
+If you get a `/bin/bash: ~/cua/libs/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
+
+To see the logs:
+```
+tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
+```
+
 ## Claude Desktop Integration
 
 To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
diff --git a/libs/mcp-server/scripts/start_mcp_server.sh b/libs/mcp-server/scripts/start_mcp_server.sh
new file mode 100755
index 00000000..17fd9dab
--- /dev/null
+++ b/libs/mcp-server/scripts/start_mcp_server.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+# Set the CUA repository path based on script location
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CUA_REPO_DIR="$( cd "$SCRIPT_DIR/../../.." &> /dev/null && pwd )"
+PYTHON_PATH="${CUA_REPO_DIR}/.venv/bin/python"
+
+# Set Python path to include all necessary libraries
+export PYTHONPATH="${CUA_REPO_DIR}/libs/mcp-server:${CUA_REPO_DIR}/libs/agent:${CUA_REPO_DIR}/libs/computer:${CUA_REPO_DIR}/libs/core:${CUA_REPO_DIR}/libs/pylume"
+
+# Run the MCP server directly as a module
+$PYTHON_PATH -m mcp_server.server
\ No newline at end of file

From 9a00c510740ee42d3a7a8d24efe16ca65791ae9a Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Thu, 1 May 2025 16:34:05 -0700
Subject: [PATCH 34/38] Fix model name param

---
 libs/agent/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/agent/README.md b/libs/agent/README.md
index e5dad869..cdcdb8f7 100644
--- a/libs/agent/README.md
+++ b/libs/agent/README.md
@@ -50,10 +50,10 @@ async with Computer() as macos_computer:
       # model=LLM(provider=LLMProvider.ANTHROPIC)
       # or
       # loop=AgentLoop.OMNI,
-      # model=LLM(provider=LLMProvider.OLLAMA, model="gemma3")
+      # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
       # or
       # loop=AgentLoop.UITARS,
-      # model=LLM(provider=LLMProvider.OAICOMPAT, model="tgi", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+      # model=LLM(provider=LLMProvider.OAICOMPAT, name="name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
   )
 
   tasks = [

From 6072d59cd93aa1385298cc449d723452a00ce2fc Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Thu, 1 May 2025 16:34:49 -0700
Subject: [PATCH 35/38] Update README.md

---
 libs/agent/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/agent/README.md b/libs/agent/README.md
index cdcdb8f7..c1aac96c 100644
--- a/libs/agent/README.md
+++ b/libs/agent/README.md
@@ -53,7 +53,7 @@ async with Computer() as macos_computer:
       # model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
       # or
       # loop=AgentLoop.UITARS,
-      # model=LLM(provider=LLMProvider.OAICOMPAT, name="name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
+      # model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1")
   )
 
   tasks = [

From aa6bbdf3a89ea4b8fdb7af6fde86cf122e85c2f4 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Thu, 1 May 2025 16:36:10 -0700
Subject: [PATCH 36/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 45a97758..8eb12022 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ If you want to use AI agents with virtualized environments:
    async with Computer(verbosity=logging.DEBUG) as macos_computer:
      agent = ComputerAgent(
          computer=macos_computer,
-         loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.OMNI
+         loop=AgentLoop.OPENAI, # or AgentLoop.ANTHROPIC, or AgentLoop.UITARS, or AgentLoop.OMNI
          model=LLM(provider=LLMProvider.OPENAI) # or LLM(provider=LLMProvider.ANTHROPIC)
      )
 

From e9a9c03b637916eb307af0e7e3c8a7113f723449 Mon Sep 17 00:00:00 2001
From: f-trycua <f@trycua.com>
Date: Thu, 1 May 2025 18:27:54 -0700
Subject: [PATCH 37/38] Fix handleStop

---
 libs/lume/src/Server/Server.swift | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/libs/lume/src/Server/Server.swift b/libs/lume/src/Server/Server.swift
index 98ffc588..782efa70 100644
--- a/libs/lume/src/Server/Server.swift
+++ b/libs/lume/src/Server/Server.swift
@@ -179,8 +179,21 @@ final class Server {
                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
                     }
 
-                    // Extract storage from query params if present
-                    let storage = self.extractQueryParam(request: request, name: "storage")
+                    Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path])
+
+                    // Extract storage from the request body
+                    var storage: String? = nil
+                    if let bodyData = request.body, !bodyData.isEmpty {
+                        do {
+                            if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
+                               let bodyStorage = json["storage"] as? String {
+                                storage = bodyStorage
+                                Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage])
+                            }
+                        } catch {
+                            Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription])
+                        }
+                    }
 
                     return try await self.handleStopVM(name: name, storage: storage)
                 }),

From d55f566aa19491f3047565728ee227cfa7f2cdeb Mon Sep 17 00:00:00 2001
From: Dillon DuPont <v-ddupont@microsoft.com>
Date: Thu, 1 May 2025 20:29:56 -0700
Subject: [PATCH 38/38] delete video maker python script in favor of video
 player react component

---
 examples/video_maker_traj.py | 717 -----------------------------------
 1 file changed, 717 deletions(-)
 delete mode 100644 examples/video_maker_traj.py

diff --git a/examples/video_maker_traj.py b/examples/video_maker_traj.py
deleted file mode 100644
index 552969b6..00000000
--- a/examples/video_maker_traj.py
+++ /dev/null
@@ -1,717 +0,0 @@
-"""
-Video Maker for Trajectory Dataset
-
-This script processes a trajectory dataset folder, extracts frames,
-and creates an animated video with cursor overlays.
-"""
-
-from utils import load_dotenv_files
-load_dotenv_files()
-
-import os
-import json
-import math
-import shutil
-import re
-from pathlib import Path
-import argparse
-import numpy as np
-from PIL import Image, ImageDraw, ImageFilter
-import requests
-from io import BytesIO
-from tqdm import tqdm
-
-# Constants
-CURSOR_SCALE = 2  # Scale factor for cursor size
-FRAMES_PER_CLICK = 8  # Number of frames to show for click animation
-FRAMES_PER_MOVE = 10  # Number of frames to interpolate between cursor positions
-CURSOR_NORMAL = "https://mac-cursors.netlify.app/png/default@2x.png"
-CURSOR_CLICKING = "https://mac-cursors.netlify.app/png/handpointing@2x.png"
-CURSOR_TYPING = "https://mac-cursors.netlify.app/png/textcursor@2x.png"
-CURSOR_HOTSPOT = (20, 15)
-OUTPUT_DIR = "examples/output/video_frames"
-
-# Vignette effect constants
-VIGNETTE_WIDTH = 10  # Width of the vignette border in pixels
-VIGNETTE_COLORS = [(128, 0, 255), (0, 0, 255)]  # Purple to Blue gradient colors
-VIGNETTE_ANIMATION_SPEED = 0.1  # Controls speed of the animation pulse
-
-def download_image(url):
-    """Download an image from a URL."""
-    response = requests.get(url)
-    return Image.open(BytesIO(response.content))
-
-def load_cursor_images():
-    """Load and resize cursor images."""
-    cursor_normal = download_image(CURSOR_NORMAL)
-    cursor_clicking = download_image(CURSOR_CLICKING)
-    cursor_typing = download_image(CURSOR_TYPING)
-    
-    # Resize all cursors based on CURSOR_SCALE
-    width_normal, height_normal = cursor_normal.size
-    width_clicking, height_clicking = cursor_clicking.size
-    width_typing, height_typing = cursor_typing.size
-    
-    cursor_normal = cursor_normal.resize((int(width_normal * CURSOR_SCALE), int(height_normal * CURSOR_SCALE)))
-    cursor_clicking = cursor_clicking.resize((int(width_clicking * CURSOR_SCALE), int(height_clicking * CURSOR_SCALE)))
-    cursor_typing = cursor_typing.resize((int(width_typing * CURSOR_SCALE), int(height_typing * CURSOR_SCALE)))
-    
-    cursors = {
-        "normal": cursor_normal,
-        "clicking": cursor_clicking,
-        "typing": cursor_typing
-    }
-    
-    return cursors
-
-# Store the last known cursor position and thought across all frames
-last_known_cursor_position = None
-last_known_thought = None
-
-def parse_agent_response(filename_or_turn_dir):
-    """Parse agent response JSON file to extract text, actions, cursor positions, thought, and action type."""
-    global last_known_cursor_position, last_known_thought
-    
-    # Check if we're getting a filename or turn directory
-    if os.path.isdir(filename_or_turn_dir):
-        turn_dir = filename_or_turn_dir
-    else:
-        turn_dir = os.path.dirname(filename_or_turn_dir)
-    
-    # Find agent response files in the turn directory
-    agent_response_files = [f for f in os.listdir(turn_dir) if f.endswith('_agent_response.json')]
-    
-    result = {
-        "text": [],
-        "actions": [],
-        "cursor_positions": [],
-        "thought": None,
-        "action_type": "normal"
-    }
-    
-    for agent_file in agent_response_files:
-        try:
-            with open(os.path.join(turn_dir, agent_file), 'r') as f:
-                data = json.load(f)
-                response_data = data.get('response', {})
-                
-                # First check for content field (simple text response)
-                if response_data.get("content"):
-                    result["text"].append(response_data.get("content", ""))
-                
-                # Process outputs array if present
-                outputs = response_data.get("output", [])
-                for output in outputs:
-                    output_type = output.get("type")
-                    
-                    if output_type == "message":
-                        content = output.get("content", [])
-                        for content_part in content:
-                            if content_part.get("text"):
-                                result["text"].append(content_part.get("text", ""))
-                    
-                    elif output_type == "reasoning":
-                        # Handle reasoning (thought) content
-                        summary_content = output.get("summary", [])
-                        if summary_content:
-                            for summary_part in summary_content:
-                                if summary_part.get("type") == "summary_text":
-                                    result["text"].append(summary_part.get("text", ""))
-                        else:
-                            summary_text = output.get("text", "")
-                            if summary_text:
-                                result["text"].append(summary_text)
-                    
-                    elif output_type == "computer_call":
-                        action = output.get("action", {})
-                        if action:
-                            result["actions"].append(action)
-                            # Extract cursor position if available
-                            if action.get("x") is not None and action.get("y") is not None:
-                                position = (action.get("x"), action.get("y"))
-                                result["cursor_positions"].append(position)
-                                last_known_cursor_position = position
-                            
-                            # Determine action type
-                            action_type = action.get("type", "")
-                            if action_type == "click":
-                                result["action_type"] = "clicking"
-                            elif action_type == "type" or action_type == "input":
-                                result["action_type"] = "typing"
-        except Exception as e:
-            print(f"Error processing {agent_file}: {e}")
-    
-    # Set thought from text if available
-    if result["text"]:
-        result["thought"] = ' '.join(result["text"])
-        last_known_thought = result["thought"]
-    else:
-        result["thought"] = last_known_thought
-    
-    # Set cursor position if not found
-    if not result["cursor_positions"]:
-        result["cursor_positions"] = [last_known_cursor_position] if last_known_cursor_position else []
-    
-    return result
-
-def create_animated_vignette(image, frame_index):
-    """
-    Create an animated purple/blue gradient vignette effect around the border of the image.
-    The animation pulses the colors and gently varies their intensity over time.
-    
-    Args:
-        image: The base image to apply the vignette to
-        frame_index: Current frame index for animation timing
-    
-    Returns:
-        Image with vignette effect applied
-    """
-    # Create a copy of the image to work with
-    result = image.copy()
-    width, height = result.size
-    
-    # Create a blank RGBA image for the vignette overlay
-    vignette = Image.new('RGBA', (width, height), (0, 0, 0, 0))
-    draw = ImageDraw.Draw(vignette)
-    
-    # Calculate animation phase based on frame index
-    phase = math.sin(frame_index * VIGNETTE_ANIMATION_SPEED) * 0.5 + 0.5  # Oscillates between 0 and 1
-    
-    # Interpolate between the vignette colors based on the animation phase
-    color1 = VIGNETTE_COLORS[0]
-    color2 = VIGNETTE_COLORS[1]
-    animated_color = (
-        int(color1[0] + (color2[0] - color1[0]) * phase),
-        int(color1[1] + (color2[1] - color1[1]) * phase),
-        int(color1[2] + (color2[2] - color1[2]) * phase),
-    )
-    
-    # Draw gradient borders around each edge
-    # Top border
-    for i in range(VIGNETTE_WIDTH):
-        alpha = int(150 * (1 - i / VIGNETTE_WIDTH))
-        border_color = animated_color[:3] + (alpha,)
-        draw.line([(0, i), (width, i)], fill=border_color, width=1)
-        draw.line([(0, height-i-1), (width, height-i-1)], fill=border_color, width=1)
-        draw.line([(i, 0), (i, height)], fill=border_color, width=1)
-        draw.line([(width-i-1, 0), (width-i-1, height)], fill=border_color, width=1)
-    
-    # Apply slight blur to smooth the gradient
-    vignette = vignette.filter(ImageFilter.GaussianBlur(16))
-    
-    # Composite the vignette over the original image
-    result = Image.alpha_composite(result.convert('RGBA'), vignette)
-    
-    return result.convert('RGB')  # Convert back to RGB for consistency
-
-def scale_cursor_with_animation(cursor, frame, max_frames, cursor_type):
-    """Create springy scale animation for cursor."""
-    if cursor_type == "normal":
-        return cursor
-    
-    # For clicking or typing cursors, create a spring effect
-    progress = frame / max_frames
-    
-    # Spring effect calculation - starts big, gets smaller, then back to normal
-    if progress < 0.3:
-        # Start with larger scale, shrink down
-        scale = 1.3 - progress
-    elif progress < 0.7:
-        # Then bounce back up a bit
-        scale = 0.7 + (progress - 0.3) * 0.8
-    else:
-        # Then settle to normal (1.0)
-        scale = 1.0 + (1.0 - progress) * 0.3
-    
-    # Apply scale
-    width, height = cursor.size
-    new_width = int(width * scale)
-    new_height = int(height * scale)
-    return cursor.resize((new_width, new_height))
-
-# Store the last thought bubble position
-last_thought_bubble_pos = None
-
-def draw_thought_bubble(image, position, thought_text, frame_index):
-    """Draw a thought bubble with the AI's thoughts near the cursor position."""
-    global last_thought_bubble_pos
-    
-    if thought_text is None or position is None:
-        return image
-        
-    # Create a copy of the image to work with
-    result = image.copy()
-    
-    # Set up text parameters
-    font_size = 16
-    try:
-        # Try to use a nice font if available
-        from PIL import ImageFont
-        try:
-            font = ImageFont.truetype("Arial", font_size)
-        except IOError:
-            # Fallback to default font
-            font = ImageFont.load_default()
-    except ImportError:
-        font = None
-    
-    # Wrap text to fit in bubble
-    max_width = 400  # Max width in pixels
-    wrapped_lines = []
-    words = thought_text.split()
-    current_line = []
-    
-    for word in words:
-        # Add word to current line
-        test_line = ' '.join(current_line + [word])
-        
-        # Create a temporary draw object to measure text width if needed
-        temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
-        
-        # Measure the text width
-        if font:
-            if hasattr(temp_draw, 'textlength'):
-                text_width = temp_draw.textlength(test_line, font=font)
-            else:
-                # Fall back to rough estimation
-                text_width = len(test_line) * (font_size * 0.6)
-        else:
-            # Rough estimation if no font metrics are available
-            text_width = len(test_line) * (font_size * 0.6)
-        
-        if text_width <= max_width:
-            current_line.append(word)
-        else:
-            # Line is full, start a new line
-            if current_line:
-                wrapped_lines.append(' '.join(current_line))
-            current_line = [word]
-    
-    # Don't forget the last line
-    if current_line:
-        wrapped_lines.append(' '.join(current_line))
-    
-    # Limit number of lines for very long thoughts
-    max_lines = 8
-    if len(wrapped_lines) > max_lines:
-        wrapped_lines = wrapped_lines[:max_lines-1] + ["..."]
-    
-    # Calculate text dimensions
-    line_height = font_size + 4
-    text_height = len(wrapped_lines) * line_height
-    
-    # Find the widest line
-    if font:
-        # Create a draw object to measure text width
-        temp_draw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
-        if hasattr(temp_draw, 'textlength'):
-            text_width = max(temp_draw.textlength(line, font=font) for line in wrapped_lines)
-        else:
-            # Fall back to rough estimation
-            text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines)
-    else:
-        text_width = max(len(line) * (font_size * 0.6) for line in wrapped_lines)
-    
-    # Add padding
-    padding = 20
-    bubble_width = text_width + padding * 2
-    bubble_height = text_height + padding * 2
-    
-    # Calculate bubble position - move slowly towards cursor position
-    x, y = position
-    screen_width, screen_height = image.size
-    
-    # Default initial position if this is the first bubble
-    target_bubble_x = min(x + 30, screen_width - bubble_width - 10)
-    target_bubble_y = max(y - bubble_height - 30, 10)
-    
-    # Ensure target position is fully on screen
-    if target_bubble_x < 10:
-        target_bubble_x = 10
-    if target_bubble_y + bubble_height > screen_height - 10:
-        target_bubble_y = screen_height - bubble_height - 10
-    
-    # Calculate new position with slow movement towards target
-    # Very slow movement factor (0.01 means it moves 1% of the distance per frame)
-    movement_factor = 0.001
-    
-    if last_thought_bubble_pos is None:
-        # First frame, set to target position
-        bubble_x, bubble_y = target_bubble_x, target_bubble_y
-    else:
-        # Interpolate slowly towards target position
-        last_x, last_y = last_thought_bubble_pos
-        bubble_x = last_x + (target_bubble_x - last_x) * movement_factor
-        bubble_y = last_y + (target_bubble_y - last_y) * movement_factor
-    
-    # Add a subtle animation effect to the bubble
-    # animation_offset = math.sin(frame_index * 0.1) * 2
-    # bubble_y += int(animation_offset)
-    
-    # Store position for next frame
-    last_thought_bubble_pos = (bubble_x, bubble_y)
-    
-    # Draw rounded rectangle for bubble
-    corner_radius = 15
-    
-    # Background with black gaussian blur
-    background_color = (0, 0, 0, 180)  # Black with transparency
-    outline_color = (50, 50, 50, 255)   # Dark gray outline
-    
-    # Draw the bubble background - first create an RGBA version
-    bubble_img = Image.new('RGBA', result.size, (0, 0, 0, 0))
-    bubble_draw = ImageDraw.Draw(bubble_img)
-    
-    # Draw rounded rectangle
-    # Check if rounded_rectangle is available (PIL 8.0.0+)
-    if hasattr(bubble_draw, 'rounded_rectangle'):
-        bubble_draw.rounded_rectangle(
-            [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height],
-            radius=corner_radius,
-            fill=background_color,
-            outline=outline_color,
-            width=2
-        )
-    else:
-        # Fall back to regular rectangle if rounded_rectangle not available
-        bubble_draw.rectangle(
-            [bubble_x, bubble_y, bubble_x + bubble_width, bubble_y + bubble_height],
-            fill=background_color,
-            outline=outline_color
-        )
-    
-    # Apply gaussian blur to the bubble background
-    bubble_img = bubble_img.filter(ImageFilter.GaussianBlur(3))
-    
-    # Draw small triangle pointing to cursor
-    pointer_size = 10
-    pointer_x = x + 15
-    pointer_y = y - 5
-    
-    # Make sure pointer is under the bubble
-    if pointer_x > bubble_x + bubble_width:
-        pointer_x = bubble_x + bubble_width - 20
-    elif pointer_x < bubble_x:
-        pointer_x = bubble_x + 20
-    
-    # Create an overlay for the pointer
-    pointer_overlay = Image.new('RGBA', result.size, (0, 0, 0, 0))
-    pointer_draw = ImageDraw.Draw(pointer_overlay)
-    
-    # Draw pointer triangle
-    # pointer_draw.polygon(
-    #     [
-    #         (pointer_x, pointer_y),
-    #         (pointer_x - pointer_size, pointer_y - pointer_size),
-    #         (pointer_x + pointer_size, pointer_y - pointer_size)
-    #     ],
-    #     fill=background_color,
-    #     outline=outline_color
-    # )
-    
-    # Apply gaussian blur to the pointer
-    pointer_overlay = pointer_overlay.filter(ImageFilter.GaussianBlur(3))
-    
-    # Composite the bubble and pointer onto the original image
-    result = Image.alpha_composite(result.convert('RGBA'), bubble_img)
-    result = Image.alpha_composite(result, pointer_overlay)
-    
-    # Now draw the text
-    draw = ImageDraw.Draw(result)
-    text_x = bubble_x + padding
-    text_y = bubble_y + padding
-    
-    text_color = (255, 255, 255, 255)  # White text
-    for line in wrapped_lines:
-        draw.text((text_x, text_y), line, font=font, fill=text_color)
-        text_y += line_height
-    
-    return result.convert('RGB')
-
-def create_cursor_overlay(base_image, position, cursor_images, thought_text=None, cursor_type="normal", animation_frame=0, frame_index=0):
-    """Create an image with cursor overlaid on the base image and thought bubble if available."""
-    # Create a copy of the base image
-    result = base_image.copy()
-    
-    # If position is None, return the image without a cursor
-    if position is None:
-        return result
-    
-    # Get the appropriate cursor image
-    cursor = cursor_images[cursor_type]
-    
-    # Apply animation scaling if needed
-    if cursor_type in ["clicking", "typing"]:
-        cursor = scale_cursor_with_animation(cursor, animation_frame, FRAMES_PER_CLICK, cursor_type)
-    
-    # Calculate position to center the cursor hotspot
-    # Cursor hotspot is at (20,15) of the cursor image
-    x, y = position
-    hotspot_x, hotspot_y = CURSOR_HOTSPOT
-    cursor_x = x - (hotspot_x * CURSOR_SCALE)  # X offset for hotspot
-    cursor_y = y - (hotspot_y * CURSOR_SCALE)  # Y offset for hotspot
-    
-    # Paste the cursor onto the image
-    result.paste(cursor, (int(cursor_x), int(cursor_y)), cursor)
-    
-    # Add thought bubble if text is available
-    if thought_text:
-        result = draw_thought_bubble(result, position, thought_text, frame_index)
-    
-    return result
-
-def get_turns(trajectory_dir):
-    """
-    Get all turn folders from a trajectory directory and their corresponding files.
-    
-    Args:
-        trajectory_dir: Path to trajectory directory
-        
-    Returns:
-        List of tuples (turn_dir, agent_response_path, image_file_path)
-    """
-    turns = []
-    
-    # List all turn directories in order
-    turn_dirs = sorted([d for d in os.listdir(trajectory_dir) if d.startswith('turn_')], 
-                     key=lambda x: int(x.split('_')[1]))
-    
-    for turn_dir_name in turn_dirs:
-        turn_path = os.path.join(trajectory_dir, turn_dir_name)
-        if not os.path.isdir(turn_path):
-            continue
-        
-        # Find agent response files (if any)
-        agent_response_files = [f for f in os.listdir(turn_path) if f.endswith('_agent_response.json')]
-        agent_response_path = None
-        if agent_response_files:
-            agent_response_path = os.path.join(turn_path, agent_response_files[0])
-        
-        # Find screenshot files (if any)
-        screenshot_files = [f for f in os.listdir(turn_path) if f.startswith('screenshot_') and f.endswith('.png')]
-        screenshot_path = None
-        if screenshot_files:
-            # Sort by sequence number to get the main one
-            sorted_screenshots = sorted(screenshot_files, 
-                                      key=lambda x: int(re.search(r'screenshot_(\d+)', x).group(1) 
-                                                   if re.search(r'screenshot_(\d+)', x) else 0))
-            screenshot_path = os.path.join(turn_path, sorted_screenshots[0]) if sorted_screenshots else None
-        
-        turns.append((turn_path, agent_response_path, screenshot_path))
-    
-    return turns
-
-def process_trajectory(trajectory_dir, output_dir, cursors):
-    """Process a trajectory directory and create output frames."""
-    # Get all turns with their associated files
-    turns = get_turns(trajectory_dir)
-    
-    if not turns:
-        print(f"No turn directories found in {trajectory_dir}")
-        return
-    
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-    
-    # Track frame index
-    frame_index = 0
-    
-    # Process each turn
-    prev_img = None
-    prev_cursor_pos = None
-    
-    for turn_path, agent_response_path, screenshot_path in tqdm(turns, desc="Processing turns"):
-        if not screenshot_path:
-            continue  # Skip turns without screenshots
-        
-        # Load the current image
-        try:
-            current_img = Image.open(screenshot_path)
-        except Exception as e:
-            print(f"Error loading image {screenshot_path}: {e}")
-            continue
-        
-        # Parse agent response
-        agent_response = parse_agent_response(turn_path)
-        
-        # Extract action type, cursor position, and thought
-        action_type = agent_response["action_type"]
-        current_cursor_pos = agent_response["cursor_positions"][0] if agent_response["cursor_positions"] else None
-        current_thought = agent_response["thought"]
-        
-        # Check if the current frame has an action (click/typing)
-        is_action_frame = action_type in ["clicking", "typing"]
-        
-        if is_action_frame:
-            # If we have a previous frame, use it for the first half of animation
-            if prev_img is not None:
-                half_frames = FRAMES_PER_CLICK // 2
-                # First half of animation uses PREVIOUS image
-                for j in range(half_frames):
-                    output_img = create_cursor_overlay(
-                        prev_img, current_cursor_pos, cursors,
-                        thought_text=current_thought,
-                        cursor_type=action_type, 
-                        animation_frame=j,
-                        frame_index=frame_index
-                    )
-                    # Apply animated vignette effect
-                    output_img = create_animated_vignette(output_img, frame_index)
-                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-                    frame_index += 1
-                
-                # Second half uses CURRENT image
-                for j in range(half_frames, FRAMES_PER_CLICK):
-                    output_img = create_cursor_overlay(
-                        current_img, current_cursor_pos, cursors,
-                        thought_text=current_thought,
-                        cursor_type=action_type,
-                        animation_frame=j,
-                        frame_index=frame_index
-                    )
-                    # Apply animated vignette effect
-                    output_img = create_animated_vignette(output_img, frame_index)
-                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-                    frame_index += 1
-            else:
-                # If no previous frame, use current for full animation
-                for j in range(FRAMES_PER_CLICK):
-                    output_img = create_cursor_overlay(
-                        current_img, current_cursor_pos, cursors,
-                        thought_text=current_thought,
-                        cursor_type=action_type,
-                        animation_frame=j,
-                        frame_index=frame_index
-                    )
-                    # Apply animated vignette effect
-                    output_img = create_animated_vignette(output_img, frame_index)
-                    output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-                    frame_index += 1
-        else:
-            # Regular frame with normal cursor
-            output_img = create_cursor_overlay(
-                current_img, current_cursor_pos, cursors,
-                thought_text=current_thought,
-                cursor_type="normal",
-                frame_index=frame_index
-            )
-            # Apply animated vignette effect
-            output_img = create_animated_vignette(output_img, frame_index)
-            output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-            frame_index += 1
-        
-        # Store current frame as previous for next iteration
-        prev_img = current_img
-        prev_cursor_pos = current_cursor_pos
-        
-        # Add position interpolation frames if we have both current and next turn data
-        current_turn_index = turns.index((turn_path, agent_response_path, screenshot_path))
-        if current_turn_index < len(turns) - 1:
-            # Get next turn data
-            next_turn_path, next_agent_response_path, next_screenshot_path = turns[current_turn_index + 1]
-            if next_screenshot_path:  # Only if next turn has a screenshot
-                # Get next position
-                next_agent_response = parse_agent_response(next_turn_path)
-                next_cursor_pos = next_agent_response["cursor_positions"][0] if next_agent_response["cursor_positions"] else None
-                
-                # Only interpolate if both positions are valid and different
-                if current_cursor_pos is not None and next_cursor_pos is not None and current_cursor_pos != next_cursor_pos:
-                    for j in range(1, FRAMES_PER_MOVE):
-                        progress = j / FRAMES_PER_MOVE
-                        interp_x = current_cursor_pos[0] + (next_cursor_pos[0] - current_cursor_pos[0]) * progress
-                        interp_y = current_cursor_pos[1] + (next_cursor_pos[1] - current_cursor_pos[1]) * progress
-                        interp_pos = (int(interp_x), int(interp_y))
-                        
-                        # Create interpolated movement frame
-                        output_img = create_cursor_overlay(
-                            current_img, interp_pos, cursors,
-                            thought_text=current_thought,
-                            cursor_type="normal",
-                            frame_index=frame_index
-                        )
-                        # Apply animated vignette effect
-                        output_img = create_animated_vignette(output_img, frame_index)
-                        output_img.save(os.path.join(output_dir, f"frame_{frame_index:04d}.png"))
-                        frame_index += 1
-
-def main():
-    """Main function to process the trajectory and create video frames."""
-    parser = argparse.ArgumentParser(description='Create a video from a trajectory folder.')
-    parser.add_argument('trajectory_dir', type=str, nargs='?', help='Path to the trajectory folder')
-    parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Output directory for video frames')
-    parser.add_argument('--fps', type=int, default=24, help='Frames per second for the output video')
-    parser.add_argument('--output_video', type=str, default='output_video.mp4', help='Path to output video file')
-    parser.add_argument('--skip_ffmpeg', action='store_true', help='Skip running ffmpeg to create video')
-    args = parser.parse_args()
-    
-    trajectory_dir = args.trajectory_dir
-    
-    # If trajectory_dir is not provided, find the latest folder in './trajectories'
-    if trajectory_dir is None:
-        trajectories_base_dir = "./trajectories"
-        if os.path.exists(trajectories_base_dir) and os.path.isdir(trajectories_base_dir):
-            # Get all directories in the trajectories folder
-            trajectory_folders = [os.path.join(trajectories_base_dir, d) for d in os.listdir(trajectories_base_dir) 
-                                 if os.path.isdir(os.path.join(trajectories_base_dir, d))]
-            
-            if trajectory_folders:
-                # Sort folders by modification time, most recent last
-                trajectory_folders.sort(key=lambda x: os.path.getmtime(x))
-                # Use the most recent folder
-                trajectory_dir = trajectory_folders[-1]
-                print(f"No trajectory directory specified, using latest: {trajectory_dir}")
-            else:
-                print(f"No trajectory folders found in {trajectories_base_dir}")
-                return
-        else:
-            print(f"Trajectories directory {trajectories_base_dir} does not exist")
-            return
-    
-    output_dir = args.output_dir
-    fps = args.fps
-    output_video = args.output_video
-    skip_ffmpeg = args.skip_ffmpeg
-    
-    # Check if trajectory directory exists
-    if not os.path.exists(trajectory_dir):
-        print(f"Trajectory directory {trajectory_dir} does not exist")
-        return
-    
-    # Clean output directory if it exists
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-    
-    # Create output directory
-    os.makedirs(output_dir, exist_ok=True)
-    
-    # Load cursor images
-    print("Loading cursor images...")
-    cursors = load_cursor_images()
-    
-    # Process the trajectory
-    print(f"Processing trajectory from {trajectory_dir}...")
-    process_trajectory(trajectory_dir, output_dir, cursors)
-    
-    print(f"Processing complete. Frames saved to {output_dir}")
-    
-    # Run ffmpeg to create the video
-    if not skip_ffmpeg:
-        print(f"Running ffmpeg to create video: {output_video}")
-        ffmpeg_cmd = f"ffmpeg -y -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}"
-        try:
-            import subprocess
-            result = subprocess.run(ffmpeg_cmd, shell=True, check=True, 
-                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
-                                   text=True)
-            print(f"Video created successfully: {output_video}")
-        except subprocess.CalledProcessError as e:
-            print(f"Error running ffmpeg: {e}")
-            print(f"ffmpeg output:\n{e.stdout}\n{e.stderr}")
-            print("\nYou can create a video manually with this command:")
-            print(ffmpeg_cmd)
-    else:
-        print("Skipping ffmpeg. You can create a video from these frames using ffmpeg with this command:")
-        print(f"ffmpeg -framerate {fps} -i {output_dir}/frame_%04d.png -c:v libx264 -pix_fmt yuv420p {output_video}")
-
-if __name__ == "__main__":
-    main()