diff --git a/libs/python/agent/agent/core/agent.py b/libs/python/agent/agent/core/agent.py index e9d3b866..d6a400fa 100644 --- a/libs/python/agent/agent/core/agent.py +++ b/libs/python/agent/agent/core/agent.py @@ -29,6 +29,7 @@ class ComputerAgent: trajectory_dir: str = "trajectories", only_n_most_recent_images: Optional[int] = None, verbosity: int = logging.INFO, + disable_response_storage: bool = False, ): """Initialize the ComputerAgent. @@ -45,6 +46,7 @@ class ComputerAgent: trajectory_dir: Directory to save the trajectory. only_n_most_recent_images: Maximum number of recent screenshots to include in API requests. verbosity: Logging level. + disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy. """ # Basic agent configuration self.max_retries = max_retries @@ -55,6 +57,7 @@ class ComputerAgent: self._retry_count = 0 self._initialized = False self._in_context = False + self.disable_response_storage = disable_response_storage # Set logging level logger.setLevel(verbosity) @@ -105,6 +108,7 @@ class ComputerAgent: trajectory_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, provider_base_url=self.provider_base_url, + disable_response_storage=disable_response_storage, ) except ValueError as e: logger.error(f"Failed to create loop: {str(e)}") diff --git a/libs/python/agent/agent/core/base.py b/libs/python/agent/agent/core/base.py index fe0f07ad..8e40313f 100644 --- a/libs/python/agent/agent/core/base.py +++ b/libs/python/agent/agent/core/base.py @@ -29,6 +29,7 @@ class BaseLoop(ABC): save_trajectory: bool = True, only_n_most_recent_images: Optional[int] = 2, callback_handlers: Optional[List[CallbackHandler]] = None, + disable_response_storage: bool = False, **kwargs, ): """Initialize base agent loop. @@ -43,6 +44,7 @@ class BaseLoop(ABC): base_dir: Base directory for saving experiment data save_trajectory: Whether to save trajectory data only_n_most_recent_images: Maximum number of recent screenshots to include in API requests + disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy. **kwargs: Additional provider-specific arguments """ self.computer = computer @@ -54,6 +56,7 @@ class BaseLoop(ABC): self.base_dir = base_dir self.save_trajectory = save_trajectory self.only_n_most_recent_images = only_n_most_recent_images + self.disable_response_storage = disable_response_storage self._kwargs = kwargs # Initialize message manager diff --git a/libs/python/agent/agent/core/factory.py b/libs/python/agent/agent/core/factory.py index f0c6046e..9d1ef476 100644 --- a/libs/python/agent/agent/core/factory.py +++ b/libs/python/agent/agent/core/factory.py @@ -30,6 +30,7 @@ class LoopFactory: only_n_most_recent_images: Optional[int] = None, acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None, provider_base_url: Optional[str] = None, + disable_response_storage: bool = False, ) -> BaseLoop: """Create and return an appropriate loop instance based on type.""" if loop_type == AgentLoop.ANTHROPIC: @@ -49,6 +50,7 @@ class LoopFactory: save_trajectory=save_trajectory, base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, + disable_response_storage=disable_response_storage, ) elif loop_type == AgentLoop.OPENAI: # Lazy import OpenAILoop only when needed @@ -68,6 +70,7 @@ class LoopFactory: base_dir=trajectory_dir, only_n_most_recent_images=only_n_most_recent_images, acknowledge_safety_check_callback=acknowledge_safety_check_callback, + disable_response_storage=disable_response_storage, ) elif loop_type == AgentLoop.OMNI: # Lazy import OmniLoop and related classes only when needed @@ -97,6 +100,7 @@ class LoopFactory: only_n_most_recent_images=only_n_most_recent_images, parser=OmniParser(), provider_base_url=provider_base_url, + disable_response_storage=disable_response_storage, ) elif loop_type == AgentLoop.UITARS: # Lazy import UITARSLoop only when needed @@ -117,6 +121,7 @@ class LoopFactory: only_n_most_recent_images=only_n_most_recent_images, provider_base_url=provider_base_url, provider=provider, + disable_response_storage=disable_response_storage, ) else: raise ValueError(f"Unsupported loop type: {loop_type}") diff --git a/libs/python/agent/agent/core/messages.py b/libs/python/agent/agent/core/messages.py index d2c70558..9218790f 100644 --- a/libs/python/agent/agent/core/messages.py +++ b/libs/python/agent/agent/core/messages.py @@ -69,6 +69,44 @@ class StandardMessageManager: return self._apply_image_retention(self.messages) return self.messages + def add_openai_response(self, response: Dict[str, Any]) -> None: + """Add OpenAI response output to message history. + + This method extracts the output items from an OpenAI response and adds them + as assistant messages to maintain conversation state manually. + + Args: + response: OpenAI API response containing output items + """ + if not isinstance(response, dict) or "output" not in response: + logger.warning("Invalid OpenAI response format for adding to message history") + return + + output_items = response.get("output", []) + if not isinstance(output_items, list): + logger.warning("OpenAI response output is not a list") + return + + # Convert output items to assistant message content + assistant_content = [] + for item in output_items: + if not isinstance(item, dict): + continue + + item_type = item.get("type") + if item_type == "output_text": + assistant_content.append({ + "type": "text", + "text": item.get("text", "") + }) + elif item_type == "computer_call": + # Keep computer calls as-is for tool execution tracking + assistant_content.append(item) + + # Add as assistant message if we have content + if assistant_content: + self.add_assistant_message(assistant_content) + def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Apply image retention policy to messages. diff --git a/libs/python/agent/agent/providers/openai/api_handler.py b/libs/python/agent/agent/providers/openai/api_handler.py index e4b5de75..790bfc96 100644 --- a/libs/python/agent/agent/providers/openai/api_handler.py +++ b/libs/python/agent/agent/providers/openai/api_handler.py @@ -15,11 +15,12 @@ logger = logging.getLogger(__name__) class OpenAIAPIHandler: """Handler for OpenAI API interactions.""" - def __init__(self, loop: "OpenAILoop"): + def __init__(self, loop: "OpenAILoop", disable_response_storage: bool = False): """Initialize the API handler. Args: loop: OpenAI loop instance + disable_response_storage: Whether to disable response storage """ self.loop = loop self.api_key = os.getenv("OPENAI_API_KEY") @@ -45,7 +46,7 @@ class OpenAIAPIHandler: display_width: str, display_height: str, previous_response_id: Optional[str] = None, - os_type: str, + os_type: str = "mac", ) -> Dict[str, Any]: """Send an initial request to the OpenAI API with a screenshot. @@ -61,10 +62,7 @@ class OpenAIAPIHandler: # Convert from our internal OS types to the ones OpenAI expects if os_type == "macos": os_type = "mac" - elif os_type == "linux": - os_type = "ubuntu" - - if os_type not in ["mac", "windows", "ubuntu", "browser"]: + if os_type not in ["mac", "windows", "linux", "browser"]: raise ValueError(f"Invalid OS type: {os_type}") # Convert display dimensions to integers @@ -143,7 +141,7 @@ class OpenAIAPIHandler: ], "input": input_array, "reasoning": { - "generate_summary": "concise", + "summary": "concise", }, "truncation": "auto", } @@ -207,10 +205,8 @@ class OpenAIAPIHandler: # Convert from our internal OS types to the ones OpenAI expects if os_type == "macos": os_type = "mac" - elif os_type == "linux": - os_type = "ubuntu" - if os_type not in ["mac", "windows", "ubuntu", "browser"]: + if os_type not in ["mac", "windows", "linux", "browser"]: raise ValueError(f"Invalid OS type: {os_type}") # Convert display dimensions to integers @@ -289,6 +285,9 @@ class OpenAIAPIHandler: }, } ], + "reasoning": { + "summary": "concise", + }, "truncation": "auto", } diff --git a/libs/python/agent/agent/providers/openai/loop.py b/libs/python/agent/agent/providers/openai/loop.py index cc9a07db..cb9fdbe7 100644 --- a/libs/python/agent/agent/providers/openai/loop.py +++ b/libs/python/agent/agent/providers/openai/loop.py @@ -40,6 +40,7 @@ class OpenAILoop(BaseLoop): retry_delay: float = 1.0, save_trajectory: bool = True, acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None, + disable_response_storage: bool = False, **kwargs, ): """Initialize the OpenAI loop. @@ -54,6 +55,7 @@ class OpenAILoop(BaseLoop): retry_delay: Delay between retries in seconds save_trajectory: Whether to save trajectory data acknowledge_safety_check_callback: Optional callback for safety check acknowledgment + disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy. **kwargs: Additional provider-specific arguments """ # Always use computer-use-preview model @@ -72,6 +74,7 @@ class OpenAILoop(BaseLoop): base_dir=base_dir, save_trajectory=save_trajectory, only_n_most_recent_images=only_n_most_recent_images, + disable_response_storage=disable_response_storage, **kwargs, ) @@ -90,7 +93,7 @@ class OpenAILoop(BaseLoop): self.loop_task = None # Store the loop task for cancellation # Initialize handlers - self.api_handler = OpenAIAPIHandler(self) + self.api_handler = OpenAIAPIHandler(self, self.disable_response_storage) self.response_handler = OpenAIResponseHandler(self) # Initialize tool manager with callback @@ -275,24 +278,47 @@ class OpenAILoop(BaseLoop): # Call API screen_size = await self.computer.interface.get_screen_size() - response = await self.api_handler.send_initial_request( - messages=self.message_manager.get_messages(), # Apply image retention policy - display_width=str(screen_size["width"]), - display_height=str(screen_size["height"]), - previous_response_id=self.last_response_id, - os_type=self.computer.os_type, - ) - - # Store response ID for next request - # OpenAI API response structure: the ID is in the response dictionary - if isinstance(response, dict) and "id" in response: - self.last_response_id = response["id"] # Update instance variable - logger.info(f"Received response with ID: {self.last_response_id}") - else: - logger.warning( - f"Could not find response ID in OpenAI response: {type(response)}" + + # Choose API call method based on disable_response_storage setting + if self.disable_response_storage: + # Manual conversation state management - always send full message history + response = await self.api_handler.send_initial_request( + messages=self.message_manager.get_messages(), # Apply image retention policy + display_width=str(screen_size["width"]), + display_height=str(screen_size["height"]), + previous_response_id=None, # Don't use response chaining + os_type=self.computer.os_type, ) - # Don't reset last_response_id to None - keep the previous value if available + else: + # Use OpenAI's response storage with previous_response_id + response = await self.api_handler.send_initial_request( + messages=self.message_manager.get_messages(), # Apply image retention policy + display_width=str(screen_size["width"]), + display_height=str(screen_size["height"]), + previous_response_id=self.last_response_id, + os_type=self.computer.os_type, + ) + + from pprint import pprint + + print("========== send_initial_request ===========") + pprint(response) + print("===========================================") + + if self.disable_response_storage: + # Manual conversation state management - add response to message history + self.message_manager.add_openai_response(response) + else: + # Store response ID for next request + # OpenAI API response structure: the ID is in the response dictionary + if isinstance(response, dict) and "id" in response: + self.last_response_id = response["id"] # Update instance variable + logger.info(f"Received response with ID: {self.last_response_id}") + else: + logger.warning( + f"Could not find response ID in OpenAI response: {type(response)}" + ) + # Don't reset last_response_id to None - keep the previous value if available # Log standardized response for ease of parsing @@ -393,27 +419,54 @@ class OpenAILoop(BaseLoop): ) self.message_manager.add_user_message([computer_call_output]) - # For follow-up requests with previous_response_id, we only need to send - # the computer_call_output, not the full message history - # The API handler will extract this from the message history - if isinstance(self.last_response_id, str): - response = await self.api_handler.send_computer_call_request( + # Choose API call method based on disable_response_storage setting + if self.disable_response_storage: + # Manual conversation state management - send full message history + response = await self.api_handler.send_initial_request( messages=self.message_manager.get_messages(), # Apply image retention policy display_width=str(screen_size["width"]), display_height=str(screen_size["height"]), - previous_response_id=self.last_response_id, # Use instance variable + previous_response_id=None, # Don't use response chaining os_type=self.computer.os_type, ) - - # Store response ID for next request - if isinstance(response, dict) and "id" in response: - self.last_response_id = response["id"] # Update instance variable - logger.info(f"Received response with ID: {self.last_response_id}") + + from pprint import pprint + + print("========== send_initial_request (manual mode) ===========") + pprint(response) + print("========================================================") + + # Add response to message history for manual state management + self.message_manager.add_openai_response(response) else: - logger.warning( - f"Could not find response ID in OpenAI response: {type(response)}" - ) - # Keep using the previous response ID if we can't find a new one + # Use OpenAI's response storage with previous_response_id + # For follow-up requests with previous_response_id, we only need to send + # the computer_call_output, not the full message history + # The API handler will extract this from the message history + if isinstance(self.last_response_id, str): + response = await self.api_handler.send_computer_call_request( + messages=self.message_manager.get_messages(), # Apply image retention policy + display_width=str(screen_size["width"]), + display_height=str(screen_size["height"]), + previous_response_id=self.last_response_id, # Use instance variable + os_type=self.computer.os_type, + ) + + from pprint import pprint + + print("========== send_computer_call_request ===========") + pprint(response) + print("============================================") + + # Store response ID for next request + if isinstance(response, dict) and "id" in response: + self.last_response_id = response["id"] # Update instance variable + logger.info(f"Received response with ID: {self.last_response_id}") + else: + logger.warning( + f"Could not find response ID in OpenAI response: {type(response)}" + ) + # Keep using the previous response ID if we can't find a new one # Process the response # await self.response_handler.process_response(response, queue) @@ -455,20 +508,3 @@ class OpenAILoop(BaseLoop): } ) await queue.put(None) # Signal that we're done - - def get_last_response_id(self) -> Optional[str]: - """Get the last response ID. - - Returns: - The last response ID or None if no response has been received - """ - return self.last_response_id - - def set_last_response_id(self, response_id: str) -> None: - """Set the last response ID. - - Args: - response_id: OpenAI response ID to set - """ - self.last_response_id = response_id - logger.info(f"Manually set response ID to: {self.last_response_id}") diff --git a/libs/python/agent/pyproject.toml b/libs/python/agent/pyproject.toml index 7f6af835..c967f578 100644 --- a/libs/python/agent/pyproject.toml +++ b/libs/python/agent/pyproject.toml @@ -91,6 +91,9 @@ all = [ "mlx-vlm>=0.1.27; sys_platform == 'darwin'" ] +[tool.uv] +constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"] + [tool.pdm] distribution = true diff --git a/scripts/build-uv.sh b/scripts/build-uv.sh new file mode 100755 index 00000000..3b1db41a --- /dev/null +++ b/scripts/build-uv.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +# Exit on error +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Function to print step information +print_step() { + echo -e "${BLUE}==> $1${NC}" +} + +# Function to print success message +print_success() { + echo -e "${GREEN}==> Success: $1${NC}" +} + +# Function to print error message +print_error() { + echo -e "${RED}==> Error: $1${NC}" >&2 +} + +# Function to print warning message +print_warning() { + echo -e "${YELLOW}==> Warning: $1${NC}" +} + +# Function to check if UV is installed +check_uv() { + if command -v uv &> /dev/null; then + print_success "UV is already installed" + uv --version + return 0 + else + return 1 + fi +} + +# Function to install UV +install_uv() { + print_step "UV not found. Installing UV..." + + # Detect OS + if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "darwin"* ]]; then + print_step "Installing UV for Unix-like system..." + curl -LsSf https://astral.sh/uv/install.sh | sh + + # Add UV to PATH for current session + export PATH="$HOME/.cargo/bin:$PATH" + + # Check if installation was successful + if command -v uv &> /dev/null; then + print_success "UV installed successfully" + uv --version + else + print_error "UV installation failed" + print_step "Please restart your terminal and try again, or install manually:" + echo " curl -LsSf https://astral.sh/uv/install.sh | sh" + exit 1 + fi + elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then + print_error "For Windows, please use PowerShell and run:" + echo " powershell -ExecutionPolicy ByPass -c \"irm https://astral.sh/uv/install.ps1 | iex\"" + exit 1 + else + print_error "Unsupported operating system: $OSTYPE" + print_step "Please install UV manually from: https://docs.astral.sh/uv/getting-started/installation/" + exit 1 + fi +} + +# Get the script's directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )" + +# Change to project root +cd "$PROJECT_ROOT" + +# Check if UV is installed, install if not +if ! check_uv; then + install_uv +fi + +# Load environment variables from .env.local +if [ -f .env.local ]; then + print_step "Loading environment variables from .env.local..." + set -a + source .env.local + set +a + print_success "Environment variables loaded" +else + print_error ".env.local file not found" + exit 1 +fi + +# Clean up existing environments and cache +print_step "Cleaning up existing environments..." +find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true +find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true +find . -type d -name "dist" -exec rm -rf {} + 2>/dev/null || true +find . -type d -name ".venv" -exec rm -rf {} + 2>/dev/null || true +find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true +print_success "Environment cleanup complete" + +# Install Python 3.12 using UV +print_step "Installing Python 3.12 using UV..." +uv python install 3.12 +print_success "Python 3.12 installed" + +# Create virtual environment using UV +print_step "Creating virtual environment with UV..." +uv venv .venv --python 3.12 +print_success "Virtual environment created" + +# Activate virtual environment +print_step "Activating virtual environment..." +source .venv/bin/activate +print_success "Virtual environment activated" + +# Function to install a package and its dependencies using UV +install_package() { + local package_dir=$1 + local package_name=$2 + local extras=$3 + print_step "Installing ${package_name} with UV..." + cd "$package_dir" + + if [ -f "pyproject.toml" ]; then + if [ -n "$extras" ]; then + uv pip install -e ".[${extras}]" + else + uv pip install -e . + fi + else + print_error "No pyproject.toml found in ${package_dir}" + return 1 + fi + + cd "$PROJECT_ROOT" +} + +# Install packages in order of dependency +print_step "Installing packages in development mode with UV..." + +# Install core first (base package with telemetry support) +install_package "libs/python/core" "core" + +# Install pylume (base dependency) +install_package "libs/python/pylume" "pylume" + +# Install computer with all its dependencies and extras +install_package "libs/python/computer" "computer" "all" + +# Install omniparser +install_package "libs/python/som" "som" + +# Install agent with all its dependencies and extras +install_package "libs/python/agent" "agent" "all" + +# Install computer-server +install_package "libs/python/computer-server" "computer-server" + +# Install mcp-server +install_package "libs/python/mcp-server" "mcp-server" + +# Install development tools from root project +print_step "Installing development dependencies with UV..." +uv pip install -e ".[dev,test,docs]" + +# Create a .env file for VS Code to use the virtual environment +print_step "Creating .env file for VS Code..." +echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env + +print_success "All packages installed successfully with UV!" +print_step "Your virtual environment is ready. To activate it:" +echo " source .venv/bin/activate" +print_step "UV provides fast dependency resolution and installation." +print_step "You can also use 'uv run' to run commands in the virtual environment without activation."