Added disable_response_storage

2026-05-09 08:49:33 -05:00 · 2025-07-21 12:07:54 -04:00
parent 96fd9cb98e
commit 9068ec32d8
8 changed files with 331 additions and 60 deletions
@@ -29,6 +29,7 @@ class ComputerAgent:
        trajectory_dir: str = "trajectories",
        only_n_most_recent_images: Optional[int] = None,
        verbosity: int = logging.INFO,
+        disable_response_storage: bool = False,
    ):
        """Initialize the ComputerAgent.

@@ -45,6 +46,7 @@ class ComputerAgent:
            trajectory_dir: Directory to save the trajectory.
            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests.
            verbosity: Logging level.
+            disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy.
        """
        # Basic agent configuration
        self.max_retries = max_retries
@@ -55,6 +57,7 @@ class ComputerAgent:
        self._retry_count = 0
        self._initialized = False
        self._in_context = False
+        self.disable_response_storage = disable_response_storage

        # Set logging level
        logger.setLevel(verbosity)
@@ -105,6 +108,7 @@ class ComputerAgent:
                trajectory_dir=trajectory_dir,
                only_n_most_recent_images=only_n_most_recent_images,
                provider_base_url=self.provider_base_url,
+                disable_response_storage=disable_response_storage,
            )
        except ValueError as e:
            logger.error(f"Failed to create loop: {str(e)}")
@@ -29,6 +29,7 @@ class BaseLoop(ABC):
        save_trajectory: bool = True,
        only_n_most_recent_images: Optional[int] = 2,
        callback_handlers: Optional[List[CallbackHandler]] = None,
+        disable_response_storage: bool = False,
        **kwargs,
    ):
        """Initialize base agent loop.
@@ -43,6 +44,7 @@ class BaseLoop(ABC):
            base_dir: Base directory for saving experiment data
            save_trajectory: Whether to save trajectory data
            only_n_most_recent_images: Maximum number of recent screenshots to include in API requests
+            disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy.
            **kwargs: Additional provider-specific arguments
        """
        self.computer = computer
@@ -54,6 +56,7 @@ class BaseLoop(ABC):
        self.base_dir = base_dir
        self.save_trajectory = save_trajectory
        self.only_n_most_recent_images = only_n_most_recent_images
+        self.disable_response_storage = disable_response_storage
        self._kwargs = kwargs

        # Initialize message manager
@@ -30,6 +30,7 @@ class LoopFactory:
        only_n_most_recent_images: Optional[int] = None,
        acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
        provider_base_url: Optional[str] = None,
+        disable_response_storage: bool = False,
    ) -> BaseLoop:
        """Create and return an appropriate loop instance based on type."""
        if loop_type == AgentLoop.ANTHROPIC:
@@ -49,6 +50,7 @@ class LoopFactory:
                save_trajectory=save_trajectory,
                base_dir=trajectory_dir,
                only_n_most_recent_images=only_n_most_recent_images,
+                disable_response_storage=disable_response_storage,
            )
        elif loop_type == AgentLoop.OPENAI:
            # Lazy import OpenAILoop only when needed
@@ -68,6 +70,7 @@ class LoopFactory:
                base_dir=trajectory_dir,
                only_n_most_recent_images=only_n_most_recent_images,
                acknowledge_safety_check_callback=acknowledge_safety_check_callback,
+                disable_response_storage=disable_response_storage,
            )
        elif loop_type == AgentLoop.OMNI:
            # Lazy import OmniLoop and related classes only when needed
@@ -97,6 +100,7 @@ class LoopFactory:
                only_n_most_recent_images=only_n_most_recent_images,
                parser=OmniParser(),
                provider_base_url=provider_base_url,
+                disable_response_storage=disable_response_storage,
            )
        elif loop_type == AgentLoop.UITARS:
            # Lazy import UITARSLoop only when needed
@@ -117,6 +121,7 @@ class LoopFactory:
                only_n_most_recent_images=only_n_most_recent_images,
                provider_base_url=provider_base_url,
                provider=provider,
+                disable_response_storage=disable_response_storage,
            )
        else:
            raise ValueError(f"Unsupported loop type: {loop_type}")
@@ -69,6 +69,44 @@ class StandardMessageManager:
            return self._apply_image_retention(self.messages)
        return self.messages

+    def add_openai_response(self, response: Dict[str, Any]) -> None:
+        """Add OpenAI response output to message history.
+        
+        This method extracts the output items from an OpenAI response and adds them
+        as assistant messages to maintain conversation state manually.
+        
+        Args:
+            response: OpenAI API response containing output items
+        """
+        if not isinstance(response, dict) or "output" not in response:
+            logger.warning("Invalid OpenAI response format for adding to message history")
+            return
+            
+        output_items = response.get("output", [])
+        if not isinstance(output_items, list):
+            logger.warning("OpenAI response output is not a list")
+            return
+            
+        # Convert output items to assistant message content
+        assistant_content = []
+        for item in output_items:
+            if not isinstance(item, dict):
+                continue
+                
+            item_type = item.get("type")
+            if item_type == "output_text":
+                assistant_content.append({
+                    "type": "text",
+                    "text": item.get("text", "")
+                })
+            elif item_type == "computer_call":
+                # Keep computer calls as-is for tool execution tracking
+                assistant_content.append(item)
+        
+        # Add as assistant message if we have content
+        if assistant_content:
+            self.add_assistant_message(assistant_content)
+
    def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Apply image retention policy to messages.

@@ -15,11 +15,12 @@ logger = logging.getLogger(__name__)
 class OpenAIAPIHandler:
    """Handler for OpenAI API interactions."""

-    def __init__(self, loop: "OpenAILoop"):
+    def __init__(self, loop: "OpenAILoop", disable_response_storage: bool = False):
        """Initialize the API handler.

        Args:
            loop: OpenAI loop instance
+            disable_response_storage: Whether to disable response storage
        """
        self.loop = loop
        self.api_key = os.getenv("OPENAI_API_KEY")
@@ -45,7 +46,7 @@ class OpenAIAPIHandler:
        display_width: str,
        display_height: str,
        previous_response_id: Optional[str] = None,
-        os_type: str,
+        os_type: str = "mac",
    ) -> Dict[str, Any]:
        """Send an initial request to the OpenAI API with a screenshot.

@@ -61,10 +62,7 @@ class OpenAIAPIHandler:
        # Convert from our internal OS types to the ones OpenAI expects
        if os_type == "macos":
            os_type = "mac"
-        elif os_type == "linux":
-            os_type = "ubuntu"
-        
-        if os_type not in ["mac", "windows", "ubuntu", "browser"]:
+        if os_type not in ["mac", "windows", "linux", "browser"]:
            raise ValueError(f"Invalid OS type: {os_type}")

        # Convert display dimensions to integers
@@ -143,7 +141,7 @@ class OpenAIAPIHandler:
            ],
            "input": input_array,
            "reasoning": {
-                "generate_summary": "concise",
+                "summary": "concise",
            },
            "truncation": "auto",
        }
@@ -207,10 +205,8 @@ class OpenAIAPIHandler:
        # Convert from our internal OS types to the ones OpenAI expects
        if os_type == "macos":
            os_type = "mac"
-        elif os_type == "linux":
-            os_type = "ubuntu"
        
-        if os_type not in ["mac", "windows", "ubuntu", "browser"]:
+        if os_type not in ["mac", "windows", "linux", "browser"]:
            raise ValueError(f"Invalid OS type: {os_type}")

        # Convert display dimensions to integers
@@ -289,6 +285,9 @@ class OpenAIAPIHandler:
                    },
                }
            ],
+            "reasoning": {
+                "summary": "concise",
+            },
            "truncation": "auto",
        }

@@ -40,6 +40,7 @@ class OpenAILoop(BaseLoop):
        retry_delay: float = 1.0,
        save_trajectory: bool = True,
        acknowledge_safety_check_callback: Optional[Callable[[str], Awaitable[bool]]] = None,
+        disable_response_storage: bool = False,
        **kwargs,
    ):
        """Initialize the OpenAI loop.
@@ -54,6 +55,7 @@ class OpenAILoop(BaseLoop):
            retry_delay: Delay between retries in seconds
            save_trajectory: Whether to save trajectory data
            acknowledge_safety_check_callback: Optional callback for safety check acknowledgment
+            disable_response_storage: Whether to disable response storage on the provider side. Turn this on if you are participating in a Zero Data Retention policy.
            **kwargs: Additional provider-specific arguments
        """
        # Always use computer-use-preview model
@@ -72,6 +74,7 @@ class OpenAILoop(BaseLoop):
            base_dir=base_dir,
            save_trajectory=save_trajectory,
            only_n_most_recent_images=only_n_most_recent_images,
+            disable_response_storage=disable_response_storage,
            **kwargs,
        )

@@ -90,7 +93,7 @@ class OpenAILoop(BaseLoop):
        self.loop_task = None  # Store the loop task for cancellation

        # Initialize handlers
-        self.api_handler = OpenAIAPIHandler(self)
+        self.api_handler = OpenAIAPIHandler(self, self.disable_response_storage)
        self.response_handler = OpenAIResponseHandler(self)

        # Initialize tool manager with callback
@@ -275,24 +278,47 @@ class OpenAILoop(BaseLoop):

                # Call API
                screen_size = await self.computer.interface.get_screen_size()
-                response = await self.api_handler.send_initial_request(
-                    messages=self.message_manager.get_messages(), # Apply image retention policy
-                    display_width=str(screen_size["width"]),
-                    display_height=str(screen_size["height"]),
-                    previous_response_id=self.last_response_id,
-                    os_type=self.computer.os_type,
-                )
-
-                # Store response ID for next request
-                # OpenAI API response structure: the ID is in the response dictionary
-                if isinstance(response, dict) and "id" in response:
-                    self.last_response_id = response["id"]  # Update instance variable
-                    logger.info(f"Received response with ID: {self.last_response_id}")
-                else:
-                    logger.warning(
-                        f"Could not find response ID in OpenAI response: {type(response)}"
+                
+                # Choose API call method based on disable_response_storage setting
+                if self.disable_response_storage:
+                    # Manual conversation state management - always send full message history
+                    response = await self.api_handler.send_initial_request(
+                        messages=self.message_manager.get_messages(), # Apply image retention policy
+                        display_width=str(screen_size["width"]),
+                        display_height=str(screen_size["height"]),
+                        previous_response_id=None,  # Don't use response chaining
+                        os_type=self.computer.os_type,
                    )
-                    # Don't reset last_response_id to None - keep the previous value if available
+                else:
+                    # Use OpenAI's response storage with previous_response_id
+                    response = await self.api_handler.send_initial_request(
+                        messages=self.message_manager.get_messages(), # Apply image retention policy
+                        display_width=str(screen_size["width"]),
+                        display_height=str(screen_size["height"]),
+                        previous_response_id=self.last_response_id,
+                        os_type=self.computer.os_type,
+                    )
+                    
+                from pprint import pprint
+
+                print("========== send_initial_request ===========")
+                pprint(response)
+                print("===========================================")
+
+                if self.disable_response_storage:
+                    # Manual conversation state management - add response to message history
+                    self.message_manager.add_openai_response(response)
+                else:
+                    # Store response ID for next request
+                    # OpenAI API response structure: the ID is in the response dictionary
+                    if isinstance(response, dict) and "id" in response:
+                        self.last_response_id = response["id"]  # Update instance variable
+                        logger.info(f"Received response with ID: {self.last_response_id}")
+                    else:
+                        logger.warning(
+                            f"Could not find response ID in OpenAI response: {type(response)}"
+                        )
+                        # Don't reset last_response_id to None - keep the previous value if available


                # Log standardized response for ease of parsing
@@ -393,27 +419,54 @@ class OpenAILoop(BaseLoop):
                        )
                        self.message_manager.add_user_message([computer_call_output])

-                        # For follow-up requests with previous_response_id, we only need to send
-                        # the computer_call_output, not the full message history
-                        # The API handler will extract this from the message history
-                        if isinstance(self.last_response_id, str):
-                            response = await self.api_handler.send_computer_call_request(
+                        # Choose API call method based on disable_response_storage setting
+                        if self.disable_response_storage:
+                            # Manual conversation state management - send full message history
+                            response = await self.api_handler.send_initial_request(
                                messages=self.message_manager.get_messages(), # Apply image retention policy
                                display_width=str(screen_size["width"]),
                                display_height=str(screen_size["height"]),
-                                previous_response_id=self.last_response_id,  # Use instance variable
+                                previous_response_id=None,  # Don't use response chaining
                                os_type=self.computer.os_type,
                            )
-
-                        # Store response ID for next request
-                        if isinstance(response, dict) and "id" in response:
-                            self.last_response_id = response["id"]  # Update instance variable
-                            logger.info(f"Received response with ID: {self.last_response_id}")
+                            
+                            from pprint import pprint
+                            
+                            print("========== send_initial_request (manual mode) ===========")
+                            pprint(response)
+                            print("========================================================")
+                            
+                            # Add response to message history for manual state management
+                            self.message_manager.add_openai_response(response)
                        else:
-                            logger.warning(
-                                f"Could not find response ID in OpenAI response: {type(response)}"
-                            )
-                            # Keep using the previous response ID if we can't find a new one
+                            # Use OpenAI's response storage with previous_response_id
+                            # For follow-up requests with previous_response_id, we only need to send
+                            # the computer_call_output, not the full message history
+                            # The API handler will extract this from the message history
+                            if isinstance(self.last_response_id, str):
+                                response = await self.api_handler.send_computer_call_request(
+                                    messages=self.message_manager.get_messages(), # Apply image retention policy
+                                    display_width=str(screen_size["width"]),
+                                    display_height=str(screen_size["height"]),
+                                    previous_response_id=self.last_response_id,  # Use instance variable
+                                    os_type=self.computer.os_type,
+                                )
+
+                                from pprint import pprint
+                                
+                                print("========== send_computer_call_request ===========")
+                                pprint(response)
+                                print("============================================")
+
+                            # Store response ID for next request
+                            if isinstance(response, dict) and "id" in response:
+                                self.last_response_id = response["id"]  # Update instance variable
+                                logger.info(f"Received response with ID: {self.last_response_id}")
+                            else:
+                                logger.warning(
+                                    f"Could not find response ID in OpenAI response: {type(response)}"
+                                )
+                                # Keep using the previous response ID if we can't find a new one

                        # Process the response
                        # await self.response_handler.process_response(response, queue)
@@ -455,20 +508,3 @@ class OpenAILoop(BaseLoop):
                }
            )
            await queue.put(None)  # Signal that we're done
-
-    def get_last_response_id(self) -> Optional[str]:
-        """Get the last response ID.
-
-        Returns:
-            The last response ID or None if no response has been received
-        """
-        return self.last_response_id
-
-    def set_last_response_id(self, response_id: str) -> None:
-        """Set the last response ID.
-
-        Args:
-            response_id: OpenAI response ID to set
-        """
-        self.last_response_id = response_id
-        logger.info(f"Manually set response ID to: {self.last_response_id}")
@@ -91,6 +91,9 @@ all = [
    "mlx-vlm>=0.1.27; sys_platform == 'darwin'"
 ]

+[tool.uv]
+constraint-dependencies = ["fastrtc>0.43.0", "mlx-audio>0.2.3"]
+
 [tool.pdm]
 distribution = true

@@ -0,0 +1,183 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Function to print step information
+print_step() {
+    echo -e "${BLUE}==> $1${NC}"
+}
+
+# Function to print success message
+print_success() {
+    echo -e "${GREEN}==> Success: $1${NC}"
+}
+
+# Function to print error message
+print_error() {
+    echo -e "${RED}==> Error: $1${NC}" >&2
+}
+
+# Function to print warning message
+print_warning() {
+    echo -e "${YELLOW}==> Warning: $1${NC}"
+}
+
+# Function to check if UV is installed
+check_uv() {
+    if command -v uv &> /dev/null; then
+        print_success "UV is already installed"
+        uv --version
+        return 0
+    else
+        return 1
+    fi
+}
+
+# Function to install UV
+install_uv() {
+    print_step "UV not found. Installing UV..."
+    
+    # Detect OS
+    if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ "$OSTYPE" == "darwin"* ]]; then
+        print_step "Installing UV for Unix-like system..."
+        curl -LsSf https://astral.sh/uv/install.sh | sh
+        
+        # Add UV to PATH for current session
+        export PATH="$HOME/.cargo/bin:$PATH"
+        
+        # Check if installation was successful
+        if command -v uv &> /dev/null; then
+            print_success "UV installed successfully"
+            uv --version
+        else
+            print_error "UV installation failed"
+            print_step "Please restart your terminal and try again, or install manually:"
+            echo "  curl -LsSf https://astral.sh/uv/install.sh | sh"
+            exit 1
+        fi
+    elif [[ "$OSTYPE" == "msys" ]] || [[ "$OSTYPE" == "cygwin" ]]; then
+        print_error "For Windows, please use PowerShell and run:"
+        echo "  powershell -ExecutionPolicy ByPass -c \"irm https://astral.sh/uv/install.ps1 | iex\""
+        exit 1
+    else
+        print_error "Unsupported operating system: $OSTYPE"
+        print_step "Please install UV manually from: https://docs.astral.sh/uv/getting-started/installation/"
+        exit 1
+    fi
+}
+
+# Get the script's directory
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$( cd "${SCRIPT_DIR}/.." && pwd )"
+
+# Change to project root
+cd "$PROJECT_ROOT"
+
+# Check if UV is installed, install if not
+if ! check_uv; then
+    install_uv
+fi
+
+# Load environment variables from .env.local
+if [ -f .env.local ]; then
+    print_step "Loading environment variables from .env.local..."
+    set -a
+    source .env.local
+    set +a
+    print_success "Environment variables loaded"
+else
+    print_error ".env.local file not found"
+    exit 1
+fi
+
+# Clean up existing environments and cache
+print_step "Cleaning up existing environments..."
+find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+find . -type d -name ".pytest_cache" -exec rm -rf {} + 2>/dev/null || true
+find . -type d -name "dist" -exec rm -rf {} + 2>/dev/null || true
+find . -type d -name ".venv" -exec rm -rf {} + 2>/dev/null || true
+find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true
+print_success "Environment cleanup complete"
+
+# Install Python 3.12 using UV
+print_step "Installing Python 3.12 using UV..."
+uv python install 3.12
+print_success "Python 3.12 installed"
+
+# Create virtual environment using UV
+print_step "Creating virtual environment with UV..."
+uv venv .venv --python 3.12
+print_success "Virtual environment created"
+
+# Activate virtual environment
+print_step "Activating virtual environment..."
+source .venv/bin/activate
+print_success "Virtual environment activated"
+
+# Function to install a package and its dependencies using UV
+install_package() {
+    local package_dir=$1
+    local package_name=$2
+    local extras=$3
+    print_step "Installing ${package_name} with UV..."
+    cd "$package_dir"
+    
+    if [ -f "pyproject.toml" ]; then
+        if [ -n "$extras" ]; then
+            uv pip install -e ".[${extras}]"
+        else
+            uv pip install -e .
+        fi
+    else
+        print_error "No pyproject.toml found in ${package_dir}"
+        return 1
+    fi
+    
+    cd "$PROJECT_ROOT"
+}
+
+# Install packages in order of dependency
+print_step "Installing packages in development mode with UV..."
+
+# Install core first (base package with telemetry support)
+install_package "libs/python/core" "core"
+
+# Install pylume (base dependency)
+install_package "libs/python/pylume" "pylume"
+
+# Install computer with all its dependencies and extras
+install_package "libs/python/computer" "computer" "all"
+
+# Install omniparser
+install_package "libs/python/som" "som"
+
+# Install agent with all its dependencies and extras
+install_package "libs/python/agent" "agent" "all"
+
+# Install computer-server
+install_package "libs/python/computer-server" "computer-server"
+
+# Install mcp-server
+install_package "libs/python/mcp-server" "mcp-server"
+
+# Install development tools from root project
+print_step "Installing development dependencies with UV..."
+uv pip install -e ".[dev,test,docs]"
+
+# Create a .env file for VS Code to use the virtual environment
+print_step "Creating .env file for VS Code..."
+echo "PYTHONPATH=${PROJECT_ROOT}/libs/python/core:${PROJECT_ROOT}/libs/python/computer:${PROJECT_ROOT}/libs/python/agent:${PROJECT_ROOT}/libs/python/som:${PROJECT_ROOT}/libs/python/pylume:${PROJECT_ROOT}/libs/python/computer-server:${PROJECT_ROOT}/libs/python/mcp-server" > .env
+
+print_success "All packages installed successfully with UV!"
+print_step "Your virtual environment is ready. To activate it:"
+echo "  source .venv/bin/activate"
+print_step "UV provides fast dependency resolution and installation."
+print_step "You can also use 'uv run' to run commands in the virtual environment without activation."