computer/libs/python/agent/agent/computer_handler.py

"""
Computer handler implementation for OpenAI computer-use-preview protocol.
"""

import base64
from typing import Dict, List, Any, Literal
from .types import Computer


class OpenAIComputerHandler:
    """Computer handler that implements the Computer protocol using the computer interface."""

    def __init__(self, computer_interface):
        """Initialize with a computer interface (from tool schema)."""
        self.interface = computer_interface

    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
        """Get the current environment type."""
        # For now, return a default - this could be enhanced to detect actual environment
        return "windows"

    async def get_dimensions(self) -> tuple[int, int]:
        """Get screen dimensions as (width, height)."""
        screen_size = await self.interface.get_screen_size()
        return screen_size["width"], screen_size["height"]

    async def screenshot(self) -> str:
        """Take a screenshot and return as base64 string."""
        screenshot_bytes = await self.interface.screenshot()
        return base64.b64encode(screenshot_bytes).decode('utf-8')

    async def click(self, x: int, y: int, button: str = "left") -> None:
        """Click at coordinates with specified button."""
        if button == "left":
            await self.interface.left_click(x, y)
        elif button == "right":
            await self.interface.right_click(x, y)
        else:
            # Default to left click for unknown buttons
            await self.interface.left_click(x, y)

    async def double_click(self, x: int, y: int) -> None:
        """Double click at coordinates."""
        await self.interface.double_click(x, y)

    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
        """Scroll at coordinates with specified scroll amounts."""
        await self.interface.move_cursor(x, y)
        await self.interface.scroll(scroll_x, scroll_y)

    async def type(self, text: str) -> None:
        """Type text."""
        await self.interface.type_text(text)

    async def wait(self, ms: int = 1000) -> None:
        """Wait for specified milliseconds."""
        import asyncio
        await asyncio.sleep(ms / 1000.0)

    async def move(self, x: int, y: int) -> None:
        """Move cursor to coordinates."""
        await self.interface.move_cursor(x, y)

    async def keypress(self, keys: List[str]) -> None:
        """Press key combination."""
        if len(keys) == 1:
            await self.interface.press_key(keys[0])
        else:
            # Handle key combinations
            await self.interface.hotkey(*keys)

    async def drag(self, path: List[Dict[str, int]]) -> None:
        """Drag along specified path."""
        if not path:
            return

        # Start drag from first point
        start = path[0]
        await self.interface.mouse_down(start["x"], start["y"])

        # Move through path
        for point in path[1:]:
            await self.interface.move_cursor(point["x"], point["y"])

        # End drag at last point
        end = path[-1]
        await self.interface.mouse_up(end["x"], end["y"])

    async def get_current_url(self) -> str:
        """Get current URL (for browser environments)."""
        # This would need to be implemented based on the specific browser interface
        # For now, return empty string
        return ""


def acknowledge_safety_check_callback(message: str) -> bool:
    """Safety check callback for user acknowledgment."""
    response = input(
        f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
    ).lower()
    return response.strip() == "y"


def check_blocklisted_url(url: str) -> None:
    """Check if URL is blocklisted (placeholder implementation)."""
    # This would contain actual URL checking logic
    pass