From 500465883ddd9cf41cb18e32df1b32072b015773 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Sat, 31 May 2025 12:19:51 -0400 Subject: [PATCH] Added experiment flag for now --- .../computer_server/diorama/diorama.py | 14 ++++++++++++++ .../computer_server/diorama/macos.py | 3 ++- .../computer_server/handlers/factory.py | 4 ++-- libs/computer/computer/computer.py | 7 ++++++- libs/computer/computer/diorama_computer.py | 14 +++++++++----- libs/computer/computer/interface/macos.py | 2 +- 6 files changed, 34 insertions(+), 10 deletions(-) diff --git a/libs/computer-server/computer_server/diorama/diorama.py b/libs/computer-server/computer_server/diorama/diorama.py index 7ee5ef84..bf30a018 100644 --- a/libs/computer-server/computer_server/diorama/diorama.py +++ b/libs/computer-server/computer_server/diorama/diorama.py @@ -97,6 +97,14 @@ class Diorama: await automation_handler.drag_to(x, y, duration=duration) if future: future.set_result(None) + elif action in ["scroll_up", "scroll_down"]: + clicks = args.get("clicks", 1) + if action == "scroll_up": + await automation_handler.scroll_up(clicks) + else: + await automation_handler.scroll_down(clicks) + if future: + future.set_result(None) # Keyboard actions elif action == "type_text": text = args.get("text") @@ -198,6 +206,12 @@ class Diorama: async def hotkey(self, keys): await self._send_cmd("hotkey", {"keys": list(keys)}) + async def scroll_up(self, clicks: int = 1): + await self._send_cmd("scroll_up", {"clicks": clicks}) + + async def scroll_down(self, clicks: int = 1): + await self._send_cmd("scroll_down", {"clicks": clicks}) + async def get_screen_size(self) -> dict[str, int]: if not self._scene_size: await self.screenshot() diff --git a/libs/computer-server/computer_server/diorama/macos.py b/libs/computer-server/computer_server/diorama/macos.py index 6ced04d7..be266cae 100644 --- a/libs/computer-server/computer_server/diorama/macos.py +++ b/libs/computer-server/computer_server/diorama/macos.py @@ -4,10 +4,11 @@ import platform import inspect from computer_server.diorama.diorama import Diorama from computer_server.diorama.base import BaseDioramaHandler +from typing import Optional class MacOSDioramaHandler(BaseDioramaHandler): """Handler for Diorama commands on macOS, using local diorama module.""" - async def diorama_cmd(self, action: str, arguments: dict = None) -> dict: + async def diorama_cmd(self, action: str, arguments: Optional[dict] = None) -> dict: if platform.system().lower() != "darwin": return {"success": False, "error": "Diorama is only supported on macOS."} try: diff --git a/libs/computer-server/computer_server/handlers/factory.py b/libs/computer-server/computer_server/handlers/factory.py index fecd9bbc..4c076b80 100644 --- a/libs/computer-server/computer_server/handlers/factory.py +++ b/libs/computer-server/computer_server/handlers/factory.py @@ -55,7 +55,7 @@ class HandlerFactory: if os_type == 'darwin': return MacOSAccessibilityHandler(), MacOSAutomationHandler(), MacOSDioramaHandler() - else: + elif os_type == 'linux': return LinuxAccessibilityHandler(), LinuxAutomationHandler(), BaseDioramaHandler() - + else: raise NotImplementedError(f"OS '{os_type}' is not supported") \ No newline at end of file diff --git a/libs/computer/computer/computer.py b/libs/computer/computer/computer.py index b20922ce..a1868cc4 100644 --- a/libs/computer/computer/computer.py +++ b/libs/computer/computer/computer.py @@ -31,6 +31,7 @@ class Computer: Returns: DioramaComputer: A proxy object with the Diorama interface, but using diorama_cmds. """ + assert "app-use" in self.experiments, "App Usage is an experimental feature. Enable it by passing experiments=['app-use'] to Computer()" from .diorama_computer import DioramaComputer return DioramaComputer(self, apps) @@ -52,7 +53,8 @@ class Computer: host: str = os.environ.get("PYLUME_HOST", "localhost"), storage: Optional[str] = None, ephemeral: bool = False, - api_key: Optional[str] = None + api_key: Optional[str] = None, + experiments: Optional[List[str]] = None ): """Initialize a new Computer instance. @@ -78,6 +80,8 @@ class Computer: host: Host to use for VM provider connections (e.g. "localhost", "host.docker.internal") storage: Optional path for persistent VM storage (Lumier provider) ephemeral: Whether to use ephemeral storage + api_key: Optional API key for cloud providers + experiments: Optional list of experimental features to enable (e.g. ["app-use"]) """ self.logger = Logger("cua.computer", verbosity) @@ -93,6 +97,7 @@ class Computer: self.ephemeral = ephemeral self.api_key = api_key + self.experiments = experiments or [] # The default is currently to use non-ephemeral storage if storage and ephemeral and storage != "ephemeral": diff --git a/libs/computer/computer/diorama_computer.py b/libs/computer/computer/diorama_computer.py index 5e37af45..608e6721 100644 --- a/libs/computer/computer/diorama_computer.py +++ b/libs/computer/computer/diorama_computer.py @@ -26,7 +26,6 @@ class DioramaComputerInterface: def __init__(self, computer, apps): self.computer = computer self.apps = apps - self._scene_hitboxes = [] self._scene_size = None async def _send_cmd(self, action, arguments=None): @@ -43,11 +42,10 @@ class DioramaComputerInterface: async def screenshot(self, as_bytes=True): from PIL import Image + import base64 result = await self._send_cmd("screenshot") - img_bytes = result.get("image_bytes") - hitboxes = result.get("hitboxes", []) - self._scene_hitboxes = hitboxes - # Assume server returns PNG bytes + # assume result is a b64 string of an image + img_bytes = base64.b64decode(result) import io img = Image.open(io.BytesIO(img_bytes)) self._scene_size = img.size @@ -70,6 +68,12 @@ class DioramaComputerInterface: async def double_click(self, x=None, y=None): await self._send_cmd("double_click", {"x": x, "y": y}) + async def scroll_up(self, clicks=1): + await self._send_cmd("scroll_up", {"clicks": clicks}) + + async def scroll_down(self, clicks=1): + await self._send_cmd("scroll_down", {"clicks": clicks}) + async def drag_to(self, x, y, duration=0.5): await self._send_cmd("drag_to", {"x": x, "y": y, "duration": duration}) diff --git a/libs/computer/computer/interface/macos.py b/libs/computer/computer/interface/macos.py index cb0deed1..a96c44d1 100644 --- a/libs/computer/computer/interface/macos.py +++ b/libs/computer/computer/interface/macos.py @@ -346,7 +346,7 @@ class MacOSComputerInterface(BaseComputerInterface): asyncio.create_task(self._ws.close()) self._ws = None - async def diorama_cmd(self, action: str, arguments: dict = None) -> dict: + async def diorama_cmd(self, action: str, arguments: Optional[dict] = None) -> dict: """Send a diorama command to the server (macOS only).""" return await self._send_command("diorama_cmd", {"action": action, "arguments": arguments or {}})