pyautogui cleanup

2026-01-05 20:09:56 -06:00 · 2025-12-28 21:25:49 -05:00
parent 76c0ca4302
commit 545c2180f8
12 changed files with 2931 additions and 2951 deletions
--- a/libs/python/agent/agent/loops/opencua.py
+++ b/libs/python/agent/agent/loops/opencua.py
@@ -21,11 +21,15 @@ from ..types import AgentCapability, AgentResponse, Messages, Tools
 from .composed_grounded import ComposedGroundedConfig


-def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
-    """Extract coordinates from pyautogui.click(x=..., y=...) format."""
+def extract_coordinates_from_click(text: str) -> Optional[Tuple[int, int]]:
+    """Extract coordinates from click(x=..., y=...) or pyautogui.click(x=..., y=...) format.
+    
+    This function supports parsing both generic click() and legacy pyautogui.click() formats
+    for backwards compatibility with models that may still output pyautogui format.
+    """
    try:
-        # Look for pyautogui.click(x=1443, y=343) pattern
-        pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
+        # Look for click(x=1443, y=343) or pyautogui.click(x=1443, y=343) pattern
+        pattern = r"(?:pyautogui\.)?click\(x=(\d+),\s*y=(\d+)\)"
        match = re.search(pattern, text)
        if match:
            x, y = int(match.group(1)), int(match.group(2))
@@ -90,7 +94,7 @@ class OpenCUAConfig(ComposedGroundedConfig):
        # Prepare system message
        system_prompt = (
            "You are a GUI agent. You are given a task and a screenshot of the screen. "
-            "You need to perform a series of pyautogui actions to complete the task."
+            "You need to perform a series of click actions to complete the task."
        )

        system_message = {"role": "system", "content": system_prompt}
@@ -120,8 +124,8 @@ class OpenCUAConfig(ComposedGroundedConfig):
        output_text = response.choices[0].message.content
        # print(output_text)

-        # Extract coordinates from pyautogui format
-        coordinates = extract_coordinates_from_pyautogui(output_text)
+        # Extract coordinates from click format (supports both click() and pyautogui.click() for backwards compatibility)
+        coordinates = extract_coordinates_from_click(output_text)

        return coordinates

--- a/libs/python/agent/benchmarks/utils.py
+++ b/libs/python/agent/benchmarks/utils.py
@@ -432,12 +432,12 @@ def take_screenshot() -> Image.Image:
        PIL Image of the screenshot
    """
    try:
-        import pyautogui
+        from PIL import ImageGrab

-        screenshot = pyautogui.screenshot()
+        screenshot = ImageGrab.grab()
        return screenshot
    except ImportError:
-        print("pyautogui not installed. Please install it with: pip install pyautogui")
+        print("PIL/Pillow not installed. Please install it with: pip install pillow")
        raise
    except Exception as e:
        print(f"Error taking screenshot: {e}")
--- a/libs/python/computer-server/README.md
+++ b/libs/python/computer-server/README.md
@@ -16,7 +16,7 @@
 </h1>
 </div>

-**Computer Server** is the server component for the Computer-Use Interface (CUI) framework powering Cua for interacting with local macOS and Linux sandboxes, PyAutoGUI-compatible, and pluggable with any AI agent systems (Cua, Langchain, CrewAI, AutoGen).
+**Computer Server** is the server component for the Computer-Use Interface (CUI) framework powering Cua for interacting with local macOS and Linux sandboxes, automation-compatible, and pluggable with any AI agent systems (Cua, Langchain, CrewAI, AutoGen).

 ## Features

--- a/libs/python/computer-server/computer_server/handlers/linux.py
+++ b/libs/python/computer-server/computer_server/handlers/linux.py
@@ -22,7 +22,6 @@ from PIL import Image, ImageGrab
 # Configure logger
 logger = logging.getLogger(__name__)

-# pyautogui removed in favor of pynput

 from pynput.keyboard import Controller as KeyboardController
 from pynput.keyboard import Key
@@ -81,7 +80,7 @@ class LinuxAccessibilityHandler(BaseAccessibilityHandler):

        Returns:
            Tuple[int, int]: The x and y coordinates of the cursor position.
-                           Returns (0, 0) if pyautogui is not available.
+                           Returns (0, 0) if cursor position cannot be determined.
        """
        try:
            # Use pynput mouse controller
@@ -98,7 +97,7 @@ class LinuxAccessibilityHandler(BaseAccessibilityHandler):

        Returns:
            Tuple[int, int]: The width and height of the screen in pixels.
-                           Returns (1920, 1080) if pyautogui is not available.
+                           Returns (1920, 1080) if screen size cannot be determined.
        """
        try:
            img = ImageGrab.grab()
--- a/libs/python/computer-server/computer_server/handlers/windows.py
+++ b/libs/python/computer-server/computer_server/handlers/windows.py
@@ -504,7 +504,7 @@ class WindowsAutomationHandler(BaseAutomationHandler):
        """Scroll vertically at the current cursor position.

        Args:
-            x (int): Horizontal scroll amount (not used in pyautogui implementation).
+            x (int): Horizontal scroll amount.
            y (int): Vertical scroll amount. Positive values scroll up, negative values scroll down.

        Returns:
--- a/libs/python/computer/README.md
+++ b/libs/python/computer/README.md
@@ -16,7 +16,7 @@
 </h1>
 </div>

-**cua-computer** is a Computer-Use Interface (CUI) framework powering Cua for interacting with local macOS and Linux sandboxes, PyAutoGUI-compatible, and pluggable with any AI agent systems (Cua, Langchain, CrewAI, AutoGen). Computer relies on [Lume](https://github.com/trycua/lume) for creating and managing sandbox environments.
+**cua-computer** is a Computer-Use Interface (CUI) framework powering Cua for interacting with local macOS and Linux sandboxes, automation-compatible, and pluggable with any AI agent systems (Cua, Langchain, CrewAI, AutoGen). Computer relies on [Lume](https://github.com/trycua/lume) for creating and managing sandbox environments.

 ### Get started with Computer

--- a/libs/python/computer/computer/interface/models.py
+++ b/libs/python/computer/computer/interface/models.py
@@ -31,7 +31,7 @@ FunctionKey = Literal["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10
 class Key(Enum):
    """Keyboard keys that can be used with press_key.

-    These key names map to PyAutoGUI's expected key names.
+    These key names follow a consistent cross-platform keyboard key naming convention.
    """

    # Navigation