add global and local delay control (#308)

This commit is contained in:
Dillon DuPont
2025-07-02 11:41:48 -04:00
parent 394ebaac3c
commit e1216bcd61
3 changed files with 261 additions and 63 deletions

View File

@@ -24,6 +24,9 @@ class BaseComputerInterface(ABC):
self.api_key = api_key
self.vm_name = vm_name
self.logger = Logger("cua.interface", LogLevel.NORMAL)
# Optional default delay time between commands (in seconds)
self.delay: float = 0.0
@abstractmethod
async def wait_for_ready(self, timeout: int = 60) -> None:
@@ -52,37 +55,75 @@ class BaseComputerInterface(ABC):
# Mouse Actions
@abstractmethod
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: "MouseButton" = "left") -> None:
"""Press and hold a mouse button."""
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: "MouseButton" = "left", delay: Optional[float] = None) -> None:
"""Press and hold a mouse button.
Args:
x: X coordinate to press at. If None, uses current cursor position.
y: Y coordinate to press at. If None, uses current cursor position.
button: Mouse button to press ('left', 'middle', 'right').
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: "MouseButton" = "left") -> None:
"""Release a mouse button."""
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: "MouseButton" = "left", delay: Optional[float] = None) -> None:
"""Release a mouse button.
Args:
x: X coordinate to release at. If None, uses current cursor position.
y: Y coordinate to release at. If None, uses current cursor position.
button: Mouse button to release ('left', 'middle', 'right').
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Perform a left click."""
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
"""Perform a left mouse button click.
Args:
x: X coordinate to click at. If None, uses current cursor position.
y: Y coordinate to click at. If None, uses current cursor position.
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Perform a right click."""
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
"""Perform a right mouse button click.
Args:
x: X coordinate to click at. If None, uses current cursor position.
y: Y coordinate to click at. If None, uses current cursor position.
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Perform a double click."""
async def double_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
"""Perform a double left mouse button click.
Args:
x: X coordinate to double-click at. If None, uses current cursor position.
y: Y coordinate to double-click at. If None, uses current cursor position.
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def move_cursor(self, x: int, y: int) -> None:
"""Move the cursor to specified position."""
async def move_cursor(self, x: int, y: int, delay: Optional[float] = None) -> None:
"""Move the cursor to the specified screen coordinates.
Args:
x: X coordinate to move cursor to.
y: Y coordinate to move cursor to.
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5) -> None:
async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5, delay: Optional[float] = None) -> None:
"""Drag from current position to specified coordinates.
Args:
@@ -90,60 +131,103 @@ class BaseComputerInterface(ABC):
y: The y coordinate to drag to
button: The mouse button to use ('left', 'middle', 'right')
duration: How long the drag should take in seconds
delay: Optional delay in seconds after the action
"""
pass
@abstractmethod
async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> None:
async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5, delay: Optional[float] = None) -> None:
"""Drag the cursor along a path of coordinates.
Args:
path: List of (x, y) coordinate tuples defining the drag path
button: The mouse button to use ('left', 'middle', 'right')
duration: Total time in seconds that the drag operation should take
delay: Optional delay in seconds after the action
"""
pass
# Keyboard Actions
@abstractmethod
async def key_down(self, key: str) -> None:
"""Press and hold a key."""
async def key_down(self, key: str, delay: Optional[float] = None) -> None:
"""Press and hold a key.
Args:
key: The key to press and hold (e.g., 'a', 'shift', 'ctrl').
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def key_up(self, key: str) -> None:
"""Release a key."""
async def key_up(self, key: str, delay: Optional[float] = None) -> None:
"""Release a previously pressed key.
Args:
key: The key to release (e.g., 'a', 'shift', 'ctrl').
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def type_text(self, text: str) -> None:
"""Type the specified text."""
async def type_text(self, text: str, delay: Optional[float] = None) -> None:
"""Type the specified text string.
Args:
text: The text string to type.
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def press_key(self, key: str) -> None:
"""Press a single key."""
async def press_key(self, key: str, delay: Optional[float] = None) -> None:
"""Press and release a single key.
Args:
key: The key to press (e.g., 'a', 'enter', 'escape').
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def hotkey(self, *keys: str) -> None:
"""Press multiple keys simultaneously."""
async def hotkey(self, *keys: str, delay: Optional[float] = None) -> None:
"""Press multiple keys simultaneously (keyboard shortcut).
Args:
*keys: Variable number of keys to press together (e.g., 'ctrl', 'c').
delay: Optional delay in seconds after the action.
"""
pass
# Scrolling Actions
@abstractmethod
async def scroll(self, x: int, y: int) -> None:
"""Scroll the mouse wheel."""
async def scroll(self, x: int, y: int, delay: Optional[float] = None) -> None:
"""Scroll the mouse wheel by specified amounts.
Args:
x: Horizontal scroll amount (positive = right, negative = left).
y: Vertical scroll amount (positive = up, negative = down).
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def scroll_down(self, clicks: int = 1) -> None:
"""Scroll down."""
async def scroll_down(self, clicks: int = 1, delay: Optional[float] = None) -> None:
"""Scroll down by the specified number of clicks.
Args:
clicks: Number of scroll clicks to perform downward.
delay: Optional delay in seconds after the action.
"""
pass
@abstractmethod
async def scroll_up(self, clicks: int = 1) -> None:
"""Scroll up."""
async def scroll_up(self, clicks: int = 1, delay: Optional[float] = None) -> None:
"""Scroll up by the specified number of clicks.
Args:
clicks: Number of scroll clicks to perform upward.
delay: Optional delay in seconds after the action.
"""
pass
# Screen Actions
@@ -167,44 +251,89 @@ class BaseComputerInterface(ABC):
@abstractmethod
async def get_cursor_position(self) -> Dict[str, int]:
"""Get current cursor position."""
"""Get the current cursor position on screen.
Returns:
Dict with 'x' and 'y' keys containing cursor coordinates.
"""
pass
# Clipboard Actions
@abstractmethod
async def copy_to_clipboard(self) -> str:
"""Get clipboard content."""
"""Get the current clipboard content.
Returns:
The text content currently stored in the clipboard.
"""
pass
@abstractmethod
async def set_clipboard(self, text: str) -> None:
"""Set clipboard content."""
"""Set the clipboard content to the specified text.
Args:
text: The text to store in the clipboard.
"""
pass
# File System Actions
@abstractmethod
async def file_exists(self, path: str) -> bool:
"""Check if file exists."""
"""Check if a file exists at the specified path.
Args:
path: The file path to check.
Returns:
True if the file exists, False otherwise.
"""
pass
@abstractmethod
async def directory_exists(self, path: str) -> bool:
"""Check if directory exists."""
"""Check if a directory exists at the specified path.
Args:
path: The directory path to check.
Returns:
True if the directory exists, False otherwise.
"""
pass
@abstractmethod
async def list_dir(self, path: str) -> List[str]:
"""List directory contents."""
"""List the contents of a directory.
Args:
path: The directory path to list.
Returns:
List of file and directory names in the specified directory.
"""
pass
@abstractmethod
async def read_text(self, path: str) -> str:
"""Read file text contents."""
"""Read the text contents of a file.
Args:
path: The file path to read from.
Returns:
The text content of the file.
"""
pass
@abstractmethod
async def write_text(self, path: str, content: str) -> None:
"""Write file text contents."""
"""Write text content to a file.
Args:
path: The file path to write to.
content: The text content to write.
"""
pass
@abstractmethod
@@ -220,27 +349,51 @@ class BaseComputerInterface(ABC):
@abstractmethod
async def write_bytes(self, path: str, content: bytes) -> None:
"""Write file binary contents."""
"""Write binary content to a file.
Args:
path: The file path to write to.
content: The binary content to write.
"""
pass
@abstractmethod
async def delete_file(self, path: str) -> None:
"""Delete file."""
"""Delete a file at the specified path.
Args:
path: The file path to delete.
"""
pass
@abstractmethod
async def create_dir(self, path: str) -> None:
"""Create directory."""
"""Create a directory at the specified path.
Args:
path: The directory path to create.
"""
pass
@abstractmethod
async def delete_dir(self, path: str) -> None:
"""Delete directory."""
"""Delete a directory at the specified path.
Args:
path: The directory path to delete.
"""
pass
@abstractmethod
async def get_file_size(self, path: str) -> int:
"""Get the size of a file in bytes."""
"""Get the size of a file in bytes.
Args:
path: The file path to get the size of.
Returns:
The size of the file in bytes.
"""
pass
@abstractmethod
@@ -274,7 +427,11 @@ class BaseComputerInterface(ABC):
# Accessibility Actions
@abstractmethod
async def get_accessibility_tree(self) -> Dict:
"""Get the accessibility tree of the current screen."""
"""Get the accessibility tree of the current screen.
Returns:
Dict containing the hierarchical accessibility information of screen elements.
"""
pass
@abstractmethod

View File

@@ -32,6 +32,21 @@ class GenericComputerInterface(BaseComputerInterface):
# Set logger name for the interface
self.logger = Logger(logger_name, LogLevel.NORMAL)
# Optional default delay time between commands (in seconds)
self.delay = 0.0
async def _handle_delay(self, delay: Optional[float] = None):
"""Handle delay between commands using async sleep.
Args:
delay: Optional delay in seconds. If None, uses self.delay.
"""
if delay is not None:
if isinstance(delay, float) and delay > 0:
await asyncio.sleep(delay)
elif isinstance(self.delay, float) and self.delay > 0:
await asyncio.sleep(self.delay)
@property
def ws_uri(self) -> str:
"""Get the WebSocket URI using the current IP address.
@@ -44,42 +59,52 @@ class GenericComputerInterface(BaseComputerInterface):
return f"{protocol}://{self.ip_address}:{port}/ws"
# Mouse actions
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> None:
async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left", delay: Optional[float] = None) -> None:
await self._send_command("mouse_down", {"x": x, "y": y, "button": button})
await self._handle_delay(delay)
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> None:
async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left", delay: Optional[float] = None) -> None:
await self._send_command("mouse_up", {"x": x, "y": y, "button": button})
await self._handle_delay(delay)
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
async def left_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
await self._send_command("left_click", {"x": x, "y": y})
await self._handle_delay(delay)
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
async def right_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
await self._send_command("right_click", {"x": x, "y": y})
await self._handle_delay(delay)
async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
async def double_click(self, x: Optional[int] = None, y: Optional[int] = None, delay: Optional[float] = None) -> None:
await self._send_command("double_click", {"x": x, "y": y})
await self._handle_delay(delay)
async def move_cursor(self, x: int, y: int) -> None:
async def move_cursor(self, x: int, y: int, delay: Optional[float] = None) -> None:
await self._send_command("move_cursor", {"x": x, "y": y})
await self._handle_delay(delay)
async def drag_to(self, x: int, y: int, button: "MouseButton" = "left", duration: float = 0.5) -> None:
async def drag_to(self, x: int, y: int, button: "MouseButton" = "left", duration: float = 0.5, delay: Optional[float] = None) -> None:
await self._send_command(
"drag_to", {"x": x, "y": y, "button": button, "duration": duration}
)
await self._handle_delay(delay)
async def drag(self, path: List[Tuple[int, int]], button: "MouseButton" = "left", duration: float = 0.5) -> None:
async def drag(self, path: List[Tuple[int, int]], button: "MouseButton" = "left", duration: float = 0.5, delay: Optional[float] = None) -> None:
await self._send_command(
"drag", {"path": path, "button": button, "duration": duration}
)
await self._handle_delay(delay)
# Keyboard Actions
async def key_down(self, key: "KeyType") -> None:
async def key_down(self, key: "KeyType", delay: Optional[float] = None) -> None:
await self._send_command("key_down", {"key": key})
await self._handle_delay(delay)
async def key_up(self, key: "KeyType") -> None:
async def key_up(self, key: "KeyType", delay: Optional[float] = None) -> None:
await self._send_command("key_up", {"key": key})
await self._handle_delay(delay)
async def type_text(self, text: str) -> None:
async def type_text(self, text: str, delay: Optional[float] = None) -> None:
# Temporary fix for https://github.com/trycua/cua/issues/165
# Check if text contains Unicode characters
if any(ord(char) > 127 for char in text):
@@ -89,8 +114,9 @@ class GenericComputerInterface(BaseComputerInterface):
else:
# For ASCII text, use the regular typing method
await self._send_command("type_text", {"text": text})
await self._handle_delay(delay)
async def press(self, key: "KeyType") -> None:
async def press(self, key: "KeyType", delay: Optional[float] = None) -> None:
"""Press a single key.
Args:
@@ -126,16 +152,17 @@ class GenericComputerInterface(BaseComputerInterface):
raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
await self._send_command("press_key", {"key": actual_key})
await self._handle_delay(delay)
async def press_key(self, key: "KeyType") -> None:
async def press_key(self, key: "KeyType", delay: Optional[float] = None) -> None:
"""DEPRECATED: Use press() instead.
This method is kept for backward compatibility but will be removed in a future version.
Please use the press() method instead.
"""
await self.press(key)
await self.press(key, delay)
async def hotkey(self, *keys: "KeyType") -> None:
async def hotkey(self, *keys: "KeyType", delay: Optional[float] = None) -> None:
"""Press multiple keys simultaneously.
Args:
@@ -169,16 +196,20 @@ class GenericComputerInterface(BaseComputerInterface):
raise ValueError(f"Invalid key type: {type(key)}. Must be Key enum or string.")
await self._send_command("hotkey", {"keys": actual_keys})
await self._handle_delay(delay)
# Scrolling Actions
async def scroll(self, x: int, y: int) -> None:
async def scroll(self, x: int, y: int, delay: Optional[float] = None) -> None:
await self._send_command("scroll", {"x": x, "y": y})
await self._handle_delay(delay)
async def scroll_down(self, clicks: int = 1) -> None:
async def scroll_down(self, clicks: int = 1, delay: Optional[float] = None) -> None:
await self._send_command("scroll_down", {"clicks": clicks})
async def scroll_up(self, clicks: int = 1) -> None:
await self._handle_delay(delay)
async def scroll_up(self, clicks: int = 1, delay: Optional[float] = None) -> None:
await self._send_command("scroll_up", {"clicks": clicks})
await self._handle_delay(delay)
# Screen actions
async def screenshot(