From a5b5bad05cd282c87fdd8346c2d351612a3b7f4f Mon Sep 17 00:00:00 2001 From: f-trycua Date: Sat, 10 May 2025 22:15:30 -0700 Subject: [PATCH 1/5] Add clipboard and audio device --- .../VMVirtualizationService.swift | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/libs/lume/src/Virtualization/VMVirtualizationService.swift b/libs/lume/src/Virtualization/VMVirtualizationService.swift index 93cb4db0a..b358659b2 100644 --- a/libs/lume/src/Virtualization/VMVirtualizationService.swift +++ b/libs/lume/src/Virtualization/VMVirtualizationService.swift @@ -246,6 +246,27 @@ final class DarwinVirtualizationService: BaseVirtualizationService { ] vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()] vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()] + + // Audio configuration + let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration() + let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration() + let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration() + + inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource() + outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink() + + soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration] + vzConfig.audioDevices = [soundDeviceConfiguration] + + // Clipboard sharing via Spice agent + let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration() + let spiceAgentPort = VZVirtioConsolePortConfiguration() + spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName + let spiceAgentPortAttachment = VZSpiceAgentPortAttachment() + spiceAgentPortAttachment.sharesClipboard = true + spiceAgentPort.attachment = spiceAgentPortAttachment + spiceAgentConsoleDevice.ports[0] = spiceAgentPort + vzConfig.consoleDevices.append(spiceAgentConsoleDevice) // Directory sharing let directorySharingDevices = createDirectorySharingDevices( @@ -376,6 +397,27 @@ final class LinuxVirtualizationService: BaseVirtualizationService { ] vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()] vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()] + + // Audio configuration + let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration() + let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration() + let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration() + + inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource() + outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink() + + soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration] + vzConfig.audioDevices = [soundDeviceConfiguration] + + // Clipboard sharing via Spice agent + let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration() + let spiceAgentPort = VZVirtioConsolePortConfiguration() + spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName + let spiceAgentPortAttachment = VZSpiceAgentPortAttachment() + spiceAgentPortAttachment.sharesClipboard = true + spiceAgentPort.attachment = spiceAgentPortAttachment + spiceAgentConsoleDevice.ports[0] = spiceAgentPort + vzConfig.consoleDevices.append(spiceAgentConsoleDevice) // Directory sharing var directorySharingDevices = createDirectorySharingDevices( From e51bbe2c0f54987391646ab45c57b3e3af6be146 Mon Sep 17 00:00:00 2001 From: Francesco Bonacci Date: Sun, 11 May 2025 09:51:08 -0700 Subject: [PATCH 2/5] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dafef93da..c11134d96 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,9 @@ **c/ua** (pronounced "koo-ah") enables AI agents to control full operating systems in high-performance virtual containers with near-native speed on Apple Silicon. - -
-
+ + # 🚀 Quick Start From ee7784e2ddc111f47ab82ab8aacf882854013c37 Mon Sep 17 00:00:00 2001 From: ddupont <3820588+ddupont808@users.noreply.github.com> Date: Sun, 11 May 2025 21:13:53 -0400 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c11134d96..61ac0b870 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ This script will: - [UITARS-1.5](https://github.com/trycua/cua/blob/main/libs/agent/README.md#agent-loops) - Run locally on Apple Silicon with MLX, or use cloud providers - [OpenAI CUA](https://github.com/trycua/cua/blob/main/libs/agent/README.md#agent-loops) - Use OpenAI's Computer-Use Preview model - [Anthropic CUA](https://github.com/trycua/cua/blob/main/libs/agent/README.md#agent-loops) - Use Anthropic's Computer-Use capabilities -- [OmniParser](https://github.com/trycua/cua/blob/main/libs/agent/README.md#agent-loops) - Control UI with [Set-of-Marks prompting](https://som-gpt4v.github.io/) using any vision model +- [OmniParser-v2.0](https://github.com/trycua/cua/blob/main/libs/agent/README.md#agent-loops) - Control UI with [Set-of-Marks prompting](https://som-gpt4v.github.io/) using any vision model ### System Requirements From aa0132222038539e88934aee01820dbd3e5384bc Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 12 May 2025 08:54:28 -0400 Subject: [PATCH 4/5] Fixes issue #172 --- libs/agent/agent/providers/openai/loop.py | 8 +- libs/agent/agent/providers/uitars/loop.py | 193 ++++++++++++---------- 2 files changed, 114 insertions(+), 87 deletions(-) diff --git a/libs/agent/agent/providers/openai/loop.py b/libs/agent/agent/providers/openai/loop.py index 87719d1b1..e791b8c98 100644 --- a/libs/agent/agent/providers/openai/loop.py +++ b/libs/agent/agent/providers/openai/loop.py @@ -133,22 +133,22 @@ class OpenAILoop(BaseLoop): logger.info("Starting OpenAI loop run") # Create queue for response streaming - queue = asyncio.Queue() + self.queue = asyncio.Queue() # Ensure tool manager is initialized await self.tool_manager.initialize() # Start loop in background task - self.loop_task = asyncio.create_task(self._run_loop(queue, messages)) + self.loop_task = asyncio.create_task(self._run_loop(self.queue, messages)) # Process and yield messages as they arrive while True: try: - item = await queue.get() + item = await self.queue.get() if item is None: # Stop signal break yield item - queue.task_done() + self.queue.task_done() except Exception as e: logger.error(f"Error processing queue item: {str(e)}") continue diff --git a/libs/agent/agent/providers/uitars/loop.py b/libs/agent/agent/providers/uitars/loop.py index 3766cd92d..133a3b83a 100644 --- a/libs/agent/agent/providers/uitars/loop.py +++ b/libs/agent/agent/providers/uitars/loop.py @@ -463,17 +463,40 @@ class UITARSLoop(BaseLoop): Yields: Agent response format """ - # Initialize the message manager with the provided messages - self.message_manager.messages = messages.copy() - logger.info(f"Starting UITARSLoop run with {len(self.message_manager.messages)} messages") - - # Create a task to run the loop - self.loop_task = asyncio.create_task(self._run_loop(messages)) - - # Yield from the loop task try: - async for response in self.loop_task: - yield response + logger.info(f"Starting UITARSLoop run with {len(messages)} messages") + + # Initialize the message manager with the provided messages + self.message_manager.messages = messages.copy() + + # Create queue for response streaming + queue = asyncio.Queue() + + # Start loop in background task + self.loop_task = asyncio.create_task(self._run_loop(queue, messages)) + + # Process and yield messages as they arrive + while True: + try: + item = await queue.get() + if item is None: # Stop signal + break + yield item + queue.task_done() + except Exception as e: + logger.error(f"Error processing queue item: {str(e)}") + continue + + # Wait for loop to complete + await self.loop_task + + # Send completion message + yield { + "role": "assistant", + "content": "Task completed successfully.", + "metadata": {"title": "✅ Complete"}, + } + except Exception as e: logger.error(f"Error in run method: {str(e)}") yield { @@ -482,14 +505,12 @@ class UITARSLoop(BaseLoop): "metadata": {"title": "❌ Error"}, } - async def _run_loop(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]: + async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None: """Internal method to run the agent loop with provided messages. Args: + queue: Queue to put responses into messages: List of messages in standard OpenAI format - - Yields: - Agent response format """ # Continue running until explicitly told to stop running = True @@ -500,88 +521,94 @@ class UITARSLoop(BaseLoop): attempt = 0 max_attempts = 3 - while running and attempt < max_attempts: - try: - # Create a new turn directory if it's not already created - if not turn_created: - self._create_turn_dir() - turn_created = True + try: + while running and attempt < max_attempts: + try: + # Create a new turn directory if it's not already created + if not turn_created: + self._create_turn_dir() + turn_created = True - # Ensure client is initialized - if self.client is None: - logger.info("Initializing client...") - await self.initialize_client() + # Ensure client is initialized if self.client is None: - raise RuntimeError("Failed to initialize client") - logger.info("Client initialized successfully") + logger.info("Initializing client...") + await self.initialize_client() + if self.client is None: + raise RuntimeError("Failed to initialize client") + logger.info("Client initialized successfully") - # Get current screen - base64_screenshot = await self._get_current_screen() - - # Add screenshot to message history - self.message_manager.add_user_message( - [ - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{base64_screenshot}"}, - } - ] - ) - logger.info("Added screenshot to message history") + # Get current screen + base64_screenshot = await self._get_current_screen() + + # Add screenshot to message history + self.message_manager.add_user_message( + [ + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{base64_screenshot}"}, + } + ] + ) + logger.info("Added screenshot to message history") - # Get system prompt - system_prompt = self._get_system_prompt() + # Get system prompt + system_prompt = self._get_system_prompt() - # Make API call with retries - response = await self._make_api_call( - self.message_manager.messages, system_prompt - ) + # Make API call with retries + response = await self._make_api_call( + self.message_manager.messages, system_prompt + ) - # Handle the response (may execute actions) - # Returns: (should_continue, action_screenshot_saved) - should_continue, new_screenshot_saved = await self._handle_response( - response, self.message_manager.messages - ) + # Handle the response (may execute actions) + # Returns: (should_continue, action_screenshot_saved) + should_continue, new_screenshot_saved = await self._handle_response( + response, self.message_manager.messages + ) - # Update whether an action screenshot was saved this turn - action_screenshot_saved = action_screenshot_saved or new_screenshot_saved - - agent_response = await to_agent_response_format( - response, - messages, - model=self.model, - ) - # Log standardized response for ease of parsing - self._log_api_call("agent_response", request=None, response=agent_response) - yield agent_response - - # Check if we should continue this conversation - running = should_continue + # Update whether an action screenshot was saved this turn + action_screenshot_saved = action_screenshot_saved or new_screenshot_saved + + agent_response = await to_agent_response_format( + response, + messages, + model=self.model, + ) + # Log standardized response for ease of parsing + self._log_api_call("agent_response", request=None, response=agent_response) + + # Put the response in the queue + await queue.put(agent_response) + + # Check if we should continue this conversation + running = should_continue - # Create a new turn directory if we're continuing - if running: - turn_created = False + # Create a new turn directory if we're continuing + if running: + turn_created = False - # Reset attempt counter on success - attempt = 0 + # Reset attempt counter on success + attempt = 0 - except Exception as e: - attempt += 1 - error_msg = f"Error in run method (attempt {attempt}/{max_attempts}): {str(e)}" - logger.error(error_msg) + except Exception as e: + attempt += 1 + error_msg = f"Error in run method (attempt {attempt}/{max_attempts}): {str(e)}" + logger.error(error_msg) - # If this is our last attempt, provide more info about the error - if attempt >= max_attempts: - logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}") + # If this is our last attempt, provide more info about the error + if attempt >= max_attempts: + logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}") - yield { - "role": "assistant", - "content": f"Error: {str(e)}", - "metadata": {"title": "❌ Error"}, - } + await queue.put({ + "role": "assistant", + "content": f"Error: {str(e)}", + "metadata": {"title": "❌ Error"}, + }) - # Create a brief delay before retrying - await asyncio.sleep(1) + # Create a brief delay before retrying + await asyncio.sleep(1) + finally: + # Signal that we're done + await queue.put(None) async def cancel(self) -> None: """Cancel the currently running agent loop task. From 1b1eb813741e00647649f774a58500bf4a1bd6cf Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 12 May 2025 08:55:13 -0400 Subject: [PATCH 5/5] Fixes issue #172 --- libs/agent/agent/providers/omni/loop.py | 60 +++++++++++++++++-------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/libs/agent/agent/providers/omni/loop.py b/libs/agent/agent/providers/omni/loop.py index b77194500..840b29166 100644 --- a/libs/agent/agent/providers/omni/loop.py +++ b/libs/agent/agent/providers/omni/loop.py @@ -581,17 +581,40 @@ class OmniLoop(BaseLoop): Yields: Agent response format """ - # Initialize the message manager with the provided messages - self.message_manager.messages = messages.copy() - logger.info(f"Starting OmniLoop run with {len(self.message_manager.messages)} messages") - - # Create a task to run the loop - self.loop_task = asyncio.create_task(self._run_loop(messages)) - - # Yield from the loop task try: - async for response in self.loop_task: - yield response + logger.info(f"Starting OmniLoop run with {len(messages)} messages") + + # Initialize the message manager with the provided messages + self.message_manager.messages = messages.copy() + + # Create queue for response streaming + queue = asyncio.Queue() + + # Start loop in background task + self.loop_task = asyncio.create_task(self._run_loop(queue, messages)) + + # Process and yield messages as they arrive + while True: + try: + item = await queue.get() + if item is None: # Stop signal + break + yield item + queue.task_done() + except Exception as e: + logger.error(f"Error processing queue item: {str(e)}") + continue + + # Wait for loop to complete + await self.loop_task + + # Send completion message + yield { + "role": "assistant", + "content": "Task completed successfully.", + "metadata": {"title": "✅ Complete"}, + } + except Exception as e: logger.error(f"Error in run method: {str(e)}") yield { @@ -600,14 +623,12 @@ class OmniLoop(BaseLoop): "metadata": {"title": "❌ Error"}, } - async def _run_loop(self, messages: List[Dict[str, Any]]) -> AsyncGenerator[AgentResponse, None]: + async def _run_loop(self, queue: asyncio.Queue, messages: List[Dict[str, Any]]) -> None: """Internal method to run the agent loop with provided messages. Args: + queue: Queue to put responses into messages: List of messages in standard OpenAI format - - Yields: - Agent response format """ # Continue running until explicitly told to stop running = True @@ -698,8 +719,8 @@ class OmniLoop(BaseLoop): # Log standardized response for ease of parsing self._log_api_call("agent_response", request=None, response=openai_compatible_response) - # Yield the response to the caller - yield openai_compatible_response + # Put the response in the queue + await queue.put(openai_compatible_response) # Check if we should continue this conversation running = should_continue @@ -720,14 +741,17 @@ class OmniLoop(BaseLoop): if attempt >= max_attempts: logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}") - yield { + await queue.put({ "role": "assistant", "content": f"Error: {str(e)}", "metadata": {"title": "❌ Error"}, - } + }) # Create a brief delay before retrying await asyncio.sleep(1) + finally: + # Signal that we're done + await queue.put(None) async def cancel(self) -> None: """Cancel the currently running agent loop task.