From 8f15c21df96f9576f0979504db412ea514a2926b Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Fri, 8 Aug 2025 18:15:56 -0400 Subject: [PATCH] added run_job --- .../agent-sdk/benchmarks/osworld-verified.mdx | 33 ++--- .../docs/agent-sdk/integrations/hud.mdx | 36 ++--- .../agent/agent/integrations/hud/__init__.py | 34 ++++- .../agent/agent/integrations/hud/agent.py | 131 +++++++++++------- 4 files changed, 144 insertions(+), 90 deletions(-) diff --git a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx index 5284b11b..1bfc79f2 100644 --- a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx +++ b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx @@ -63,26 +63,23 @@ print(f"Success: {result.get('success', False)}") Run all tasks in parallel using `run_job`: ```python -from hud import run_job -from agent.integrations.hud import ComputerAgent -import logging +from agent.integrations.hud import run_job +from hud import load_taskset -logging.basicConfig(level=logging.INFO) +# Load taskset +taskset = await load_taskset("SheetBench-V2") -# Load full taskset -taskset = await load_taskset("OSWorld-Verified") - -# Run parallel job +# Run benchmark job job = await run_job( - ComputerAgent, - taskset, - "osworld-computeragent", - max_steps_per_task=100, - max_concurrent_tasks=20, - auto_reply_question=True, - agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"} + model="anthropic/claude-3-5-sonnet-20241022", + task_or_taskset=taskset, + job_name="test-computeragent-job", + # Any extra ComputerAgent kwargs: + # verbosity=logging.INFO, # Enable logging + # trajectory_dir=".." # Save trajectories locally ) -# Get analytics -analytics = await job.get_analytics() -``` +# Get results OR view them at app.hud.so +print(await job.get_analytics()) +print(f"View results at: https://app.hud.so/jobs/{job.id}") +``` \ No newline at end of file diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx index 786e45b5..114c4c92 100644 --- a/docs/content/docs/agent-sdk/integrations/hud.mdx +++ b/docs/content/docs/agent-sdk/integrations/hud.mdx @@ -16,28 +16,28 @@ pip install "cua-agent[hud]" ## Usage ```python -from agent.integrations.hud import ComputerAgent +from agent.integrations.hud import run_job +from hud import load_taskset -# Create agent with any ComputerAgent model -agent = ComputerAgent( - model="anthropic/claude-3-5-sonnet-20241022", # or any model string - environment="linux" +# Load taskset +taskset = await load_taskset("OSWorld-Verified") + +# Run benchmark job +job = await run_job( + model="anthropic/claude-3-5-sonnet-20241022", + task_or_taskset=taskset, + job_name="test-computeragent-job", + # Any extra ComputerAgent kwargs: + # verbosity=logging.INFO, # Enable logging + # trajectory_dir=".." # Save trajectories locally ) -# Use exactly like other HUD agents -action, done = await agent.predict(observation) +# Get results OR view them at app.hud.so +print(await job.get_analytics()) +print(f"View results at: https://app.hud.so/jobs/{job.id}") ``` -## Environment Variables - -Set these environment variables: - -- `HUD_API_KEY` - Your HUD API key -- `ANTHROPIC_API_KEY` - For Claude models -- `OPENAI_API_KEY` - For OpenAI models - -## Example Benchmarks - -1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution +**Available Benchmarks:** +1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments. \ No newline at end of file diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py index 6459048d..993a3a76 100644 --- a/libs/python/agent/agent/integrations/hud/__init__.py +++ b/libs/python/agent/agent/integrations/hud/__init__.py @@ -1,7 +1,39 @@ """HUD integration for ComputerAgent.""" +from typing import Any, Optional, Dict +from hud import run_job as hud_run_job + from .agent import ComputerAgent from .adapter import ComputerAgentAdapter from .computer_handler import HUDComputerHandler -__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"] + +async def run_job( + model: str, + task_or_taskset: Any, + job_name: str, + job_kwargs: Optional[Dict[str, Any]] = None, + **agent_kwargs: Any +) -> Any: + """ + Run a job using ComputerAgent with the specified model. + + Args: + model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022") + task_or_taskset: Task or TaskSet to run + job_name: Name for the job + **agent_kwargs: Additional kwargs to pass to ComputerAgent + + Returns: + Job instance from HUD + """ + return await hud_run_job( + agent_cls=ComputerAgent, + agent_kwargs={"model": model, **agent_kwargs}, + task_or_taskset=task_or_taskset, + job_name=job_name, + **job_kwargs or {} + ) + + +__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"] \ No newline at end of file diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py index 6f246c20..9156cf4a 100644 --- a/libs/python/agent/agent/integrations/hud/agent.py +++ b/libs/python/agent/agent/integrations/hud/agent.py @@ -16,6 +16,19 @@ from .computer_handler import HUDComputerHandler logger = logging.getLogger(__name__) +BASE_SYSTEM_PROMPT = """ +You are an autonomous computer-using agent. Follow these guidelines: + +1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary. +2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism. +3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task. +4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly. +5. Only stop when the task is fully complete or if you encounter an error that prevents completion. +6. Trust that the user wants you to complete the entire task they've requested. +7. You must say "Task completed" when the task is complete. + +Remember: You have been given permission to complete the requested task autonomously. +""".strip() class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): """ @@ -88,25 +101,16 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): self.initial_prompt: Optional[str] = None # System prompt for computer use tasks - self.base_system_prompt = """ - You are an autonomous computer-using agent. Follow these guidelines: - - 1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary. - 2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism. - 3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task. - 4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly. - 5. Only stop when the task is fully complete or if you encounter an error that prevents completion. - 6. Trust that the user wants you to complete the entire task they've requested. - - Remember: You have been given permission to complete the requested task autonomously. - """ + self.base_system_prompt = BASE_SYSTEM_PROMPT async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]: """ Fetch a response from ComputerAgent based on the observation. Args: - observation: The preprocessed observation + observation: The preprocessed observation, attributes: + screenshot: Base64 encoded PNG string of the screen + text: Text observation, if available Returns: tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions, @@ -140,9 +144,39 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): self.conversation_history.append({"role": "user", "content": message}) else: - # Subsequent interactions - add context about the current state - message = "Continue with the task based on the current screen state." - self.conversation_history.append({"role": "user", "content": message}) + # Subsequent interactions - check if last action was computer_call + # If so, add computer_call_output with screenshot instead of user message + last_computer_calls = [] + for msg in reversed(self.conversation_history): + if msg.get("type") == "computer_call" and msg.get("status") == "completed": + call_id = msg.get("call_id") + if call_id: + # Check if this call_id already has a computer_call_output + has_output = any( + m.get("type") == "computer_call_output" and m.get("call_id") == call_id + for m in self.conversation_history + ) + if not has_output: + last_computer_calls.append(call_id) + elif msg.get("role") == "user": + # Stop at the last user message + break + + if last_computer_calls and observation.screenshot: + # Add computer_call_output for each unresponded computer_call + for call_id in reversed(last_computer_calls): # Maintain order + self.conversation_history.append({ + "type": "computer_call_output", + "call_id": call_id, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{observation.screenshot}" + } + }) + else: + # No computer_call found, add regular user message + message = "Continue with the task based on the current screen state." + self.conversation_history.append({"role": "user", "content": message}) # Run ComputerAgent try: @@ -150,7 +184,8 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): async for result in self.computer_agent.run(self.conversation_history, stream=False): # Update conversation history with the output self.conversation_history += result["output"] - + break + # Check if we captured any actions if captured_actions: # Extract reasoning from the conversation history @@ -171,44 +206,34 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]): action["reasoning"] = reasoning action["logs"] = {"conversation_length": len(self.conversation_history)} - # Check if task is done by looking for assistant message indicating completion - done = False - for msg in reversed(self.conversation_history): - if msg.get("type") == "message" and msg.get("role") == "assistant": - content = msg.get("content", []) - for c in content: - if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower(): - done = True - break - break + return captured_actions, False - return captured_actions, done - else: - # No actions captured, task is likely complete - response_text = "Task completed." - for msg in reversed(self.conversation_history): - if msg.get("type") == "message" and msg.get("role") == "assistant": - content = msg.get("content", []) - for c in content: - if c.get("type") == "output_text": - response_text = c.get("text", response_text) - break - break - - response_action = { - "type": "response", - "text": response_text, - "reasoning": response_text, - "logs": {"conversation_length": len(self.conversation_history)} - } - - # Check if this indicates task completion or failure + # Check if the last message is "Task completed" + response_text = "" + for msg in reversed(self.conversation_history): + if msg.get("type") == "message" and msg.get("role") == "assistant": + content = msg.get("content", []) + for c in content: + if c.get("type") == "output_text": + response_text = c.get("text", response_text) + break + break + + done = "task completed" in response_text.lower() + + response_action = { + "type": "response", + "text": response_text, + "reasoning": response_text, + "logs": {"conversation_length": len(self.conversation_history)} + } + + # Check if this indicates task completion or failure + if "task is infeasible" in response_text.lower(): + response_action = {"type": "custom", "action": "FAIL"} done = True - if "task is infeasible" in response_text.lower(): - response_action = {"type": "custom", "action": "FAIL"} - - return [response_action], done - + + return [response_action], done except Exception as e: logger.error(f"Error running ComputerAgent: {e}") # Return an error response