From 8f15c21df96f9576f0979504db412ea514a2926b Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Fri, 8 Aug 2025 18:15:56 -0400
Subject: [PATCH] added run_job

---
 .../agent-sdk/benchmarks/osworld-verified.mdx |  33 ++---
 .../docs/agent-sdk/integrations/hud.mdx       |  36 ++---
 .../agent/agent/integrations/hud/__init__.py  |  34 ++++-
 .../agent/agent/integrations/hud/agent.py     | 131 +++++++++++-------
 4 files changed, 144 insertions(+), 90 deletions(-)

diff --git a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
index 5284b11b..1bfc79f2 100644
--- a/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
+++ b/docs/content/docs/agent-sdk/benchmarks/osworld-verified.mdx
@@ -63,26 +63,23 @@ print(f"Success: {result.get('success', False)}")
 Run all tasks in parallel using `run_job`:
 
 ```python
-from hud import run_job
-from agent.integrations.hud import ComputerAgent
-import logging
+from agent.integrations.hud import run_job
+from hud import load_taskset
 
-logging.basicConfig(level=logging.INFO)
+# Load taskset
+taskset = await load_taskset("SheetBench-V2")
 
-# Load full taskset
-taskset = await load_taskset("OSWorld-Verified")
-
-# Run parallel job
+# Run benchmark job
 job = await run_job(
-    ComputerAgent,
-    taskset,
-    "osworld-computeragent",
-    max_steps_per_task=100,
-    max_concurrent_tasks=20,
-    auto_reply_question=True,
-    agent_kwargs={"model": "anthropic/claude-3-5-sonnet-20241022"}
+    model="anthropic/claude-3-5-sonnet-20241022",
+    task_or_taskset=taskset,
+    job_name="test-computeragent-job",
+    # Any extra ComputerAgent kwargs:
+    # verbosity=logging.INFO,  # Enable logging
+    # trajectory_dir=".."       # Save trajectories locally
 )
 
-# Get analytics
-analytics = await job.get_analytics()
-```
+# Get results OR view them at app.hud.so
+print(await job.get_analytics())
+print(f"View results at: https://app.hud.so/jobs/{job.id}")
+```
\ No newline at end of file
diff --git a/docs/content/docs/agent-sdk/integrations/hud.mdx b/docs/content/docs/agent-sdk/integrations/hud.mdx
index 786e45b5..114c4c92 100644
--- a/docs/content/docs/agent-sdk/integrations/hud.mdx
+++ b/docs/content/docs/agent-sdk/integrations/hud.mdx
@@ -16,28 +16,28 @@ pip install "cua-agent[hud]"
 ## Usage
 
 ```python
-from agent.integrations.hud import ComputerAgent
+from agent.integrations.hud import run_job
+from hud import load_taskset
 
-# Create agent with any ComputerAgent model
-agent = ComputerAgent(
-    model="anthropic/claude-3-5-sonnet-20241022",  # or any model string
-    environment="linux"
+# Load taskset
+taskset = await load_taskset("OSWorld-Verified")
+
+# Run benchmark job
+job = await run_job(
+    model="anthropic/claude-3-5-sonnet-20241022",
+    task_or_taskset=taskset,
+    job_name="test-computeragent-job",
+    # Any extra ComputerAgent kwargs:
+    # verbosity=logging.INFO,  # Enable logging
+    # trajectory_dir=".."       # Save trajectories locally
 )
 
-# Use exactly like other HUD agents
-action, done = await agent.predict(observation)
+# Get results OR view them at app.hud.so
+print(await job.get_analytics())
+print(f"View results at: https://app.hud.so/jobs/{job.id}")
 ```
 
-## Environment Variables
-
-Set these environment variables:
-
-- `HUD_API_KEY` - Your HUD API key
-- `ANTHROPIC_API_KEY` - For Claude models
-- `OPENAI_API_KEY` - For OpenAI models
-
-## Example Benchmarks
-
-1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks with parallel execution
+**Available Benchmarks:**
+1. [OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified) - Benchmark on OSWorld tasks
 
 See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.
\ No newline at end of file
diff --git a/libs/python/agent/agent/integrations/hud/__init__.py b/libs/python/agent/agent/integrations/hud/__init__.py
index 6459048d..993a3a76 100644
--- a/libs/python/agent/agent/integrations/hud/__init__.py
+++ b/libs/python/agent/agent/integrations/hud/__init__.py
@@ -1,7 +1,39 @@
 """HUD integration for ComputerAgent."""
 
+from typing import Any, Optional, Dict
+from hud import run_job as hud_run_job
+
 from .agent import ComputerAgent
 from .adapter import ComputerAgentAdapter
 from .computer_handler import HUDComputerHandler
 
-__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler"]
+
+async def run_job(
+    model: str,
+    task_or_taskset: Any,
+    job_name: str,
+    job_kwargs: Optional[Dict[str, Any]] = None,
+    **agent_kwargs: Any
+) -> Any:
+    """
+    Run a job using ComputerAgent with the specified model.
+    
+    Args:
+        model: Model string for ComputerAgent (e.g., "anthropic/claude-3-5-sonnet-20241022")
+        task_or_taskset: Task or TaskSet to run
+        job_name: Name for the job
+        **agent_kwargs: Additional kwargs to pass to ComputerAgent
+    
+    Returns:
+        Job instance from HUD
+    """
+    return await hud_run_job(
+        agent_cls=ComputerAgent,
+        agent_kwargs={"model": model, **agent_kwargs},
+        task_or_taskset=task_or_taskset,
+        job_name=job_name,
+        **job_kwargs or {}
+    )
+
+
+__all__ = ["ComputerAgent", "ComputerAgentAdapter", "HUDComputerHandler", "run_job"]
\ No newline at end of file
diff --git a/libs/python/agent/agent/integrations/hud/agent.py b/libs/python/agent/agent/integrations/hud/agent.py
index 6f246c20..9156cf4a 100644
--- a/libs/python/agent/agent/integrations/hud/agent.py
+++ b/libs/python/agent/agent/integrations/hud/agent.py
@@ -16,6 +16,19 @@ from .computer_handler import HUDComputerHandler
 
 logger = logging.getLogger(__name__)
 
+BASE_SYSTEM_PROMPT = """
+You are an autonomous computer-using agent. Follow these guidelines:
+
+1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
+2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
+3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
+4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
+5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
+6. Trust that the user wants you to complete the entire task they've requested.
+7. You must say "Task completed" when the task is complete.
+
+Remember: You have been given permission to complete the requested task autonomously.
+""".strip()
 
 class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
     """
@@ -88,25 +101,16 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
         self.initial_prompt: Optional[str] = None
 
         # System prompt for computer use tasks
-        self.base_system_prompt = """
-        You are an autonomous computer-using agent. Follow these guidelines:
-
-        1. Be decisive and complete tasks without asking for confirmation unless absolutely necessary.
-        2. If you need user confirmation for safety-critical actions, use the formal safety check mechanism.
-        3. Do NOT ask questions like "Should I proceed?" or "Would you like me to continue?" - just proceed with the task.
-        4. When you find what you're looking for (e.g., a file to upload), proceed with the action directly.
-        5. Only stop when the task is fully complete or if you encounter an error that prevents completion.
-        6. Trust that the user wants you to complete the entire task they've requested.
-
-        Remember: You have been given permission to complete the requested task autonomously.
-        """
+        self.base_system_prompt = BASE_SYSTEM_PROMPT
 
     async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
         """
         Fetch a response from ComputerAgent based on the observation.
 
         Args:
-            observation: The preprocessed observation
+            observation: The preprocessed observation, attributes: 
+                screenshot: Base64 encoded PNG string of the screen
+                text: Text observation, if available
 
         Returns:
             tuple[list[dict[str, Any]], bool, list[LogType] | None]: A tuple containing the list of raw actions,
@@ -140,9 +144,39 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
                 
                 self.conversation_history.append({"role": "user", "content": message})
             else:
-                # Subsequent interactions - add context about the current state
-                message = "Continue with the task based on the current screen state."
-                self.conversation_history.append({"role": "user", "content": message})
+                # Subsequent interactions - check if last action was computer_call
+                # If so, add computer_call_output with screenshot instead of user message
+                last_computer_calls = []
+                for msg in reversed(self.conversation_history):
+                    if msg.get("type") == "computer_call" and msg.get("status") == "completed":
+                        call_id = msg.get("call_id")
+                        if call_id:
+                            # Check if this call_id already has a computer_call_output
+                            has_output = any(
+                                m.get("type") == "computer_call_output" and m.get("call_id") == call_id
+                                for m in self.conversation_history
+                            )
+                            if not has_output:
+                                last_computer_calls.append(call_id)
+                    elif msg.get("role") == "user":
+                        # Stop at the last user message
+                        break
+                
+                if last_computer_calls and observation.screenshot:
+                    # Add computer_call_output for each unresponded computer_call
+                    for call_id in reversed(last_computer_calls):  # Maintain order
+                        self.conversation_history.append({
+                            "type": "computer_call_output",
+                            "call_id": call_id,
+                            "output": {
+                                "type": "input_image",
+                                "image_url": f"data:image/png;base64,{observation.screenshot}"
+                            }
+                        })
+                else:
+                    # No computer_call found, add regular user message
+                    message = "Continue with the task based on the current screen state."
+                    self.conversation_history.append({"role": "user", "content": message})
 
             # Run ComputerAgent
             try:
@@ -150,7 +184,8 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
                 async for result in self.computer_agent.run(self.conversation_history, stream=False):
                     # Update conversation history with the output
                     self.conversation_history += result["output"]
-                
+                    break
+
                 # Check if we captured any actions
                 if captured_actions:
                     # Extract reasoning from the conversation history
@@ -171,44 +206,34 @@ class ComputerAgent(Agent[BaseComputerAgent, dict[str, Any]]):
                         action["reasoning"] = reasoning
                         action["logs"] = {"conversation_length": len(self.conversation_history)}
                     
-                    # Check if task is done by looking for assistant message indicating completion
-                    done = False
-                    for msg in reversed(self.conversation_history):
-                        if msg.get("type") == "message" and msg.get("role") == "assistant":
-                            content = msg.get("content", [])
-                            for c in content:
-                                if c.get("type") == "output_text" and "task completed" in c.get("text", "").lower():
-                                    done = True
-                                    break
-                            break
+                    return captured_actions, False
                     
-                    return captured_actions, done
-                else:
-                    # No actions captured, task is likely complete
-                    response_text = "Task completed."
-                    for msg in reversed(self.conversation_history):
-                        if msg.get("type") == "message" and msg.get("role") == "assistant":
-                            content = msg.get("content", [])
-                            for c in content:
-                                if c.get("type") == "output_text":
-                                    response_text = c.get("text", response_text)
-                                    break
-                            break
-                    
-                    response_action = {
-                        "type": "response",
-                        "text": response_text,
-                        "reasoning": response_text,
-                        "logs": {"conversation_length": len(self.conversation_history)}
-                    }
-                    
-                    # Check if this indicates task completion or failure
+                # Check if the last message is "Task completed"
+                response_text = ""
+                for msg in reversed(self.conversation_history):
+                    if msg.get("type") == "message" and msg.get("role") == "assistant":
+                        content = msg.get("content", [])
+                        for c in content:
+                            if c.get("type") == "output_text":
+                                response_text = c.get("text", response_text)
+                                break
+                        break
+                
+                done = "task completed" in response_text.lower()
+                
+                response_action = {
+                    "type": "response",
+                    "text": response_text,
+                    "reasoning": response_text,
+                    "logs": {"conversation_length": len(self.conversation_history)}
+                }
+                
+                # Check if this indicates task completion or failure
+                if "task is infeasible" in response_text.lower():
+                    response_action = {"type": "custom", "action": "FAIL"}
                     done = True
-                    if "task is infeasible" in response_text.lower():
-                        response_action = {"type": "custom", "action": "FAIL"}
-                    
-                    return [response_action], done
-
+                
+                return [response_action], done
             except Exception as e:
                 logger.error(f"Error running ComputerAgent: {e}")
                 # Return an error response