improved display of AgentResponse objects in gradio ui, and standardized uitars agent output

This commit is contained in:
Dillon DuPont
2025-04-30 14:47:27 -07:00
parent 7981000820
commit e55e649cd6
3 changed files with 179 additions and 106 deletions

View File

@@ -17,7 +17,7 @@ from ...core.types import AgentResponse, LLMProvider
from ...core.visualization import VisualizationHelper
from computer import Computer
from .utils import add_box_token, parse_actions, parse_action_parameters
from .utils import add_box_token, parse_actions, parse_action_parameters, to_agent_response_format
from .tools.manager import ToolManager
from .tools.computer import ToolResult
from .prompts import COMPUTER_USE, SYSTEM_PROMPT, MAC_SPECIFIC_NOTES
@@ -507,41 +507,14 @@ class UITARSLoop(BaseLoop):
# Update whether an action screenshot was saved this turn
action_screenshot_saved = action_screenshot_saved or new_screenshot_saved
# Parse actions from the raw response
raw_response = response["choices"][0]["message"]["content"]
parsed_actions = parse_actions(raw_response)
# Extract thought content if available
thought = ""
if "Thought:" in raw_response:
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", raw_response, re.DOTALL)
if thought_match:
thought = thought_match.group(1).strip()
agent_response = await to_agent_response_format(
response,
messages,
model=self.model,
)
yield agent_response
# Create standardized thought response format
thought_response = {
"role": "assistant",
"content": thought or raw_response,
"metadata": {
"title": "🧠 UI-TARS Thoughts"
}
}
# Create action response format
action_response = {
"role": "assistant",
"content": str(parsed_actions),
"metadata": {
"title": "🖱️ UI-TARS Actions",
}
}
# Yield both responses to the caller (thoughts first, then actions)
yield thought_response
if parsed_actions:
yield action_response
# Check if we should continue this conversation
running = should_continue
@@ -562,7 +535,8 @@ class UITARSLoop(BaseLoop):
logger.error(f"Maximum retry attempts reached. Last error was: {str(e)}")
yield {
"error": str(e),
"role": "assistant",
"content": f"Error: {str(e)}",
"metadata": {"title": "❌ Error"},
}

View File

@@ -4,9 +4,114 @@ import logging
import base64
import re
from typing import Any, Dict, List, Optional, Union, Tuple
from datetime import datetime
logger = logging.getLogger(__name__)
from ...core.types import AgentResponse
async def to_agent_response_format(
response: Dict[str, Any],
messages: List[Dict[str, Any]],
model: Optional[str] = None,
) -> AgentResponse:
"""Convert raw UI-TARS response to agent response format.
Args:
response: Raw UI-TARS response
messages: List of messages in standard format
model: Optional model name
Returns:
AgentResponse: Standardized agent response format
"""
# Create unique IDs for this response
response_id = f"resp_{datetime.now().strftime('%Y%m%d%H%M%S')}_{id(response)}"
reasoning_id = f"rs_{response_id}"
action_id = f"cu_{response_id}"
call_id = f"call_{response_id}"
# Parse actions from the raw response
content = response["choices"][0]["message"]["content"]
actions = parse_actions(content)
# Extract thought content if available
reasoning_text = ""
if "Thought:" in content:
thought_match = re.search(r"Thought: (.*?)(?=\s*Action:|$)", content, re.DOTALL)
if thought_match:
reasoning_text = thought_match.group(1).strip()
# Create output items
output_items = []
if reasoning_text:
output_items.append({
"type": "reasoning",
"id": reasoning_id,
"text": reasoning_text
})
if actions:
for i, action in enumerate(actions):
action_name, tool_args = parse_action_parameters(action)
if action_name == "finished":
output_items.append({
"type": "message",
"role": "assistant",
"content": [{
"type": "output_text",
"text": tool_args["content"]
}],
"id": f"action_{i}_{action_id}",
"status": "completed"
})
else:
if tool_args.get("action") == action_name:
del tool_args["action"]
output_items.append({
"type": "computer_call",
"id": f"{action}_{i}_{action_id}",
"call_id": f"call_{i}_{action_id}",
"action": { "type": action_name, **tool_args },
"pending_safety_checks": [],
"status": "completed"
})
# Create agent response
agent_response = AgentResponse(
id=response_id,
object="response",
created_at=int(datetime.now().timestamp()),
status="completed",
error=None,
incomplete_details=None,
instructions=None,
max_output_tokens=None,
model=model or response["model"],
output=output_items,
parallel_tool_calls=True,
previous_response_id=None,
reasoning={"effort": "medium"},
store=True,
temperature=0.0,
top_p=0.7,
text={"format": {"type": "text"}},
tool_choice="auto",
tools=[
{
"type": "computer_use_preview",
"display_height": 768,
"display_width": 1024,
"environment": "mac",
}
],
truncation="auto",
usage=response["usage"],
user=None,
metadata={},
response=response
)
return agent_response
def add_box_token(input_string: str) -> str:
"""Add box tokens to the coordinates in the model response.
@@ -74,7 +179,13 @@ def parse_action_parameters(action: str) -> Tuple[str, Dict[str, Any]]:
"""
# Handle "finished" action
if action.startswith("finished"):
return "finished", {}
# Parse content if it exists
content_match = re.search(r"content='([^']*)'", action)
if content_match:
content = content_match.group(1)
return "finished", {"content": content}
else:
return "finished", {}
# Parse action parameters
action_match = re.match(r'(\w+)\((.*)\)', action)

View File

@@ -35,6 +35,7 @@ from pathlib import Path
from typing import Dict, List, Optional, AsyncGenerator, Any, Tuple, Union
import gradio as gr
from gradio.components.chatbot import MetadataDict
from typing import cast
# Import from agent package
from agent.core.types import AgentResponse
@@ -447,66 +448,6 @@ def create_agent(
return global_agent
def process_agent_result(result: Union[AgentResponse, Dict[str, Any]]) -> Tuple[str, MetadataDict]:
"""Process agent results for the Gradio UI."""
# Extract text content
text_obj = result.get("text", {})
metadata = result.get("metadata", {})
# Create a properly typed MetadataDict
metadata_dict = MetadataDict()
metadata_dict["title"] = metadata.get("title", "")
metadata_dict["status"] = "done"
metadata = metadata_dict
# For OpenAI's Computer-Use Agent, text field is an object with format property
if (
text_obj
and isinstance(text_obj, dict)
and "format" in text_obj
and not text_obj.get("value", "")
):
content, metadata = extract_synthesized_text(result)
else:
if not text_obj:
text_obj = result
# For other types of results, try to get text directly
if isinstance(text_obj, dict):
if "value" in text_obj:
content = text_obj["value"]
elif "text" in text_obj:
content = text_obj["text"]
elif "content" in text_obj:
content = text_obj["content"]
else:
content = ""
else:
content = str(text_obj) if text_obj else ""
# If still no content but we have outputs, create a summary
if not content and "output" in result and result["output"]:
output = result["output"]
for out in output:
if out.get("type") == "reasoning":
content = out.get("content", "")
if content:
break
elif out.get("type") == "computer_call":
action = out.get("action", {})
action_type = action.get("type", "")
if action_type:
content = f"Performing action: {action_type}"
break
# Clean up the text - ensure content is a string
if not isinstance(content, str):
content = str(content) if content else ""
return content, metadata
def create_gradio_ui(
provider_name: str = "openai",
model_name: str = "gpt-4o",
@@ -907,17 +848,64 @@ def create_gradio_ui(
# Stream responses from the agent
async for result in global_agent.run(last_user_message):
# Process result
content, metadata = process_agent_result(result)
# Skip empty content
if content or metadata.get("title"):
history.append(
gr.ChatMessage(
role="assistant", content=content, metadata=metadata
print(f"DEBUG - Agent response ------- START")
from pprint import pprint
pprint(result)
print(f"DEBUG - Agent response ------- END")
def generate_gradio_messages():
if result.get("content"):
yield gr.ChatMessage(
role="assistant",
content=result.get("content", ""),
metadata=cast(MetadataDict, result.get("metadata", {}))
)
)
yield history
else:
outputs = result.get("output", [])
for output in outputs:
if output.get("type") == "message":
content = output.get("content", [])
for content_part in content:
if content_part.get("text"):
yield gr.ChatMessage(
role=output.get("role", "assistant"),
content=content_part.get("text", ""),
metadata=content_part.get("metadata", {})
)
elif output.get("type") == "reasoning":
# if it's openAI, we only have access to a summary of the reasoning
summary_content = output.get("summary", [])
if summary_content:
for summary_part in summary_content:
if summary_part.get("type") == "summary_text":
yield gr.ChatMessage(
role="assistant",
content=summary_part.get("text", "")
)
else:
summary_content = output.get("text", "")
if summary_content:
yield gr.ChatMessage(
role="assistant",
content=summary_content,
)
elif output.get("type") == "computer_call":
action = output.get("action", {})
action_type = action.get("type", "")
if action_type:
action_title = f"🛠️ Performing {action_type}"
if action.get("x") and action.get("y"):
action_title += f" at ({action['x']}, {action['y']})"
yield gr.ChatMessage(
role="assistant",
content=f"```json\n{json.dumps(action)}\n```",
metadata={"title": action_title}
)
for message in generate_gradio_messages():
history.append(message)
yield history
except Exception as e:
import traceback