Merge pull request #94 from trycua/fix/agent/anthropic-format

Fix anthropic format in omni loop
This commit is contained in:
f-trycua
2025-04-06 13:24:18 -07:00
committed by GitHub
4 changed files with 60 additions and 20 deletions

View File

@@ -32,9 +32,9 @@ async def run_agent_example():
# loop=AgentLoop.ANTHROPIC,
loop=AgentLoop.OMNI,
# model=LLM(provider=LLMProvider.OPENAI), # No model name for Operator CUA
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
# model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"),
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
# model=LLM(
# provider=LLMProvider.OAICOMPAT,
# name="qwen2.5-vl-7b-instruct",

View File

@@ -109,6 +109,33 @@ export ANTHROPIC_API_KEY=your_anthropic_key_here
OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
```
### Using Local Models
You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
If you're using a different local model server:
- vLLM: `http://localhost:8000/v1`
- LocalAI: `http://localhost:8080/v1`
- Ollama with OpenAI compat API: `http://localhost:11434/v1`
To change the URL, modify the `provider_base_url` in your launcher script:
```python
# In your launcher script
from agent.ui.gradio.app import create_gradio_ui
from agent import LLM, LLMProvider
# Create a custom model with a specific URL
custom_model = LLM(
provider=LLMProvider.OAICOMPAT,
name="your-model-name",
provider_base_url="http://localhost:8000/v1" # Change to your server URL
)
app = create_gradio_ui(custom_model=custom_model)
app.launch()
```
Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
The Gradio UI provides:
@@ -123,14 +150,8 @@ You can also embed the Gradio UI in your own application:
# Import directly in your application
from agent.ui.gradio.app import create_gradio_ui
# Create the UI with advanced features
demo = create_gradio_ui()
demo.launch()
# Or for a simpler interface
from agent.ui.gradio import registry
demo = registry(name='cua:gpt-4o')
demo.launch()
app = create_gradio_ui()
app.launch()
```
## Agent Loops
@@ -141,7 +162,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|:-----------|:-----------------|:------------|:-------------|
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
## AgentResponse
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.

View File

@@ -443,6 +443,8 @@ class OmniLoop(BaseLoop):
except (json.JSONDecodeError, IndexError):
try:
# Look for JSON object pattern
import re # Local import to ensure availability
json_pattern = r"\{[^}]+\}"
json_match = re.search(json_pattern, raw_text)
if json_match:
@@ -453,8 +455,20 @@ class OmniLoop(BaseLoop):
logger.error(f"No JSON found in content")
return True, action_screenshot_saved
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON from text: {str(e)}")
return True, action_screenshot_saved
# Try to sanitize the JSON string and retry
try:
# Remove or replace invalid control characters
import re # Local import to ensure availability
sanitized_text = re.sub(r"[\x00-\x1F\x7F]", "", raw_text)
# Try parsing again with sanitized text
parsed_content = json.loads(sanitized_text)
logger.info(
"Successfully parsed JSON after sanitizing control characters"
)
except json.JSONDecodeError:
logger.error(f"Failed to parse JSON from text: {str(e)}")
return True, action_screenshot_saved
# Step 4: Process the parsed content if available
if parsed_content:

View File

@@ -271,16 +271,19 @@ def create_agent(
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
# Create LLM model object with appropriate parameters
provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
provider_base_url = "http://localhost:1234/v1" if use_oaicompat else None
if use_oaicompat:
# Special handling for OAICOMPAT - use OPENAI provider with custom base URL
print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
# Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
print(
f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {provider_base_url}"
)
llm = LLM(
provider=provider, # Already set to OPENAI
provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
name=model_name,
provider_base_url=provider_base_url,
)
print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
# Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
elif provider == LLMProvider.OAICOMPAT:
# This path is unlikely to be taken with our current approach
@@ -461,8 +464,10 @@ def respond(
# Special handling for OAICOMPAT to bypass provider-specific errors
# Creates the agent with OPENAI provider but using custom model name and provider base URL
is_oaicompat = str(provider) == "oaicompat"
if is_oaicompat:
provider = LLMProvider.OPENAI
# Don't override the provider for OAICOMPAT - instead pass it through
# if is_oaicompat:
# provider = LLMProvider.OPENAI
# Get API key based on provider
if provider == LLMProvider.OPENAI: