mirror of
https://github.com/trycua/computer.git
synced 2026-01-06 21:39:58 -06:00
Merge pull request #94 from trycua/fix/agent/anthropic-format
Fix anthropic format in omni loop
This commit is contained in:
@@ -32,9 +32,9 @@ async def run_agent_example():
|
||||
# loop=AgentLoop.ANTHROPIC,
|
||||
loop=AgentLoop.OMNI,
|
||||
# model=LLM(provider=LLMProvider.OPENAI), # No model name for Operator CUA
|
||||
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4.5-preview"),
|
||||
# model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
||||
model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
|
||||
# model=LLM(provider=LLMProvider.OPENAI, name="gpt-4o"),
|
||||
model=LLM(provider=LLMProvider.ANTHROPIC, name="claude-3-7-sonnet-20250219"),
|
||||
# model=LLM(provider=LLMProvider.OLLAMA, name="gemma3:4b-it-q4_K_M"),
|
||||
# model=LLM(
|
||||
# provider=LLMProvider.OAICOMPAT,
|
||||
# name="qwen2.5-vl-7b-instruct",
|
||||
|
||||
@@ -109,6 +109,33 @@ export ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
|
||||
```
|
||||
|
||||
### Using Local Models
|
||||
|
||||
You can use local models with the OMNI loop provider by selecting "Custom model..." from the dropdown. The default provider URL is set to `http://localhost:1234/v1` which works with LM Studio.
|
||||
|
||||
If you're using a different local model server:
|
||||
- vLLM: `http://localhost:8000/v1`
|
||||
- LocalAI: `http://localhost:8080/v1`
|
||||
- Ollama with OpenAI compat API: `http://localhost:11434/v1`
|
||||
|
||||
To change the URL, modify the `provider_base_url` in your launcher script:
|
||||
|
||||
```python
|
||||
# In your launcher script
|
||||
from agent.ui.gradio.app import create_gradio_ui
|
||||
from agent import LLM, LLMProvider
|
||||
|
||||
# Create a custom model with a specific URL
|
||||
custom_model = LLM(
|
||||
provider=LLMProvider.OAICOMPAT,
|
||||
name="your-model-name",
|
||||
provider_base_url="http://localhost:8000/v1" # Change to your server URL
|
||||
)
|
||||
|
||||
app = create_gradio_ui(custom_model=custom_model)
|
||||
app.launch()
|
||||
```
|
||||
|
||||
Without these environment variables, the UI will show "No models available" for the corresponding providers, but you can still use local models with the OMNI loop provider.
|
||||
|
||||
The Gradio UI provides:
|
||||
@@ -123,14 +150,8 @@ You can also embed the Gradio UI in your own application:
|
||||
# Import directly in your application
|
||||
from agent.ui.gradio.app import create_gradio_ui
|
||||
|
||||
# Create the UI with advanced features
|
||||
demo = create_gradio_ui()
|
||||
demo.launch()
|
||||
|
||||
# Or for a simpler interface
|
||||
from agent.ui.gradio import registry
|
||||
demo = registry(name='cua:gpt-4o')
|
||||
demo.launch()
|
||||
app = create_gradio_ui()
|
||||
app.launch()
|
||||
```
|
||||
|
||||
## Agent Loops
|
||||
@@ -141,7 +162,7 @@ The `cua-agent` package provides three agent loops variations, based on differen
|
||||
|:-----------|:-----------------|:------------|:-------------|
|
||||
| `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA model | Not Required |
|
||||
| `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use | Not Required |
|
||||
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
| `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
|
||||
|
||||
## AgentResponse
|
||||
The `AgentResponse` class represents the structured output returned after each agent turn. It contains the agent's response, reasoning, tool usage, and other metadata. The response format aligns with the new [OpenAI Agent SDK specification](https://platform.openai.com/docs/api-reference/responses) for better consistency across different agent loops.
|
||||
|
||||
@@ -443,6 +443,8 @@ class OmniLoop(BaseLoop):
|
||||
except (json.JSONDecodeError, IndexError):
|
||||
try:
|
||||
# Look for JSON object pattern
|
||||
import re # Local import to ensure availability
|
||||
|
||||
json_pattern = r"\{[^}]+\}"
|
||||
json_match = re.search(json_pattern, raw_text)
|
||||
if json_match:
|
||||
@@ -453,8 +455,20 @@ class OmniLoop(BaseLoop):
|
||||
logger.error(f"No JSON found in content")
|
||||
return True, action_screenshot_saved
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
||||
return True, action_screenshot_saved
|
||||
# Try to sanitize the JSON string and retry
|
||||
try:
|
||||
# Remove or replace invalid control characters
|
||||
import re # Local import to ensure availability
|
||||
|
||||
sanitized_text = re.sub(r"[\x00-\x1F\x7F]", "", raw_text)
|
||||
# Try parsing again with sanitized text
|
||||
parsed_content = json.loads(sanitized_text)
|
||||
logger.info(
|
||||
"Successfully parsed JSON after sanitizing control characters"
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Failed to parse JSON from text: {str(e)}")
|
||||
return True, action_screenshot_saved
|
||||
|
||||
# Step 4: Process the parsed content if available
|
||||
if parsed_content:
|
||||
|
||||
@@ -271,16 +271,19 @@ def create_agent(
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
# Create LLM model object with appropriate parameters
|
||||
provider_base_url = "http://localhost:8000/v1" if use_oaicompat else None
|
||||
provider_base_url = "http://localhost:1234/v1" if use_oaicompat else None
|
||||
|
||||
if use_oaicompat:
|
||||
# Special handling for OAICOMPAT - use OPENAI provider with custom base URL
|
||||
print(f"DEBUG - Creating OAICOMPAT agent with model: {model_name}")
|
||||
# Special handling for OAICOMPAT - use OAICOMPAT provider with custom base URL
|
||||
print(
|
||||
f"DEBUG - Creating OAICOMPAT agent with model: {model_name}, URL: {provider_base_url}"
|
||||
)
|
||||
llm = LLM(
|
||||
provider=provider, # Already set to OPENAI
|
||||
provider=LLMProvider.OAICOMPAT, # Set to OAICOMPAT instead of using original provider
|
||||
name=model_name,
|
||||
provider_base_url=provider_base_url,
|
||||
)
|
||||
print(f"DEBUG - LLM provider is now: {llm.provider}, base URL: {llm.provider_base_url}")
|
||||
# Note: Don't pass use_oaicompat to the agent, as it doesn't accept this parameter
|
||||
elif provider == LLMProvider.OAICOMPAT:
|
||||
# This path is unlikely to be taken with our current approach
|
||||
@@ -461,8 +464,10 @@ def respond(
|
||||
# Special handling for OAICOMPAT to bypass provider-specific errors
|
||||
# Creates the agent with OPENAI provider but using custom model name and provider base URL
|
||||
is_oaicompat = str(provider) == "oaicompat"
|
||||
if is_oaicompat:
|
||||
provider = LLMProvider.OPENAI
|
||||
|
||||
# Don't override the provider for OAICOMPAT - instead pass it through
|
||||
# if is_oaicompat:
|
||||
# provider = LLMProvider.OPENAI
|
||||
|
||||
# Get API key based on provider
|
||||
if provider == LLMProvider.OPENAI:
|
||||
|
||||
Reference in New Issue
Block a user