mirror of
https://github.com/trycua/computer.git
synced 2025-12-31 02:19:58 -06:00
updated eval to use sandboxed decorator
This commit is contained in:
@@ -1,13 +1,63 @@
|
||||
import os
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import traceback
|
||||
import time
|
||||
from functools import wraps
|
||||
import urllib.request
|
||||
import datetime
|
||||
from urllib.parse import quote
|
||||
|
||||
# Wikirace prompt template
|
||||
WIKIRACE_PROMPT_TEMPLATE = """
|
||||
You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}"
|
||||
by clicking only on Wikipedia links within articles.
|
||||
|
||||
Rules:
|
||||
1. Only click on links within Wikipedia articles (blue underlined text)
|
||||
2. No using search, back button, or typing URLs
|
||||
3. You MAY use cmd+f (or ctrl+f) to find text on the current page
|
||||
4. Do NOT click any search icon or type into any search box unless it's a browser command
|
||||
5. Try to find the shortest path possible
|
||||
6. Current target: {target_page}
|
||||
7. Do not maximize the window or use any other application
|
||||
8. Avoid wasting actions by scrolling
|
||||
9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
|
||||
|
||||
Look at the current page and click on a link that might lead you closer to {target_page}.
|
||||
"""
|
||||
|
||||
# Store original print function
|
||||
_print = print
|
||||
|
||||
# Define log file path
|
||||
project_root = Path(__file__).parent.parent
|
||||
log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
|
||||
|
||||
# Custom print function that also logs to file
|
||||
def print(*args, **kwargs):
|
||||
# Call the original print function
|
||||
_print(*args, **kwargs)
|
||||
|
||||
# Format the output as a string
|
||||
output = " ".join(str(arg) for arg in args)
|
||||
if kwargs.get("end") is not None:
|
||||
output += kwargs["end"]
|
||||
else:
|
||||
output += "\n"
|
||||
|
||||
# Add timestamp
|
||||
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log_entry = f"[{timestamp}] {output}"
|
||||
|
||||
# Append to log file
|
||||
with open(log_file, "a") as f:
|
||||
f.write(log_entry)
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
env_file = project_root / ".env"
|
||||
print(f"Loading environment from: {env_file}")
|
||||
from dotenv import load_dotenv
|
||||
@@ -29,104 +79,283 @@ from computer.helpers import sandboxed
|
||||
# Assuming these exist based on your request
|
||||
from agent import ComputerAgent, LLM, AgentLoop, LLMProvider
|
||||
|
||||
async def main():
|
||||
articles = []
|
||||
|
||||
# Load from file
|
||||
articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt"
|
||||
with open(articles_file, "r") as f:
|
||||
articles = [line.strip() for line in f]
|
||||
|
||||
|
||||
def get_article_links(article_title):
|
||||
"""Get all links from a Wikipedia article's content"""
|
||||
try:
|
||||
print("\n=== Using cloud container ===")
|
||||
# # Create a remote Linux computer with CUA
|
||||
# computer = Computer(
|
||||
# os_type="linux",
|
||||
# api_key=os.getenv("CUA_API_KEY"),
|
||||
# name=str(os.getenv("CUA_CONTAINER_NAME")),
|
||||
# provider_type=VMProviderType.CLOUD,
|
||||
# )
|
||||
# Get the article content
|
||||
url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json"
|
||||
|
||||
# Connect to local macOS computer
|
||||
computer = Computer()
|
||||
with urllib.request.urlopen(url) as response:
|
||||
data = json.loads(response.read().decode())
|
||||
|
||||
pages = data.get('query', {}).get('pages', {})
|
||||
if not pages:
|
||||
return []
|
||||
|
||||
# Get the first (and only) page
|
||||
page = next(iter(pages.values()))
|
||||
links = page.get('links', [])
|
||||
|
||||
# Filter links to keep only main namespace articles (no special pages, files, etc.)
|
||||
article_links = []
|
||||
for link in links:
|
||||
title = link.get('title', '')
|
||||
# Skip if title contains colons (indicates special pages, files, categories, etc.)
|
||||
if ':' not in title and title.isascii() and len(title) < 50:
|
||||
article_links.append(title)
|
||||
|
||||
return article_links
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching links for {article_title}: {e}")
|
||||
return []
|
||||
|
||||
def wikipedia_random_walk(start_article, depth=5):
|
||||
"""
|
||||
Perform a random walk through Wikipedia articles
|
||||
|
||||
Args:
|
||||
start_article (str): The article title to start from
|
||||
depth (int): How many steps to take in the random walk
|
||||
|
||||
Returns:
|
||||
list: Path of article titles visited during the walk
|
||||
"""
|
||||
path = [start_article]
|
||||
current_article = start_article
|
||||
|
||||
for step in range(depth):
|
||||
print(f"Step {step + 1}: Currently at '{current_article}'")
|
||||
|
||||
# Get links from current article
|
||||
links = get_article_links(current_article)
|
||||
|
||||
if not links:
|
||||
print(f"No valid links found in '{current_article}'. Ending walk.")
|
||||
break
|
||||
|
||||
# Randomly select next article
|
||||
next_article = random.choice(links)
|
||||
path.append(next_article)
|
||||
current_article = next_article
|
||||
|
||||
print(f" -> Moving to '{next_article}'")
|
||||
|
||||
return path
|
||||
|
||||
def get_article_pair(depth=5):
|
||||
global articles
|
||||
start_article = random.choice(articles)
|
||||
target_article = wikipedia_random_walk(start_article, depth)[-1]
|
||||
while target_article == start_article:
|
||||
start_article = random.choice(articles)
|
||||
target_article = wikipedia_random_walk(start_article, depth)[-1]
|
||||
return start_article, target_article
|
||||
|
||||
async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
|
||||
"""Run a specific evaluation scenario"""
|
||||
|
||||
print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===")
|
||||
|
||||
# Create computer instance with or without app-use experiment
|
||||
experiments = ["app-use"] if use_app_use else []
|
||||
computer = Computer(experiments=experiments)
|
||||
|
||||
try:
|
||||
# Run the computer
|
||||
await computer.run()
|
||||
|
||||
# Install required packages
|
||||
await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
|
||||
|
||||
# Run the specific scenario
|
||||
if scenario_name == "messy_desktop":
|
||||
await run_messy_desktop_scenario(computer, agent_configs, max_steps)
|
||||
elif scenario_name == "parallel_agents":
|
||||
await run_parallel_agents_scenario(computer, agent_configs, max_steps)
|
||||
else:
|
||||
print(f"Unknown scenario: {scenario_name}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in scenario {scenario_name}: {e}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
# Important to clean up resources
|
||||
# await computer.stop()
|
||||
pass
|
||||
|
||||
|
||||
@sandboxed("eval_env")
|
||||
def close_all_windows():
|
||||
"""Close all open windows"""
|
||||
import pywinctl
|
||||
windows = pywinctl.getAllWindows()
|
||||
for window in windows:
|
||||
try:
|
||||
window.close()
|
||||
except:
|
||||
# Some windows might not be closeable or may have already closed
|
||||
pass
|
||||
|
||||
|
||||
@sandboxed("eval_env")
|
||||
def get_current_wiki_page(app_name=None):
|
||||
"""Get the title of the current Wikipedia page
|
||||
|
||||
Args:
|
||||
app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox')
|
||||
"""
|
||||
import pywinctl
|
||||
windows = pywinctl.getAllWindows()
|
||||
|
||||
# Filter windows by app name if provided
|
||||
if app_name:
|
||||
windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()]
|
||||
|
||||
# Get titles from filtered windows
|
||||
titles = [w.title for w in windows if w.title]
|
||||
wiki_titles = [title for title in titles if "Wikipedia" in title]
|
||||
|
||||
if wiki_titles:
|
||||
return wiki_titles[0].split(" - Wikipedia")[0]
|
||||
return None
|
||||
|
||||
|
||||
@sandboxed("eval_env")
|
||||
def get_open_app_names():
|
||||
"""Get names of all open applications"""
|
||||
import pywinctl
|
||||
windows = pywinctl.getAllWindows()
|
||||
return [window.getAppName() for window in windows if window.getAppName()]
|
||||
|
||||
def _computer():
|
||||
"""Get the default computer instance"""
|
||||
from computer.helpers import _default_computer
|
||||
return _default_computer
|
||||
|
||||
async def open_app(app_name):
|
||||
"""Open a specific application"""
|
||||
await _computer().interface.run_command(f"open -a '{app_name}'")
|
||||
await asyncio.sleep(2) # Wait for app to open
|
||||
|
||||
|
||||
async def open_wiki(page, app_name="Safari"):
|
||||
"""Open a specific Wikipedia page"""
|
||||
await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}")
|
||||
await asyncio.sleep(2) # Wait for page to load
|
||||
|
||||
|
||||
async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
|
||||
"""Run the messy desktop scenario with a single agent"""
|
||||
# Get popular wiki articles
|
||||
global articles
|
||||
start_page, target_page = get_article_pair(depth=1)
|
||||
|
||||
print(f"Wiki race: {start_page} → {target_page}")
|
||||
|
||||
# Close all windows first
|
||||
await close_all_windows()
|
||||
|
||||
# Open starting Wikipedia page
|
||||
await open_wiki(start_page)
|
||||
|
||||
# Open 3 random apps to create a messy desktop
|
||||
apps_to_open = ["Notes", "Terminal", "System Settings"]
|
||||
for app in apps_to_open:
|
||||
await open_app(app)
|
||||
|
||||
# Verify apps are open
|
||||
open_apps = await get_open_app_names()
|
||||
print(f"Open applications: {open_apps}")
|
||||
|
||||
# Create the agent's computer interface
|
||||
# If app-use is enabled, create a desktop limited to Safari/Firefox
|
||||
if "app-use" in (computer.experiments or []):
|
||||
browser_desktop = computer.create_desktop_from_apps(["Safari"])
|
||||
agent_computer = browser_desktop
|
||||
else:
|
||||
agent_computer = computer
|
||||
|
||||
# Run each agent configuration
|
||||
for config_name, loop_provider, model_provider in agent_configs:
|
||||
print(f"\n--- Testing Agent: {config_name} ---")
|
||||
|
||||
# Create agent with the specified configuration
|
||||
agent = ComputerAgent(
|
||||
computer=agent_computer,
|
||||
loop=loop_provider,
|
||||
model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
|
||||
)
|
||||
|
||||
# Run the wikirace
|
||||
steps = 0
|
||||
success = False
|
||||
start_time = time.time()
|
||||
|
||||
# Use the template with formatting for this scenario
|
||||
prompt = WIKIRACE_PROMPT_TEMPLATE.format(
|
||||
browser="Safari",
|
||||
start_page=start_page,
|
||||
target_page=target_page
|
||||
)
|
||||
|
||||
try:
|
||||
# Run the computer with default parameters
|
||||
await computer.run()
|
||||
|
||||
# Install required packages
|
||||
await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
|
||||
|
||||
# Helper functions for wikirace
|
||||
async def open_wiki(page):
|
||||
await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
|
||||
await asyncio.sleep(2) # Wait for page to load
|
||||
|
||||
# Remote functions for wikirace - using @sandboxed decorator
|
||||
@sandboxed("eval_env")
|
||||
def close_all_windows():
|
||||
import pywinctl
|
||||
windows = pywinctl.getAllWindows()
|
||||
for window in windows:
|
||||
try:
|
||||
window.close()
|
||||
except:
|
||||
# Some windows might not be closeable or may have already closed
|
||||
pass
|
||||
|
||||
@sandboxed("eval_env")
|
||||
def get_current_wiki_page():
|
||||
import pywinctl
|
||||
titles = pywinctl.getAllTitles()
|
||||
wiki_titles = [title for title in titles if "Wikipedia" in title]
|
||||
if wiki_titles:
|
||||
return wiki_titles[0].split(" - Wikipedia")[0]
|
||||
return None
|
||||
|
||||
# Wikirace setup
|
||||
max_steps = 2
|
||||
start_page = "Albert Einstein"
|
||||
target_page = "Pizza"
|
||||
|
||||
print(f"\nStarting Wikirace: {start_page} → {target_page}")
|
||||
|
||||
# Close all windows
|
||||
await close_all_windows()
|
||||
|
||||
# Open starting page
|
||||
await open_wiki(start_page)
|
||||
|
||||
# Check current page using decorated function
|
||||
current_page = await get_current_wiki_page()
|
||||
print(f"Starting page: {current_page}")
|
||||
assert current_page == start_page, f"Expected {start_page}, got {current_page}"
|
||||
|
||||
# Create agent
|
||||
agent = ComputerAgent(
|
||||
computer=computer,
|
||||
loop=AgentLoop.OPENAI,
|
||||
model=LLM(LLMProvider.OPENAI)
|
||||
)
|
||||
|
||||
# Run the wikirace
|
||||
steps = 0
|
||||
success = False
|
||||
start_time = time.time()
|
||||
|
||||
prompt = f"""
|
||||
You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}"
|
||||
by clicking only on Wikipedia links within articles.
|
||||
|
||||
Rules:
|
||||
1. Only click on links within Wikipedia articles (blue underlined text)
|
||||
2. No using search, back button, or typing URLs
|
||||
3. Try to find the shortest path possible
|
||||
4. Current target: {target_page}
|
||||
|
||||
Look at the current page and click on a link that might lead you closer to {target_page}.
|
||||
"""
|
||||
|
||||
try:
|
||||
while steps < max_steps and not success:
|
||||
async for result in agent.run(prompt):
|
||||
steps += 1
|
||||
print(f"Step {steps}: {result}")
|
||||
print(f"Step {steps}")
|
||||
|
||||
# Check again
|
||||
current_page = await get_current_wiki_page()
|
||||
def process_result():
|
||||
if result.get("content"):
|
||||
print(f"Agent: {result.get('content', '')}")
|
||||
|
||||
else:
|
||||
outputs = result.get("output", [])
|
||||
for output in outputs:
|
||||
if output.get("type") == "message":
|
||||
content = output.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
print(f"Agent: {content_part.get('text', '')}")
|
||||
|
||||
elif output.get("type") == "reasoning":
|
||||
# if it's openAI, we only have access to a summary of the reasoning
|
||||
summary_content = output.get("summary", [])
|
||||
if summary_content:
|
||||
for summary_part in summary_content:
|
||||
if summary_part.get("type") == "summary_text":
|
||||
print(f"Agent: {summary_part.get('text', '')}")
|
||||
|
||||
else:
|
||||
summary_content = output.get("text", "")
|
||||
if summary_content:
|
||||
print(f"Agent: {summary_content}")
|
||||
|
||||
elif output.get("type") == "computer_call":
|
||||
action = output.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
if action_type:
|
||||
action_title = f"🛠️ Performing {action_type}"
|
||||
if action.get("x") and action.get("y"):
|
||||
action_title += f" at ({action['x']}, {action['y']})"
|
||||
print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
|
||||
|
||||
|
||||
# Process and print the result
|
||||
process_result()
|
||||
|
||||
# Check current page
|
||||
current_page = await get_current_wiki_page("Safari")
|
||||
print(f"Current page: {current_page}")
|
||||
print(f"Target: {target_page}")
|
||||
|
||||
# Check if we reached the target
|
||||
if current_page and target_page.lower() in current_page.lower():
|
||||
@@ -140,29 +369,298 @@ async def main():
|
||||
print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
|
||||
await agent._loop.cancel()
|
||||
break
|
||||
except asyncio.CancelledError:
|
||||
print("Agent stopped")
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
await asyncio.sleep(2) # Wait for agent to finish
|
||||
|
||||
# Results
|
||||
print(f"\n=== WIKIRACE RESULTS: {config_name} ===")
|
||||
print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
|
||||
print(f"Start: {start_page}")
|
||||
print(f"Target: {target_page}")
|
||||
print(f"Steps taken: {steps}")
|
||||
print(f"Success: {success}")
|
||||
print(f"Duration: {duration:.2f} seconds")
|
||||
|
||||
|
||||
async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
|
||||
|
||||
"""Run two agents in parallel, one using Safari and one using Firefox"""
|
||||
# Get popular wiki articles
|
||||
global articles
|
||||
safari_start, safari_target = get_article_pair(depth=1)
|
||||
firefox_start, firefox_target = get_article_pair(depth=1)
|
||||
|
||||
print(f"Safari Wiki race: {safari_start} → {safari_target}")
|
||||
print(f"Firefox Wiki race: {firefox_start} → {firefox_target}")
|
||||
|
||||
# Close all windows first
|
||||
await close_all_windows()
|
||||
|
||||
# Open Safari with starting page
|
||||
await open_wiki(safari_start, "Safari")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Open Firefox with starting page
|
||||
await open_wiki(firefox_start, "Firefox")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Create agent configurations
|
||||
for config_name, loop_provider, model_provider in agent_configs:
|
||||
print(f"\n--- Testing Parallel Agents: {config_name} ---")
|
||||
|
||||
# Create the agent interfaces
|
||||
if "app-use" in (computer.experiments or []):
|
||||
safari_desktop = computer.create_desktop_from_apps(["Safari"])
|
||||
firefox_desktop = computer.create_desktop_from_apps(["Firefox"])
|
||||
else:
|
||||
safari_desktop = computer
|
||||
firefox_desktop = computer
|
||||
|
||||
# Save screenshots
|
||||
screenshot_dir = project_root / "examples" / "evals" / "screenshots"
|
||||
screenshot_dir.mkdir(exist_ok=True)
|
||||
safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png"
|
||||
firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png"
|
||||
screenshot_bytes = await safari_desktop.interface.screenshot()
|
||||
with open(safari_screenshot_path, "wb") as f:
|
||||
f.write(screenshot_bytes)
|
||||
screenshot_bytes = await firefox_desktop.interface.screenshot()
|
||||
with open(firefox_screenshot_path, "wb") as f:
|
||||
f.write(screenshot_bytes)
|
||||
|
||||
# Create agents
|
||||
safari_agent = ComputerAgent(
|
||||
computer=safari_desktop,
|
||||
loop=loop_provider,
|
||||
model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
|
||||
)
|
||||
|
||||
firefox_agent = ComputerAgent(
|
||||
computer=firefox_desktop,
|
||||
loop=loop_provider,
|
||||
model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
|
||||
)
|
||||
|
||||
# Create prompts using the template
|
||||
safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
|
||||
browser="Safari",
|
||||
start_page=safari_start,
|
||||
target_page=safari_target
|
||||
)
|
||||
|
||||
firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
|
||||
browser="Firefox",
|
||||
start_page=firefox_start,
|
||||
target_page=firefox_target
|
||||
)
|
||||
|
||||
# Track results
|
||||
safari_results = {
|
||||
"steps": 0,
|
||||
"success": False,
|
||||
"start_time": time.time(),
|
||||
"end_time": None
|
||||
}
|
||||
|
||||
firefox_results = {
|
||||
"steps": 0,
|
||||
"success": False,
|
||||
"start_time": time.time(),
|
||||
"end_time": None
|
||||
}
|
||||
|
||||
# Function to run a single agent
|
||||
async def run_agent(agent, prompt, browser, start_page, target_page, results):
|
||||
try:
|
||||
while results["steps"] < max_steps and not results["success"]:
|
||||
async for result in agent.run(prompt):
|
||||
results["steps"] += 1
|
||||
print(f"{browser} Step {results['steps']}")
|
||||
|
||||
def process_result():
|
||||
if result.get("content"):
|
||||
print(f"{browser} Agent: {result.get('content', '')}")
|
||||
|
||||
else:
|
||||
outputs = result.get("output", [])
|
||||
for output in outputs:
|
||||
if output.get("type") == "message":
|
||||
content = output.get("content", [])
|
||||
for content_part in content:
|
||||
if content_part.get("text"):
|
||||
print(f"{browser} Agent: {content_part.get('text', '')}")
|
||||
|
||||
elif output.get("type") == "reasoning":
|
||||
# if it's openAI, we only have access to a summary of the reasoning
|
||||
summary_content = output.get("summary", [])
|
||||
if summary_content:
|
||||
for summary_part in summary_content:
|
||||
if summary_part.get("type") == "summary_text":
|
||||
print(f"{browser} Agent: {summary_part.get('text', '')}")
|
||||
|
||||
else:
|
||||
summary_content = output.get("text", "")
|
||||
if summary_content:
|
||||
print(f"{browser} Agent: {summary_content}")
|
||||
|
||||
elif output.get("type") == "computer_call":
|
||||
action = output.get("action", {})
|
||||
action_type = action.get("type", "")
|
||||
if action_type:
|
||||
action_title = f"🛠️ Performing {action_type}"
|
||||
if action.get("x") and action.get("y"):
|
||||
action_title += f" at ({action['x']}, {action['y']})"
|
||||
print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
|
||||
|
||||
|
||||
# Process and print the result
|
||||
process_result()
|
||||
|
||||
# Check current page
|
||||
current_page = await get_current_wiki_page(browser)
|
||||
print(f"{browser} current page: {current_page}")
|
||||
print(f"{browser} target: {target_page}")
|
||||
|
||||
# Check if we reached the target
|
||||
if current_page and target_page.lower() in current_page.lower():
|
||||
results["success"] = True
|
||||
print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!")
|
||||
await agent._loop.cancel()
|
||||
break
|
||||
|
||||
# Check if we reached the maximum steps
|
||||
if results["steps"] >= max_steps:
|
||||
print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})")
|
||||
await agent._loop.cancel()
|
||||
break
|
||||
except asyncio.CancelledError:
|
||||
print("Agent stopped")
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
await asyncio.sleep(2) # Wait for agent to finish
|
||||
print(f"{browser} agent stopped")
|
||||
finally:
|
||||
results["end_time"] = time.time()
|
||||
|
||||
# Run both agents in parallel
|
||||
await asyncio.gather(
|
||||
run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results),
|
||||
run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results)
|
||||
)
|
||||
|
||||
# Wait for agents to finish
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Print results
|
||||
print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===")
|
||||
print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
|
||||
|
||||
print(f"\nSafari Results:")
|
||||
print(f"Start: {safari_start}")
|
||||
print(f"Target: {safari_target}")
|
||||
print(f"Steps taken: {safari_results['steps']}")
|
||||
print(f"Success: {safari_results['success']}")
|
||||
print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds")
|
||||
|
||||
print(f"\nFirefox Results:")
|
||||
print(f"Start: {firefox_start}")
|
||||
print(f"Target: {firefox_target}")
|
||||
print(f"Steps taken: {firefox_results['steps']}")
|
||||
print(f"Success: {firefox_results['success']}")
|
||||
print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds")
|
||||
|
||||
|
||||
async def main():
|
||||
try:
|
||||
|
||||
# Define agent configurations to test
|
||||
agent_configs = [
|
||||
("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
|
||||
("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
|
||||
# ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
|
||||
]
|
||||
|
||||
# # Run the test scenario without any agents
|
||||
# print("Running test scenario for sandboxed functions")
|
||||
# await run_test_scenario()
|
||||
|
||||
# Set maximum steps for each agent run
|
||||
max_steps = 50
|
||||
runs = 5
|
||||
|
||||
# run all scenarios
|
||||
for _ in range(runs):
|
||||
# Scenario 1: Messy desktop without App-Use
|
||||
await run_scenario("messy_desktop", False, agent_configs, max_steps)
|
||||
|
||||
# Results
|
||||
print(f"\n=== WIKIRACE RESULTS ===")
|
||||
print(f"Start: {start_page}")
|
||||
print(f"Target: {target_page}")
|
||||
print(f"Steps taken: {steps}")
|
||||
print(f"Success: {success}")
|
||||
print(f"Duration: {duration:.2f} seconds")
|
||||
finally:
|
||||
# Important to clean up resources
|
||||
# await computer.stop()
|
||||
pass
|
||||
# Scenario 1: Messy desktop with App-Use
|
||||
await run_scenario("messy_desktop", True, agent_configs, max_steps)
|
||||
|
||||
# Scenario 2: Parallel agents without App-Use
|
||||
await run_scenario("parallel_agents", False, agent_configs, max_steps)
|
||||
|
||||
# Scenario 2: Parallel agents with App-Use
|
||||
await run_scenario("parallel_agents", True, agent_configs, max_steps)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in main: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
async def run_test_scenario(max_iterations=5):
|
||||
"""Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match
|
||||
|
||||
This function opens the same Wikipedia pages in both browsers and verifies that
|
||||
the get_current_wiki_page function returns the same result for both browsers.
|
||||
It does this for the specified number of iterations.
|
||||
"""
|
||||
|
||||
# Create computer instance
|
||||
computer = Computer()
|
||||
await computer.run()
|
||||
|
||||
# Get popular wiki articles
|
||||
global articles
|
||||
selected_articles = random.sample(articles, max_iterations)
|
||||
|
||||
print(f"\n--- Running Test Scenario for {max_iterations} iterations ---")
|
||||
|
||||
# Close all windows first
|
||||
await close_all_windows()
|
||||
|
||||
# Open both browsers
|
||||
await open_app("Safari")
|
||||
await open_app("Firefox")
|
||||
|
||||
# Verify browsers are open
|
||||
open_apps = await get_open_app_names()
|
||||
print(f"Open applications: {open_apps}")
|
||||
|
||||
# Run test iterations
|
||||
for i, article in enumerate(selected_articles):
|
||||
print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'")
|
||||
|
||||
# Open the same Wikipedia page in both browsers
|
||||
await open_wiki(article, "Safari")
|
||||
await open_wiki(article, "Firefox")
|
||||
await asyncio.sleep(3) # Give a bit more time for both pages to load
|
||||
|
||||
# Check if both browsers show the same page
|
||||
safari_page = await get_current_wiki_page("Safari")
|
||||
firefox_page = await get_current_wiki_page("Firefox")
|
||||
|
||||
print(f"Safari page: {safari_page}")
|
||||
print(f"Firefox page: {firefox_page}")
|
||||
|
||||
if safari_page == firefox_page:
|
||||
print(f"✅ MATCH: Both browsers show '{safari_page}'")
|
||||
else:
|
||||
print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'")
|
||||
|
||||
await asyncio.sleep(1) # Brief pause between iterations
|
||||
|
||||
print("\n--- Test Scenario Completed ---")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user