From 86d052d88278b4d58e1832d93a6d7fd50eaf6cf4 Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Tue, 3 Jun 2025 21:19:46 -0400 Subject: [PATCH] updated eval to use sandboxed decorator --- examples/eval_examples.py | 714 ++++++++++++++++++++++++++++++++------ 1 file changed, 606 insertions(+), 108 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 48da31be..1978a897 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -1,13 +1,63 @@ import os import asyncio +import json +import random from pathlib import Path import sys import traceback import time from functools import wraps +import urllib.request +import datetime +from urllib.parse import quote + +# Wikirace prompt template +WIKIRACE_PROMPT_TEMPLATE = """ +You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" +by clicking only on Wikipedia links within articles. + +Rules: +1. Only click on links within Wikipedia articles (blue underlined text) +2. No using search, back button, or typing URLs +3. You MAY use cmd+f (or ctrl+f) to find text on the current page +4. Do NOT click any search icon or type into any search box unless it's a browser command +5. Try to find the shortest path possible +6. Current target: {target_page} +7. Do not maximize the window or use any other application +8. Avoid wasting actions by scrolling +9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps + +Look at the current page and click on a link that might lead you closer to {target_page}. +""" + +# Store original print function +_print = print + +# Define log file path +project_root = Path(__file__).parent.parent +log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt" + +# Custom print function that also logs to file +def print(*args, **kwargs): + # Call the original print function + _print(*args, **kwargs) + + # Format the output as a string + output = " ".join(str(arg) for arg in args) + if kwargs.get("end") is not None: + output += kwargs["end"] + else: + output += "\n" + + # Add timestamp + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + log_entry = f"[{timestamp}] {output}" + + # Append to log file + with open(log_file, "a") as f: + f.write(log_entry) # Load environment variables from .env file -project_root = Path(__file__).parent.parent env_file = project_root / ".env" print(f"Loading environment from: {env_file}") from dotenv import load_dotenv @@ -29,104 +79,283 @@ from computer.helpers import sandboxed # Assuming these exist based on your request from agent import ComputerAgent, LLM, AgentLoop, LLMProvider -async def main(): +articles = [] + +# Load from file +articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt" +with open(articles_file, "r") as f: + articles = [line.strip() for line in f] + + +def get_article_links(article_title): + """Get all links from a Wikipedia article's content""" try: - print("\n=== Using cloud container ===") - # # Create a remote Linux computer with CUA - # computer = Computer( - # os_type="linux", - # api_key=os.getenv("CUA_API_KEY"), - # name=str(os.getenv("CUA_CONTAINER_NAME")), - # provider_type=VMProviderType.CLOUD, - # ) + # Get the article content + url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json" - # Connect to local macOS computer - computer = Computer() + with urllib.request.urlopen(url) as response: + data = json.loads(response.read().decode()) + + pages = data.get('query', {}).get('pages', {}) + if not pages: + return [] + + # Get the first (and only) page + page = next(iter(pages.values())) + links = page.get('links', []) + + # Filter links to keep only main namespace articles (no special pages, files, etc.) + article_links = [] + for link in links: + title = link.get('title', '') + # Skip if title contains colons (indicates special pages, files, categories, etc.) + if ':' not in title and title.isascii() and len(title) < 50: + article_links.append(title) + + return article_links + + except Exception as e: + print(f"Error fetching links for {article_title}: {e}") + return [] + +def wikipedia_random_walk(start_article, depth=5): + """ + Perform a random walk through Wikipedia articles + + Args: + start_article (str): The article title to start from + depth (int): How many steps to take in the random walk + + Returns: + list: Path of article titles visited during the walk + """ + path = [start_article] + current_article = start_article + + for step in range(depth): + print(f"Step {step + 1}: Currently at '{current_article}'") + + # Get links from current article + links = get_article_links(current_article) + + if not links: + print(f"No valid links found in '{current_article}'. Ending walk.") + break + + # Randomly select next article + next_article = random.choice(links) + path.append(next_article) + current_article = next_article + + print(f" -> Moving to '{next_article}'") + + return path + +def get_article_pair(depth=5): + global articles + start_article = random.choice(articles) + target_article = wikipedia_random_walk(start_article, depth)[-1] + while target_article == start_article: + start_article = random.choice(articles) + target_article = wikipedia_random_walk(start_article, depth)[-1] + return start_article, target_article + +async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30): + """Run a specific evaluation scenario""" + + print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===") + + # Create computer instance with or without app-use experiment + experiments = ["app-use"] if use_app_use else [] + computer = Computer(experiments=experiments) + + try: + # Run the computer + await computer.run() + + # Install required packages + await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) + + # Run the specific scenario + if scenario_name == "messy_desktop": + await run_messy_desktop_scenario(computer, agent_configs, max_steps) + elif scenario_name == "parallel_agents": + await run_parallel_agents_scenario(computer, agent_configs, max_steps) + else: + print(f"Unknown scenario: {scenario_name}") + + except Exception as e: + print(f"Error in scenario {scenario_name}: {e}") + traceback.print_exc() + finally: + # Important to clean up resources + # await computer.stop() + pass + + +@sandboxed("eval_env") +def close_all_windows(): + """Close all open windows""" + import pywinctl + windows = pywinctl.getAllWindows() + for window in windows: + try: + window.close() + except: + # Some windows might not be closeable or may have already closed + pass + + +@sandboxed("eval_env") +def get_current_wiki_page(app_name=None): + """Get the title of the current Wikipedia page + + Args: + app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox') + """ + import pywinctl + windows = pywinctl.getAllWindows() + + # Filter windows by app name if provided + if app_name: + windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()] + + # Get titles from filtered windows + titles = [w.title for w in windows if w.title] + wiki_titles = [title for title in titles if "Wikipedia" in title] + + if wiki_titles: + return wiki_titles[0].split(" - Wikipedia")[0] + return None + + +@sandboxed("eval_env") +def get_open_app_names(): + """Get names of all open applications""" + import pywinctl + windows = pywinctl.getAllWindows() + return [window.getAppName() for window in windows if window.getAppName()] + +def _computer(): + """Get the default computer instance""" + from computer.helpers import _default_computer + return _default_computer + +async def open_app(app_name): + """Open a specific application""" + await _computer().interface.run_command(f"open -a '{app_name}'") + await asyncio.sleep(2) # Wait for app to open + + +async def open_wiki(page, app_name="Safari"): + """Open a specific Wikipedia page""" + await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}") + await asyncio.sleep(2) # Wait for page to load + + +async def run_messy_desktop_scenario(computer, agent_configs, max_steps): + """Run the messy desktop scenario with a single agent""" + # Get popular wiki articles + global articles + start_page, target_page = get_article_pair(depth=1) + + print(f"Wiki race: {start_page} → {target_page}") + + # Close all windows first + await close_all_windows() + + # Open starting Wikipedia page + await open_wiki(start_page) + + # Open 3 random apps to create a messy desktop + apps_to_open = ["Notes", "Terminal", "System Settings"] + for app in apps_to_open: + await open_app(app) + + # Verify apps are open + open_apps = await get_open_app_names() + print(f"Open applications: {open_apps}") + + # Create the agent's computer interface + # If app-use is enabled, create a desktop limited to Safari/Firefox + if "app-use" in (computer.experiments or []): + browser_desktop = computer.create_desktop_from_apps(["Safari"]) + agent_computer = browser_desktop + else: + agent_computer = computer + + # Run each agent configuration + for config_name, loop_provider, model_provider in agent_configs: + print(f"\n--- Testing Agent: {config_name} ---") + + # Create agent with the specified configuration + agent = ComputerAgent( + computer=agent_computer, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + # Run the wikirace + steps = 0 + success = False + start_time = time.time() + + # Use the template with formatting for this scenario + prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Safari", + start_page=start_page, + target_page=target_page + ) try: - # Run the computer with default parameters - await computer.run() - - # Install required packages - await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"]) - - # Helper functions for wikirace - async def open_wiki(page): - await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &") - await asyncio.sleep(2) # Wait for page to load - - # Remote functions for wikirace - using @sandboxed decorator - @sandboxed("eval_env") - def close_all_windows(): - import pywinctl - windows = pywinctl.getAllWindows() - for window in windows: - try: - window.close() - except: - # Some windows might not be closeable or may have already closed - pass - - @sandboxed("eval_env") - def get_current_wiki_page(): - import pywinctl - titles = pywinctl.getAllTitles() - wiki_titles = [title for title in titles if "Wikipedia" in title] - if wiki_titles: - return wiki_titles[0].split(" - Wikipedia")[0] - return None - - # Wikirace setup - max_steps = 2 - start_page = "Albert Einstein" - target_page = "Pizza" - - print(f"\nStarting Wikirace: {start_page} → {target_page}") - - # Close all windows - await close_all_windows() - - # Open starting page - await open_wiki(start_page) - - # Check current page using decorated function - current_page = await get_current_wiki_page() - print(f"Starting page: {current_page}") - assert current_page == start_page, f"Expected {start_page}, got {current_page}" - - # Create agent - agent = ComputerAgent( - computer=computer, - loop=AgentLoop.OPENAI, - model=LLM(LLMProvider.OPENAI) - ) - - # Run the wikirace - steps = 0 - success = False - start_time = time.time() - - prompt = f""" - You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" - by clicking only on Wikipedia links within articles. - - Rules: - 1. Only click on links within Wikipedia articles (blue underlined text) - 2. No using search, back button, or typing URLs - 3. Try to find the shortest path possible - 4. Current target: {target_page} - - Look at the current page and click on a link that might lead you closer to {target_page}. - """ - - try: + while steps < max_steps and not success: async for result in agent.run(prompt): steps += 1 - print(f"Step {steps}: {result}") + print(f"Step {steps}") - # Check again - current_page = await get_current_wiki_page() + def process_result(): + if result.get("content"): + print(f"Agent: {result.get('content', '')}") + + else: + outputs = result.get("output", []) + for output in outputs: + if output.get("type") == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + print(f"Agent: {content_part.get('text', '')}") + + elif output.get("type") == "reasoning": + # if it's openAI, we only have access to a summary of the reasoning + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + print(f"Agent: {summary_part.get('text', '')}") + + else: + summary_content = output.get("text", "") + if summary_content: + print(f"Agent: {summary_content}") + + elif output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type: + action_title = f"🛠️ Performing {action_type}" + if action.get("x") and action.get("y"): + action_title += f" at ({action['x']}, {action['y']})" + print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```") + + + # Process and print the result + process_result() + + # Check current page + current_page = await get_current_wiki_page("Safari") print(f"Current page: {current_page}") + print(f"Target: {target_page}") # Check if we reached the target if current_page and target_page.lower() in current_page.lower(): @@ -140,29 +369,298 @@ async def main(): print(f"❌ Stopping agent: Reached maximum steps ({max_steps})") await agent._loop.cancel() break + except asyncio.CancelledError: + print("Agent stopped") + + end_time = time.time() + duration = end_time - start_time + await asyncio.sleep(2) # Wait for agent to finish + + # Results + print(f"\n=== WIKIRACE RESULTS: {config_name} ===") + print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") + print(f"Start: {start_page}") + print(f"Target: {target_page}") + print(f"Steps taken: {steps}") + print(f"Success: {success}") + print(f"Duration: {duration:.2f} seconds") + + +async def run_parallel_agents_scenario(computer, agent_configs, max_steps): + + """Run two agents in parallel, one using Safari and one using Firefox""" + # Get popular wiki articles + global articles + safari_start, safari_target = get_article_pair(depth=1) + firefox_start, firefox_target = get_article_pair(depth=1) + + print(f"Safari Wiki race: {safari_start} → {safari_target}") + print(f"Firefox Wiki race: {firefox_start} → {firefox_target}") + + # Close all windows first + await close_all_windows() + + # Open Safari with starting page + await open_wiki(safari_start, "Safari") + await asyncio.sleep(2) + + # Open Firefox with starting page + await open_wiki(firefox_start, "Firefox") + await asyncio.sleep(2) + + # Create agent configurations + for config_name, loop_provider, model_provider in agent_configs: + print(f"\n--- Testing Parallel Agents: {config_name} ---") + + # Create the agent interfaces + if "app-use" in (computer.experiments or []): + safari_desktop = computer.create_desktop_from_apps(["Safari"]) + firefox_desktop = computer.create_desktop_from_apps(["Firefox"]) + else: + safari_desktop = computer + firefox_desktop = computer + + # Save screenshots + screenshot_dir = project_root / "examples" / "evals" / "screenshots" + screenshot_dir.mkdir(exist_ok=True) + safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png" + firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png" + screenshot_bytes = await safari_desktop.interface.screenshot() + with open(safari_screenshot_path, "wb") as f: + f.write(screenshot_bytes) + screenshot_bytes = await firefox_desktop.interface.screenshot() + with open(firefox_screenshot_path, "wb") as f: + f.write(screenshot_bytes) + + # Create agents + safari_agent = ComputerAgent( + computer=safari_desktop, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + firefox_agent = ComputerAgent( + computer=firefox_desktop, + loop=loop_provider, + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + ) + + # Create prompts using the template + safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Safari", + start_page=safari_start, + target_page=safari_target + ) + + firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format( + browser="Firefox", + start_page=firefox_start, + target_page=firefox_target + ) + + # Track results + safari_results = { + "steps": 0, + "success": False, + "start_time": time.time(), + "end_time": None + } + + firefox_results = { + "steps": 0, + "success": False, + "start_time": time.time(), + "end_time": None + } + + # Function to run a single agent + async def run_agent(agent, prompt, browser, start_page, target_page, results): + try: + while results["steps"] < max_steps and not results["success"]: + async for result in agent.run(prompt): + results["steps"] += 1 + print(f"{browser} Step {results['steps']}") + + def process_result(): + if result.get("content"): + print(f"{browser} Agent: {result.get('content', '')}") + + else: + outputs = result.get("output", []) + for output in outputs: + if output.get("type") == "message": + content = output.get("content", []) + for content_part in content: + if content_part.get("text"): + print(f"{browser} Agent: {content_part.get('text', '')}") + + elif output.get("type") == "reasoning": + # if it's openAI, we only have access to a summary of the reasoning + summary_content = output.get("summary", []) + if summary_content: + for summary_part in summary_content: + if summary_part.get("type") == "summary_text": + print(f"{browser} Agent: {summary_part.get('text', '')}") + + else: + summary_content = output.get("text", "") + if summary_content: + print(f"{browser} Agent: {summary_content}") + + elif output.get("type") == "computer_call": + action = output.get("action", {}) + action_type = action.get("type", "") + if action_type: + action_title = f"🛠️ Performing {action_type}" + if action.get("x") and action.get("y"): + action_title += f" at ({action['x']}, {action['y']})" + print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```") + + + # Process and print the result + process_result() + + # Check current page + current_page = await get_current_wiki_page(browser) + print(f"{browser} current page: {current_page}") + print(f"{browser} target: {target_page}") + + # Check if we reached the target + if current_page and target_page.lower() in current_page.lower(): + results["success"] = True + print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!") + await agent._loop.cancel() + break + + # Check if we reached the maximum steps + if results["steps"] >= max_steps: + print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})") + await agent._loop.cancel() + break except asyncio.CancelledError: - print("Agent stopped") - - end_time = time.time() - duration = end_time - start_time - await asyncio.sleep(2) # Wait for agent to finish + print(f"{browser} agent stopped") + finally: + results["end_time"] = time.time() + + # Run both agents in parallel + await asyncio.gather( + run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results), + run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results) + ) + + # Wait for agents to finish + await asyncio.sleep(2) + + # Print results + print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===") + print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}") + + print(f"\nSafari Results:") + print(f"Start: {safari_start}") + print(f"Target: {safari_target}") + print(f"Steps taken: {safari_results['steps']}") + print(f"Success: {safari_results['success']}") + print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds") + + print(f"\nFirefox Results:") + print(f"Start: {firefox_start}") + print(f"Target: {firefox_target}") + print(f"Steps taken: {firefox_results['steps']}") + print(f"Success: {firefox_results['success']}") + print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds") + + +async def main(): + try: + + # Define agent configurations to test + agent_configs = [ + ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), + ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), + # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) + ] + + # # Run the test scenario without any agents + # print("Running test scenario for sandboxed functions") + # await run_test_scenario() + + # Set maximum steps for each agent run + max_steps = 50 + runs = 5 + + # run all scenarios + for _ in range(runs): + # Scenario 1: Messy desktop without App-Use + await run_scenario("messy_desktop", False, agent_configs, max_steps) - # Results - print(f"\n=== WIKIRACE RESULTS ===") - print(f"Start: {start_page}") - print(f"Target: {target_page}") - print(f"Steps taken: {steps}") - print(f"Success: {success}") - print(f"Duration: {duration:.2f} seconds") - finally: - # Important to clean up resources - # await computer.stop() - pass + # Scenario 1: Messy desktop with App-Use + await run_scenario("messy_desktop", True, agent_configs, max_steps) + + # Scenario 2: Parallel agents without App-Use + await run_scenario("parallel_agents", False, agent_configs, max_steps) + + # Scenario 2: Parallel agents with App-Use + await run_scenario("parallel_agents", True, agent_configs, max_steps) except Exception as e: print(f"Error in main: {e}") traceback.print_exc() +async def run_test_scenario(max_iterations=5): + """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match + + This function opens the same Wikipedia pages in both browsers and verifies that + the get_current_wiki_page function returns the same result for both browsers. + It does this for the specified number of iterations. + """ + + # Create computer instance + computer = Computer() + await computer.run() + + # Get popular wiki articles + global articles + selected_articles = random.sample(articles, max_iterations) + + print(f"\n--- Running Test Scenario for {max_iterations} iterations ---") + + # Close all windows first + await close_all_windows() + + # Open both browsers + await open_app("Safari") + await open_app("Firefox") + + # Verify browsers are open + open_apps = await get_open_app_names() + print(f"Open applications: {open_apps}") + + # Run test iterations + for i, article in enumerate(selected_articles): + print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'") + + # Open the same Wikipedia page in both browsers + await open_wiki(article, "Safari") + await open_wiki(article, "Firefox") + await asyncio.sleep(3) # Give a bit more time for both pages to load + + # Check if both browsers show the same page + safari_page = await get_current_wiki_page("Safari") + firefox_page = await get_current_wiki_page("Firefox") + + print(f"Safari page: {safari_page}") + print(f"Firefox page: {firefox_page}") + + if safari_page == firefox_page: + print(f"✅ MATCH: Both browsers show '{safari_page}'") + else: + print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'") + + await asyncio.sleep(1) # Brief pause between iterations + + print("\n--- Test Scenario Completed ---") + + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main())