From c2302eb6c607f074916051f411b3eeaec54aa38f Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Wed, 4 Jun 2025 09:46:09 -0400 Subject: [PATCH] Added results table --- examples/eval_examples.py | 74 ++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/examples/eval_examples.py b/examples/eval_examples.py index 1978a897..b3d163ca 100644 --- a/examples/eval_examples.py +++ b/examples/eval_examples.py @@ -11,13 +11,16 @@ import urllib.request import datetime from urllib.parse import quote +# Global variable to track all results +all_results = [] + # Wikirace prompt template WIKIRACE_PROMPT_TEMPLATE = """ You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" by clicking only on Wikipedia links within articles. Rules: -1. Only click on links within Wikipedia articles (blue underlined text) +1. Only click on links within Wikipedia articles (blue text) 2. No using search, back button, or typing URLs 3. You MAY use cmd+f (or ctrl+f) to find text on the current page 4. Do NOT click any search icon or type into any search box unless it's a browser command @@ -26,6 +29,7 @@ Rules: 7. Do not maximize the window or use any other application 8. Avoid wasting actions by scrolling 9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps +10. Stay on the English Wikipedia Look at the current page and click on a link that might lead you closer to {target_page}. """ @@ -36,6 +40,7 @@ _print = print # Define log file path project_root = Path(__file__).parent.parent log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt" +results_file = project_root / "examples" / "evals" / "eval_appuse_results.md" # Custom print function that also logs to file def print(*args, **kwargs): @@ -160,6 +165,36 @@ def get_article_pair(depth=5): target_article = wikipedia_random_walk(start_article, depth)[-1] return start_article, target_article + +def save_results_to_markdown(): + """Save all results to a markdown table""" + global all_results + + if not all_results: + print("No results to save") + return + + # Create header for the markdown table + header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |" + separator = "|---|---|---|---|---|---|---|---|---|---|" + + # Create rows for each result + rows = [] + for result in all_results: + timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |" + rows.append(row) + + # Combine header, separator, and rows + table = "\n".join([header, separator] + rows) + + # Write to file (append mode) + with open(results_file, "a") as f: + f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + f.write(table) + + print(f"Results saved to {results_file}") + async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30): """Run a specific evaluation scenario""" @@ -254,6 +289,7 @@ async def open_wiki(page, app_name="Safari"): async def run_messy_desktop_scenario(computer, agent_configs, max_steps): + global all_results """Run the messy desktop scenario with a single agent""" # Get popular wiki articles global articles @@ -292,7 +328,8 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps): agent = ComputerAgent( computer=agent_computer, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_appuse" ) # Run the wikirace @@ -387,6 +424,7 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps): async def run_parallel_agents_scenario(computer, agent_configs, max_steps): + global all_results """Run two agents in parallel, one using Safari and one using Firefox""" # Get popular wiki articles @@ -436,13 +474,15 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps): safari_agent = ComputerAgent( computer=safari_desktop, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_parallel_safari" ) firefox_agent = ComputerAgent( computer=firefox_desktop, loop=loop_provider, - model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider + model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider, + trajectory_dir="examples/evals/trajectories/eval_parallel_firefox" ) # Create prompts using the template @@ -525,6 +565,24 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps): print(f"{browser} current page: {current_page}") print(f"{browser} target: {target_page}") + # Add result to global tracking + global all_results + current_result = { + 'scenario': 'parallel_agents', + 'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No', + 'browser': browser, + 'config': config_name, + 'start': start_page, + 'target': target_page, + 'steps': results['steps'], + 'success': results['success'], + 'duration': time.time() - results['start_time'] + } + all_results.append(current_result) + + # Save results after each step + save_results_to_markdown() + # Check if we reached the target if current_page and target_page.lower() in current_page.lower(): results["success"] = True @@ -575,9 +633,9 @@ async def main(): # Define agent configurations to test agent_configs = [ - ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), - ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), - # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) + # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI), + # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC), + ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL"))) ] # # Run the test scenario without any agents @@ -585,7 +643,7 @@ async def main(): # await run_test_scenario() # Set maximum steps for each agent run - max_steps = 50 + max_steps = 15 runs = 5 # run all scenarios