From c2302eb6c607f074916051f411b3eeaec54aa38f Mon Sep 17 00:00:00 2001
From: Dillon DuPont <ddupont@mit.edu>
Date: Wed, 4 Jun 2025 09:46:09 -0400
Subject: [PATCH] Added results table

---
 examples/eval_examples.py | 74 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/examples/eval_examples.py b/examples/eval_examples.py
index 1978a897..b3d163ca 100644
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -11,13 +11,16 @@ import urllib.request
 import datetime
 from urllib.parse import quote
 
+# Global variable to track all results
+all_results = []
+
 # Wikirace prompt template
 WIKIRACE_PROMPT_TEMPLATE = """
 You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" 
 by clicking only on Wikipedia links within articles.
 
 Rules:
-1. Only click on links within Wikipedia articles (blue underlined text)
+1. Only click on links within Wikipedia articles (blue text)
 2. No using search, back button, or typing URLs
 3. You MAY use cmd+f (or ctrl+f) to find text on the current page
 4. Do NOT click any search icon or type into any search box unless it's a browser command
@@ -26,6 +29,7 @@ Rules:
 7. Do not maximize the window or use any other application
 8. Avoid wasting actions by scrolling
 9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
+10. Stay on the English Wikipedia
 
 Look at the current page and click on a link that might lead you closer to {target_page}.
 """
@@ -36,6 +40,7 @@ _print = print
 # Define log file path
 project_root = Path(__file__).parent.parent
 log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
+results_file = project_root / "examples" / "evals" / "eval_appuse_results.md"
 
 # Custom print function that also logs to file
 def print(*args, **kwargs):
@@ -160,6 +165,36 @@ def get_article_pair(depth=5):
         target_article = wikipedia_random_walk(start_article, depth)[-1]
     return start_article, target_article
 
+
+def save_results_to_markdown():
+    """Save all results to a markdown table"""
+    global all_results
+    
+    if not all_results:
+        print("No results to save")
+        return
+    
+    # Create header for the markdown table
+    header = "| Timestamp | Scenario | App-Use | Browser | Config | Start | Target | Steps | Success | Duration (s) |"
+    separator = "|---|---|---|---|---|---|---|---|---|---|"
+    
+    # Create rows for each result
+    rows = []
+    for result in all_results:
+        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        row = f"| {timestamp} | {result['scenario']} | {result['app_use']} | {result['browser']} | {result['config']} | {result['start']} | {result['target']} | {result['steps']} | {result['success']} | {result['duration']:.2f} |"
+        rows.append(row)
+    
+    # Combine header, separator, and rows
+    table = "\n".join([header, separator] + rows)
+    
+    # Write to file (append mode)
+    with open(results_file, "a") as f:
+        f.write(f"\n\n## Results Update - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+        f.write(table)
+    
+    print(f"Results saved to {results_file}")
+
 async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
     """Run a specific evaluation scenario"""
     
@@ -254,6 +289,7 @@ async def open_wiki(page, app_name="Safari"):
 
 
 async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
+    global all_results
     """Run the messy desktop scenario with a single agent"""
     # Get popular wiki articles
     global articles
@@ -292,7 +328,8 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
         agent = ComputerAgent(
             computer=agent_computer,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_appuse"
         )
         
         # Run the wikirace
@@ -387,6 +424,7 @@ async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
 
 
 async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
+    global all_results
     
     """Run two agents in parallel, one using Safari and one using Firefox"""
     # Get popular wiki articles
@@ -436,13 +474,15 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
         safari_agent = ComputerAgent(
             computer=safari_desktop,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_parallel_safari"
         )
         
         firefox_agent = ComputerAgent(
             computer=firefox_desktop,
             loop=loop_provider,
-            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider,
+            trajectory_dir="examples/evals/trajectories/eval_parallel_firefox"
         )
         
         # Create prompts using the template
@@ -525,6 +565,24 @@ async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
                         print(f"{browser} current page: {current_page}")
                         print(f"{browser} target: {target_page}") 
                         
+                        # Add result to global tracking
+                        global all_results
+                        current_result = {
+                            'scenario': 'parallel_agents',
+                            'app_use': 'Yes' if 'app-use' in (computer.experiments or []) else 'No',
+                            'browser': browser,
+                            'config': config_name,
+                            'start': start_page,
+                            'target': target_page,
+                            'steps': results['steps'],
+                            'success': results['success'],
+                            'duration': time.time() - results['start_time']
+                        }
+                        all_results.append(current_result)
+                        
+                        # Save results after each step
+                        save_results_to_markdown()
+                        
                         # Check if we reached the target
                         if current_page and target_page.lower() in current_page.lower():
                             results["success"] = True
@@ -575,9 +633,9 @@ async def main():
         
         # Define agent configurations to test
         agent_configs = [
-            ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
-            ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
-            # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
+            # ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
+            # ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
+            ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
         ]
         
         # # Run the test scenario without any agents
@@ -585,7 +643,7 @@ async def main():
         # await run_test_scenario()
         
         # Set maximum steps for each agent run
-        max_steps = 50
+        max_steps = 15
         runs = 5
 
         # run all scenarios