updated eval to use sandboxed decorator

2026-02-14 18:39:56 -06:00 · 2025-06-03 21:19:46 -04:00
parent a7e56ce64a
commit 86d052d882
1 changed files with 606 additions and 108 deletions
--- a/examples/eval_examples.py
+++ b/examples/eval_examples.py
@@ -1,13 +1,63 @@
 import os
 import asyncio
+import json
+import random
 from pathlib import Path
 import sys
 import traceback
 import time
 from functools import wraps
+import urllib.request
+import datetime
+from urllib.parse import quote
+
+# Wikirace prompt template
+WIKIRACE_PROMPT_TEMPLATE = """
+You are playing Wikirace in {browser}! Your goal is to navigate from "{start_page}" to "{target_page}" 
+by clicking only on Wikipedia links within articles.
+
+Rules:
+1. Only click on links within Wikipedia articles (blue underlined text)
+2. No using search, back button, or typing URLs
+3. You MAY use cmd+f (or ctrl+f) to find text on the current page
+4. Do NOT click any search icon or type into any search box unless it's a browser command
+5. Try to find the shortest path possible
+6. Current target: {target_page}
+7. Do not maximize the window or use any other application
+8. Avoid wasting actions by scrolling
+9. Try using cmd+f and quickly clicking through relevant links in the page as you have a limited number of steps
+
+Look at the current page and click on a link that might lead you closer to {target_page}.
+"""
+
+# Store original print function
+_print = print
+
+# Define log file path
+project_root = Path(__file__).parent.parent
+log_file = project_root / "examples" / "evals" / "eval_appuse_log.txt"
+
+# Custom print function that also logs to file
+def print(*args, **kwargs):
+    # Call the original print function
+    _print(*args, **kwargs)
+    
+    # Format the output as a string
+    output = " ".join(str(arg) for arg in args)
+    if kwargs.get("end") is not None:
+        output += kwargs["end"]
+    else:
+        output += "\n"
+    
+    # Add timestamp
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    log_entry = f"[{timestamp}] {output}"
+    
+    # Append to log file
+    with open(log_file, "a") as f:
+        f.write(log_entry)

 # Load environment variables from .env file
-project_root = Path(__file__).parent.parent
 env_file = project_root / ".env"
 print(f"Loading environment from: {env_file}")
 from dotenv import load_dotenv
@@ -29,104 +79,283 @@ from computer.helpers import sandboxed
 # Assuming these exist based on your request
 from agent import ComputerAgent, LLM, AgentLoop, LLMProvider

-async def main():    
+articles = []
+
+# Load from file
+articles_file = project_root / "examples" / "evals" / "wikipedia_most_linked.txt"
+with open(articles_file, "r") as f:
+    articles = [line.strip() for line in f]
+
+
+def get_article_links(article_title):
+    """Get all links from a Wikipedia article's content"""
    try:
-        print("\n=== Using cloud container ===")
-        # # Create a remote Linux computer with CUA
-        # computer = Computer(
-        #     os_type="linux",
-        #     api_key=os.getenv("CUA_API_KEY"),
-        #     name=str(os.getenv("CUA_CONTAINER_NAME")),
-        #     provider_type=VMProviderType.CLOUD,
-        # )
+        # Get the article content
+        url = f"https://en.wikipedia.org/w/api.php?action=query&titles={quote(article_title)}&prop=links&pllimit=500&format=json"
        
-        # Connect to local macOS computer
-        computer = Computer()
+        with urllib.request.urlopen(url) as response:
+            data = json.loads(response.read().decode())
+            
+        pages = data.get('query', {}).get('pages', {})
+        if not pages:
+            return []
+        
+        # Get the first (and only) page
+        page = next(iter(pages.values()))
+        links = page.get('links', [])
+        
+        # Filter links to keep only main namespace articles (no special pages, files, etc.)
+        article_links = []
+        for link in links:
+            title = link.get('title', '')
+            # Skip if title contains colons (indicates special pages, files, categories, etc.)
+            if ':' not in title and title.isascii() and len(title) < 50:
+                article_links.append(title)
+        
+        return article_links
+    
+    except Exception as e:
+        print(f"Error fetching links for {article_title}: {e}")
+        return []
+
+def wikipedia_random_walk(start_article, depth=5):
+    """
+    Perform a random walk through Wikipedia articles
+    
+    Args:
+        start_article (str): The article title to start from
+        depth (int): How many steps to take in the random walk
+    
+    Returns:
+        list: Path of article titles visited during the walk
+    """
+    path = [start_article]
+    current_article = start_article
+    
+    for step in range(depth):
+        print(f"Step {step + 1}: Currently at '{current_article}'")
+        
+        # Get links from current article
+        links = get_article_links(current_article)
+        
+        if not links:
+            print(f"No valid links found in '{current_article}'. Ending walk.")
+            break
+        
+        # Randomly select next article
+        next_article = random.choice(links)
+        path.append(next_article)
+        current_article = next_article
+        
+        print(f"  -> Moving to '{next_article}'")
+    
+    return path
+
+def get_article_pair(depth=5):
+    global articles
+    start_article = random.choice(articles)
+    target_article = wikipedia_random_walk(start_article, depth)[-1]
+    while target_article == start_article:
+        start_article = random.choice(articles)
+        target_article = wikipedia_random_walk(start_article, depth)[-1]
+    return start_article, target_article
+
+async def run_scenario(scenario_name, use_app_use, agent_configs, max_steps=30):
+    """Run a specific evaluation scenario"""
+    
+    print(f"\n=== Running Scenario: {scenario_name} (App-Use: {use_app_use}) ===")
+    
+    # Create computer instance with or without app-use experiment
+    experiments = ["app-use"] if use_app_use else []
+    computer = Computer(experiments=experiments)
+    
+    try:
+        # Run the computer
+        await computer.run()
+        
+        # Install required packages
+        await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
+        
+        # Run the specific scenario
+        if scenario_name == "messy_desktop":
+            await run_messy_desktop_scenario(computer, agent_configs, max_steps)
+        elif scenario_name == "parallel_agents":
+            await run_parallel_agents_scenario(computer, agent_configs, max_steps)
+        else:
+            print(f"Unknown scenario: {scenario_name}")
+    
+    except Exception as e:
+        print(f"Error in scenario {scenario_name}: {e}")
+        traceback.print_exc()
+    finally:
+        # Important to clean up resources
+        # await computer.stop()
+        pass
+
+
+@sandboxed("eval_env")
+def close_all_windows():
+    """Close all open windows"""
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    for window in windows:
+        try:
+            window.close()
+        except:
+            # Some windows might not be closeable or may have already closed
+            pass
+
+
+@sandboxed("eval_env")
+def get_current_wiki_page(app_name=None):
+    """Get the title of the current Wikipedia page
+    
+    Args:
+        app_name: Optional name of the app to check (e.g., 'Safari', 'Firefox')
+    """
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    
+    # Filter windows by app name if provided
+    if app_name:
+        windows = [w for w in windows if w.getAppName() and app_name.lower() in w.getAppName().lower()]
+    
+    # Get titles from filtered windows
+    titles = [w.title for w in windows if w.title]
+    wiki_titles = [title for title in titles if "Wikipedia" in title]
+    
+    if wiki_titles:
+        return wiki_titles[0].split(" - Wikipedia")[0]
+    return None
+
+
+@sandboxed("eval_env")
+def get_open_app_names():
+    """Get names of all open applications"""
+    import pywinctl
+    windows = pywinctl.getAllWindows()
+    return [window.getAppName() for window in windows if window.getAppName()]
+
+def _computer():
+    """Get the default computer instance"""
+    from computer.helpers import _default_computer
+    return _default_computer
+
+async def open_app(app_name):
+    """Open a specific application"""
+    await _computer().interface.run_command(f"open -a '{app_name}'")
+    await asyncio.sleep(2)  # Wait for app to open
+
+
+async def open_wiki(page, app_name="Safari"):
+    """Open a specific Wikipedia page"""
+    await _computer().interface.run_command(f"open -a {app_name} https://en.wikipedia.org/wiki/{page.replace(' ', '_')}")
+    await asyncio.sleep(2)  # Wait for page to load
+
+
+async def run_messy_desktop_scenario(computer, agent_configs, max_steps):
+    """Run the messy desktop scenario with a single agent"""
+    # Get popular wiki articles
+    global articles
+    start_page, target_page = get_article_pair(depth=1)
+    
+    print(f"Wiki race: {start_page} → {target_page}")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open starting Wikipedia page
+    await open_wiki(start_page)
+    
+    # Open 3 random apps to create a messy desktop
+    apps_to_open = ["Notes", "Terminal", "System Settings"]
+    for app in apps_to_open:
+        await open_app(app)
+    
+    # Verify apps are open
+    open_apps = await get_open_app_names()
+    print(f"Open applications: {open_apps}")
+    
+    # Create the agent's computer interface
+    # If app-use is enabled, create a desktop limited to Safari/Firefox
+    if "app-use" in (computer.experiments or []):
+        browser_desktop = computer.create_desktop_from_apps(["Safari"])
+        agent_computer = browser_desktop
+    else:
+        agent_computer = computer
+    
+    # Run each agent configuration
+    for config_name, loop_provider, model_provider in agent_configs:
+        print(f"\n--- Testing Agent: {config_name} ---")
+        
+        # Create agent with the specified configuration
+        agent = ComputerAgent(
+            computer=agent_computer,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        # Run the wikirace
+        steps = 0
+        success = False
+        start_time = time.time()
+        
+        # Use the template with formatting for this scenario
+        prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Safari",
+            start_page=start_page,
+            target_page=target_page
+        )
        
        try:
-            # Run the computer with default parameters
-            await computer.run()
-            
-            # Install required packages
-            await computer.venv_install("eval_env", ["pywinctl", "selenium", "beautifulsoup4"])
-
-            # Helper functions for wikirace
-            async def open_wiki(page):
-                await computer.interface.run_command(f"open https://en.wikipedia.org/wiki/{page.replace(' ', '_')} &")
-                await asyncio.sleep(2)  # Wait for page to load
-
-            # Remote functions for wikirace - using @sandboxed decorator
-            @sandboxed("eval_env")
-            def close_all_windows():
-                import pywinctl
-                windows = pywinctl.getAllWindows()
-                for window in windows:
-                    try:
-                        window.close()
-                    except:
-                        # Some windows might not be closeable or may have already closed
-                        pass
-
-            @sandboxed("eval_env")
-            def get_current_wiki_page():
-                import pywinctl
-                titles = pywinctl.getAllTitles()
-                wiki_titles = [title for title in titles if "Wikipedia" in title]
-                if wiki_titles:
-                    return wiki_titles[0].split(" - Wikipedia")[0]
-                return None
-
-            # Wikirace setup
-            max_steps = 2
-            start_page = "Albert Einstein"
-            target_page = "Pizza"
-            
-            print(f"\nStarting Wikirace: {start_page} → {target_page}")
-            
-            # Close all windows
-            await close_all_windows()
-            
-            # Open starting page
-            await open_wiki(start_page)
-            
-            # Check current page using decorated function
-            current_page = await get_current_wiki_page()
-            print(f"Starting page: {current_page}")
-            assert current_page == start_page, f"Expected {start_page}, got {current_page}"
-            
-            # Create agent
-            agent = ComputerAgent(
-                computer=computer,
-                loop=AgentLoop.OPENAI,
-                model=LLM(LLMProvider.OPENAI)
-            )
-            
-            # Run the wikirace
-            steps = 0
-            success = False
-            start_time = time.time()
-            
-            prompt = f"""
-            You are playing Wikirace! Your goal is to navigate from "{start_page}" to "{target_page}" 
-            by clicking only on Wikipedia links within articles.
-            
-            Rules:
-            1. Only click on links within Wikipedia articles (blue underlined text)
-            2. No using search, back button, or typing URLs
-            3. Try to find the shortest path possible
-            4. Current target: {target_page}
-            
-            Look at the current page and click on a link that might lead you closer to {target_page}.
-            """
-            
-            try: 
+            while steps < max_steps and not success: 
                async for result in agent.run(prompt):    
                    steps += 1
-                    print(f"Step {steps}: {result}")
+                    print(f"Step {steps}")
                    
-                    # Check again
-                    current_page = await get_current_wiki_page()
+                    def process_result():
+                        if result.get("content"):
+                            print(f"Agent: {result.get('content', '')}")
+
+                        else:
+                            outputs = result.get("output", [])
+                            for output in outputs:
+                                if output.get("type") == "message":
+                                    content = output.get("content", [])
+                                    for content_part in content:
+                                        if content_part.get("text"):
+                                            print(f"Agent: {content_part.get('text', '')}")
+
+                                elif output.get("type") == "reasoning":
+                                    # if it's openAI, we only have access to a summary of the reasoning
+                                    summary_content = output.get("summary", [])
+                                    if summary_content:
+                                        for summary_part in summary_content:
+                                            if summary_part.get("type") == "summary_text":
+                                                print(f"Agent: {summary_part.get('text', '')}")
+
+                                    else:
+                                        summary_content = output.get("text", "")
+                                        if summary_content:
+                                            print(f"Agent: {summary_content}")
+
+                                elif output.get("type") == "computer_call":
+                                    action = output.get("action", {})
+                                    action_type = action.get("type", "")
+                                    if action_type:
+                                        action_title = f"🛠️ Performing {action_type}"
+                                        if action.get("x") and action.get("y"):
+                                            action_title += f" at ({action['x']}, {action['y']})"
+                                        print(f"Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
+
+                    
+                    # Process and print the result
+                    process_result()
+                    
+                    # Check current page
+                    current_page = await get_current_wiki_page("Safari")
                    print(f"Current page: {current_page}")
+                    print(f"Target: {target_page}")
                    
                    # Check if we reached the target
                    if current_page and target_page.lower() in current_page.lower():
@@ -140,29 +369,298 @@ async def main():
                        print(f"❌ Stopping agent: Reached maximum steps ({max_steps})")
                        await agent._loop.cancel()
                        break
+        except asyncio.CancelledError:
+            print("Agent stopped")
+                        
+        end_time = time.time()
+        duration = end_time - start_time
+        await asyncio.sleep(2)  # Wait for agent to finish
+        
+        # Results
+        print(f"\n=== WIKIRACE RESULTS: {config_name} ===")
+        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
+        print(f"Start: {start_page}")
+        print(f"Target: {target_page}")
+        print(f"Steps taken: {steps}")
+        print(f"Success: {success}")
+        print(f"Duration: {duration:.2f} seconds")
+
+
+async def run_parallel_agents_scenario(computer, agent_configs, max_steps):
+    
+    """Run two agents in parallel, one using Safari and one using Firefox"""
+    # Get popular wiki articles
+    global articles
+    safari_start, safari_target = get_article_pair(depth=1)
+    firefox_start, firefox_target = get_article_pair(depth=1)
+    
+    print(f"Safari Wiki race: {safari_start} → {safari_target}")
+    print(f"Firefox Wiki race: {firefox_start} → {firefox_target}")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open Safari with starting page
+    await open_wiki(safari_start, "Safari")
+    await asyncio.sleep(2)
+    
+    # Open Firefox with starting page
+    await open_wiki(firefox_start, "Firefox")
+    await asyncio.sleep(2)
+    
+    # Create agent configurations
+    for config_name, loop_provider, model_provider in agent_configs:
+        print(f"\n--- Testing Parallel Agents: {config_name} ---")
+        
+        # Create the agent interfaces
+        if "app-use" in (computer.experiments or []):
+            safari_desktop = computer.create_desktop_from_apps(["Safari"])
+            firefox_desktop = computer.create_desktop_from_apps(["Firefox"])
+        else:
+            safari_desktop = computer
+            firefox_desktop = computer
+        
+        # Save screenshots
+        screenshot_dir = project_root / "examples" / "evals" / "screenshots"
+        screenshot_dir.mkdir(exist_ok=True)
+        safari_screenshot_path = screenshot_dir / f"safari_{config_name}.png"
+        firefox_screenshot_path = screenshot_dir / f"firefox_{config_name}.png"
+        screenshot_bytes = await safari_desktop.interface.screenshot()
+        with open(safari_screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        screenshot_bytes = await firefox_desktop.interface.screenshot()
+        with open(firefox_screenshot_path, "wb") as f:
+            f.write(screenshot_bytes)
+        
+        # Create agents
+        safari_agent = ComputerAgent(
+            computer=safari_desktop,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        firefox_agent = ComputerAgent(
+            computer=firefox_desktop,
+            loop=loop_provider,
+            model=LLM(model_provider) if not isinstance(model_provider, LLM) else model_provider
+        )
+        
+        # Create prompts using the template
+        safari_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Safari",
+            start_page=safari_start,
+            target_page=safari_target
+        )
+        
+        firefox_prompt = WIKIRACE_PROMPT_TEMPLATE.format(
+            browser="Firefox",
+            start_page=firefox_start,
+            target_page=firefox_target
+        )
+        
+        # Track results
+        safari_results = {
+            "steps": 0,
+            "success": False,
+            "start_time": time.time(),
+            "end_time": None
+        }
+        
+        firefox_results = {
+            "steps": 0,
+            "success": False,
+            "start_time": time.time(),
+            "end_time": None
+        }
+        
+        # Function to run a single agent
+        async def run_agent(agent, prompt, browser, start_page, target_page, results):
+            try:
+                while results["steps"] < max_steps and not results["success"]:
+                    async for result in agent.run(prompt):
+                        results["steps"] += 1
+                        print(f"{browser} Step {results['steps']}")
+                        
+                        def process_result():
+                            if result.get("content"):
+                                print(f"{browser} Agent: {result.get('content', '')}")
+
+                            else:
+                                outputs = result.get("output", [])
+                                for output in outputs:
+                                    if output.get("type") == "message":
+                                        content = output.get("content", [])
+                                        for content_part in content:
+                                            if content_part.get("text"):
+                                                print(f"{browser} Agent: {content_part.get('text', '')}")
+
+                                    elif output.get("type") == "reasoning":
+                                        # if it's openAI, we only have access to a summary of the reasoning
+                                        summary_content = output.get("summary", [])
+                                        if summary_content:
+                                            for summary_part in summary_content:
+                                                if summary_part.get("type") == "summary_text":
+                                                    print(f"{browser} Agent: {summary_part.get('text', '')}")
+
+                                        else:
+                                            summary_content = output.get("text", "")
+                                            if summary_content:
+                                                print(f"{browser} Agent: {summary_content}")
+
+                                    elif output.get("type") == "computer_call":
+                                        action = output.get("action", {})
+                                        action_type = action.get("type", "")
+                                        if action_type:
+                                            action_title = f"🛠️ Performing {action_type}"
+                                            if action.get("x") and action.get("y"):
+                                                action_title += f" at ({action['x']}, {action['y']})"
+                                            print(f"{browser} Agent: {action_title}\n```json\n{json.dumps(action)}\n```")
+
+                        
+                        # Process and print the result
+                        process_result()
+                        
+                        # Check current page
+                        current_page = await get_current_wiki_page(browser)
+                        print(f"{browser} current page: {current_page}")
+                        print(f"{browser} target: {target_page}") 
+                        
+                        # Check if we reached the target
+                        if current_page and target_page.lower() in current_page.lower():
+                            results["success"] = True
+                            print(f"🎉 {browser} SUCCESS! Reached {target_page} in {results['steps']} steps!")
+                            await agent._loop.cancel()
+                            break
+                        
+                        # Check if we reached the maximum steps
+                        if results["steps"] >= max_steps:
+                            print(f"❌ Stopping {browser} agent: Reached maximum steps ({max_steps})")
+                            await agent._loop.cancel()
+                            break
            except asyncio.CancelledError:
-                print("Agent stopped")
-                            
-            end_time = time.time()
-            duration = end_time - start_time
-            await asyncio.sleep(2) # Wait for agent to finish
+                print(f"{browser} agent stopped")
+            finally:
+                results["end_time"] = time.time()
+        
+        # Run both agents in parallel
+        await asyncio.gather(
+            run_agent(safari_agent, safari_prompt, "Safari", safari_start, safari_target, safari_results),
+            run_agent(firefox_agent, firefox_prompt, "Firefox", firefox_start, firefox_target, firefox_results)
+        )
+        
+        # Wait for agents to finish
+        await asyncio.sleep(2)
+        
+        # Print results
+        print(f"\n=== PARALLEL AGENTS RESULTS: {config_name} ===")
+        print(f"App-Use Enabled: {'Yes' if 'app-use' in (computer.experiments or []) else 'No'}")
+        
+        print(f"\nSafari Results:")
+        print(f"Start: {safari_start}")
+        print(f"Target: {safari_target}")
+        print(f"Steps taken: {safari_results['steps']}")
+        print(f"Success: {safari_results['success']}")
+        print(f"Duration: {safari_results['end_time'] - safari_results['start_time']:.2f} seconds")
+        
+        print(f"\nFirefox Results:")
+        print(f"Start: {firefox_start}")
+        print(f"Target: {firefox_target}")
+        print(f"Steps taken: {firefox_results['steps']}")
+        print(f"Success: {firefox_results['success']}")
+        print(f"Duration: {firefox_results['end_time'] - firefox_results['start_time']:.2f} seconds")
+
+
+async def main():
+    try:
+        
+        # Define agent configurations to test
+        agent_configs = [
+            ("OpenAI", AgentLoop.OPENAI, LLMProvider.OPENAI),
+            ("Anthropic", AgentLoop.ANTHROPIC, LLMProvider.ANTHROPIC),
+            # ("UITARS", AgentLoop.UITARS, LLM(LLMProvider.OAICOMPAT, name="tgi", provider_base_url=os.getenv("UITARS_BASE_URL")))
+        ]
+        
+        # # Run the test scenario without any agents
+        # print("Running test scenario for sandboxed functions")
+        # await run_test_scenario()
+        
+        # Set maximum steps for each agent run
+        max_steps = 50
+        runs = 5
+
+        # run all scenarios
+        for _ in range(runs):
+            # Scenario 1: Messy desktop without App-Use
+            await run_scenario("messy_desktop", False, agent_configs, max_steps)
            
-            # Results
-            print(f"\n=== WIKIRACE RESULTS ===")
-            print(f"Start: {start_page}")
-            print(f"Target: {target_page}")
-            print(f"Steps taken: {steps}")
-            print(f"Success: {success}")
-            print(f"Duration: {duration:.2f} seconds")
-        finally:
-            # Important to clean up resources
-            # await computer.stop()
-            pass
+            # Scenario 1: Messy desktop with App-Use
+            await run_scenario("messy_desktop", True, agent_configs, max_steps)
+            
+            # Scenario 2: Parallel agents without App-Use
+            await run_scenario("parallel_agents", False, agent_configs, max_steps)
+            
+            # Scenario 2: Parallel agents with App-Use
+            await run_scenario("parallel_agents", True, agent_configs, max_steps)
            
    except Exception as e:
        print(f"Error in main: {e}")
        traceback.print_exc()


+async def run_test_scenario(max_iterations=5):
+    """Test sandboxed functions by opening the same pages in Safari and Firefox and checking if they match
+    
+    This function opens the same Wikipedia pages in both browsers and verifies that
+    the get_current_wiki_page function returns the same result for both browsers.
+    It does this for the specified number of iterations.
+    """
+    
+    # Create computer instance
+    computer = Computer()
+    await computer.run()
+    
+    # Get popular wiki articles
+    global articles
+    selected_articles = random.sample(articles, max_iterations)
+    
+    print(f"\n--- Running Test Scenario for {max_iterations} iterations ---")
+    
+    # Close all windows first
+    await close_all_windows()
+    
+    # Open both browsers
+    await open_app("Safari")
+    await open_app("Firefox")
+    
+    # Verify browsers are open
+    open_apps = await get_open_app_names()
+    print(f"Open applications: {open_apps}")
+    
+    # Run test iterations
+    for i, article in enumerate(selected_articles):
+        print(f"\nIteration {i+1}/{max_iterations}: Testing with article '{article}'")
+        
+        # Open the same Wikipedia page in both browsers
+        await open_wiki(article, "Safari")
+        await open_wiki(article, "Firefox")
+        await asyncio.sleep(3)  # Give a bit more time for both pages to load
+        
+        # Check if both browsers show the same page
+        safari_page = await get_current_wiki_page("Safari")
+        firefox_page = await get_current_wiki_page("Firefox")
+        
+        print(f"Safari page: {safari_page}")
+        print(f"Firefox page: {firefox_page}")
+        
+        if safari_page == firefox_page:
+            print(f"✅ MATCH: Both browsers show '{safari_page}'")
+        else:
+            print(f"❌ MISMATCH: Safari shows '{safari_page}', Firefox shows '{firefox_page}'")
+        
+        await asyncio.sleep(1)  # Brief pause between iterations
+    
+    print("\n--- Test Scenario Completed ---")
+
+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())