diff --git a/tests/watchdog.py b/tests/watchdog.py new file mode 100644 index 00000000..c46b7517 --- /dev/null +++ b/tests/watchdog.py @@ -0,0 +1,204 @@ +""" +Watchdog Recovery Tests +Tests for the watchdog functionality to ensure server recovery after hanging commands. +Required environment variables: +- CUA_API_KEY: API key for C/ua cloud provider +- CUA_CONTAINER_NAME: Name of the container to use +""" + +import os +import asyncio +import pytest +from pathlib import Path +import sys +import traceback +import time + +# Load environment variables from .env file +project_root = Path(__file__).parent.parent +env_file = project_root / ".env" +print(f"Loading environment from: {env_file}") +from dotenv import load_dotenv + +load_dotenv(env_file) + +# Add paths to sys.path if needed +pythonpath = os.environ.get("PYTHONPATH", "") +for path in pythonpath.split(":"): + if path and path not in sys.path: + sys.path.insert(0, path) # Insert at beginning to prioritize + print(f"Added to sys.path: {path}") + +from computer import Computer, VMProviderType + +@pytest.fixture(scope="session") +async def computer(): + """Shared Computer instance for all test cases.""" + # Create a remote Linux computer with C/ua + computer = Computer( + os_type="linux", + api_key=os.getenv("CUA_API_KEY"), + name=str(os.getenv("CUA_CONTAINER_NAME")), + provider_type=VMProviderType.CLOUD, + ) + + try: + await computer.run() + yield computer + finally: + await computer.disconnect() + + +@pytest.mark.asyncio +async def test_simple_server_ping(computer): + """ + Simple test to verify server connectivity before running watchdog tests. + """ + print("Testing basic server connectivity...") + + try: + result = await computer.interface.run_command("echo 'Server ping test'") + print(f"Ping successful: {result}") + assert result is not None, "Server ping returned None" + print("✅ Server connectivity test passed") + except Exception as e: + print(f"❌ Server ping failed: {e}") + pytest.fail(f"Basic server connectivity test failed: {e}") + + +@pytest.mark.asyncio +async def test_watchdog_recovery_after_hanging_command(computer): + """ + Test that the watchdog can recover the server after a hanging command. + + This test runs two concurrent tasks: + 1. A long-running command that hangs the server (sleep 300 = 5 minutes) + 2. Periodic ping commands every 30 seconds to test server responsiveness + + The watchdog should detect the unresponsive server and restart it. + """ + print("Starting watchdog recovery test...") + + async def hanging_command(): + """Execute a command that takes 5 minutes to complete.""" + try: + print("Starting hanging command (sleep 300)...") + result = await computer.interface.run_command("sleep 300") + print(f"Hanging command completed: {result}") + return result + except Exception as e: + print(f"Hanging command failed (expected if watchdog restarts): {e}") + return None + + async def ping_server(): + """Ping the server every 30 seconds with echo commands.""" + ping_count = 0 + successful_pings = 0 + failed_pings = 0 + + try: + # Run pings for up to 4 minutes (8 pings at 30-second intervals) + for i in range(8): + try: + ping_count += 1 + print(f"Ping #{ping_count}: Sending echo command...") + + start_time = time.time() + result = await asyncio.wait_for( + computer.interface.run_command(f"echo 'Ping {ping_count} at {int(start_time)}'"), + timeout=10.0 # 10 second timeout for each ping + ) + end_time = time.time() + + print(f"Ping #{ping_count} successful in {end_time - start_time:.2f}s: {result}") + successful_pings += 1 + + except asyncio.TimeoutError: + print(f"Ping #{ping_count} timed out (server may be unresponsive)") + failed_pings += 1 + except Exception as e: + print(f"Ping #{ping_count} failed with exception: {e}") + failed_pings += 1 + + # Wait 30 seconds before next ping + if i < 7: # Don't wait after the last ping + print(f"Waiting 30 seconds before next ping...") + await asyncio.sleep(30) + + print(f"Ping summary: {successful_pings} successful, {failed_pings} failed") + return successful_pings, failed_pings + + except Exception as e: + print(f"Ping server function failed with critical error: {e}") + traceback.print_exc() + return successful_pings, failed_pings + + # Run both tasks concurrently + print("Starting concurrent tasks: hanging command and ping monitoring...") + + try: + # Use asyncio.gather to run both tasks concurrently + hanging_task = asyncio.create_task(hanging_command()) + ping_task = asyncio.create_task(ping_server()) + + # Wait for both tasks to complete or timeout after 5 minutes + done, pending = await asyncio.wait( + [hanging_task, ping_task], + timeout=300, # 5 minute timeout + return_when=asyncio.ALL_COMPLETED + ) + + # Cancel any pending tasks + for task in pending: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + # Get results from completed tasks + ping_result = None + hanging_result = None + + if ping_task in done: + try: + ping_result = await ping_task + print(f"Ping task completed with result: {ping_result}") + except Exception as e: + print(f"Error getting ping task result: {e}") + traceback.print_exc() + + if hanging_task in done: + try: + hanging_result = await hanging_task + print(f"Hanging task completed with result: {hanging_result}") + except Exception as e: + print(f"Error getting hanging task result: {e}") + traceback.print_exc() + + # Analyze results + if ping_result: + successful_pings, failed_pings = ping_result + + # Test passes if we had some successful pings, indicating recovery + assert successful_pings > 0, f"No successful pings detected. Server may not have recovered." + + # If we had failures followed by successes, that indicates watchdog recovery + if failed_pings > 0 and successful_pings > 0: + print("✅ SUCCESS: Watchdog recovery detected - server became unresponsive then recovered") + elif successful_pings > 0 and failed_pings == 0: + print("✅ SUCCESS: Server remained responsive throughout test") + + print(f"Test completed: {successful_pings} successful pings, {failed_pings} failed pings") + else: + pytest.fail("Ping task did not complete - unable to assess server recovery") + + except Exception as e: + print(f"Test failed with exception: {e}") + traceback.print_exc() + pytest.fail(f"Watchdog recovery test failed: {e}") + + +if __name__ == "__main__": + # Run tests directly + pytest.main([__file__, "-v"])