Added server recovery test

2026-05-08 08:09:43 -05:00 · 2025-07-01 12:40:38 -04:00
parent cd5c0fbb2a
commit 62832f7bea
1 changed files with 204 additions and 0 deletions
@@ -0,0 +1,204 @@
+"""
+Watchdog Recovery Tests
+Tests for the watchdog functionality to ensure server recovery after hanging commands.
+Required environment variables:
+- CUA_API_KEY: API key for C/ua cloud provider
+- CUA_CONTAINER_NAME: Name of the container to use
+"""
+
+import os
+import asyncio
+import pytest
+from pathlib import Path
+import sys
+import traceback
+import time
+
+# Load environment variables from .env file
+project_root = Path(__file__).parent.parent
+env_file = project_root / ".env"
+print(f"Loading environment from: {env_file}")
+from dotenv import load_dotenv
+
+load_dotenv(env_file)
+
+# Add paths to sys.path if needed
+pythonpath = os.environ.get("PYTHONPATH", "")
+for path in pythonpath.split(":"):
+    if path and path not in sys.path:
+        sys.path.insert(0, path)  # Insert at beginning to prioritize
+        print(f"Added to sys.path: {path}")
+
+from computer import Computer, VMProviderType
+
+@pytest.fixture(scope="session")
+async def computer():
+    """Shared Computer instance for all test cases."""
+    # Create a remote Linux computer with C/ua
+    computer = Computer(
+        os_type="linux",
+        api_key=os.getenv("CUA_API_KEY"),
+        name=str(os.getenv("CUA_CONTAINER_NAME")),
+        provider_type=VMProviderType.CLOUD,
+    )
+    
+    try:
+        await computer.run()
+        yield computer
+    finally:
+        await computer.disconnect()
+
+
+@pytest.mark.asyncio
+async def test_simple_server_ping(computer):
+    """
+    Simple test to verify server connectivity before running watchdog tests.
+    """
+    print("Testing basic server connectivity...")
+    
+    try:
+        result = await computer.interface.run_command("echo 'Server ping test'")
+        print(f"Ping successful: {result}")
+        assert result is not None, "Server ping returned None"
+        print("✅ Server connectivity test passed")
+    except Exception as e:
+        print(f"❌ Server ping failed: {e}")
+        pytest.fail(f"Basic server connectivity test failed: {e}")
+
+
+@pytest.mark.asyncio
+async def test_watchdog_recovery_after_hanging_command(computer):
+    """
+    Test that the watchdog can recover the server after a hanging command.
+    
+    This test runs two concurrent tasks:
+    1. A long-running command that hangs the server (sleep 300 = 5 minutes)
+    2. Periodic ping commands every 30 seconds to test server responsiveness
+    
+    The watchdog should detect the unresponsive server and restart it.
+    """
+    print("Starting watchdog recovery test...")
+    
+    async def hanging_command():
+        """Execute a command that takes 5 minutes to complete."""
+        try:
+            print("Starting hanging command (sleep 300)...")
+            result = await computer.interface.run_command("sleep 300")
+            print(f"Hanging command completed: {result}")
+            return result
+        except Exception as e:
+            print(f"Hanging command failed (expected if watchdog restarts): {e}")
+            return None
+    
+    async def ping_server():
+        """Ping the server every 30 seconds with echo commands."""
+        ping_count = 0
+        successful_pings = 0
+        failed_pings = 0
+        
+        try:
+            # Run pings for up to 4 minutes (8 pings at 30-second intervals)
+            for i in range(8):
+                try:
+                    ping_count += 1
+                    print(f"Ping #{ping_count}: Sending echo command...")
+                    
+                    start_time = time.time()
+                    result = await asyncio.wait_for(
+                        computer.interface.run_command(f"echo 'Ping {ping_count} at {int(start_time)}'"),
+                        timeout=10.0  # 10 second timeout for each ping
+                    )
+                    end_time = time.time()
+                    
+                    print(f"Ping #{ping_count} successful in {end_time - start_time:.2f}s: {result}")
+                    successful_pings += 1
+                    
+                except asyncio.TimeoutError:
+                    print(f"Ping #{ping_count} timed out (server may be unresponsive)")
+                    failed_pings += 1
+                except Exception as e:
+                    print(f"Ping #{ping_count} failed with exception: {e}")
+                    failed_pings += 1
+                
+                # Wait 30 seconds before next ping
+                if i < 7:  # Don't wait after the last ping
+                    print(f"Waiting 30 seconds before next ping...")
+                    await asyncio.sleep(30)
+            
+            print(f"Ping summary: {successful_pings} successful, {failed_pings} failed")
+            return successful_pings, failed_pings
+            
+        except Exception as e:
+            print(f"Ping server function failed with critical error: {e}")
+            traceback.print_exc()
+            return successful_pings, failed_pings
+    
+    # Run both tasks concurrently
+    print("Starting concurrent tasks: hanging command and ping monitoring...")
+    
+    try:
+        # Use asyncio.gather to run both tasks concurrently
+        hanging_task = asyncio.create_task(hanging_command())
+        ping_task = asyncio.create_task(ping_server())
+        
+        # Wait for both tasks to complete or timeout after 5 minutes
+        done, pending = await asyncio.wait(
+            [hanging_task, ping_task],
+            timeout=300,  # 5 minute timeout
+            return_when=asyncio.ALL_COMPLETED
+        )
+        
+        # Cancel any pending tasks
+        for task in pending:
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+        
+        # Get results from completed tasks
+        ping_result = None
+        hanging_result = None
+        
+        if ping_task in done:
+            try:
+                ping_result = await ping_task
+                print(f"Ping task completed with result: {ping_result}")
+            except Exception as e:
+                print(f"Error getting ping task result: {e}")
+                traceback.print_exc()
+        
+        if hanging_task in done:
+            try:
+                hanging_result = await hanging_task
+                print(f"Hanging task completed with result: {hanging_result}")
+            except Exception as e:
+                print(f"Error getting hanging task result: {e}")
+                traceback.print_exc()
+        
+        # Analyze results
+        if ping_result:
+            successful_pings, failed_pings = ping_result
+            
+            # Test passes if we had some successful pings, indicating recovery
+            assert successful_pings > 0, f"No successful pings detected. Server may not have recovered."
+            
+            # If we had failures followed by successes, that indicates watchdog recovery
+            if failed_pings > 0 and successful_pings > 0:
+                print("✅ SUCCESS: Watchdog recovery detected - server became unresponsive then recovered")
+            elif successful_pings > 0 and failed_pings == 0:
+                print("✅ SUCCESS: Server remained responsive throughout test")
+            
+            print(f"Test completed: {successful_pings} successful pings, {failed_pings} failed pings")
+        else:
+            pytest.fail("Ping task did not complete - unable to assess server recovery")
+            
+    except Exception as e:
+        print(f"Test failed with exception: {e}")
+        traceback.print_exc()
+        pytest.fail(f"Watchdog recovery test failed: {e}")
+
+
+if __name__ == "__main__":
+    # Run tests directly
+    pytest.main([__file__, "-v"])