mirror of
https://github.com/trycua/computer.git
synced 2026-05-08 08:09:43 -05:00
Added server recovery test
This commit is contained in:
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
Watchdog Recovery Tests
|
||||
Tests for the watchdog functionality to ensure server recovery after hanging commands.
|
||||
Required environment variables:
|
||||
- CUA_API_KEY: API key for C/ua cloud provider
|
||||
- CUA_CONTAINER_NAME: Name of the container to use
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import traceback
|
||||
import time
|
||||
|
||||
# Load environment variables from .env file
|
||||
project_root = Path(__file__).parent.parent
|
||||
env_file = project_root / ".env"
|
||||
print(f"Loading environment from: {env_file}")
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(env_file)
|
||||
|
||||
# Add paths to sys.path if needed
|
||||
pythonpath = os.environ.get("PYTHONPATH", "")
|
||||
for path in pythonpath.split(":"):
|
||||
if path and path not in sys.path:
|
||||
sys.path.insert(0, path) # Insert at beginning to prioritize
|
||||
print(f"Added to sys.path: {path}")
|
||||
|
||||
from computer import Computer, VMProviderType
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
async def computer():
|
||||
"""Shared Computer instance for all test cases."""
|
||||
# Create a remote Linux computer with C/ua
|
||||
computer = Computer(
|
||||
os_type="linux",
|
||||
api_key=os.getenv("CUA_API_KEY"),
|
||||
name=str(os.getenv("CUA_CONTAINER_NAME")),
|
||||
provider_type=VMProviderType.CLOUD,
|
||||
)
|
||||
|
||||
try:
|
||||
await computer.run()
|
||||
yield computer
|
||||
finally:
|
||||
await computer.disconnect()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_server_ping(computer):
|
||||
"""
|
||||
Simple test to verify server connectivity before running watchdog tests.
|
||||
"""
|
||||
print("Testing basic server connectivity...")
|
||||
|
||||
try:
|
||||
result = await computer.interface.run_command("echo 'Server ping test'")
|
||||
print(f"Ping successful: {result}")
|
||||
assert result is not None, "Server ping returned None"
|
||||
print("✅ Server connectivity test passed")
|
||||
except Exception as e:
|
||||
print(f"❌ Server ping failed: {e}")
|
||||
pytest.fail(f"Basic server connectivity test failed: {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_watchdog_recovery_after_hanging_command(computer):
|
||||
"""
|
||||
Test that the watchdog can recover the server after a hanging command.
|
||||
|
||||
This test runs two concurrent tasks:
|
||||
1. A long-running command that hangs the server (sleep 300 = 5 minutes)
|
||||
2. Periodic ping commands every 30 seconds to test server responsiveness
|
||||
|
||||
The watchdog should detect the unresponsive server and restart it.
|
||||
"""
|
||||
print("Starting watchdog recovery test...")
|
||||
|
||||
async def hanging_command():
|
||||
"""Execute a command that takes 5 minutes to complete."""
|
||||
try:
|
||||
print("Starting hanging command (sleep 300)...")
|
||||
result = await computer.interface.run_command("sleep 300")
|
||||
print(f"Hanging command completed: {result}")
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Hanging command failed (expected if watchdog restarts): {e}")
|
||||
return None
|
||||
|
||||
async def ping_server():
|
||||
"""Ping the server every 30 seconds with echo commands."""
|
||||
ping_count = 0
|
||||
successful_pings = 0
|
||||
failed_pings = 0
|
||||
|
||||
try:
|
||||
# Run pings for up to 4 minutes (8 pings at 30-second intervals)
|
||||
for i in range(8):
|
||||
try:
|
||||
ping_count += 1
|
||||
print(f"Ping #{ping_count}: Sending echo command...")
|
||||
|
||||
start_time = time.time()
|
||||
result = await asyncio.wait_for(
|
||||
computer.interface.run_command(f"echo 'Ping {ping_count} at {int(start_time)}'"),
|
||||
timeout=10.0 # 10 second timeout for each ping
|
||||
)
|
||||
end_time = time.time()
|
||||
|
||||
print(f"Ping #{ping_count} successful in {end_time - start_time:.2f}s: {result}")
|
||||
successful_pings += 1
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
print(f"Ping #{ping_count} timed out (server may be unresponsive)")
|
||||
failed_pings += 1
|
||||
except Exception as e:
|
||||
print(f"Ping #{ping_count} failed with exception: {e}")
|
||||
failed_pings += 1
|
||||
|
||||
# Wait 30 seconds before next ping
|
||||
if i < 7: # Don't wait after the last ping
|
||||
print(f"Waiting 30 seconds before next ping...")
|
||||
await asyncio.sleep(30)
|
||||
|
||||
print(f"Ping summary: {successful_pings} successful, {failed_pings} failed")
|
||||
return successful_pings, failed_pings
|
||||
|
||||
except Exception as e:
|
||||
print(f"Ping server function failed with critical error: {e}")
|
||||
traceback.print_exc()
|
||||
return successful_pings, failed_pings
|
||||
|
||||
# Run both tasks concurrently
|
||||
print("Starting concurrent tasks: hanging command and ping monitoring...")
|
||||
|
||||
try:
|
||||
# Use asyncio.gather to run both tasks concurrently
|
||||
hanging_task = asyncio.create_task(hanging_command())
|
||||
ping_task = asyncio.create_task(ping_server())
|
||||
|
||||
# Wait for both tasks to complete or timeout after 5 minutes
|
||||
done, pending = await asyncio.wait(
|
||||
[hanging_task, ping_task],
|
||||
timeout=300, # 5 minute timeout
|
||||
return_when=asyncio.ALL_COMPLETED
|
||||
)
|
||||
|
||||
# Cancel any pending tasks
|
||||
for task in pending:
|
||||
task.cancel()
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# Get results from completed tasks
|
||||
ping_result = None
|
||||
hanging_result = None
|
||||
|
||||
if ping_task in done:
|
||||
try:
|
||||
ping_result = await ping_task
|
||||
print(f"Ping task completed with result: {ping_result}")
|
||||
except Exception as e:
|
||||
print(f"Error getting ping task result: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
if hanging_task in done:
|
||||
try:
|
||||
hanging_result = await hanging_task
|
||||
print(f"Hanging task completed with result: {hanging_result}")
|
||||
except Exception as e:
|
||||
print(f"Error getting hanging task result: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
# Analyze results
|
||||
if ping_result:
|
||||
successful_pings, failed_pings = ping_result
|
||||
|
||||
# Test passes if we had some successful pings, indicating recovery
|
||||
assert successful_pings > 0, f"No successful pings detected. Server may not have recovered."
|
||||
|
||||
# If we had failures followed by successes, that indicates watchdog recovery
|
||||
if failed_pings > 0 and successful_pings > 0:
|
||||
print("✅ SUCCESS: Watchdog recovery detected - server became unresponsive then recovered")
|
||||
elif successful_pings > 0 and failed_pings == 0:
|
||||
print("✅ SUCCESS: Server remained responsive throughout test")
|
||||
|
||||
print(f"Test completed: {successful_pings} successful pings, {failed_pings} failed pings")
|
||||
else:
|
||||
pytest.fail("Ping task did not complete - unable to assess server recovery")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Test failed with exception: {e}")
|
||||
traceback.print_exc()
|
||||
pytest.fail(f"Watchdog recovery test failed: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests directly
|
||||
pytest.main([__file__, "-v"])
|
||||
Reference in New Issue
Block a user