mirror of
https://github.com/trycua/computer.git
synced 2026-01-02 11:30:12 -06:00
215 lines
8.3 KiB
Python
215 lines
8.3 KiB
Python
"""
|
|
Watchdog Recovery Tests
|
|
Tests for the watchdog functionality to ensure server recovery after hanging commands.
|
|
Required environment variables:
|
|
- CUA_API_KEY: API key for C/ua cloud provider
|
|
- CUA_CONTAINER_NAME: Name of the container to use
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import pytest
|
|
from pathlib import Path
|
|
import sys
|
|
import traceback
|
|
import time
|
|
|
|
# Load environment variables from .env file
|
|
project_root = Path(__file__).parent.parent
|
|
env_file = project_root / ".env"
|
|
print(f"Loading environment from: {env_file}")
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv(env_file)
|
|
|
|
# Add paths to sys.path if needed
|
|
pythonpath = os.environ.get("PYTHONPATH", "")
|
|
for path in pythonpath.split(":"):
|
|
if path and path not in sys.path:
|
|
sys.path.insert(0, path) # Insert at beginning to prioritize
|
|
print(f"Added to sys.path: {path}")
|
|
|
|
from computer import Computer, VMProviderType
|
|
|
|
@pytest.fixture(scope="session")
|
|
async def computer():
|
|
"""Shared Computer instance for all test cases."""
|
|
# Create a remote Linux computer with C/ua
|
|
computer = Computer(
|
|
os_type="linux",
|
|
api_key=os.getenv("CUA_API_KEY"),
|
|
name=str(os.getenv("CUA_CONTAINER_NAME")),
|
|
provider_type=VMProviderType.CLOUD,
|
|
)
|
|
|
|
try:
|
|
await computer.run()
|
|
yield computer
|
|
finally:
|
|
await computer.disconnect()
|
|
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_simple_server_ping(computer):
|
|
"""
|
|
Simple test to verify server connectivity before running watchdog tests.
|
|
"""
|
|
print("Testing basic server connectivity...")
|
|
|
|
try:
|
|
result = await computer.interface.run_command("echo 'Server ping test'")
|
|
print(f"Ping successful: {result}")
|
|
assert result is not None, "Server ping returned None"
|
|
print("✅ Server connectivity test passed")
|
|
except Exception as e:
|
|
print(f"❌ Server ping failed: {e}")
|
|
pytest.fail(f"Basic server connectivity test failed: {e}")
|
|
|
|
|
|
@pytest.mark.asyncio(loop_scope="session")
|
|
async def test_watchdog_recovery_after_hanging_command(computer):
|
|
"""
|
|
Test that the watchdog can recover the server after a hanging command.
|
|
|
|
This test runs two concurrent tasks:
|
|
1. A long-running command that hangs the server (sleep 300 = 5 minutes)
|
|
2. Periodic ping commands every 30 seconds to test server responsiveness
|
|
|
|
The watchdog should detect the unresponsive server and restart it.
|
|
"""
|
|
print("Starting watchdog recovery test...")
|
|
|
|
async def hanging_command():
|
|
"""Execute a command that sleeps forever to hang the server."""
|
|
try:
|
|
print("Starting hanging command (sleep infinity)...")
|
|
# Use a very long sleep that should never complete naturally
|
|
result = await computer.interface.run_command("sleep 999999")
|
|
print(f"Hanging command completed unexpectedly: {result}")
|
|
return True # Should never reach here if watchdog works
|
|
except Exception as e:
|
|
print(f"Hanging command interrupted (expected if watchdog restarts): {e}")
|
|
return None # Expected result when watchdog kills the process
|
|
|
|
async def ping_server():
|
|
"""Ping the server every 30 seconds with echo commands."""
|
|
ping_count = 0
|
|
successful_pings = 0
|
|
failed_pings = 0
|
|
|
|
try:
|
|
# Run pings for up to 4 minutes (8 pings at 30-second intervals)
|
|
for i in range(8):
|
|
try:
|
|
ping_count += 1
|
|
print(f"Ping #{ping_count}: Sending echo command...")
|
|
|
|
start_time = time.time()
|
|
result = await asyncio.wait_for(
|
|
computer.interface.run_command(f"echo 'Ping {ping_count} at {int(start_time)}'"),
|
|
timeout=10.0 # 10 second timeout for each ping
|
|
)
|
|
end_time = time.time()
|
|
|
|
print(f"Ping #{ping_count} successful in {end_time - start_time:.2f}s: {result}")
|
|
successful_pings += 1
|
|
|
|
except asyncio.TimeoutError:
|
|
print(f"Ping #{ping_count} timed out (server may be unresponsive)")
|
|
failed_pings += 1
|
|
except Exception as e:
|
|
print(f"Ping #{ping_count} failed with exception: {e}")
|
|
failed_pings += 1
|
|
|
|
# Wait 30 seconds before next ping
|
|
if i < 7: # Don't wait after the last ping
|
|
print(f"Waiting 30 seconds before next ping...")
|
|
await asyncio.sleep(30)
|
|
|
|
print(f"Ping summary: {successful_pings} successful, {failed_pings} failed")
|
|
return successful_pings, failed_pings
|
|
|
|
except Exception as e:
|
|
print(f"Ping server function failed with critical error: {e}")
|
|
traceback.print_exc()
|
|
return successful_pings, failed_pings
|
|
|
|
# Run both tasks concurrently
|
|
print("Starting concurrent tasks: hanging command and ping monitoring...")
|
|
|
|
try:
|
|
# Use asyncio.gather to run both tasks concurrently
|
|
hanging_task = asyncio.create_task(hanging_command())
|
|
ping_task = asyncio.create_task(ping_server())
|
|
|
|
# Wait for both tasks to complete or timeout after 5 minutes
|
|
done, pending = await asyncio.wait(
|
|
[hanging_task, ping_task],
|
|
timeout=300, # 5 minute timeout
|
|
return_when=asyncio.ALL_COMPLETED
|
|
)
|
|
|
|
# Cancel any pending tasks
|
|
for task in pending:
|
|
task.cancel()
|
|
try:
|
|
await task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
# Get results from completed tasks
|
|
ping_result = None
|
|
hanging_result = None
|
|
|
|
if ping_task in done:
|
|
try:
|
|
ping_result = await ping_task
|
|
print(f"Ping task completed with result: {ping_result}")
|
|
except Exception as e:
|
|
print(f"Error getting ping task result: {e}")
|
|
traceback.print_exc()
|
|
|
|
if hanging_task in done:
|
|
try:
|
|
hanging_result = await hanging_task
|
|
print(f"Hanging task completed with result: {hanging_result}")
|
|
except Exception as e:
|
|
print(f"Error getting hanging task result: {e}")
|
|
traceback.print_exc()
|
|
|
|
# Analyze results
|
|
if ping_result:
|
|
successful_pings, failed_pings = ping_result
|
|
|
|
# Test passes if we had some successful pings, indicating recovery
|
|
assert successful_pings > 0, f"No successful pings detected. Server may not have recovered."
|
|
|
|
# Check if hanging command was killed (indicating watchdog restart)
|
|
if hanging_result is None:
|
|
print("✅ SUCCESS: Hanging command was killed - watchdog restart detected")
|
|
elif hanging_result is True:
|
|
print("⚠️ WARNING: Hanging command completed naturally - watchdog may not have restarted")
|
|
|
|
# If we had failures followed by successes, that indicates watchdog recovery
|
|
if failed_pings > 0 and successful_pings > 0:
|
|
print("✅ SUCCESS: Watchdog recovery detected - server became unresponsive then recovered")
|
|
# Additional check: hanging command should be None if watchdog worked
|
|
assert hanging_result is None, "Expected hanging command to be killed by watchdog restart"
|
|
elif successful_pings > 0 and failed_pings == 0:
|
|
print("✅ SUCCESS: Server remained responsive throughout test")
|
|
|
|
print(f"Test completed: {successful_pings} successful pings, {failed_pings} failed pings")
|
|
print(f"Hanging command result: {hanging_result} (None = killed by watchdog, True = completed naturally)")
|
|
else:
|
|
pytest.fail("Ping task did not complete - unable to assess server recovery")
|
|
|
|
except Exception as e:
|
|
print(f"Test failed with exception: {e}")
|
|
traceback.print_exc()
|
|
pytest.fail(f"Watchdog recovery test failed: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests directly
|
|
pytest.main([__file__, "-v"])
|