From 0fee673acb9788e10142e0e96db5cd095537ba4f Mon Sep 17 00:00:00 2001 From: Admin9705 <9705@duck.com> Date: Sun, 22 Jun 2025 20:39:19 -0400 Subject: [PATCH] Add health check and graceful shutdown support - Implemented health check endpoint for Docker and orchestration systems. - Added graceful shutdown configuration in Docker Compose and application code. - Enhanced shutdown handling in main application and background tasks for improved diagnostics. - Updated Dockerfile to include health check command. - Introduced readiness check endpoint for Kubernetes-style orchestration. --- Dockerfile | 4 ++ docker-compose.yml | 10 ++++ main.py | 48 ++++++++++++++++-- src/primary/background.py | 28 ++++++++--- src/primary/routes/common.py | 94 ++++++++++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5379b2b0..77dc10c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,5 +30,9 @@ ENV TZ=UTC # Expose port EXPOSE 9705 +# Add health check for Docker +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:9705/health || exit 1 + # Run the main application using the new entry point CMD ["python3", "main.py"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 8fcae13c..3b2f6a7d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,16 @@ services: - TZ=${TZ:-UTC} - BASE_URL=${BASE_URL:-} restart: unless-stopped + # Graceful shutdown configuration + stop_signal: SIGTERM + stop_grace_period: 30s + # Health check configuration + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9705/health"] + interval: 30s + timeout: 10s + start_period: 40s + retries: 3 volumes: huntarr-config: diff --git a/main.py b/main.py index ab70532d..c6d486ea 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,7 @@ import signal import logging # Use standard logging for initial setup import atexit import time +import time # Import path configuration early to set up environment try: @@ -144,6 +145,14 @@ except Exception as e: waitress_server = None shutdown_requested = threading.Event() +# Global shutdown flag for health checks +_global_shutdown_flag = False + +def is_shutting_down(): + """Check if the application is shutting down""" + global _global_shutdown_flag + return _global_shutdown_flag or shutdown_requested.is_set() or stop_event.is_set() + def refresh_sponsors_on_startup(): """Refresh sponsors database from manifest.json on startup""" import os @@ -317,7 +326,15 @@ def run_web_server(): def main_shutdown_handler(signum, frame): """Gracefully shut down the application.""" - huntarr_logger.info(f"Received signal {signum}. Initiating graceful shutdown...") + global _global_shutdown_flag + _global_shutdown_flag = True # Set global shutdown flag immediately + + signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}" + huntarr_logger.info(f"Received {signal_name}. Initiating graceful shutdown...") + + # Set a reasonable timeout for shutdown operations + shutdown_start_time = time.time() + shutdown_timeout = 30 # 30 seconds total shutdown timeout # Immediate database checkpoint to prevent corruption try: @@ -360,12 +377,19 @@ def main_shutdown_handler(signum, frame): waitress_server.close() except Exception as e: huntarr_logger.warning(f"Error closing Waitress server: {e}") + + # Force exit if shutdown takes too long (Docker container update scenario) + elapsed_time = time.time() - shutdown_start_time + if elapsed_time > shutdown_timeout: + huntarr_logger.warning(f"Shutdown timeout exceeded ({shutdown_timeout}s). Forcing exit with code 0.") + os._exit(0) # Clean exit for Docker updates def cleanup_handler(): """Cleanup function called at exit""" + cleanup_start_time = time.time() huntarr_logger.info("Exit cleanup handler called") - # Shutdown databases gracefully + # Shutdown databases gracefully with timeout try: from primary.utils.database import get_database, get_logs_database @@ -377,7 +401,8 @@ def cleanup_handler(): try: with main_db.get_connection() as conn: conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") # Flush WAL to main database - conn.execute("VACUUM") # Optimize database before shutdown + # Skip VACUUM for faster shutdown during updates + huntarr_logger.debug("Main database WAL checkpoint completed") except Exception as db_error: huntarr_logger.warning(f"Error during main database cleanup: {db_error}") @@ -388,7 +413,8 @@ def cleanup_handler(): try: with logs_db.get_logs_connection() as conn: conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") # Flush WAL to logs database - conn.execute("VACUUM") # Optimize logs database before shutdown + # Skip VACUUM for faster shutdown during updates + huntarr_logger.debug("Logs database WAL checkpoint completed") except Exception as logs_error: huntarr_logger.warning(f"Error during logs database cleanup: {logs_error}") @@ -397,10 +423,15 @@ def cleanup_handler(): except Exception as e: huntarr_logger.warning(f"Error during database shutdown: {e}") + # Ensure stop events are set if not stop_event.is_set(): stop_event.set() if not shutdown_requested.is_set(): shutdown_requested.set() + + # Log cleanup timing for Docker update diagnostics + cleanup_duration = time.time() - cleanup_start_time + huntarr_logger.info(f"Cleanup completed in {cleanup_duration:.2f} seconds") def main(): """Main entry point function for Huntarr application. @@ -511,7 +542,14 @@ def main(): # shutdown_threads() # Uncomment if primary.main.shutdown_threads() does more cleanup huntarr_logger.info("--- Huntarr Main Process Exiting ---") - return 0 # Success exit code + + # Return appropriate exit code based on shutdown reason + if shutdown_requested.is_set() or stop_event.is_set(): + huntarr_logger.info("Clean shutdown completed - Exit code 0") + return 0 # Clean shutdown + else: + huntarr_logger.warning("Unexpected shutdown - Exit code 1") + return 1 # Unexpected shutdown if __name__ == '__main__': diff --git a/src/primary/background.py b/src/primary/background.py index 305b6524..067179ad 100644 --- a/src/primary/background.py +++ b/src/primary/background.py @@ -651,11 +651,17 @@ def check_and_restart_threads(): def shutdown_handler(signum, frame): """Handle termination signals (SIGINT, SIGTERM).""" - logger.info(f"Received signal {signum}. Initiating shutdown...") + signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}" + logger.info(f"Received {signal_name}. Initiating background tasks shutdown...") stop_event.set() # Signal all threads to stop + + # Log shutdown progress for Docker diagnostics + logger.info("Background shutdown initiated - threads will stop gracefully") def shutdown_threads(): """Wait for all threads to finish.""" + import time + shutdown_start = time.time() logger.info("Waiting for all app threads to stop...") # Stop the hourly API cap scheduler @@ -688,12 +694,22 @@ def shutdown_threads(): except Exception as e: logger.error(f"Error stopping schedule action engine: {e}") - # Wait for all threads to terminate - for thread in app_threads.values(): - if thread.is_alive(): - thread.join(timeout=10.0) + # Wait for all app threads to terminate + active_threads = [name for name, thread in app_threads.items() if thread.is_alive()] + if active_threads: + logger.info(f"Waiting for {len(active_threads)} app threads to stop: {', '.join(active_threads)}") + + for name, thread in app_threads.items(): + if thread.is_alive(): + logger.debug(f"Waiting for {name} thread to stop...") + thread.join(timeout=10.0) + if thread.is_alive(): + logger.warning(f"{name} thread did not stop gracefully within 10 seconds") + else: + logger.debug(f"{name} thread stopped successfully") - logger.info("All app threads stopped.") + shutdown_duration = time.time() - shutdown_start + logger.info(f"All app threads stopped. Shutdown completed in {shutdown_duration:.2f} seconds") def hourly_cap_scheduler_loop(): """Main loop for the hourly API cap scheduler thread diff --git a/src/primary/routes/common.py b/src/primary/routes/common.py index e723a00f..a1973775 100644 --- a/src/primary/routes/common.py +++ b/src/primary/routes/common.py @@ -44,6 +44,100 @@ def logo_files(filename): # --- API Routes --- # +@common_bp.route('/health', methods=['GET']) +def health_check(): + """Health check endpoint for Docker and orchestration systems""" + try: + # Check if shutdown is in progress using multiple methods + from src.primary.background import stop_event + + # Also check the global shutdown flag from main.py + try: + import main + is_shutting_down = main.is_shutting_down() + except: + is_shutting_down = stop_event.is_set() + + if is_shutting_down: + return jsonify({ + "status": "shutting_down", + "message": "Application is shutting down", + "ready": False + }), 503 # Service Unavailable + + # Basic database connectivity check + from src.primary.utils.database import get_database + db = get_database() + + # Quick database health check + with db.get_connection() as conn: + conn.execute("SELECT 1") + + return jsonify({ + "status": "healthy", + "message": "Application is running normally", + "ready": True, + "timestamp": datetime.utcnow().isoformat() + }), 200 + + except Exception as e: + logger.error(f"Health check failed: {e}") + return jsonify({ + "status": "unhealthy", + "message": f"Health check failed: {str(e)}", + "ready": False + }), 503 # Service Unavailable + +@common_bp.route('/ready', methods=['GET']) +def readiness_check(): + """Readiness check endpoint for Kubernetes-style orchestration""" + try: + # Check if the application is ready to serve traffic + from src.primary.background import stop_event + + # Also check the global shutdown flag from main.py + try: + import main + is_shutting_down = main.is_shutting_down() + except: + is_shutting_down = stop_event.is_set() + + if is_shutting_down: + return jsonify({ + "ready": False, + "message": "Application is shutting down" + }), 503 + + # Check if setup is complete + from src.primary.utils.database import get_database + db = get_database() + + if db.is_setup_in_progress(): + return jsonify({ + "ready": False, + "message": "Application setup in progress" + }), 503 + + # Check if user exists (setup complete) + from ..auth import user_exists + if not user_exists(): + return jsonify({ + "ready": False, + "message": "Application requires initial setup" + }), 503 + + return jsonify({ + "ready": True, + "message": "Application is ready to serve traffic" + }), 200 + + except Exception as e: + logger.error(f"Readiness check failed: {e}") + return jsonify({ + "ready": False, + "message": f"Readiness check failed: {str(e)}" + }), 503 + @common_bp.route('/api/sleep.json', methods=['GET']) def api_get_sleep_json(): """API endpoint to serve sleep/cycle data from the database for frontend access"""