Add health check and graceful shutdown support

- Implemented health check endpoint for Docker and orchestration systems. - Added graceful shutdown configuration in Docker Compose and application code. - Enhanced shutdown handling in main application and background tasks for improved diagnostics. - Updated Dockerfile to include health check command. - Introduced readiness check endpoint for Kubernetes-style orchestration.
2025-12-16 20:04:16 -06:00 · 2025-06-22 20:39:19 -04:00
parent 8c3533991b
commit 0fee673acb
5 changed files with 173 additions and 11 deletions
--- a/4
+++ b/4
@@ -30,5 +30,9 @@ ENV TZ=UTC
 # Expose port
 EXPOSE 9705

+# Add health check for Docker
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD curl -f http://localhost:9705/health || exit 1
+
 # Run the main application using the new entry point
 CMD ["python3", "main.py"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,16 @@ services:
      - TZ=${TZ:-UTC}
      - BASE_URL=${BASE_URL:-}
    restart: unless-stopped
+    # Graceful shutdown configuration
+    stop_signal: SIGTERM
+    stop_grace_period: 30s
+    # Health check configuration
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9705/health"]
+      interval: 30s
+      timeout: 10s
+      start_period: 40s
+      retries: 3

 volumes:
  huntarr-config:
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@ import signal
 import logging # Use standard logging for initial setup
 import atexit
 import time
+import time

 # Import path configuration early to set up environment
 try:
@@ -144,6 +145,14 @@ except Exception as e:
 waitress_server = None
 shutdown_requested = threading.Event()

+# Global shutdown flag for health checks
+_global_shutdown_flag = False
+
+def is_shutting_down():
+    """Check if the application is shutting down"""
+    global _global_shutdown_flag
+    return _global_shutdown_flag or shutdown_requested.is_set() or stop_event.is_set()
+
 def refresh_sponsors_on_startup():
    """Refresh sponsors database from manifest.json on startup"""
    import os
@@ -317,7 +326,15 @@ def run_web_server():

 def main_shutdown_handler(signum, frame):
    """Gracefully shut down the application."""
-    huntarr_logger.info(f"Received signal {signum}. Initiating graceful shutdown...")
+    global _global_shutdown_flag
+    _global_shutdown_flag = True  # Set global shutdown flag immediately
+    
+    signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}"
+    huntarr_logger.info(f"Received {signal_name}. Initiating graceful shutdown...")
+    
+    # Set a reasonable timeout for shutdown operations
+    shutdown_start_time = time.time()
+    shutdown_timeout = 30  # 30 seconds total shutdown timeout
    
    # Immediate database checkpoint to prevent corruption
    try:
@@ -360,12 +377,19 @@ def main_shutdown_handler(signum, frame):
            waitress_server.close()
        except Exception as e:
            huntarr_logger.warning(f"Error closing Waitress server: {e}")
+    
+    # Force exit if shutdown takes too long (Docker container update scenario)
+    elapsed_time = time.time() - shutdown_start_time
+    if elapsed_time > shutdown_timeout:
+        huntarr_logger.warning(f"Shutdown timeout exceeded ({shutdown_timeout}s). Forcing exit with code 0.")
+        os._exit(0)  # Clean exit for Docker updates

 def cleanup_handler():
    """Cleanup function called at exit"""
+    cleanup_start_time = time.time()
    huntarr_logger.info("Exit cleanup handler called")
    
-    # Shutdown databases gracefully
+    # Shutdown databases gracefully with timeout
    try:
        from primary.utils.database import get_database, get_logs_database
        
@@ -377,7 +401,8 @@ def cleanup_handler():
            try:
                with main_db.get_connection() as conn:
                    conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")  # Flush WAL to main database
-                    conn.execute("VACUUM")  # Optimize database before shutdown
+                    # Skip VACUUM for faster shutdown during updates
+                    huntarr_logger.debug("Main database WAL checkpoint completed")
            except Exception as db_error:
                huntarr_logger.warning(f"Error during main database cleanup: {db_error}")
        
@@ -388,7 +413,8 @@ def cleanup_handler():
            try:
                with logs_db.get_logs_connection() as conn:
                    conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")  # Flush WAL to logs database
-                    conn.execute("VACUUM")  # Optimize logs database before shutdown
+                    # Skip VACUUM for faster shutdown during updates
+                    huntarr_logger.debug("Logs database WAL checkpoint completed")
            except Exception as logs_error:
                huntarr_logger.warning(f"Error during logs database cleanup: {logs_error}")
                
@@ -397,10 +423,15 @@ def cleanup_handler():
    except Exception as e:
        huntarr_logger.warning(f"Error during database shutdown: {e}")
    
+    # Ensure stop events are set
    if not stop_event.is_set():
        stop_event.set()
    if not shutdown_requested.is_set():
        shutdown_requested.set()
+    
+    # Log cleanup timing for Docker update diagnostics
+    cleanup_duration = time.time() - cleanup_start_time
+    huntarr_logger.info(f"Cleanup completed in {cleanup_duration:.2f} seconds")

 def main():
    """Main entry point function for Huntarr application.
@@ -511,7 +542,14 @@ def main():
        # shutdown_threads() # Uncomment if primary.main.shutdown_threads() does more cleanup

        huntarr_logger.info("--- Huntarr Main Process Exiting ---")
-        return 0  # Success exit code
+        
+        # Return appropriate exit code based on shutdown reason
+        if shutdown_requested.is_set() or stop_event.is_set():
+            huntarr_logger.info("Clean shutdown completed - Exit code 0")
+            return 0  # Clean shutdown
+        else:
+            huntarr_logger.warning("Unexpected shutdown - Exit code 1")
+            return 1  # Unexpected shutdown


 if __name__ == '__main__':
--- a/src/primary/background.py
+++ b/src/primary/background.py
@@ -651,11 +651,17 @@ def check_and_restart_threads():

 def shutdown_handler(signum, frame):
    """Handle termination signals (SIGINT, SIGTERM)."""
-    logger.info(f"Received signal {signum}. Initiating shutdown...")
+    signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}"
+    logger.info(f"Received {signal_name}. Initiating background tasks shutdown...")
    stop_event.set() # Signal all threads to stop
+    
+    # Log shutdown progress for Docker diagnostics
+    logger.info("Background shutdown initiated - threads will stop gracefully")

 def shutdown_threads():
    """Wait for all threads to finish."""
+    import time
+    shutdown_start = time.time()
    logger.info("Waiting for all app threads to stop...")
    
    # Stop the hourly API cap scheduler
@@ -688,12 +694,22 @@ def shutdown_threads():
    except Exception as e:
        logger.error(f"Error stopping schedule action engine: {e}")
    
-    # Wait for all threads to terminate
-    for thread in app_threads.values():
-        if thread.is_alive():
-            thread.join(timeout=10.0)
+    # Wait for all app threads to terminate
+    active_threads = [name for name, thread in app_threads.items() if thread.is_alive()]
+    if active_threads:
+        logger.info(f"Waiting for {len(active_threads)} app threads to stop: {', '.join(active_threads)}")
+        
+        for name, thread in app_threads.items():
+            if thread.is_alive():
+                logger.debug(f"Waiting for {name} thread to stop...")
+                thread.join(timeout=10.0)
+                if thread.is_alive():
+                    logger.warning(f"{name} thread did not stop gracefully within 10 seconds")
+                else:
+                    logger.debug(f"{name} thread stopped successfully")
    
-    logger.info("All app threads stopped.")
+    shutdown_duration = time.time() - shutdown_start
+    logger.info(f"All app threads stopped. Shutdown completed in {shutdown_duration:.2f} seconds")

 def hourly_cap_scheduler_loop():
    """Main loop for the hourly API cap scheduler thread
--- a/src/primary/routes/common.py
+++ b/src/primary/routes/common.py
@@ -44,6 +44,100 @@ def logo_files(filename):

 # --- API Routes --- #

+@common_bp.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint for Docker and orchestration systems"""
+    try:
+        # Check if shutdown is in progress using multiple methods
+        from src.primary.background import stop_event
+        
+        # Also check the global shutdown flag from main.py
+        try:
+            import main
+            is_shutting_down = main.is_shutting_down()
+        except:
+            is_shutting_down = stop_event.is_set()
+        
+        if is_shutting_down:
+            return jsonify({
+                "status": "shutting_down",
+                "message": "Application is shutting down",
+                "ready": False
+            }), 503  # Service Unavailable
+        
+        # Basic database connectivity check
+        from src.primary.utils.database import get_database
+        db = get_database()
+        
+        # Quick database health check
+        with db.get_connection() as conn:
+            conn.execute("SELECT 1")
+        
+        return jsonify({
+            "status": "healthy",
+            "message": "Application is running normally",
+            "ready": True,
+            "timestamp": datetime.utcnow().isoformat()
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return jsonify({
+            "status": "unhealthy",
+            "message": f"Health check failed: {str(e)}",
+            "ready": False
+        }), 503  # Service Unavailable
+
+@common_bp.route('/ready', methods=['GET'])
+def readiness_check():
+    """Readiness check endpoint for Kubernetes-style orchestration"""
+    try:
+        # Check if the application is ready to serve traffic
+        from src.primary.background import stop_event
+        
+        # Also check the global shutdown flag from main.py
+        try:
+            import main
+            is_shutting_down = main.is_shutting_down()
+        except:
+            is_shutting_down = stop_event.is_set()
+        
+        if is_shutting_down:
+            return jsonify({
+                "ready": False,
+                "message": "Application is shutting down"
+            }), 503
+        
+        # Check if setup is complete
+        from src.primary.utils.database import get_database
+        db = get_database()
+        
+        if db.is_setup_in_progress():
+            return jsonify({
+                "ready": False,
+                "message": "Application setup in progress"
+            }), 503
+        
+        # Check if user exists (setup complete)
+        from ..auth import user_exists
+        if not user_exists():
+            return jsonify({
+                "ready": False,
+                "message": "Application requires initial setup"
+            }), 503
+        
+        return jsonify({
+            "ready": True,
+            "message": "Application is ready to serve traffic"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Readiness check failed: {e}")
+        return jsonify({
+            "ready": False,
+            "message": f"Readiness check failed: {str(e)}"
+        }), 503
+
@common_bp.route('/api/sleep.json', methods=['GET'])
 def api_get_sleep_json():
    """API endpoint to serve sleep/cycle data from the database for frontend access"""