From 0fee673acb9788e10142e0e96db5cd095537ba4f Mon Sep 17 00:00:00 2001
From: Admin9705 <9705@duck.com>
Date: Sun, 22 Jun 2025 20:39:19 -0400
Subject: [PATCH] Add health check and graceful shutdown support

- Implemented health check endpoint for Docker and orchestration systems.
- Added graceful shutdown configuration in Docker Compose and application code.
- Enhanced shutdown handling in main application and background tasks for improved diagnostics.
- Updated Dockerfile to include health check command.
- Introduced readiness check endpoint for Kubernetes-style orchestration.
---
 Dockerfile                   |  4 ++
 docker-compose.yml           | 10 ++++
 main.py                      | 48 ++++++++++++++++--
 src/primary/background.py    | 28 ++++++++---
 src/primary/routes/common.py | 94 ++++++++++++++++++++++++++++++++++++
 5 files changed, 173 insertions(+), 11 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 5379b2b0..77dc10c6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,5 +30,9 @@ ENV TZ=UTC
 # Expose port
 EXPOSE 9705
 
+# Add health check for Docker
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD curl -f http://localhost:9705/health || exit 1
+
 # Run the main application using the new entry point
 CMD ["python3", "main.py"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 8fcae13c..3b2f6a7d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,16 @@ services:
       - TZ=${TZ:-UTC}
       - BASE_URL=${BASE_URL:-}
     restart: unless-stopped
+    # Graceful shutdown configuration
+    stop_signal: SIGTERM
+    stop_grace_period: 30s
+    # Health check configuration
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9705/health"]
+      interval: 30s
+      timeout: 10s
+      start_period: 40s
+      retries: 3
 
 volumes:
   huntarr-config:
diff --git a/main.py b/main.py
index ab70532d..c6d486ea 100644
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@ import signal
 import logging # Use standard logging for initial setup
 import atexit
 import time
+import time
 
 # Import path configuration early to set up environment
 try:
@@ -144,6 +145,14 @@ except Exception as e:
 waitress_server = None
 shutdown_requested = threading.Event()
 
+# Global shutdown flag for health checks
+_global_shutdown_flag = False
+
+def is_shutting_down():
+    """Check if the application is shutting down"""
+    global _global_shutdown_flag
+    return _global_shutdown_flag or shutdown_requested.is_set() or stop_event.is_set()
+
 def refresh_sponsors_on_startup():
     """Refresh sponsors database from manifest.json on startup"""
     import os
@@ -317,7 +326,15 @@ def run_web_server():
 
 def main_shutdown_handler(signum, frame):
     """Gracefully shut down the application."""
-    huntarr_logger.info(f"Received signal {signum}. Initiating graceful shutdown...")
+    global _global_shutdown_flag
+    _global_shutdown_flag = True  # Set global shutdown flag immediately
+    
+    signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}"
+    huntarr_logger.info(f"Received {signal_name}. Initiating graceful shutdown...")
+    
+    # Set a reasonable timeout for shutdown operations
+    shutdown_start_time = time.time()
+    shutdown_timeout = 30  # 30 seconds total shutdown timeout
     
     # Immediate database checkpoint to prevent corruption
     try:
@@ -360,12 +377,19 @@ def main_shutdown_handler(signum, frame):
             waitress_server.close()
         except Exception as e:
             huntarr_logger.warning(f"Error closing Waitress server: {e}")
+    
+    # Force exit if shutdown takes too long (Docker container update scenario)
+    elapsed_time = time.time() - shutdown_start_time
+    if elapsed_time > shutdown_timeout:
+        huntarr_logger.warning(f"Shutdown timeout exceeded ({shutdown_timeout}s). Forcing exit with code 0.")
+        os._exit(0)  # Clean exit for Docker updates
 
 def cleanup_handler():
     """Cleanup function called at exit"""
+    cleanup_start_time = time.time()
     huntarr_logger.info("Exit cleanup handler called")
     
-    # Shutdown databases gracefully
+    # Shutdown databases gracefully with timeout
     try:
         from primary.utils.database import get_database, get_logs_database
         
@@ -377,7 +401,8 @@ def cleanup_handler():
             try:
                 with main_db.get_connection() as conn:
                     conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")  # Flush WAL to main database
-                    conn.execute("VACUUM")  # Optimize database before shutdown
+                    # Skip VACUUM for faster shutdown during updates
+                    huntarr_logger.debug("Main database WAL checkpoint completed")
             except Exception as db_error:
                 huntarr_logger.warning(f"Error during main database cleanup: {db_error}")
         
@@ -388,7 +413,8 @@ def cleanup_handler():
             try:
                 with logs_db.get_logs_connection() as conn:
                     conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")  # Flush WAL to logs database
-                    conn.execute("VACUUM")  # Optimize logs database before shutdown
+                    # Skip VACUUM for faster shutdown during updates
+                    huntarr_logger.debug("Logs database WAL checkpoint completed")
             except Exception as logs_error:
                 huntarr_logger.warning(f"Error during logs database cleanup: {logs_error}")
                 
@@ -397,10 +423,15 @@ def cleanup_handler():
     except Exception as e:
         huntarr_logger.warning(f"Error during database shutdown: {e}")
     
+    # Ensure stop events are set
     if not stop_event.is_set():
         stop_event.set()
     if not shutdown_requested.is_set():
         shutdown_requested.set()
+    
+    # Log cleanup timing for Docker update diagnostics
+    cleanup_duration = time.time() - cleanup_start_time
+    huntarr_logger.info(f"Cleanup completed in {cleanup_duration:.2f} seconds")
 
 def main():
     """Main entry point function for Huntarr application.
@@ -511,7 +542,14 @@ def main():
         # shutdown_threads() # Uncomment if primary.main.shutdown_threads() does more cleanup
 
         huntarr_logger.info("--- Huntarr Main Process Exiting ---")
-        return 0  # Success exit code
+        
+        # Return appropriate exit code based on shutdown reason
+        if shutdown_requested.is_set() or stop_event.is_set():
+            huntarr_logger.info("Clean shutdown completed - Exit code 0")
+            return 0  # Clean shutdown
+        else:
+            huntarr_logger.warning("Unexpected shutdown - Exit code 1")
+            return 1  # Unexpected shutdown
 
 
 if __name__ == '__main__':
diff --git a/src/primary/background.py b/src/primary/background.py
index 305b6524..067179ad 100644
--- a/src/primary/background.py
+++ b/src/primary/background.py
@@ -651,11 +651,17 @@ def check_and_restart_threads():
 
 def shutdown_handler(signum, frame):
     """Handle termination signals (SIGINT, SIGTERM)."""
-    logger.info(f"Received signal {signum}. Initiating shutdown...")
+    signal_name = "SIGINT" if signum == signal.SIGINT else "SIGTERM" if signum == signal.SIGTERM else f"Signal {signum}"
+    logger.info(f"Received {signal_name}. Initiating background tasks shutdown...")
     stop_event.set() # Signal all threads to stop
+    
+    # Log shutdown progress for Docker diagnostics
+    logger.info("Background shutdown initiated - threads will stop gracefully")
 
 def shutdown_threads():
     """Wait for all threads to finish."""
+    import time
+    shutdown_start = time.time()
     logger.info("Waiting for all app threads to stop...")
     
     # Stop the hourly API cap scheduler
@@ -688,12 +694,22 @@ def shutdown_threads():
     except Exception as e:
         logger.error(f"Error stopping schedule action engine: {e}")
     
-    # Wait for all threads to terminate
-    for thread in app_threads.values():
-        if thread.is_alive():
-            thread.join(timeout=10.0)
+    # Wait for all app threads to terminate
+    active_threads = [name for name, thread in app_threads.items() if thread.is_alive()]
+    if active_threads:
+        logger.info(f"Waiting for {len(active_threads)} app threads to stop: {', '.join(active_threads)}")
+        
+        for name, thread in app_threads.items():
+            if thread.is_alive():
+                logger.debug(f"Waiting for {name} thread to stop...")
+                thread.join(timeout=10.0)
+                if thread.is_alive():
+                    logger.warning(f"{name} thread did not stop gracefully within 10 seconds")
+                else:
+                    logger.debug(f"{name} thread stopped successfully")
     
-    logger.info("All app threads stopped.")
+    shutdown_duration = time.time() - shutdown_start
+    logger.info(f"All app threads stopped. Shutdown completed in {shutdown_duration:.2f} seconds")
 
 def hourly_cap_scheduler_loop():
     """Main loop for the hourly API cap scheduler thread
diff --git a/src/primary/routes/common.py b/src/primary/routes/common.py
index e723a00f..a1973775 100644
--- a/src/primary/routes/common.py
+++ b/src/primary/routes/common.py
@@ -44,6 +44,100 @@ def logo_files(filename):
 
 # --- API Routes --- #
 
+@common_bp.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint for Docker and orchestration systems"""
+    try:
+        # Check if shutdown is in progress using multiple methods
+        from src.primary.background import stop_event
+        
+        # Also check the global shutdown flag from main.py
+        try:
+            import main
+            is_shutting_down = main.is_shutting_down()
+        except:
+            is_shutting_down = stop_event.is_set()
+        
+        if is_shutting_down:
+            return jsonify({
+                "status": "shutting_down",
+                "message": "Application is shutting down",
+                "ready": False
+            }), 503  # Service Unavailable
+        
+        # Basic database connectivity check
+        from src.primary.utils.database import get_database
+        db = get_database()
+        
+        # Quick database health check
+        with db.get_connection() as conn:
+            conn.execute("SELECT 1")
+        
+        return jsonify({
+            "status": "healthy",
+            "message": "Application is running normally",
+            "ready": True,
+            "timestamp": datetime.utcnow().isoformat()
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return jsonify({
+            "status": "unhealthy",
+            "message": f"Health check failed: {str(e)}",
+            "ready": False
+        }), 503  # Service Unavailable
+
+@common_bp.route('/ready', methods=['GET'])
+def readiness_check():
+    """Readiness check endpoint for Kubernetes-style orchestration"""
+    try:
+        # Check if the application is ready to serve traffic
+        from src.primary.background import stop_event
+        
+        # Also check the global shutdown flag from main.py
+        try:
+            import main
+            is_shutting_down = main.is_shutting_down()
+        except:
+            is_shutting_down = stop_event.is_set()
+        
+        if is_shutting_down:
+            return jsonify({
+                "ready": False,
+                "message": "Application is shutting down"
+            }), 503
+        
+        # Check if setup is complete
+        from src.primary.utils.database import get_database
+        db = get_database()
+        
+        if db.is_setup_in_progress():
+            return jsonify({
+                "ready": False,
+                "message": "Application setup in progress"
+            }), 503
+        
+        # Check if user exists (setup complete)
+        from ..auth import user_exists
+        if not user_exists():
+            return jsonify({
+                "ready": False,
+                "message": "Application requires initial setup"
+            }), 503
+        
+        return jsonify({
+            "ready": True,
+            "message": "Application is ready to serve traffic"
+        }), 200
+        
+    except Exception as e:
+        logger.error(f"Readiness check failed: {e}")
+        return jsonify({
+            "ready": False,
+            "message": f"Readiness check failed: {str(e)}"
+        }), 503
+
 @common_bp.route('/api/sleep.json', methods=['GET'])
 def api_get_sleep_json():
     """API endpoint to serve sleep/cycle data from the database for frontend access"""