Added Diagnostics scripts and improved setup with more redis db server handling

This commit is contained in:
Muhammad Ibrahim
2025-10-24 21:25:15 +01:00
parent f42c53d34b
commit 2c5a35b6c2
3 changed files with 1686 additions and 102 deletions

787
setup.sh
View File

@@ -707,6 +707,10 @@ configure_redis() {
chown redis:redis /etc/redis/users.acl
chmod 640 /etc/redis/users.acl
print_status "Created Redis ACL file"
else
# Backup existing ACL file
cp /etc/redis/users.acl /etc/redis/users.acl.backup.$(date +%Y%m%d_%H%M%S) 2>/dev/null || true
print_info "Backed up existing ACL file"
fi
# Configure ACL file in redis.conf
@@ -727,8 +731,14 @@ configure_redis() {
print_status "Removed user definitions from redis.conf"
fi
# Create admin user in ACL file if it doesn't exist
if ! grep -q "^user admin" /etc/redis/users.acl; then
# Create or update admin user in ACL file
if grep -q "^user admin" /etc/redis/users.acl; then
print_info "Admin user already exists in ACL, updating password..."
# Remove old admin line and add new one
sed -i '/^user admin/d' /etc/redis/users.acl
echo "user admin on sanitize-payload >$REDIS_PASSWORD ~* &* +@all" >> /etc/redis/users.acl
print_status "Updated admin user password"
else
echo "user admin on sanitize-payload >$REDIS_PASSWORD ~* &* +@all" >> /etc/redis/users.acl
print_status "Added admin user to ACL file"
fi
@@ -737,65 +747,126 @@ configure_redis() {
print_info "Restarting Redis to apply ACL configuration..."
systemctl restart redis-server
# Wait for Redis to start
sleep 3
# Wait for Redis to start with retry logic
sleep 5
# Test admin connection
if ! redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ping > /dev/null 2>&1; then
print_error "Failed to configure Redis ACL authentication"
return 1
fi
# Test admin connection with retries
local max_retries=3
local retry=0
local admin_works=false
print_status "Redis ACL authentication configuration successful"
# Create Redis user with ACL
print_info "Creating Redis ACL user: $REDIS_USER"
# Create user with password and permissions - capture output for error handling
local acl_result
acl_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SETUSER "$REDIS_USER" on ">${REDIS_USER_PASSWORD}" ~* +@all 2>&1)
if [ "$acl_result" = "OK" ]; then
print_status "Redis user '$REDIS_USER' created successfully"
# Save ACL users to file to persist across restarts
local save_result
save_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SAVE 2>&1)
if [ "$save_result" = "OK" ]; then
print_status "Redis ACL users saved to file"
else
print_warning "Failed to save ACL users to file: $save_result"
while [ $retry -lt $max_retries ]; do
if redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ping > /dev/null 2>&1; then
admin_works=true
break
fi
print_info "Waiting for Redis to be ready... (attempt $((retry + 1))/$max_retries)"
sleep 2
retry=$((retry + 1))
done
if [ "$admin_works" = false ]; then
print_error "Failed to verify admin connection after Redis restart"
print_error "Redis ACL configuration may have issues"
# Verify user was actually created
local verify_result
verify_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL GETUSER "$REDIS_USER" 2>&1)
# Try to fix by disabling ACL and using requirepass instead
print_warning "Attempting fallback: using requirepass instead of ACL..."
sed -i 's/^aclfile/# aclfile/' /etc/redis/redis.conf
sed -i "s/^# requirepass .*/requirepass $REDIS_PASSWORD/" /etc/redis/redis.conf
if ! grep -q "^requirepass" /etc/redis/redis.conf; then
echo "requirepass $REDIS_PASSWORD" >> /etc/redis/redis.conf
fi
systemctl restart redis-server
sleep 3
if [ "$verify_result" = "(nil)" ]; then
print_error "User creation reported OK but user does not exist"
# Test requirepass
if redis-cli -h 127.0.0.1 -p 6379 -a "$REDIS_PASSWORD" --no-auth-warning ping > /dev/null 2>&1; then
print_status "Fallback successful - using requirepass authentication"
# For requirepass mode, we'll set REDIS_USER empty later
print_info "Note: Using legacy requirepass mode instead of ACL"
else
print_error "Fallback also failed - Redis authentication is broken"
return 1
fi
else
print_error "Failed to create Redis user: $acl_result"
return 1
print_status "Redis ACL authentication configuration successful"
fi
# Test user connection
print_info "Testing Redis user connection..."
if redis-cli -h 127.0.0.1 -p 6379 --user "$REDIS_USER" --pass "$REDIS_USER_PASSWORD" --no-auth-warning -n "$REDIS_DB" ping > /dev/null 2>&1; then
print_status "Redis user connection test successful"
# Create Redis user with ACL (only if admin_works, meaning we're using ACL mode)
if [ "$admin_works" = true ]; then
print_info "Creating Redis ACL user: $REDIS_USER"
# Create user with password and permissions - capture output for error handling
local acl_result
acl_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SETUSER "$REDIS_USER" on ">${REDIS_USER_PASSWORD}" ~* +@all 2>&1)
if [ "$acl_result" = "OK" ]; then
print_status "Redis user '$REDIS_USER' created successfully"
# Save ACL users to file to persist across restarts
local save_result
save_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SAVE 2>&1)
if [ "$save_result" = "OK" ]; then
print_status "Redis ACL users saved to file"
else
print_warning "Failed to save ACL users to file: $save_result"
fi
# Verify user was actually created
local verify_result
verify_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL GETUSER "$REDIS_USER" 2>&1)
if [ "$verify_result" = "(nil)" ]; then
print_error "User creation reported OK but user does not exist"
return 1
fi
# Test user connection
print_info "Testing Redis user connection..."
if redis-cli -h 127.0.0.1 -p 6379 --user "$REDIS_USER" --pass "$REDIS_USER_PASSWORD" --no-auth-warning -n "$REDIS_DB" ping > /dev/null 2>&1; then
print_status "Redis user connection test successful"
else
print_error "Redis user connection test failed"
return 1
fi
# Mark the selected database as in-use
redis-cli -h 127.0.0.1 -p 6379 --user "$REDIS_USER" --pass "$REDIS_USER_PASSWORD" --no-auth-warning -n "$REDIS_DB" SET "patchmon:initialized" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > /dev/null
print_status "Marked Redis database $REDIS_DB as in-use"
else
print_error "Failed to create Redis user: $acl_result"
return 1
fi
else
print_error "Redis user connection test failed"
return 1
# Using requirepass mode - no per-user ACL
print_info "Using requirepass mode - testing connection..."
# For requirepass, we don't use username, just password
if redis-cli -h 127.0.0.1 -p 6379 -a "$REDIS_PASSWORD" --no-auth-warning -n "$REDIS_DB" ping > /dev/null 2>&1; then
print_status "Redis requirepass connection test successful"
# Mark the selected database as in-use
redis-cli -h 127.0.0.1 -p 6379 -a "$REDIS_PASSWORD" --no-auth-warning -n "$REDIS_DB" SET "patchmon:initialized" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > /dev/null
print_status "Marked Redis database $REDIS_DB as in-use"
# Set REDIS_USER to empty for requirepass mode
REDIS_USER=""
REDIS_USER_PASSWORD="$REDIS_PASSWORD"
else
print_error "Redis requirepass connection test failed"
return 1
fi
fi
# Mark the selected database as in-use
redis-cli -h 127.0.0.1 -p 6379 --user "$REDIS_USER" --pass "$REDIS_USER_PASSWORD" --no-auth-warning -n "$REDIS_DB" SET "patchmon:initialized" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > /dev/null
print_status "Marked Redis database $REDIS_DB as in-use"
# Note: Redis credentials will be written to .env by create_env_files() function
print_status "Redis user '$REDIS_USER' configured successfully"
print_status "Redis configured successfully"
if [ -n "$REDIS_USER" ]; then
print_info "Redis Mode: ACL with user '$REDIS_USER'"
else
print_info "Redis Mode: requirepass (legacy single-password auth)"
fi
print_info "Redis credentials will be saved to backend/.env"
return 0
@@ -1116,16 +1187,121 @@ EOF
print_status "Environment files created"
}
# Run database migrations
# Check and fix failed Prisma migrations
fix_failed_migrations() {
local db_name="$1"
local db_user="$2"
local db_pass="$3"
local db_host="${4:-localhost}"
local max_retries=3
print_info "Checking for failed migrations in database..."
# Query for failed migrations (where started_at is set but finished_at is NULL)
local failed_migrations
failed_migrations=$(PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -t -A -c \
"SELECT migration_name FROM _prisma_migrations WHERE finished_at IS NULL AND started_at IS NOT NULL;" 2>/dev/null || echo "")
if [ -z "$failed_migrations" ]; then
print_status "No failed migrations found"
return 0
fi
print_warning "Found failed migration(s):"
echo "$failed_migrations" | while read -r migration; do
[ -n "$migration" ] && print_warning " - $migration"
done
print_info "Attempting to resolve failed migrations..."
# For each failed migration, mark it as rolled back and remove it
echo "$failed_migrations" | while read -r migration; do
if [ -n "$migration" ]; then
print_info "Processing failed migration: $migration"
# Mark the migration as rolled back
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"UPDATE _prisma_migrations SET rolled_back_at = NOW() WHERE migration_name = '$migration' AND finished_at IS NULL;" >/dev/null 2>&1
# Delete the failed migration record to allow retry
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"DELETE FROM _prisma_migrations WHERE migration_name = '$migration' AND finished_at IS NULL;" >/dev/null 2>&1
print_status "Marked migration '$migration' for retry"
fi
done
print_status "Failed migrations have been cleared for retry"
return 0
}
# Run database migrations with self-healing
run_migrations() {
print_info "Running database migrations as user $INSTANCE_USER..."
cd "$APP_DIR/backend"
# Suppress Prisma CLI output (still logged to install log via tee)
run_as_user "$INSTANCE_USER" "cd $APP_DIR/backend && npx prisma migrate deploy" >/dev/null 2>&1 || true
local max_attempts=3
local attempt=1
local migration_success=false
while [ $attempt -le $max_attempts ]; do
print_info "Migration attempt $attempt of $max_attempts..."
# Try to run migrations
local migrate_output
migrate_output=$(run_as_user "$INSTANCE_USER" "cd $APP_DIR/backend && npx prisma migrate deploy 2>&1" || echo "MIGRATION_FAILED")
# Check if migration succeeded
if ! echo "$migrate_output" | grep -q "MIGRATION_FAILED\|Error:\|P3009"; then
print_status "Migrations completed successfully"
migration_success=true
break
fi
# Check specifically for P3009 (failed migrations found)
if echo "$migrate_output" | grep -q "P3009\|migrate found failed migrations"; then
print_warning "Detected failed migrations (P3009 error)"
# Extract the failed migration name if possible
local failed_migration
failed_migration=$(echo "$migrate_output" | grep -oP "The \`\K[^\`]+" | head -1 || echo "")
if [ -n "$failed_migration" ]; then
print_info "Failed migration identified: $failed_migration"
fi
# Attempt to fix failed migrations
print_info "Attempting to self-heal migration issues..."
if fix_failed_migrations "$DB_NAME" "$DB_USER" "$DB_PASS" "localhost"; then
print_status "Migration issues resolved, retrying..."
attempt=$((attempt + 1))
sleep 2
continue
else
print_error "Failed to resolve migration issues"
break
fi
else
# Other migration error
print_error "Migration failed with error:"
echo "$migrate_output" | grep -A 5 "Error:"
break
fi
done
if [ "$migration_success" = false ]; then
print_error "Migrations failed after $max_attempts attempts"
print_info "You may need to manually resolve migration issues"
print_info "Check migrations: cd $APP_DIR/backend && npx prisma migrate status"
return 1
fi
# Generate Prisma client
run_as_user "$INSTANCE_USER" "cd $APP_DIR/backend && npx prisma generate" >/dev/null 2>&1 || true
print_status "Database migrations completed as $INSTANCE_USER"
return 0
}
# Admin account creation removed - handled by application's first-time setup
@@ -1462,7 +1638,60 @@ start_services() {
print_status "PatchMon service started successfully"
else
print_error "Failed to start PatchMon service"
systemctl status "$SERVICE_NAME"
echo ""
# Show last 25 lines of service logs for debugging
print_warning "=== Last 25 lines of service logs ==="
journalctl -u "$SERVICE_NAME" -n 25 --no-pager || true
print_warning "==================================="
echo ""
# Check for specific error patterns
local logs=$(journalctl -u "$SERVICE_NAME" -n 50 --no-pager 2>/dev/null || echo "")
if echo "$logs" | grep -q "WRONGPASS\|NOAUTH"; then
print_error "❌ Detected Redis authentication error!"
print_info "The service cannot authenticate with Redis."
echo ""
print_info "Current Redis configuration in .env:"
grep "^REDIS_" "$APP_DIR/backend/.env" || true
echo ""
print_info "Debug steps:"
print_info " 1. Check Redis is running:"
print_info " systemctl status redis-server"
echo ""
print_info " 2. Check Redis ACL users:"
print_info " redis-cli ACL LIST"
echo ""
print_info " 3. Test Redis connection:"
local test_user=$(grep "^REDIS_USER=" "$APP_DIR/backend/.env" | cut -d'=' -f2)
local test_pass=$(grep "^REDIS_PASSWORD=" "$APP_DIR/backend/.env" | cut -d'=' -f2)
local test_db=$(grep "^REDIS_DB=" "$APP_DIR/backend/.env" | cut -d'=' -f2)
print_info " redis-cli --user $test_user --pass $test_pass -n ${test_db:-0} ping"
echo ""
print_info " 4. Check Redis configuration files:"
print_info " cat /etc/redis/redis.conf | grep aclfile"
print_info " cat /etc/redis/users.acl"
echo ""
elif echo "$logs" | grep -q "ECONNREFUSED.*postgresql\|Connection refused.*5432"; then
print_error "❌ Detected PostgreSQL connection error!"
print_info "Check if PostgreSQL is running:"
print_info " systemctl status postgresql"
elif echo "$logs" | grep -q "ECONNREFUSED.*redis\|Connection refused.*6379"; then
print_error "❌ Detected Redis connection error!"
print_info "Check if Redis is running:"
print_info " systemctl status redis-server"
elif echo "$logs" | grep -q "database.*does not exist"; then
print_error "❌ Database does not exist!"
print_info "Database: $DB_NAME"
elif echo "$logs" | grep -q "Error:"; then
print_error "❌ Application error detected in logs"
fi
echo ""
print_info "View full logs: journalctl -u $SERVICE_NAME -f"
print_info "Check service status: systemctl status $SERVICE_NAME"
return 1
fi
}
@@ -2012,6 +2241,65 @@ select_installation_to_update() {
done
}
# Repair/recreate Redis user with correct permissions
repair_redis_user() {
local redis_user="$1"
local redis_pass="$2"
local redis_db="${3:-0}"
print_info "Attempting to repair Redis user: $redis_user"
# Find admin password
local admin_password=""
if [ -f /etc/redis/users.acl ] && grep -q "^user admin" /etc/redis/users.acl; then
admin_password=$(grep "^user admin" /etc/redis/users.acl | grep -oP '>\K[^ ]+' | head -1)
fi
if [ -z "$admin_password" ]; then
print_error "Cannot repair Redis user - no admin credentials found"
return 1
fi
# Test admin connection
if ! redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning ping >/dev/null 2>&1; then
print_error "Admin credentials don't work - cannot repair user"
return 1
fi
print_status "Admin access confirmed"
# Delete existing user if it exists (and is broken)
print_info "Removing old user configuration..."
redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning ACL DELUSER "$redis_user" >/dev/null 2>&1 || true
# Create user with full permissions
print_info "Creating user with full permissions..."
local create_result
create_result=$(redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning ACL SETUSER "$redis_user" on ">${redis_pass}" ~* +@all 2>&1)
if echo "$create_result" | grep -q "OK"; then
# Save ACL
redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning ACL SAVE >/dev/null 2>&1
# Verify the new user works
if redis-cli -h localhost -p 6379 --user "$redis_user" --pass "$redis_pass" --no-auth-warning -n "$redis_db" ping >/dev/null 2>&1; then
if redis-cli -h localhost -p 6379 --user "$redis_user" --pass "$redis_pass" --no-auth-warning -n "$redis_db" info >/dev/null 2>&1; then
print_status "Redis user repaired successfully"
return 0
else
print_error "User created but INFO command still fails"
return 1
fi
else
print_error "User created but PING command fails"
return 1
fi
else
print_error "Failed to create user: $create_result"
return 1
fi
}
# Check and update Redis configuration for existing installation
update_redis_configuration() {
print_info "Checking Redis configuration..."
@@ -2021,12 +2309,57 @@ update_redis_configuration() {
if grep -q "^REDIS_HOST=" "$instance_dir/backend/.env" && \
grep -q "^REDIS_PASSWORD=" "$instance_dir/backend/.env"; then
print_status "Redis configuration already exists in .env"
return 0
# Verify the credentials actually work
local redis_user=$(grep "^REDIS_USER=" "$instance_dir/backend/.env" | cut -d'=' -f2 | tr -d '"')
local redis_pass=$(grep "^REDIS_PASSWORD=" "$instance_dir/backend/.env" | cut -d'=' -f2 | tr -d '"')
local redis_db=$(grep "^REDIS_DB=" "$instance_dir/backend/.env" | cut -d'=' -f2 | tr -d '"')
if [ -n "$redis_user" ] && [ -n "$redis_pass" ]; then
# Test with username and password
local ping_works=false
local info_works=false
if redis-cli -h localhost -p 6379 --user "$redis_user" --pass "$redis_pass" --no-auth-warning -n "${redis_db:-0}" ping >/dev/null 2>&1; then
ping_works=true
fi
if redis-cli -h localhost -p 6379 --user "$redis_user" --pass "$redis_pass" --no-auth-warning -n "${redis_db:-0}" info >/dev/null 2>&1; then
info_works=true
fi
if [ "$ping_works" = true ] && [ "$info_works" = true ]; then
print_status "Redis credentials verified with redis-cli (tested ping and info commands)"
# Force refresh the Redis user during updates to ensure correct ACL permissions
# This prevents issues where redis-cli works but Node.js client doesn't
print_info "Refreshing Redis user permissions to ensure compatibility..."
if repair_redis_user "$redis_user" "$redis_pass" "$redis_db"; then
print_status "Redis user permissions refreshed successfully"
return 0
else
print_warning "Could not refresh Redis user, but credentials seem to work - continuing..."
return 0
fi
else
print_warning "Redis credentials not working properly (ping: $ping_works, info: $info_works)"
print_info "Attempting to repair Redis user..."
if repair_redis_user "$redis_user" "$redis_pass" "$redis_db"; then
print_status "Redis user repaired successfully"
return 0
else
print_warning "Could not repair Redis user, will reconfigure from scratch..."
fi
fi
else
print_warning "Redis credentials incomplete in .env (missing user or password)"
fi
fi
fi
print_warning "Redis configuration not found in .env - this is a legacy installation"
print_info "Setting up Redis for this instance..."
print_warning "Redis configuration not found or invalid in .env - setting up Redis for this instance..."
# Detect package manager if not already set
if [ -z "$PKG_INSTALL" ]; then
@@ -2054,6 +2387,39 @@ update_redis_configuration() {
REDIS_USER="patchmon_${DB_SAFE_NAME}"
REDIS_USER_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32)
# Test Redis connection to determine authentication status
print_info "Testing Redis authentication status..."
local needs_auth=false
local admin_password=""
# Try ping without auth
if redis-cli -h localhost -p 6379 ping >/dev/null 2>&1; then
print_info "Redis is accessible without authentication"
needs_auth=false
else
print_info "Redis requires authentication"
needs_auth=true
# Try to find existing admin password from ACL file
if [ -f /etc/redis/users.acl ] && grep -q "^user admin" /etc/redis/users.acl; then
# Extract password from ACL file (format: >password)
admin_password=$(grep "^user admin" /etc/redis/users.acl | grep -oP '>\K[^ ]+' | head -1)
if [ -n "$admin_password" ]; then
print_info "Found existing admin credentials in ACL file"
# Test admin credentials
if redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning ping >/dev/null 2>&1; then
print_status "Existing admin credentials work"
REDIS_PASSWORD="$admin_password"
else
print_warning "Existing admin credentials don't work, will create new configuration"
admin_password=""
fi
fi
fi
fi
# Find available Redis database
print_info "Finding available Redis database..."
local redis_db=0
@@ -2061,9 +2427,14 @@ update_redis_configuration() {
while [ $redis_db -lt $max_attempts ]; do
local key_count
key_count=$(redis-cli -h localhost -p 6379 -n "$redis_db" DBSIZE 2>&1 | grep -v "ERR" || echo "1")
if [ "$key_count" = "0" ] || [ "$key_count" = "(integer) 0" ]; then
if [ "$needs_auth" = true ] && [ -n "$admin_password" ]; then
key_count=$(redis-cli -h localhost -p 6379 --user admin --pass "$admin_password" --no-auth-warning -n "$redis_db" DBSIZE 2>&1 | grep -oP '\d+' || echo "1")
else
key_count=$(redis-cli -h localhost -p 6379 -n "$redis_db" DBSIZE 2>&1 | grep -oP '\d+' || echo "1")
fi
if [ "$key_count" = "0" ]; then
print_status "Found available Redis database: $redis_db"
REDIS_DB=$redis_db
break
@@ -2076,50 +2447,146 @@ update_redis_configuration() {
REDIS_DB=0
fi
# Generate admin password if not exists
REDIS_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32)
# Configure Redis with ACL if needed
print_info "Configuring Redis ACL..."
# Create ACL file if it doesn't exist
if [ ! -f /etc/redis/users.acl ]; then
touch /etc/redis/users.acl
chown redis:redis /etc/redis/users.acl
chmod 640 /etc/redis/users.acl
fi
# Configure ACL file in redis.conf
if ! grep -q "^aclfile" /etc/redis/redis.conf 2>/dev/null; then
echo "aclfile /etc/redis/users.acl" >> /etc/redis/redis.conf
fi
# Remove requirepass (incompatible with ACL)
if grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
sed -i 's/^requirepass.*/# &/' /etc/redis/redis.conf
fi
# Create admin user if it doesn't exist
if ! grep -q "^user admin" /etc/redis/users.acl; then
echo "user admin on sanitize-payload >$REDIS_PASSWORD ~* &* +@all" >> /etc/redis/users.acl
if [ "$needs_auth" = false ]; then
print_info "Configuring Redis ACL for security..."
# Generate new admin password
REDIS_PASSWORD=$(openssl rand -base64 32 | tr -d "=+/" | cut -c1-32)
# Backup redis.conf
if [ -f /etc/redis/redis.conf ]; then
cp /etc/redis/redis.conf /etc/redis/redis.conf.backup.$(date +%Y%m%d_%H%M%S) 2>/dev/null || true
fi
# Create ACL file if it doesn't exist
if [ ! -f /etc/redis/users.acl ]; then
touch /etc/redis/users.acl
chown redis:redis /etc/redis/users.acl
chmod 640 /etc/redis/users.acl
print_status "Created Redis ACL file"
else
# Backup existing ACL file
cp /etc/redis/users.acl /etc/redis/users.acl.backup.$(date +%Y%m%d_%H%M%S) 2>/dev/null || true
print_info "Backed up existing ACL file"
fi
# Configure ACL file in redis.conf
if ! grep -q "^aclfile" /etc/redis/redis.conf 2>/dev/null; then
echo "aclfile /etc/redis/users.acl" >> /etc/redis/redis.conf
print_status "Added ACL file configuration to redis.conf"
fi
# Remove requirepass (incompatible with ACL)
if grep -q "^requirepass" /etc/redis/redis.conf 2>/dev/null; then
sed -i 's/^requirepass.*/# &/' /etc/redis/redis.conf
print_status "Disabled requirepass (incompatible with ACL)"
fi
# Create or update admin user in ACL file
if grep -q "^user admin" /etc/redis/users.acl; then
print_info "Admin user already exists in ACL, updating password..."
# Remove old admin line and add new one
sed -i '/^user admin/d' /etc/redis/users.acl
echo "user admin on sanitize-payload >$REDIS_PASSWORD ~* &* +@all" >> /etc/redis/users.acl
print_status "Updated admin user password"
else
echo "user admin on sanitize-payload >$REDIS_PASSWORD ~* &* +@all" >> /etc/redis/users.acl
print_status "Created admin user in ACL"
fi
# Restart Redis to apply ACL
print_info "Restarting Redis to apply ACL configuration..."
systemctl restart redis-server
sleep 3
sleep 5
# Verify admin can connect
local max_retries=3
local retry=0
local admin_works=false
while [ $retry -lt $max_retries ]; do
if redis-cli -h localhost -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ping >/dev/null 2>&1; then
admin_works=true
break
fi
print_info "Waiting for Redis to be ready... (attempt $((retry + 1))/$max_retries)"
sleep 2
retry=$((retry + 1))
done
if [ "$admin_works" = false ]; then
print_error "Failed to verify admin connection after Redis restart"
print_error "Redis ACL configuration may have issues"
# Try to fix by disabling ACL and using requirepass instead
print_warning "Attempting fallback: using requirepass instead of ACL..."
sed -i 's/^aclfile/# aclfile/' /etc/redis/redis.conf
sed -i "s/^# requirepass .*/requirepass $REDIS_PASSWORD/" /etc/redis/redis.conf
if ! grep -q "^requirepass" /etc/redis/redis.conf; then
echo "requirepass $REDIS_PASSWORD" >> /etc/redis/redis.conf
fi
systemctl restart redis-server
sleep 3
# Test requirepass
if redis-cli -h localhost -p 6379 -a "$REDIS_PASSWORD" --no-auth-warning ping >/dev/null 2>&1; then
print_status "Fallback successful - using requirepass authentication"
# For requirepass, we don't use username
REDIS_USER=""
else
print_error "Fallback also failed - Redis authentication is broken"
return 1
fi
else
print_status "Redis ACL configuration successful"
fi
elif [ -z "$admin_password" ]; then
print_error "Redis requires authentication but no valid admin credentials found"
print_error "Please check /etc/redis/users.acl or /etc/redis/redis.conf"
print_info "Manual fix: Reset Redis authentication or provide admin credentials"
return 1
fi
# Create instance-specific Redis user
print_info "Creating Redis user: $REDIS_USER"
# Try to authenticate with admin (may already exist from another instance)
local acl_result
acl_result=$(redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SETUSER "$REDIS_USER" on ">${REDIS_USER_PASSWORD}" ~* +@all 2>&1)
if [ "$acl_result" = "OK" ] || echo "$acl_result" | grep -q "OK"; then
print_status "Redis user created successfully"
redis-cli -h 127.0.0.1 -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SAVE > /dev/null 2>&1
# Create instance-specific Redis user (only if using ACL)
if [ -n "$REDIS_USER" ]; then
print_info "Creating Redis user: $REDIS_USER"
local acl_result=""
if [ -n "$REDIS_PASSWORD" ]; then
# Try to create user with ACL
acl_result=$(redis-cli -h localhost -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SETUSER "$REDIS_USER" on ">${REDIS_USER_PASSWORD}" ~* +@all 2>&1)
else
# Try without authentication (for legacy setups)
acl_result=$(redis-cli -h localhost -p 6379 ACL SETUSER "$REDIS_USER" on ">${REDIS_USER_PASSWORD}" ~* +@all 2>&1)
fi
if echo "$acl_result" | grep -q "OK"; then
print_status "Redis user created successfully"
# Save ACL users
if [ -n "$REDIS_PASSWORD" ]; then
redis-cli -h localhost -p 6379 --user admin --pass "$REDIS_PASSWORD" --no-auth-warning ACL SAVE >/dev/null 2>&1
else
redis-cli -h localhost -p 6379 ACL SAVE >/dev/null 2>&1
fi
print_status "Redis ACL saved"
# Verify user can connect
if redis-cli -h localhost -p 6379 --user "$REDIS_USER" --pass "$REDIS_USER_PASSWORD" --no-auth-warning -n "$REDIS_DB" ping >/dev/null 2>&1; then
print_status "Redis user verified and working"
else
print_warning "Redis user created but verification failed"
fi
else
print_error "Failed to create Redis user: $acl_result"
print_warning "Will use requirepass mode instead of per-user ACL"
REDIS_USER=""
REDIS_USER_PASSWORD="$REDIS_PASSWORD"
fi
else
print_warning "Could not create Redis user with ACL, trying without authentication..."
# Fallback for systems without ACL configured
redis-cli -h 127.0.0.1 -p 6379 CONFIG SET requirepass "$REDIS_USER_PASSWORD" > /dev/null 2>&1 || true
print_info "Using requirepass authentication (single password, no user-specific ACL)"
REDIS_USER_PASSWORD="$REDIS_PASSWORD"
fi
# Backup existing .env
@@ -2128,18 +2595,27 @@ update_redis_configuration() {
# Add Redis configuration to .env
print_info "Adding Redis configuration to .env..."
# Use correct password variable
local redis_pass_for_env="${REDIS_USER_PASSWORD:-$REDIS_PASSWORD}"
cat >> "$instance_dir/backend/.env" << EOF
# Redis Configuration (added during update)
# Redis Configuration (added during update on $(date))
REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_USER=$REDIS_USER
REDIS_PASSWORD=$REDIS_USER_PASSWORD
REDIS_PASSWORD=$redis_pass_for_env
REDIS_DB=$REDIS_DB
EOF
print_status "Redis configuration added to .env"
print_info "Redis User: $REDIS_USER"
if [ -n "$REDIS_USER" ]; then
print_info "Redis Mode: ACL with user '$REDIS_USER'"
else
print_info "Redis Mode: requirepass (legacy single-password auth)"
fi
print_info "Redis Database: $REDIS_DB"
return 0
@@ -2543,11 +3019,81 @@ update_installation() {
print_info "Building frontend..."
npm run build
# Run database migrations and generate Prisma client
# Run database migrations with self-healing
print_info "Running database migrations..."
cd "$instance_dir/backend"
# Generate Prisma client first
npx prisma generate
npx prisma migrate deploy
local max_attempts=3
local attempt=1
local migration_success=false
while [ $attempt -le $max_attempts ]; do
print_info "Migration attempt $attempt of $max_attempts..."
# Try to run migrations
local migrate_output
migrate_output=$(npx prisma migrate deploy 2>&1 || echo "MIGRATION_FAILED")
# Check if migration succeeded
if ! echo "$migrate_output" | grep -q "MIGRATION_FAILED\|Error:\|P3009"; then
print_status "Migrations completed successfully"
migration_success=true
break
fi
# Check specifically for P3009 (failed migrations found)
if echo "$migrate_output" | grep -q "P3009\|migrate found failed migrations"; then
print_warning "Detected failed migrations (P3009 error)"
# Extract the failed migration name if possible
local failed_migration
failed_migration=$(echo "$migrate_output" | grep -oP "The \`\K[^\`]+" | head -1 || echo "")
if [ -n "$failed_migration" ]; then
print_info "Failed migration identified: $failed_migration"
fi
# Attempt to fix failed migrations
print_info "Attempting to self-heal migration issues..."
if fix_failed_migrations "$DB_NAME" "$DB_USER" "$DB_PASS" "$DB_HOST"; then
print_status "Migration issues resolved, retrying..."
attempt=$((attempt + 1))
sleep 2
continue
else
print_error "Failed to resolve migration issues"
print_warning "Attempting alternative resolution method..."
# Alternative: Mark migration as completed if tables exist
print_info "Checking if migration changes are already applied..."
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c \
"UPDATE _prisma_migrations SET finished_at = NOW(), logs = 'Manually resolved by update script' WHERE migration_name = '$failed_migration' AND finished_at IS NULL;" >/dev/null 2>&1
attempt=$((attempt + 1))
sleep 2
continue
fi
else
# Other migration error
print_error "Migration failed with error:"
echo "$migrate_output" | grep -A 10 "Error:"
# Show helpful information
print_info "Migration status:"
npx prisma migrate status 2>&1 || true
break
fi
done
if [ "$migration_success" = false ]; then
print_error "Migrations failed after $max_attempts attempts"
print_warning "The update will continue, but you may need to manually resolve migration issues"
print_info "Check migrations: cd $instance_dir/backend && npx prisma migrate status"
print_info "View failed migrations: PGPASSWORD=\"$DB_PASS\" psql -h \"$DB_HOST\" -U \"$DB_USER\" -d \"$DB_NAME\" -c \"SELECT * FROM _prisma_migrations WHERE finished_at IS NULL;\""
fi
# Check and update Redis configuration if needed (for legacy installations)
update_redis_configuration
@@ -2563,7 +3109,7 @@ update_installation() {
systemctl start "$service_name"
# Wait a moment and check status
sleep 3
sleep 5
if systemctl is-active --quiet "$service_name"; then
print_success "✅ Update completed successfully!"
@@ -2582,6 +3128,43 @@ update_installation() {
echo ""
else
print_error "Service failed to start after update"
echo ""
# Show last 25 lines of service logs for debugging
print_warning "=== Last 25 lines of service logs ==="
journalctl -u "$service_name" -n 25 --no-pager || true
print_warning "==================================="
echo ""
# Check for specific error patterns
local logs=$(journalctl -u "$service_name" -n 50 --no-pager 2>/dev/null || echo "")
if echo "$logs" | grep -q "WRONGPASS\|NOAUTH"; then
print_error "❌ Detected Redis authentication error!"
print_info "The service cannot authenticate with Redis."
echo ""
print_info "Current Redis configuration in .env:"
grep "^REDIS_" "$instance_dir/backend/.env" || true
echo ""
print_info "Quick fix - Try reconfiguring Redis:"
print_info " 1. Check Redis ACL users:"
print_info " redis-cli ACL LIST"
echo ""
print_info " 2. Test Redis connection with credentials from .env:"
local test_user=$(grep "^REDIS_USER=" "$instance_dir/backend/.env" | cut -d'=' -f2)
local test_pass=$(grep "^REDIS_PASSWORD=" "$instance_dir/backend/.env" | cut -d'=' -f2)
local test_db=$(grep "^REDIS_DB=" "$instance_dir/backend/.env" | cut -d'=' -f2)
print_info " redis-cli --user $test_user --pass $test_pass -n ${test_db:-0} ping"
echo ""
elif echo "$logs" | grep -q "ECONNREFUSED"; then
print_error "❌ Detected connection refused error!"
print_info "Check if required services are running:"
print_info " systemctl status postgresql"
print_info " systemctl status redis-server"
elif echo "$logs" | grep -q "Error:"; then
print_error "❌ Application error detected in logs"
fi
echo ""
print_warning "ROLLBACK INSTRUCTIONS:"
print_info "1. Restore code:"
@@ -2594,7 +3177,7 @@ update_installation() {
print_info "3. Restart service:"
print_info " sudo systemctl start $service_name"
echo ""
print_info "Check logs: journalctl -u $service_name -f"
print_info "View full logs: journalctl -u $service_name -f"
exit 1
fi
}

715
tools/diagnostics.sh Executable file
View File

@@ -0,0 +1,715 @@
#!/bin/bash
# PatchMon Diagnostics Collection Script
# Collects system information, logs, and configuration for troubleshooting
# Usage: sudo bash diagnostics.sh [instance-name]
# Note: Not using 'set -e' because we want to continue even if some commands fail
set -o pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Print functions
print_status() {
echo -e "${GREEN}$1${NC}"
}
print_info() {
echo -e "${BLUE} $1${NC}"
}
print_error() {
echo -e "${RED}$1${NC}"
}
print_warning() {
echo -e "${YELLOW}⚠️ $1${NC}"
}
print_success() {
echo -e "${GREEN}🎉 $1${NC}"
}
# Check if running as root
if [[ $EUID -ne 0 ]]; then
print_error "This script must be run as root"
print_info "Please run: sudo bash $0"
exit 1
fi
# Function to sanitize sensitive information
sanitize_sensitive() {
local input="$1"
# Replace passwords, secrets, and tokens with [REDACTED]
echo "$input" | \
sed -E 's/(PASSWORD|SECRET|TOKEN|KEY|PASS)=[^"]*$/\1=[REDACTED]/gi' | \
sed -E 's/(PASSWORD|SECRET|TOKEN|KEY|PASS)="[^"]*"/\1="[REDACTED]"/gi' | \
sed -E 's/(password|secret|token|key|pass)": *"[^"]*"/\1": "[REDACTED]"/gi' | \
sed -E 's/(>)[a-zA-Z0-9+\/=]{20,}/\1[REDACTED]/g' | \
sed -E 's|postgresql://([^:]+):([^@]+)@|postgresql://\1:[REDACTED]@|g' | \
sed -E 's|mysql://([^:]+):([^@]+)@|mysql://\1:[REDACTED]@|g' | \
sed -E 's|mongodb://([^:]+):([^@]+)@|mongodb://\1:[REDACTED]@|g'
}
# Function to detect PatchMon installations
detect_installations() {
local installations=()
if [ ! -d "/opt" ]; then
print_error "/opt directory does not exist"
return 1
fi
for dir in /opt/*/; do
# Skip if no directories found
[ -d "$dir" ] || continue
local dirname=$(basename "$dir")
# Skip backup directories
if [[ "$dirname" =~ \.backup\. ]]; then
continue
fi
# Check if it's a PatchMon installation
if [ -f "$dir/backend/package.json" ]; then
if grep -q "patchmon" "$dir/backend/package.json" 2>/dev/null; then
installations+=("$dirname")
fi
fi
done
echo "${installations[@]}"
}
# Function to select installation
select_installation() {
local installations=($(detect_installations))
if [ ${#installations[@]} -eq 0 ]; then
print_error "No PatchMon installations found in /opt" >&2
exit 1
fi
if [ -n "$1" ]; then
# Use provided instance name
if [[ " ${installations[@]} " =~ " $1 " ]]; then
echo "$1"
return 0
else
print_error "Instance '$1' not found" >&2
exit 1
fi
fi
# Send status messages to stderr so they don't contaminate the return value
print_info "Found ${#installations[@]} installation(s):" >&2
echo "" >&2
local i=1
declare -A install_map
for install in "${installations[@]}"; do
# Get service status
local status="unknown"
if systemctl is-active --quiet "$install" 2>/dev/null; then
status="${GREEN}running${NC}"
elif systemctl is-enabled --quiet "$install" 2>/dev/null; then
status="${RED}stopped${NC}"
fi
printf "%2d. %-30s (%b)\n" "$i" "$install" "$status" >&2
install_map[$i]="$install"
i=$((i + 1))
done
echo "" >&2
# If only one installation, select it automatically
if [ ${#installations[@]} -eq 1 ]; then
print_info "Only one installation found, selecting automatically: ${installations[0]}" >&2
echo "${installations[0]}"
return 0
fi
# Multiple installations - prompt user
printf "${BLUE}Select installation number [1]: ${NC}" >&2
read -r selection </dev/tty
selection=${selection:-1}
if [[ "$selection" =~ ^[0-9]+$ ]] && [ -n "${install_map[$selection]}" ]; then
echo "${install_map[$selection]}"
return 0
else
print_error "Invalid selection" >&2
exit 1
fi
}
# Main script
main() {
# Capture the directory where script is run from at the very start
ORIGINAL_DIR=$(pwd)
echo -e "${BLUE}====================================================${NC}"
echo -e "${BLUE} PatchMon Diagnostics Collection${NC}"
echo -e "${BLUE}====================================================${NC}"
echo ""
# Select instance
instance_name=$(select_installation "$1")
instance_dir="/opt/$instance_name"
print_info "Selected instance: $instance_name"
print_info "Directory: $instance_dir"
echo ""
# Create single diagnostics file in the original directory
timestamp=$(date +%Y%m%d_%H%M%S)
diag_file="${ORIGINAL_DIR}/patchmon_diagnostics_${instance_name}_${timestamp}.txt"
print_info "Collecting diagnostics to: $diag_file"
echo ""
# Initialize the diagnostics file with header
cat > "$diag_file" << EOF
===================================================
PatchMon Diagnostics Report
===================================================
Instance: $instance_name
Generated: $(date)
Hostname: $(hostname)
Generated from: ${ORIGINAL_DIR}
===================================================
EOF
# ========================================
# 1. System Information
# ========================================
print_info "Collecting system information..."
cat >> "$diag_file" << EOF
=== System Information ===
OS: $(cat /etc/os-release 2>/dev/null | grep PRETTY_NAME | cut -d'"' -f2 || echo "Unknown")
Kernel: $(uname -r)
Uptime: $(uptime)
=== CPU Information ===
$(lscpu | grep -E "Model name|CPU\(s\)|Thread|Core" || echo "Not available")
=== Memory Information ===
$(free -h)
=== Disk Usage ===
$(df -h | grep -E "Filesystem|/dev/|/opt")
=== Network Interfaces ===
$(ip -br addr)
===================================================
EOF
# ========================================
# 2. PatchMon Instance Information
# ========================================
print_info "Collecting instance information..."
cat >> "$diag_file" << EOF
=== PatchMon Instance Information ===
=== Directory Structure ===
$(ls -lah "$instance_dir" 2>/dev/null || echo "Cannot access directory")
=== Backend Package Info ===
$(cat "$instance_dir/backend/package.json" 2>/dev/null | grep -E "name|version" || echo "Not found")
=== Frontend Package Info ===
$(cat "$instance_dir/frontend/package.json" 2>/dev/null | grep -E "name|version" || echo "Not found")
=== Deployment Info ===
$(cat "$instance_dir/deployment-info.txt" 2>/dev/null || echo "No deployment-info.txt found")
===================================================
EOF
# ========================================
# 3. Environment Configuration (Sanitized)
# ========================================
print_info "Collecting environment configuration (sanitized)..."
echo "" >> "$diag_file"
echo "=== Backend Environment Configuration (Sanitized) ===" >> "$diag_file"
if [ -f "$instance_dir/backend/.env" ]; then
sanitize_sensitive "$(cat "$instance_dir/backend/.env")" >> "$diag_file"
else
echo "Backend .env file not found" >> "$diag_file"
fi
echo "" >> "$diag_file"
# ========================================
# 4. Service Status and Configuration
# ========================================
print_info "Collecting service information..."
cat >> "$diag_file" << EOF
=== Service Status and Configuration ===
=== Service Status ===
$(systemctl status "$instance_name" 2>/dev/null || echo "Service not found")
=== Service File ===
$(cat "/etc/systemd/system/${instance_name}.service" 2>/dev/null || echo "Service file not found")
=== Service is-enabled ===
$(systemctl is-enabled "$instance_name" 2>/dev/null || echo "unknown")
=== Service is-active ===
$(systemctl is-active "$instance_name" 2>/dev/null || echo "unknown")
===================================================
EOF
# ========================================
# 5. Service Logs
# ========================================
print_info "Collecting service logs..."
echo "" >> "$diag_file"
echo "=== Service Logs (last 500 lines) ===" >> "$diag_file"
journalctl -u "$instance_name" -n 500 --no-pager >> "$diag_file" 2>&1 || \
echo "Could not retrieve service logs" >> "$diag_file"
echo "" >> "$diag_file"
# ========================================
# 6. Nginx Configuration
# ========================================
print_info "Collecting nginx configuration..."
cat >> "$diag_file" << EOF
=== Nginx Configuration ===
=== Nginx Status ===
$(systemctl status nginx 2>/dev/null | head -20 || echo "Nginx not found")
=== Site Configuration ===
$(cat "/etc/nginx/sites-available/$instance_name" 2>/dev/null || echo "Nginx config not found")
=== Nginx Error Log (last 100 lines) ===
$(tail -100 /var/log/nginx/error.log 2>/dev/null || echo "Error log not accessible")
=== Nginx Access Log (last 50 lines) ===
$(tail -50 /var/log/nginx/access.log 2>/dev/null || echo "Access log not accessible")
=== Nginx Test ===
$(nginx -t 2>&1 || echo "Nginx test failed")
===================================================
EOF
# ========================================
# 7. Database Connection Test
# ========================================
print_info "Testing database connection..."
echo "" >> "$diag_file"
echo "=== Database Information ===" >> "$diag_file"
echo "" >> "$diag_file"
if [ -f "$instance_dir/backend/.env" ]; then
# Load .env
set -a
source "$instance_dir/backend/.env"
set +a
# Parse DATABASE_URL
if [ -n "$DATABASE_URL" ]; then
DB_USER=$(echo "$DATABASE_URL" | sed -n 's|postgresql://\([^:]*\):.*|\1|p')
DB_PASS=$(echo "$DATABASE_URL" | sed -n 's|postgresql://[^:]*:\([^@]*\)@.*|\1|p')
DB_HOST=$(echo "$DATABASE_URL" | sed -n 's|.*@\([^:]*\):.*|\1|p')
DB_PORT=$(echo "$DATABASE_URL" | sed -n 's|.*:\([0-9]*\)/.*|\1|p')
DB_NAME=$(echo "$DATABASE_URL" | sed -n 's|.*/\([^?]*\).*|\1|p')
cat >> "$diag_file" << EOF
=== Database Connection Details ===
Host: $DB_HOST
Port: $DB_PORT
Database: $DB_NAME
User: $DB_USER
=== PostgreSQL Status ===
$(systemctl status postgresql 2>/dev/null | head -20 || echo "PostgreSQL status not available")
=== Connection Test ===
EOF
if PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT version();" >> "$diag_file" 2>&1; then
echo "✅ Database connection: SUCCESSFUL" >> "$diag_file"
else
echo "❌ Database connection: FAILED" >> "$diag_file"
fi
echo "" >> "$diag_file"
echo "=== Database Size ===" >> "$diag_file"
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
pg_size_pretty(pg_database_size('$DB_NAME')) as database_size;
" >> "$diag_file" 2>&1 || echo "Could not get database size" >> "$diag_file"
echo "" >> "$diag_file"
echo "=== Table Sizes ===" >> "$diag_file"
PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 10;
" >> "$diag_file" 2>&1 || echo "Could not get table sizes" >> "$diag_file"
echo "" >> "$diag_file"
echo "=== Migration Status ===" >> "$diag_file"
cd "$instance_dir/backend"
npx prisma migrate status >> "$diag_file" 2>&1 || echo "Could not get migration status" >> "$diag_file"
echo "===================================================" >> "$diag_file"
else
echo "DATABASE_URL not found in .env" >> "$diag_file"
fi
else
echo ".env file not found" >> "$diag_file"
fi
# ========================================
# 8. Redis Connection Test
# ========================================
print_info "Testing Redis connection..."
if [ -f "$instance_dir/backend/.env" ]; then
# Load .env
set -a
source "$instance_dir/backend/.env"
set +a
cat >> "$diag_file" << EOF
===================================================
Redis Information
===================================================
=== Redis Connection Details ===
Host: ${REDIS_HOST:-localhost}
Port: ${REDIS_PORT:-6379}
User: ${REDIS_USER:-(none)}
Database: ${REDIS_DB:-0}
=== Redis Status ===
$(systemctl status redis-server 2>/dev/null | head -20 || echo "Redis status not available")
=== Connection Test ===
EOF
# Test connection
if [ -n "$REDIS_USER" ] && [ -n "$REDIS_PASSWORD" ]; then
if redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" ping >> "$diag_file" 2>&1; then
echo "✅ Redis connection (with user): SUCCESSFUL" >> "$diag_file"
echo "" >> "$diag_file"
echo "=== Redis INFO ===" >> "$diag_file"
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" INFO >> "$diag_file" 2>&1
echo "" >> "$diag_file"
echo "=== Redis Database Size ===" >> "$diag_file"
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" DBSIZE >> "$diag_file" 2>&1
else
echo "❌ Redis connection (with user): FAILED" >> "$diag_file"
fi
elif [ -n "$REDIS_PASSWORD" ]; then
if redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" ping >> "$diag_file" 2>&1; then
echo "✅ Redis connection (requirepass): SUCCESSFUL" >> "$diag_file"
echo "" >> "$diag_file"
echo "=== Redis INFO ===" >> "$diag_file"
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" INFO >> "$diag_file" 2>&1
echo "" >> "$diag_file"
echo "=== Redis Database Size ===" >> "$diag_file"
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" DBSIZE >> "$diag_file" 2>&1
else
echo "❌ Redis connection (requirepass): FAILED" >> "$diag_file"
fi
else
if redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -n "${REDIS_DB:-0}" ping >> "$diag_file" 2>&1; then
echo "✅ Redis connection (no auth): SUCCESSFUL" >> "$diag_file"
else
echo "❌ Redis connection: FAILED" >> "$diag_file"
fi
fi
echo "" >> "$diag_file"
echo "=== Redis ACL Users ===" >> "$diag_file"
if [ -n "$REDIS_USER" ] && [ -n "$REDIS_PASSWORD" ]; then
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning ACL LIST >> "$diag_file"
elif [ -n "$REDIS_PASSWORD" ]; then
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning ACL LIST >> "$diag_file"
fi
echo "===================================================" >> "$diag_file"
else
echo ".env file not found" >> "$diag_file"
fi
# ========================================
# 9. Network and Port Information
# ========================================
print_info "Collecting network information..."
# Get backend port from .env
local backend_port=$(grep '^PORT=' "$instance_dir/backend/.env" 2>/dev/null | cut -d'=' -f2 | tr -d ' ' || echo "3000")
cat >> "$diag_file" << EOF
===================================================
Network and Port Information
===================================================
=== Listening Ports ===
$(ss -tlnp | grep -E "LISTEN|nginx|node|postgres|redis" || netstat -tlnp | grep -E "LISTEN|nginx|node|postgres|redis" || echo "Could not get port information")
=== Active Connections ===
$(ss -tn state established | head -20 || echo "Could not get connection information")
=== Backend Port Connections (Port $backend_port) ===
Total connections to backend: $(ss -tn | grep ":$backend_port" | wc -l || echo "0")
$(ss -tn | grep ":$backend_port" | head -10 || echo "No connections found")
=== PostgreSQL Connections ===
EOF
# Get PostgreSQL connection count
if [ -n "$DB_PASS" ] && [ -n "$DB_USER" ] && [ -n "$DB_NAME" ]; then
PGPASSWORD="$DB_PASS" psql -h "${DB_HOST:-localhost}" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
count(*) as total_connections,
count(*) FILTER (WHERE state = 'active') as active_connections,
count(*) FILTER (WHERE state = 'idle') as idle_connections
FROM pg_stat_activity
WHERE datname = '$DB_NAME';
" >> "$diag_file" 2>&1 || echo "Could not get PostgreSQL connection stats" >> "$diag_file"
echo "" >> "$diag_file"
echo "=== PostgreSQL Connection Details ===" >> "$diag_file"
PGPASSWORD="$DB_PASS" psql -h "${DB_HOST:-localhost}" -U "$DB_USER" -d "$DB_NAME" -c "
SELECT
pid,
usename,
application_name,
client_addr,
state,
query_start,
state_change
FROM pg_stat_activity
WHERE datname = '$DB_NAME'
ORDER BY query_start DESC
LIMIT 20;
" >> "$diag_file" 2>&1 || echo "Could not get connection details" >> "$diag_file"
else
echo "Database credentials not available" >> "$diag_file"
fi
echo "" >> "$diag_file"
echo "=== Redis Connections ===" >> "$diag_file"
# Get Redis connection count
if [ -n "$REDIS_USER" ] && [ -n "$REDIS_PASSWORD" ]; then
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" INFO clients >> "$diag_file" 2>&1 || echo "Could not get Redis connection info" >> "$diag_file"
elif [ -n "$REDIS_PASSWORD" ]; then
redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning -n "${REDIS_DB:-0}" INFO clients >> "$diag_file" 2>&1 || echo "Could not get Redis connection info" >> "$diag_file"
fi
cat >> "$diag_file" << EOF
=== Firewall Status (UFW) ===
$(ufw status 2>/dev/null || echo "UFW not available")
=== Firewall Status (iptables) ===
$(iptables -L -n | head -50 2>/dev/null || echo "iptables not available")
===================================================
EOF
# ========================================
# 10. Process Information
# ========================================
print_info "Collecting process information..."
cat >> "$diag_file" << EOF
===================================================
Process Information
===================================================
=== PatchMon Node Processes ===
$(ps aux | grep -E "node.*$instance_dir|PID" | grep -v grep || echo "No processes found")
=== Top Processes (CPU) ===
$(ps aux --sort=-%cpu | head -15)
=== Top Processes (Memory) ===
$(ps aux --sort=-%mem | head -15)
===================================================
EOF
# ========================================
# 11. SSL Certificate Information
# ========================================
print_info "Collecting SSL certificate information..."
cat >> "$diag_file" << EOF
===================================================
SSL Certificate Information
===================================================
=== Certbot Certificates ===
$(certbot certificates 2>/dev/null || echo "Certbot not available or no certificates")
=== SSL Certificate Files ===
$(ls -lh /etc/letsencrypt/live/$instance_name/ 2>/dev/null || echo "No SSL certificates found for $instance_name")
===================================================
EOF
# ========================================
# 12. Recent System Logs
# ========================================
print_info "Collecting recent system logs..."
journalctl -n 200 --no-pager >> "$diag_file" 2>&1 || \
echo "Could not retrieve system logs" >> "$diag_file"
# ========================================
# 13. Installation Log (if exists)
# ========================================
print_info "Collecting installation log..."
echo "" >> "$diag_file"
echo "=== Installation Log (last 200 lines) ===" >> "$diag_file"
if [ -f "$instance_dir/patchmon-install.log" ]; then
tail -200 "$instance_dir/patchmon-install.log" >> "$diag_file" 2>&1
else
echo "No installation log found" >> "$diag_file"
fi
echo "" >> "$diag_file"
# ========================================
# 14. Node.js and npm Information
# ========================================
print_info "Collecting Node.js information..."
cat >> "$diag_file" << EOF
===================================================
Node.js and npm Information
===================================================
=== Node.js Version ===
$(node --version 2>/dev/null || echo "Node.js not found")
=== npm Version ===
$(npm --version 2>/dev/null || echo "npm not found")
=== Backend Dependencies ===
$(cd "$instance_dir/backend" && npm list --depth=0 2>/dev/null || echo "Could not list backend dependencies")
===================================================
EOF
# ========================================
# Finalize diagnostics file
# ========================================
print_info "Finalizing diagnostics file..."
echo "" >> "$diag_file"
echo "====================================================" >> "$diag_file"
echo "END OF DIAGNOSTICS REPORT" >> "$diag_file"
echo "====================================================" >> "$diag_file"
echo "" >> "$diag_file"
echo "IMPORTANT: Sensitive Information" >> "$diag_file"
echo "Passwords, secrets, and tokens have been sanitized" >> "$diag_file"
echo "and replaced with [REDACTED]. However, please review" >> "$diag_file"
echo "before sharing to ensure no sensitive data is included." >> "$diag_file"
echo "====================================================" >> "$diag_file"
print_status "Diagnostics file created: $diag_file"
# ========================================
# Display summary
# ========================================
echo ""
echo -e "${GREEN}====================================================${NC}"
echo -e "${GREEN} Diagnostics Collection Complete!${NC}"
echo -e "${GREEN}====================================================${NC}"
echo ""
# Get service statuses and file size
local service_status=$(systemctl is-active "$instance_name" 2>/dev/null || echo "unknown")
local nginx_status=$(systemctl is-active nginx 2>/dev/null || echo "unknown")
local postgres_status=$(systemctl is-active postgresql 2>/dev/null || echo "unknown")
local redis_status=$(systemctl is-active redis-server 2>/dev/null || echo "unknown")
local file_size=$(du -h "$diag_file" 2>/dev/null | cut -f1 || echo "unknown")
local line_count=$(wc -l < "$diag_file" 2>/dev/null || echo "unknown")
# Get connection counts for summary
local backend_port=$(grep '^PORT=' "$instance_dir/backend/.env" 2>/dev/null | cut -d'=' -f2 | tr -d ' ' || echo "3000")
local backend_conn_count=$(ss -tn 2>/dev/null | grep ":$backend_port" | wc -l || echo "0")
local db_conn_count="N/A"
if [ -n "$DB_PASS" ] && [ -n "$DB_USER" ] && [ -n "$DB_NAME" ]; then
db_conn_count=$(PGPASSWORD="$DB_PASS" psql -h "${DB_HOST:-localhost}" -U "$DB_USER" -d "$DB_NAME" -t -A -c "SELECT count(*) FROM pg_stat_activity WHERE datname = '$DB_NAME';" 2>/dev/null || echo "N/A")
fi
local redis_conn_count="N/A"
if [ -n "$REDIS_USER" ] && [ -n "$REDIS_PASSWORD" ]; then
redis_conn_count=$(redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" --user "$REDIS_USER" --pass "$REDIS_PASSWORD" --no-auth-warning INFO clients 2>/dev/null | grep "connected_clients:" | cut -d':' -f2 | tr -d '\r' || echo "N/A")
elif [ -n "$REDIS_PASSWORD" ]; then
redis_conn_count=$(redis-cli -h "${REDIS_HOST:-localhost}" -p "${REDIS_PORT:-6379}" -a "$REDIS_PASSWORD" --no-auth-warning INFO clients 2>/dev/null | grep "connected_clients:" | cut -d':' -f2 | tr -d '\r' || echo "N/A")
fi
# Compact, copyable summary
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}DIAGNOSTICS SUMMARY (copy-paste friendly)${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}"
echo "Instance: $instance_name"
echo "File: $diag_file"
echo "Size: $file_size ($line_count lines)"
echo "Generated: $(date '+%Y-%m-%d %H:%M:%S')"
echo "---"
echo "Service Status: $service_status"
echo "Nginx Status: $nginx_status"
echo "PostgreSQL: $postgres_status"
echo "Redis: $redis_status"
echo "---"
echo "Backend Port: $backend_port (Active Connections: $backend_conn_count)"
echo "Database Connections: $db_conn_count"
echo "Redis Connections: $redis_conn_count"
echo "---"
echo "View: cat $(basename "$diag_file")"
echo "Or: less $(basename "$diag_file")"
echo "Share: Send $(basename "$diag_file") to support"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}"
echo ""
print_warning "Review file before sharing - sensitive data has been sanitized"
echo ""
print_success "Done!"
}
# Run main function
main "$@"

286
tools/fix-migrations.sh Executable file
View File

@@ -0,0 +1,286 @@
#!/bin/bash
# PatchMon Migration Fixer
# Standalone script to detect and fix failed Prisma migrations
# Usage: sudo bash fix-migrations.sh [instance-name]
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Print functions
print_status() {
echo -e "${GREEN}$1${NC}"
}
print_info() {
echo -e "${BLUE} $1${NC}"
}
print_error() {
echo -e "${RED}$1${NC}"
}
print_warning() {
echo -e "${YELLOW}⚠️ $1${NC}"
}
# Check if running as root
if [[ $EUID -ne 0 ]]; then
print_error "This script must be run as root"
print_info "Please run: sudo bash $0"
exit 1
fi
# Function to detect PatchMon installations
detect_installations() {
local installations=()
if [ -d "/opt" ]; then
for dir in /opt/*/; do
local dirname=$(basename "$dir")
# Skip backup directories
if [[ "$dirname" =~ \.backup\. ]]; then
continue
fi
# Check if it's a PatchMon installation
if [ -f "$dir/backend/package.json" ] && grep -q "patchmon" "$dir/backend/package.json" 2>/dev/null; then
installations+=("$dirname")
fi
done
fi
echo "${installations[@]}"
}
# Function to select installation
select_installation() {
local installations=($(detect_installations))
if [ ${#installations[@]} -eq 0 ]; then
print_error "No PatchMon installations found in /opt"
exit 1
fi
if [ -n "$1" ]; then
# Use provided instance name
if [[ " ${installations[@]} " =~ " $1 " ]]; then
echo "$1"
return 0
else
print_error "Instance '$1' not found"
exit 1
fi
fi
print_info "Found ${#installations[@]} installation(s):"
echo ""
local i=1
declare -A install_map
for install in "${installations[@]}"; do
printf "%2d. %s\n" "$i" "$install"
install_map[$i]="$install"
i=$((i + 1))
done
echo ""
echo -n -e "${BLUE}Select installation number [1]: ${NC}"
read -r selection
selection=${selection:-1}
if [[ "$selection" =~ ^[0-9]+$ ]] && [ -n "${install_map[$selection]}" ]; then
echo "${install_map[$selection]}"
return 0
else
print_error "Invalid selection"
exit 1
fi
}
# Function to check and fix failed migrations
fix_failed_migrations() {
local db_name="$1"
local db_user="$2"
local db_pass="$3"
local db_host="${4:-localhost}"
print_info "Checking for failed migrations in database..."
# Query for failed migrations
local failed_migrations
failed_migrations=$(PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -t -A -c \
"SELECT migration_name FROM _prisma_migrations WHERE finished_at IS NULL AND started_at IS NOT NULL;" 2>/dev/null || echo "")
if [ -z "$failed_migrations" ]; then
print_status "No failed migrations found"
return 0
fi
print_warning "Found failed migration(s):"
echo "$failed_migrations" | while read -r migration; do
[ -n "$migration" ] && print_warning " - $migration"
done
echo ""
print_info "What would you like to do?"
echo " 1. Clean and retry (delete failed records and re-run migration)"
echo " 2. Mark as completed (if schema changes are already applied)"
echo " 3. Show migration details only"
echo " 4. Cancel"
echo ""
echo -n -e "${BLUE}Select option [1]: ${NC}"
read -r option
option=${option:-1}
case $option in
1)
print_info "Cleaning failed migrations and preparing for retry..."
echo "$failed_migrations" | while read -r migration; do
if [ -n "$migration" ]; then
print_info "Processing: $migration"
# Mark as rolled back
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"UPDATE _prisma_migrations SET rolled_back_at = NOW() WHERE migration_name = '$migration' AND finished_at IS NULL;" >/dev/null 2>&1
# Delete the failed record
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"DELETE FROM _prisma_migrations WHERE migration_name = '$migration' AND finished_at IS NULL;" >/dev/null 2>&1
print_status "Cleared: $migration"
fi
done
print_status "Failed migrations cleared - ready to retry"
return 0
;;
2)
print_info "Marking migrations as completed..."
echo "$failed_migrations" | while read -r migration; do
if [ -n "$migration" ]; then
print_info "Marking as complete: $migration"
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"UPDATE _prisma_migrations SET finished_at = NOW(), logs = 'Manually resolved by fix-migrations.sh' WHERE migration_name = '$migration' AND finished_at IS NULL;" >/dev/null 2>&1
print_status "Marked complete: $migration"
fi
done
print_status "All migrations marked as completed"
return 0
;;
3)
print_info "Migration details:"
PGPASSWORD="$db_pass" psql -h "$db_host" -U "$db_user" -d "$db_name" -c \
"SELECT migration_name, started_at, finished_at, rolled_back_at, logs FROM _prisma_migrations WHERE finished_at IS NULL AND started_at IS NOT NULL;"
return 0
;;
4)
print_info "Cancelled"
return 1
;;
*)
print_error "Invalid option"
return 1
;;
esac
}
# Main script
main() {
echo -e "${BLUE}====================================================${NC}"
echo -e "${BLUE} PatchMon Migration Fixer${NC}"
echo -e "${BLUE}====================================================${NC}"
echo ""
# Select instance
instance_name=$(select_installation "$1")
instance_dir="/opt/$instance_name"
print_info "Selected instance: $instance_name"
print_info "Directory: $instance_dir"
echo ""
# Load .env to get database credentials
if [ ! -f "$instance_dir/backend/.env" ]; then
print_error "Cannot find .env file at $instance_dir/backend/.env"
exit 1
fi
# Source .env
set -a
source "$instance_dir/backend/.env"
set +a
# Parse DATABASE_URL
if [ -z "$DATABASE_URL" ]; then
print_error "DATABASE_URL not found in .env file"
exit 1
fi
DB_USER=$(echo "$DATABASE_URL" | sed -n 's|postgresql://\([^:]*\):.*|\1|p')
DB_PASS=$(echo "$DATABASE_URL" | sed -n 's|postgresql://[^:]*:\([^@]*\)@.*|\1|p')
DB_HOST=$(echo "$DATABASE_URL" | sed -n 's|.*@\([^:]*\):.*|\1|p')
DB_NAME=$(echo "$DATABASE_URL" | sed -n 's|.*/\([^?]*\).*|\1|p')
print_info "Database: $DB_NAME"
print_info "User: $DB_USER"
print_info "Host: $DB_HOST"
echo ""
# Test database connection
print_info "Testing database connection..."
if ! PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
print_error "Cannot connect to database"
exit 1
fi
print_status "Database connection successful"
echo ""
# Check Prisma migration status
print_info "Checking Prisma migration status..."
cd "$instance_dir/backend"
echo ""
echo -e "${YELLOW}=== Prisma Migration Status ===${NC}"
npx prisma migrate status 2>&1 || true
echo -e "${YELLOW}==============================${NC}"
echo ""
# Check for failed migrations
fix_failed_migrations "$DB_NAME" "$DB_USER" "$DB_PASS" "$DB_HOST"
# Ask if user wants to run migrations now
echo ""
echo -n -e "${BLUE}Do you want to run 'npx prisma migrate deploy' now? [y/N]: ${NC}"
read -r run_migrate
if [[ "$run_migrate" =~ ^[Yy] ]]; then
print_info "Running migrations..."
cd "$instance_dir/backend"
if npx prisma migrate deploy; then
print_status "Migrations completed successfully!"
else
print_error "Migration failed"
print_info "You may need to run this script again or investigate further"
exit 1
fi
else
print_info "Skipped migration deployment"
print_info "Run manually: cd $instance_dir/backend && npx prisma migrate deploy"
fi
echo ""
print_status "Done!"
}
# Run main function
main "$@"