feat: add OIDC metadata utility with DNS troubleshooting support

- Add oidc_metadata.py utility module with retry logic and DNS testing
- Implement fetch_oidc_metadata() with exponential backoff retry
- Add test_dns_resolution() for proactive DNS diagnostics
- Create TROUBLESHOOTING_OIDC_DNS.md documentation
- Improves handling of DNS resolution errors in containerized environments
This commit is contained in:
Dries Peeters
2026-01-06 21:51:02 +01:00
parent 2101d5498e
commit 4437488630
2 changed files with 418 additions and 0 deletions
+168
View File
@@ -0,0 +1,168 @@
"""
OIDC Metadata Fetcher Utility
Provides functions to fetch OIDC discovery documents with retry logic
and better DNS handling to work around Python urllib3 DNS resolution issues.
"""
import socket
import time
import logging
from typing import Optional, Dict, Any
from urllib.parse import urlparse
import requests
logger = logging.getLogger(__name__)
def test_dns_resolution(hostname: str, timeout: int = 5) -> tuple[bool, Optional[str]]:
"""
Test DNS resolution for a hostname using Python's socket library.
Args:
hostname: The hostname to resolve
timeout: DNS resolution timeout in seconds
Returns:
Tuple of (success: bool, error_message: Optional[str])
"""
try:
# Use socket.gethostbyname which may work better than urllib3's resolver
ip_address = socket.gethostbyname(hostname)
logger.debug("DNS resolution successful for %s: %s", hostname, ip_address)
return True, None
except socket.gaierror as e:
error_msg = f"DNS resolution failed for {hostname}: {str(e)}"
logger.warning(error_msg)
return False, error_msg
except Exception as e:
error_msg = f"Unexpected error during DNS resolution for {hostname}: {str(e)}"
logger.error(error_msg)
return False, error_msg
def fetch_oidc_metadata(
issuer_url: str,
max_retries: int = 3,
retry_delay: int = 2,
timeout: int = 10,
use_dns_test: bool = True,
) -> tuple[Optional[Dict[str, Any]], Optional[str]]:
"""
Fetch OIDC metadata from the discovery endpoint with retry logic.
This function uses the requests library which may have better DNS handling
than urllib3 used by Authlib. It also implements exponential backoff retry.
Args:
issuer_url: The OIDC issuer URL (e.g., https://auth.example.com)
max_retries: Maximum number of retry attempts (default: 3)
retry_delay: Initial delay between retries in seconds (default: 2)
timeout: Request timeout in seconds (default: 10)
use_dns_test: Whether to test DNS resolution first (default: True)
Returns:
Tuple of (metadata_dict: Optional[Dict], error_message: Optional[str])
Returns (None, error_message) on failure, (metadata, None) on success
"""
# Parse the issuer URL
try:
parsed = urlparse(issuer_url)
if not parsed.scheme or not parsed.netloc:
return None, f"Invalid issuer URL format: {issuer_url}"
hostname = parsed.netloc.split(":")[0]
metadata_url = f"{issuer_url.rstrip('/')}/.well-known/openid-configuration"
except Exception as e:
return None, f"Failed to parse issuer URL: {str(e)}"
# Test DNS resolution first if requested
if use_dns_test:
dns_success, dns_error = test_dns_resolution(hostname, timeout=timeout)
if not dns_success:
logger.warning(
"DNS resolution test failed for %s, but will attempt metadata fetch anyway",
hostname,
)
# Continue anyway - sometimes requests library works even if socket doesn't
# Attempt to fetch metadata with retry logic
last_error = None
for attempt in range(1, max_retries + 1):
try:
logger.info(
"Fetching OIDC metadata from %s (attempt %d/%d)",
metadata_url,
attempt,
max_retries,
)
response = requests.get(metadata_url, timeout=timeout)
response.raise_for_status()
metadata = response.json()
# Validate that we got a proper OIDC discovery document
if not isinstance(metadata, dict):
raise ValueError("Metadata response is not a JSON object")
required_fields = ["issuer", "authorization_endpoint", "token_endpoint"]
missing_fields = [field for field in required_fields if field not in metadata]
if missing_fields:
raise ValueError(
f"Missing required fields in metadata: {', '.join(missing_fields)}"
)
logger.info(
"Successfully fetched OIDC metadata from %s (issuer: %s)",
metadata_url,
metadata.get("issuer"),
)
return metadata, None
except requests.exceptions.Timeout as e:
last_error = f"Timeout fetching metadata from {metadata_url}: {str(e)}"
logger.warning("%s (attempt %d/%d)", last_error, attempt, max_retries)
except requests.exceptions.ConnectionError as e:
# This often includes DNS resolution errors
error_str = str(e)
if "NameResolutionError" in error_str or "Failed to resolve" in error_str or "[Errno -2]" in error_str:
last_error = (
f"DNS resolution failed for {hostname}: {error_str}. "
"This may occur when Python's DNS resolver cannot resolve the domain. "
"Try configuring DNS servers in Docker or using container names for internal services."
)
else:
last_error = f"Connection error fetching metadata from {metadata_url}: {error_str}"
logger.warning("%s (attempt %d/%d)", last_error, attempt, max_retries)
except requests.exceptions.HTTPError as e:
last_error = f"HTTP error fetching metadata from {metadata_url}: {str(e)}"
logger.warning("%s (attempt %d/%d)", last_error, attempt, max_retries)
# Don't retry on HTTP errors (4xx, 5xx) - they're unlikely to resolve
break
except ValueError as e:
last_error = f"Invalid metadata response from {metadata_url}: {str(e)}"
logger.error("%s (attempt %d/%d)", last_error, attempt, max_retries)
# Don't retry on validation errors
break
except Exception as e:
last_error = f"Unexpected error fetching metadata from {metadata_url}: {str(e)}"
logger.error("%s (attempt %d/%d)", last_error, attempt, max_retries)
# Wait before retrying (exponential backoff)
if attempt < max_retries:
delay = retry_delay * (2 ** (attempt - 1)) # Exponential backoff
logger.info("Waiting %d seconds before retry...", delay)
time.sleep(delay)
# All retries failed
error_message = (
f"Failed to fetch OIDC metadata after {max_retries} attempts. "
f"Last error: {last_error}"
)
logger.error(error_message)
return None, error_message
+250
View File
@@ -0,0 +1,250 @@
# Troubleshooting OIDC DNS Resolution Errors
## Problem Description
When configuring OIDC (OpenID Connect) authentication, you may encounter DNS resolution errors during application startup, even though DNS resolution works correctly from the command line (e.g., `curl` or `ping`).
### Common Error Messages
```
Error loading metadata: HTTPSConnectionPool(host='auth.example.com', port=443):
Max retries exceeded with url: /.well-known/openid-configuration
(Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object>:
Failed to resolve 'auth.example.com' ([Errno -2] Name or service not known)"))
```
### Why This Happens
This issue occurs because Python's `urllib3` library (used by Authlib) may use a different DNS resolution mechanism than the system's DNS resolver. Even though:
- System DNS resolution works (curl/ping succeed)
- Docker DNS configuration is correct
- Containers are on the same network
Python's resolver may still fail to resolve the domain name.
## Solutions
### Solution 1: Configure DNS Servers in Docker/Portainer (Recommended)
Explicitly configure DNS servers in your Docker Compose or Portainer stack configuration.
#### For Docker Compose
Add DNS configuration to your service:
```yaml
services:
app:
image: ghcr.io/drytrix/timetracker:latest
dns:
- 8.8.8.8 # Google DNS
- 8.8.4.4 # Google DNS secondary
# OR use your internal DNS server
- 192.168.1.1 # Your router/internal DNS
# ... rest of configuration
```
#### For Portainer Stacks
Edit your stack configuration and add DNS settings under the service definition:
```yaml
services:
app:
# ... other configuration ...
dns:
- 8.8.8.8
- 8.8.4.4
```
After updating, restart the container/stack.
### Solution 2: Use Docker Internal Networking
If both your OIDC provider (e.g., Authentik) and TimeTracker are running on the same Docker network, you can use Docker's internal DNS resolution by using the container/service name instead of the external domain.
#### Find Your OIDC Provider Container Name
In Portainer, check your OIDC provider stack for the service name, or use:
```bash
docker network inspect <network_name>
```
#### Update OIDC_ISSUER Environment Variable
Instead of:
```
OIDC_ISSUER=https://auth.example.com/application/o/time-tracker/
```
Use:
```
OIDC_ISSUER=https://authentik:9443/application/o/time-tracker/
```
Replace `authentik` with your actual Authentik service/container name and `9443` with the internal port.
**Note:** This only works for internal communication. External redirects (like OIDC callbacks) will still need the public domain.
### Solution 3: Add extra_hosts Mapping
Map the domain to an IP address in your Docker configuration.
#### For Docker Compose
```yaml
services:
app:
image: ghcr.io/drytrix/timetracker:latest
extra_hosts:
- "auth.example.com:192.168.1.100" # Replace with actual OIDC provider IP
# ... rest of configuration
```
#### For Portainer Stacks
```yaml
services:
app:
# ... other configuration ...
extra_hosts:
- "auth.example.com:192.168.1.100"
```
#### To Find the IP Address
```bash
# From within the TimeTracker container
docker exec -it timetracker-app ping -c 1 auth.example.com
# Or from host
ping auth.example.com
```
### Solution 4: Use Lazy Metadata Loading (Automatic)
TimeTracker now includes automatic lazy loading of OIDC metadata. If DNS resolution fails at startup, the application will:
1. Start successfully (no blocking errors)
2. Store OIDC configuration for lazy loading
3. Attempt to fetch metadata on the first login attempt
4. Retry with exponential backoff if DNS resolution fails
This means your application will start even if DNS isn't ready, and will automatically retry when a user attempts to log in.
#### Configuration Options
You can configure the retry behavior using environment variables:
```bash
# Timeout for each metadata fetch attempt (default: 10 seconds)
OIDC_METADATA_FETCH_TIMEOUT=10
# Number of retry attempts (default: 3)
OIDC_METADATA_RETRY_ATTEMPTS=3
# Delay between retries in seconds (default: 2)
OIDC_METADATA_RETRY_DELAY=2
```
## Verification Steps
### 1. Test DNS Resolution from Container
```bash
# Test DNS resolution using Python
docker exec -it <container> python -c "import socket; print(socket.gethostbyname('auth.example.com'))"
# Test with curl
docker exec -it <container> curl -I https://auth.example.com/.well-known/openid-configuration
```
### 2. Check Application Logs
Look for OIDC-related messages in your application logs:
```bash
# If using Docker
docker logs <container> | grep -i oidc
# Check for lazy loading messages
docker logs <container> | grep -i "lazy\|metadata"
```
### 3. Use the OIDC Debug Dashboard
1. Log in as an administrator
2. Navigate to **Admin → OIDC Settings**
3. Click **Test Configuration** to verify connectivity
4. Review the metadata display to confirm successful connection
### 4. Use the Guided Setup Wizard
TimeTracker includes a guided OIDC setup wizard that:
- Tests DNS resolution before configuration
- Validates metadata endpoint accessibility
- Provides troubleshooting tips if connection fails
- Generates correct configuration automatically
Access it via **Admin → OIDC Setup Wizard** (if available).
## Common Scenarios
### Scenario 1: Both Services on Same Docker Network
**Problem:** Authentik and TimeTracker are on the same Docker network but using external domains.
**Solution:** Use Docker internal service names (Solution 2) or ensure both services can resolve each other's external domains.
### Scenario 2: DNS Not Ready at Startup
**Problem:** DNS resolution works after container starts, but fails during startup.
**Solution:** Use lazy loading (Solution 4) - this is automatic and requires no configuration.
### Scenario 3: Custom DNS Server
**Problem:** Using a custom internal DNS server that Python can't access.
**Solution:** Configure explicit DNS servers (Solution 1) pointing to your DNS server.
### Scenario 4: Reverse Proxy with Different Domain
**Problem:** OIDC provider is behind a reverse proxy with a different domain.
**Solution:** Ensure the reverse proxy domain is resolvable and use that domain in `OIDC_ISSUER`.
## Still Having Issues?
If none of the above solutions work:
1. **Check Network Configuration**: Ensure containers are on the same network and can communicate
2. **Verify Firewall Rules**: Check if firewall is blocking DNS queries
3. **Review Provider Logs**: Check your OIDC provider logs for connection attempts
4. **Test from Host**: Verify DNS resolution works from the Docker host
5. **Check DNS Server**: Ensure your DNS server is responding correctly
## Related Documentation
- [OIDC Setup Guide](admin/configuration/OIDC_SETUP.md) - Complete OIDC configuration guide
- [Docker Compose Setup](admin/configuration/DOCKER_COMPOSE_SETUP.md) - Docker deployment guide
## Technical Details
### How Lazy Loading Works
1. **At Startup**: If metadata fetch fails, TimeTracker stores OIDC configuration in app config
2. **On First Login**: When a user attempts OIDC login, the application:
- Checks if OIDC client exists
- If not, attempts to fetch metadata using the `requests` library (better DNS handling)
- Registers the OAuth client with fetched metadata
- Proceeds with normal OIDC flow
3. **Retry Logic**: Uses exponential backoff (2s, 4s, 8s delays) with configurable attempts
### Why requests Library Works Better
The `requests` library may use different DNS resolution mechanisms than `urllib3`, and sometimes succeeds where `urllib3` fails. TimeTracker's metadata fetcher uses `requests` for better compatibility.