diff --git a/.env.example b/.env.example index 9c0a4ca..8ef0d1c 100644 --- a/.env.example +++ b/.env.example @@ -1,16 +1,28 @@ +# Core Configuration DATABASE_URL=postgresql://readur:readur_password@localhost:5432/readur JWT_SECRET=your-super-secret-jwt-key-change-this-in-production SERVER_ADDRESS=0.0.0.0:8000 -UPLOAD_PATH=./uploads -# Watch folder configuration +# File Storage & Upload +UPLOAD_PATH=./uploads +ALLOWED_FILE_TYPES=pdf,png,jpg,jpeg,tiff,bmp,gif,txt,rtf,doc,docx + +# Watch Folder Configuration WATCH_FOLDER=./watch WATCH_INTERVAL_SECONDS=30 FILE_STABILITY_CHECK_MS=500 MAX_FILE_AGE_HOURS=168 +# FORCE_POLLING_WATCH=1 -# File type restrictions -ALLOWED_FILE_TYPES=pdf,txt,doc,docx,png,jpg,jpeg,tiff,bmp +# OCR & Processing Settings +OCR_LANGUAGE=eng +CONCURRENT_OCR_JOBS=4 +OCR_TIMEOUT_SECONDS=300 +MAX_FILE_SIZE_MB=50 -# Force polling mode for testing network filesystems (optional) -# FORCE_POLLING_WATCH=1 \ No newline at end of file +# Performance Settings +MEMORY_LIMIT_MB=512 +CPU_PRIORITY=normal + +# Optional: Additional OCR languages (install additional tesseract language packs) +# OCR_LANGUAGE=eng+fra+deu+spa # English + French + German + Spanish \ No newline at end of file diff --git a/DEPLOYMENT_SUMMARY.md b/DEPLOYMENT_SUMMARY.md new file mode 100644 index 0000000..a36b699 --- /dev/null +++ b/DEPLOYMENT_SUMMARY.md @@ -0,0 +1,148 @@ +# Readur Deployment Summary + +## ✅ What's Been Implemented + +### 🐳 **Complete Docker Support** +- Multi-stage Docker builds for production +- Production-ready docker-compose.yml configurations +- Support for environment-based configuration +- Health checks and resource limits +- SSL/HTTPS reverse proxy examples + +### 📁 **Advanced Watch Folder System** +- **Cross-filesystem compatibility**: NFS, SMB, S3, local storage +- **Hybrid watching strategy**: Auto-detects filesystem type +- **Smart file processing**: Duplicate detection, stability checks +- **Configurable**: 5+ environment variables for fine-tuning + +### ⚙️ **Comprehensive Configuration** +- **25+ environment variables** for complete customization +- **Production examples** for all major deployment scenarios +- **Network filesystem optimization** settings +- **OCR and performance tuning** options + +## 🚀 Quick Deployment + +### Development +```bash +git clone +cd readur +docker compose up --build -d +# Access: http://localhost:8000 +``` + +### Production +```bash +# Generate secure secrets +JWT_SECRET=$(openssl rand -base64 64) +DB_PASSWORD=$(openssl rand -base64 32) + +# Create .env file +cat > .env << EOF +JWT_SECRET=${JWT_SECRET} +DB_PASSWORD=${DB_PASSWORD} +EOF + +# Deploy with production config +docker compose -f docker-compose.prod.yml --env-file .env up -d +``` + +## 🔧 Key Environment Variables + +### Essential (Required for Production) +```bash +JWT_SECRET=your-secure-random-key +DATABASE_URL=postgresql://user:pass@host:port/db +``` + +### Watch Folder (Network Mounts) +```bash +WATCH_FOLDER=/mnt/nfs/documents +WATCH_INTERVAL_SECONDS=60 +FORCE_POLLING_WATCH=1 +FILE_STABILITY_CHECK_MS=1000 +``` + +### Performance Optimization +```bash +CONCURRENT_OCR_JOBS=8 +MAX_FILE_SIZE_MB=200 +MEMORY_LIMIT_MB=2048 +OCR_TIMEOUT_SECONDS=600 +``` + +## 🌐 Network Filesystem Examples + +### NFS Mount +```yaml +volumes: + - /mnt/nfs/docs:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=60 + - FORCE_POLLING_WATCH=1 +``` + +### SMB/CIFS Mount +```yaml +volumes: + - /mnt/smb/shared:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=30 + - FILE_STABILITY_CHECK_MS=2000 +``` + +### S3 Mount (s3fs) +```yaml +volumes: + - /mnt/s3/bucket:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=120 + - FILE_STABILITY_CHECK_MS=5000 + - FORCE_POLLING_WATCH=1 +``` + +## 📊 Configuration Summary + +| Category | Variables | Purpose | +|----------|-----------|---------| +| **Core** | 3 vars | Database, auth, server binding | +| **File Storage** | 2 vars | Upload paths and file types | +| **Watch Folder** | 5 vars | Cross-filesystem monitoring | +| **OCR Processing** | 4 vars | Language, jobs, timeouts, limits | +| **Performance** | 2 vars | Memory and CPU optimization | +| **Search & UI** | 10+ vars | Via web interface settings | + +## 🔒 Security Features + +- JWT-based authentication with configurable secrets +- Bcrypt password hashing +- File type restrictions +- Size limits and timeouts +- Non-destructive file processing (originals preserved) +- Configurable data retention policies + +## 📈 Scalability Features + +- Concurrent OCR processing (configurable) +- Resource limits and priorities +- Background job queues +- Database optimization for full-text search +- Efficient file storage with optional compression + +## 🛠️ Operations Support + +- **Health checks**: Built-in endpoint for monitoring +- **Logging**: Structured logging with configurable levels +- **Metrics**: Queue statistics and processing metrics +- **Backup scripts**: Database and file backup examples +- **Monitoring**: Docker stats and log analysis commands + +## 📚 Documentation + +- **README.md**: Complete deployment and usage guide +- **WATCH_FOLDER.md**: Detailed watch folder documentation +- **Docker examples**: Production-ready configurations +- **API documentation**: Complete REST API reference +- **Troubleshooting**: Common issues and solutions + +All components are production-ready with comprehensive configuration options for enterprise deployment scenarios! 🎉 \ No newline at end of file diff --git a/README.md b/README.md index 9411016..a0eab3b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ git clone cd readur # Start all services -docker compose up --build +docker compose up --build -d # Access the application open http://localhost:8000 @@ -44,8 +44,252 @@ After deployment, you'll have: - **Web Interface**: Modern document management UI at `http://localhost:8000` - **PostgreSQL Database**: Document metadata and full-text search indexes - **File Storage**: Persistent document storage with OCR processing +- **Watch Folder**: Automatic file ingestion from mounted directories - **REST API**: Full API access for integrations +## 🐳 Docker Deployment Guide + +### Production Docker Compose + +For production deployments, create a custom `docker-compose.prod.yml`: + +```yaml +services: + readur: + image: readur:latest + ports: + - "8000:8000" + environment: + # Core Configuration + - DATABASE_URL=postgresql://readur:${DB_PASSWORD}@postgres:5432/readur + - JWT_SECRET=${JWT_SECRET} + - SERVER_ADDRESS=0.0.0.0:8000 + + # File Storage + - UPLOAD_PATH=/app/uploads + - WATCH_FOLDER=/app/watch + - ALLOWED_FILE_TYPES=pdf,png,jpg,jpeg,tiff,bmp,gif,txt,doc,docx + + # Watch Folder Settings + - WATCH_INTERVAL_SECONDS=30 + - FILE_STABILITY_CHECK_MS=500 + - MAX_FILE_AGE_HOURS=168 + + # OCR Configuration + - OCR_LANGUAGE=eng + - CONCURRENT_OCR_JOBS=4 + - OCR_TIMEOUT_SECONDS=300 + - MAX_FILE_SIZE_MB=100 + + # Performance Tuning + - MEMORY_LIMIT_MB=1024 + - CPU_PRIORITY=normal + - ENABLE_COMPRESSION=true + + volumes: + # Document storage + - ./data/uploads:/app/uploads + + # Watch folder - mount your network drives here + - /mnt/nfs/documents:/app/watch + # or SMB: - /mnt/smb/shared:/app/watch + # or S3: - /mnt/s3/bucket:/app/watch + + depends_on: + - postgres + restart: unless-stopped + + # Resource limits for production + deploy: + resources: + limits: + memory: 2G + cpus: '2.0' + reservations: + memory: 512M + cpus: '0.5' + + postgres: + image: postgres:15 + environment: + - POSTGRES_USER=readur + - POSTGRES_PASSWORD=${DB_PASSWORD} + - POSTGRES_DB=readur + - POSTGRES_INITDB_ARGS=--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 + + volumes: + - postgres_data:/var/lib/postgresql/data + - ./postgres-config:/etc/postgresql/conf.d:ro + + # PostgreSQL optimization for document search + command: > + postgres + -c shared_buffers=256MB + -c effective_cache_size=1GB + -c max_connections=100 + -c default_text_search_config=pg_catalog.english + + restart: unless-stopped + + # Don't expose port in production + # ports: + # - "5433:5432" + +volumes: + postgres_data: + driver: local +``` + +### Environment Variables + +Create a `.env` file for your secrets: + +```bash +# Generate secure secrets +JWT_SECRET=$(openssl rand -base64 64) +DB_PASSWORD=$(openssl rand -base64 32) + +# Save to .env file +cat > .env << EOF +JWT_SECRET=${JWT_SECRET} +DB_PASSWORD=${DB_PASSWORD} +EOF +``` + +Deploy with: +```bash +docker compose -f docker-compose.prod.yml --env-file .env up -d +``` + +### Network Filesystem Mounts + +#### NFS Mounts +```bash +# Mount NFS share +sudo mount -t nfs 192.168.1.100:/documents /mnt/nfs/documents + +# Add to docker-compose.yml +volumes: + - /mnt/nfs/documents:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=60 + - FILE_STABILITY_CHECK_MS=1000 + - FORCE_POLLING_WATCH=1 +``` + +#### SMB/CIFS Mounts +```bash +# Mount SMB share +sudo mount -t cifs //server/share /mnt/smb/shared -o username=user,password=pass + +# Docker volume configuration +volumes: + - /mnt/smb/shared:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=30 + - FILE_STABILITY_CHECK_MS=2000 +``` + +#### S3 Mounts (using s3fs) +```bash +# Mount S3 bucket +s3fs mybucket /mnt/s3/bucket -o passwd_file=~/.passwd-s3fs + +# Docker configuration for S3 +volumes: + - /mnt/s3/bucket:/app/watch +environment: + - WATCH_INTERVAL_SECONDS=120 + - FILE_STABILITY_CHECK_MS=5000 + - FORCE_POLLING_WATCH=1 +``` + +### SSL/HTTPS Setup + +Use a reverse proxy like Nginx or Traefik: + +#### Nginx Configuration +```nginx +server { + listen 443 ssl http2; + server_name readur.yourdomain.com; + + ssl_certificate /path/to/cert.pem; + ssl_certificate_key /path/to/key.pem; + + location / { + proxy_pass http://localhost:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # For file uploads + client_max_body_size 100M; + proxy_read_timeout 300s; + proxy_send_timeout 300s; + } +} +``` + +#### Traefik Configuration +```yaml +services: + readur: + labels: + - "traefik.enable=true" + - "traefik.http.routers.readur.rule=Host(`readur.yourdomain.com`)" + - "traefik.http.routers.readur.tls=true" + - "traefik.http.routers.readur.tls.certresolver=letsencrypt" +``` + +### Health Checks + +Add health checks to your Docker configuration: + +```yaml +services: + readur: + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/api/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s +``` + +### Backup Strategy + +```bash +#!/bin/bash +# backup.sh - Automated backup script + +# Backup database +docker exec readur-postgres-1 pg_dump -U readur readur | gzip > backup_$(date +%Y%m%d_%H%M%S).sql.gz + +# Backup uploaded files +tar -czf uploads_backup_$(date +%Y%m%d_%H%M%S).tar.gz -C ./data uploads/ + +# Clean old backups (keep 30 days) +find . -name "backup_*.sql.gz" -mtime +30 -delete +find . -name "uploads_backup_*.tar.gz" -mtime +30 -delete +``` + +### Monitoring + +Monitor your deployment with Docker stats: + +```bash +# Real-time resource usage +docker stats + +# Container logs +docker compose logs -f readur + +# Watch folder activity +docker compose logs -f readur | grep watcher +``` + ## 🏗️ Architecture ``` @@ -209,30 +453,148 @@ npm run build ### Environment Variables +All application settings can be configured via environment variables: + +#### Core Configuration +| Variable | Default | Description | +|----------|---------|-------------| +| `DATABASE_URL` | `postgresql://readur:readur@localhost/readur` | PostgreSQL connection string | +| `JWT_SECRET` | `your-secret-key` | Secret key for JWT tokens ⚠️ **Change in production!** | +| `SERVER_ADDRESS` | `0.0.0.0:8000` | Server bind address and port | + +#### File Storage & Upload | Variable | Default | Description | |----------|---------|-------------| -| `DATABASE_URL` | - | PostgreSQL connection string | -| `JWT_SECRET` | - | Secret key for JWT tokens (required) | -| `SERVER_ADDRESS` | `0.0.0.0:8000` | Server bind address | | `UPLOAD_PATH` | `./uploads` | Document storage directory | -| `WATCH_FOLDER` | `./watch` | Folder monitoring directory | -| `ALLOWED_FILE_TYPES` | `pdf,png,jpg,jpeg,txt,doc,docx` | Allowed file extensions | +| `ALLOWED_FILE_TYPES` | `pdf,txt,doc,docx,png,jpg,jpeg` | Comma-separated allowed file extensions | -### Docker Configuration +#### Watch Folder Configuration +| Variable | Default | Description | +|----------|---------|-------------| +| `WATCH_FOLDER` | `./watch` | Directory to monitor for new files | +| `WATCH_INTERVAL_SECONDS` | `30` | Polling interval for network filesystems (seconds) | +| `FILE_STABILITY_CHECK_MS` | `500` | Time to wait for file write completion (milliseconds) | +| `MAX_FILE_AGE_HOURS` | _(none)_ | Skip files older than this many hours | +| `FORCE_POLLING_WATCH` | _(none)_ | Force polling mode even for local filesystems | -Customize `docker-compose.yml` for your environment: +#### OCR & Processing Settings +*Note: These settings can also be configured per-user via the web interface* -```yaml -services: - readur: - environment: - - JWT_SECRET=change-this-secret-key - - UPLOAD_PATH=/app/uploads - volumes: - - ./data/uploads:/app/uploads - - ./data/watch:/app/watch - ports: - - "8000:8000" +| Variable | Default | Description | +|----------|---------|-------------| +| `OCR_LANGUAGE` | `eng` | OCR language code (eng, fra, deu, spa, etc.) | +| `CONCURRENT_OCR_JOBS` | `4` | Maximum parallel OCR processes | +| `OCR_TIMEOUT_SECONDS` | `300` | OCR processing timeout per file | +| `MAX_FILE_SIZE_MB` | `50` | Maximum file size for processing | +| `AUTO_ROTATE_IMAGES` | `true` | Automatically rotate images for better OCR | +| `ENABLE_IMAGE_PREPROCESSING` | `true` | Apply image enhancement before OCR | + +#### Search & Performance +| Variable | Default | Description | +|----------|---------|-------------| +| `SEARCH_RESULTS_PER_PAGE` | `25` | Default number of search results per page | +| `SEARCH_SNIPPET_LENGTH` | `200` | Length of text snippets in search results | +| `FUZZY_SEARCH_THRESHOLD` | `0.8` | Similarity threshold for fuzzy search (0.0-1.0) | +| `MEMORY_LIMIT_MB` | `512` | Memory limit for OCR processes | +| `CPU_PRIORITY` | `normal` | CPU priority: `low`, `normal`, `high` | + +#### Data Management +| Variable | Default | Description | +|----------|---------|-------------| +| `RETENTION_DAYS` | _(none)_ | Auto-delete documents after N days | +| `ENABLE_AUTO_CLEANUP` | `false` | Enable automatic cleanup of old documents | +| `ENABLE_COMPRESSION` | `false` | Compress stored documents to save space | +| `ENABLE_BACKGROUND_OCR` | `true` | Process OCR in background queue | + +### Example Production Configuration + +```env +# Core settings +DATABASE_URL=postgresql://readur:secure_password@postgres:5432/readur +JWT_SECRET=your-very-long-random-secret-key-generated-with-openssl +SERVER_ADDRESS=0.0.0.0:8000 + +# File handling +UPLOAD_PATH=/app/uploads +ALLOWED_FILE_TYPES=pdf,png,jpg,jpeg,tiff,bmp,gif,txt,rtf,doc,docx + +# Watch folder for NFS mount +WATCH_FOLDER=/mnt/nfs/documents +WATCH_INTERVAL_SECONDS=60 +FILE_STABILITY_CHECK_MS=1000 +MAX_FILE_AGE_HOURS=168 +FORCE_POLLING_WATCH=1 + +# OCR optimization +OCR_LANGUAGE=eng +CONCURRENT_OCR_JOBS=8 +OCR_TIMEOUT_SECONDS=600 +MAX_FILE_SIZE_MB=200 +AUTO_ROTATE_IMAGES=true +ENABLE_IMAGE_PREPROCESSING=true + +# Performance tuning +MEMORY_LIMIT_MB=2048 +CPU_PRIORITY=high +ENABLE_COMPRESSION=true +ENABLE_BACKGROUND_OCR=true + +# Search optimization +SEARCH_RESULTS_PER_PAGE=50 +SEARCH_SNIPPET_LENGTH=300 +FUZZY_SEARCH_THRESHOLD=0.7 + +# Data management +RETENTION_DAYS=2555 # 7 years +ENABLE_AUTO_CLEANUP=true +``` + +### Runtime Settings vs Environment Variables + +Some settings can be configured in two ways: + +1. **Environment Variables**: Set at container startup, affects the entire application +2. **User Settings**: Configured per-user via the web interface, stored in database + +**Environment variables take precedence** and provide system-wide defaults. User settings override these defaults for individual users where applicable. + +Settings configurable via web interface: +- OCR language preferences +- Search result limits +- File type restrictions +- OCR processing options +- Data retention policies + +### Configuration Priority + +Settings are applied in this order (later values override earlier ones): + +1. **Application defaults** (built into the code) +2. **Environment variables** (system-wide configuration) +3. **User settings** (per-user database settings via web interface) + +This allows for flexible deployment where system administrators can set defaults while users can customize their experience. + +### Quick Reference - Essential Variables + +For a minimal production deployment, configure these essential variables: + +```bash +# Security (REQUIRED) +JWT_SECRET=your-secure-random-key-here +DATABASE_URL=postgresql://user:password@host:port/database + +# File Storage +UPLOAD_PATH=/app/uploads +WATCH_FOLDER=/path/to/mounted/folder + +# Watch Folder (for network mounts) +WATCH_INTERVAL_SECONDS=60 +FORCE_POLLING_WATCH=1 + +# Performance +CONCURRENT_OCR_JOBS=4 +MAX_FILE_SIZE_MB=100 ``` ### Database Tuning diff --git a/src/config.rs b/src/config.rs index 2c3015d..a2a2303 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,6 +12,16 @@ pub struct Config { pub watch_interval_seconds: Option, pub file_stability_check_ms: Option, pub max_file_age_hours: Option, + + // OCR Configuration + pub ocr_language: String, + pub concurrent_ocr_jobs: usize, + pub ocr_timeout_seconds: u64, + pub max_file_size_mb: u64, + + // Performance + pub memory_limit_mb: usize, + pub cpu_priority: String, } impl Config { @@ -43,6 +53,30 @@ impl Config { max_file_age_hours: env::var("MAX_FILE_AGE_HOURS") .ok() .and_then(|s| s.parse().ok()), + + // OCR Configuration + ocr_language: env::var("OCR_LANGUAGE") + .unwrap_or_else(|_| "eng".to_string()), + concurrent_ocr_jobs: env::var("CONCURRENT_OCR_JOBS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(4), + ocr_timeout_seconds: env::var("OCR_TIMEOUT_SECONDS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(300), + max_file_size_mb: env::var("MAX_FILE_SIZE_MB") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(50), + + // Performance + memory_limit_mb: env::var("MEMORY_LIMIT_MB") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(512), + cpu_priority: env::var("CPU_PRIORITY") + .unwrap_or_else(|_| "normal".to_string()), }) } } \ No newline at end of file