Files
Warracker/backend/paperless_handler.py
sassanix 6b035b59a8 Fix Paperless-ngx issues, restores missing assets, and improves stability.
- Fixed: Paperless-ngx document uploads were being incorrectly flagged as duplicates due to invalid API parameter usage (checksum → checksum__iexact).

- Fixed: API token authentication with Paperless-ngx now works properly when Two-Factor Authentication (2FA) is enabled, ensuring secure token-only integration.

- Fixed: Restored missing i18next JavaScript libraries for non-Docker installations, ensuring status page and i18n features function correctly.

- Enhanced: Replaced psycopg2-binary with psycopg2 for production stability and compatibility.

- Enhanced: Adjusted .gitignore to track /lib directory, ensuring essential libraries are available across environments.
2025-09-18 10:56:43 -03:00

754 lines
31 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Paperless-ngx API Handler for Warracker
This module provides functionality to interact with Paperless-ngx API
for uploading, retrieving, and managing documents.
"""
import requests
import logging
from typing import Optional, Dict, Any, Tuple
import os
from io import BytesIO
import hashlib
logger = logging.getLogger(__name__)
class PaperlessHandler:
"""Handle interactions with Paperless-ngx API"""
def __init__(self, paperless_url: str, api_token: str):
"""
Initialize Paperless handler
Args:
paperless_url: Base URL of Paperless-ngx instance (e.g., https://paperless.example.com)
api_token: API token for authentication
"""
self.paperless_url = paperless_url.rstrip('/')
self.base_url = self.paperless_url # Add base_url alias for compatibility
self.api_token = api_token
self.session = requests.Session()
self.session.headers.update({
'Authorization': f'Token {api_token}',
'User-Agent': 'Warracker-PaperlessIntegration/1.0',
'Accept': 'application/json'
})
# Ensure no environment-provided authentication (proxies, netrc) interferes
try:
self.session.trust_env = False
except Exception:
pass
def _build_url(self, url_or_path: str) -> str:
"""Build absolute URL for Paperless-ngx API calls."""
if url_or_path.startswith('http://') or url_or_path.startswith('https://'):
return url_or_path
return f"{self.paperless_url.rstrip('/')}/{url_or_path.lstrip('/')}"
def _request(self, method: str, url_or_path: str, **kwargs) -> requests.Response:
"""
Perform a request ensuring token-only auth:
- Always include Authorization: Token <token>
- Clear cookies before sending (avoid session/CSRF/2FA paths)
- Do not auto-follow redirects to login pages unless explicitly requested
"""
headers = kwargs.pop('headers', {}) or {}
merged_headers = {
'Authorization': f'Token {self.api_token}',
'User-Agent': 'Warracker-PaperlessIntegration/1.0',
'Accept': 'application/json'
}
merged_headers.update(headers)
# Avoid sending any cookies that could switch us to session auth
try:
self.session.cookies.clear()
except Exception:
pass
if 'allow_redirects' not in kwargs:
kwargs['allow_redirects'] = False
url = self._build_url(url_or_path)
response = self.session.request(method, url, headers=merged_headers, **kwargs)
# Treat redirects to login (or any redirect) as auth failures for API token mode
if 300 <= response.status_code < 400:
location = response.headers.get('Location', '')
raise requests.exceptions.HTTPError(
f"Unexpected redirect (HTTP {response.status_code}) to '{location}'. Token auth likely rejected.",
response=response
)
return response
def get(self, url_or_path: str, **kwargs) -> requests.Response:
return self._request('GET', url_or_path, **kwargs)
def test_connection(self) -> Tuple[bool, str]:
"""
Test connection to Paperless-ngx instance
Returns:
(success: bool, message: str)
"""
try:
response = self.get('/api/documents/', params={'page_size': 1})
response.raise_for_status()
return True, "Connection successful"
except requests.exceptions.ConnectionError:
return False, "Cannot connect to Paperless-ngx instance. Check URL and network connectivity."
except requests.exceptions.Timeout:
return False, "Connection timeout. Paperless-ngx instance might be slow or unresponsive."
except requests.exceptions.HTTPError as e:
if e.response.status_code == 401:
return False, "Authentication failed. Check your API token."
elif e.response.status_code == 403:
return False, "Access forbidden. Check your API token permissions."
else:
return False, f"HTTP error: {e.response.status_code} - {e.response.reason}"
except Exception as e:
return False, f"Unexpected error: {str(e)}"
def find_document_by_checksum(self, checksum: str) -> Tuple[bool, Optional[int], str]:
"""
Find a document by its checksum in Paperless-ngx
Args:
checksum: Document checksum to search for
Returns:
(success: bool, document_id: Optional[int], message: str)
"""
try:
logger.info(f"Searching for document by checksum: {checksum}")
response = self.get(
'/api/documents/',
params={
'checksum__iexact': checksum,
'ordering': '-created',
'page_size': 1
},
timeout=15
)
response.raise_for_status()
result = response.json()
if 'results' in result and result['results']:
document = result['results'][0]
document_id = document.get('id')
logger.info(f"Found existing document with checksum {checksum}: ID {document_id}")
return True, document_id, f"Found existing document: ID {document_id}"
else:
return False, None, "No document found with matching checksum"
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error searching by checksum: {e}")
return False, None, f"Search failed: HTTP {e.response.status_code}"
except Exception as e:
logger.error(f"Error searching by checksum: {e}")
return False, None, f"Search failed: {str(e)}"
def upload_document(self, file_content: bytes, filename: str, title: Optional[str] = None,
tags: Optional[list] = None, correspondent: Optional[str] = None) -> Tuple[bool, Optional[int], str]:
# Check for duplicate by checksum before uploading
checksum = hashlib.md5(file_content).hexdigest()
success, existing_id, msg = self.find_document_by_checksum(checksum)
if success:
return False, existing_id, "The file that is being uploaded to Paperless is a duplicate."
try:
# Detect MIME type from filename
import mimetypes
mime_type, _ = mimetypes.guess_type(filename)
if not mime_type:
mime_type = 'application/octet-stream'
# Prepare files for multipart upload
# Paperless-ngx expects the file under 'document' field
files = {
'document': (filename, BytesIO(file_content), mime_type)
}
# Prepare form data - Paperless-ngx API requirements
# Note: Don't include 'document' in data, only in files
data = {}
if title:
data['title'] = title
# TODO: For future enhancement, implement proper tag/correspondent handling:
# - correspondent expects PK (ID) of existing correspondent, not string name
# - tags expects PKs (IDs) of existing tags, not string names
# For now, we'll skip these optional fields to get basic upload working
# if correspondent:
# # Would need to lookup/create correspondent ID first
# data['correspondent'] = correspondent_id
# if tags:
# # Would need to lookup/create tag IDs first
# data['tags'] = [tag_id1, tag_id2, ...]
logger.info(f"Uploading document to Paperless-ngx: {filename}")
logger.info(f"Upload data: {data}")
logger.info(f"MIME type: {mime_type}")
# Don't set Content-Type manually - let requests handle it
response = self._request(
'POST',
'/api/documents/post_document/',
files=files,
data=data,
timeout=60 # Longer timeout for uploads
)
logger.info(f"Paperless-ngx upload response status: {response.status_code}")
logger.info(f"Paperless-ngx upload response text: {response.text[:500]}...") # First 500 chars
response.raise_for_status() # This will raise an exception for 4xx/5xx status codes
# Try to parse response as JSON first
try:
result = response.json()
logger.info(f"Paperless-ngx upload response: {result}")
except Exception as e:
logger.warning(f"Could not parse response as JSON: {e}")
# If Paperless returns plain text, it is often just the task UUID.
text_body = response.text.strip().strip('"') # paperless may wrap uuid in quotes
import re
uuid_pattern = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
if uuid_pattern.match(text_body):
# Treat as task ID and try to resolve the document ID
resolved_id = self._get_document_id_from_task(text_body)
if resolved_id:
return True, resolved_id, "Document uploaded and processed successfully"
else:
logger.info("Upload accepted; processing asynchronously (task %s)", text_body)
return True, None, f"Document uploaded successfully: {text_body}"
# If we can't recognise the content, still mark success but without ID
return True, None, "Document uploaded successfully"
# Handle different possible response formats from Paperless-ngx
document_id = None
if isinstance(result, dict):
# JSON object response
if 'task_id' in result:
# Task-based response (asynchronous processing)
task_id = result.get('task_id')
logger.info(f"Document upload task created: {task_id}")
# NEW: Poll Paperless-ngx task endpoint to resolve the final document ID.
resolved_id = self._get_document_id_from_task(task_id)
if resolved_id:
logger.info(f"Resolved task {task_id} to document ID {resolved_id}")
return True, resolved_id, "Document uploaded and processed successfully"
else:
logger.warning(
"Timed out waiting for Paperless-ngx to finish processing task %s", task_id
)
# Processing will still finish in background; caller can attempt later auto-link.
return True, None, "Document uploaded (processing asynchronously link pending)"
elif 'id' in result:
# Direct document ID response (synchronous processing)
document_id = result.get('id')
logger.info(f"Document uploaded with ID: {document_id}")
return True, document_id, "Document uploaded successfully"
elif result.get('success') or response.status_code == 200:
# Generic success response
logger.info(f"Document uploaded successfully (generic success)")
return True, None, "Document uploaded successfully"
else:
logger.warning(f"Unexpected JSON response format from Paperless-ngx: {result}")
# Even if format is unexpected, if we got HTTP 200, it's likely successful
return True, None, "Document uploaded successfully (unknown JSON format)"
elif isinstance(result, str):
# String response - might contain an ID or just be a success message
logger.info(f"Document uploaded successfully (string response): {result}")
# Try to extract an ID from the string if it looks like one
import re
# Only match standalone numbers, not task IDs
id_match = re.search(r'"id"\s*:\s*(\d+)', result)
if id_match:
document_id = int(id_match.group(1))
logger.info(f"Extracted document ID from string: {document_id}")
return True, document_id, f"Document uploaded successfully: {result}"
else:
return True, None, f"Document uploaded successfully: {result}"
else:
# Other response type
logger.warning(f"Unexpected response type from Paperless-ngx: {type(result)} - {result}")
return True, None, "Document uploaded successfully (unknown response type)"
except requests.exceptions.Timeout:
return False, None, "Upload timeout. The file might be too large or the connection is slow."
except requests.exceptions.HTTPError as e:
error_msg = f"Upload failed: HTTP {e.response.status_code}"
try:
error_detail = e.response.json()
logger.error(f"Paperless-ngx detailed error: {error_detail}")
if 'detail' in error_detail:
error_msg += f" - {error_detail['detail']}"
elif isinstance(error_detail, dict):
# Handle field-specific errors
error_parts = []
for field, errors in error_detail.items():
if isinstance(errors, list):
error_parts.append(f"{field}: {', '.join(errors)}")
else:
error_parts.append(f"{field}: {errors}")
if error_parts:
error_msg += f" - {'; '.join(error_parts)}"
else:
error_msg += f" - {error_detail}"
except Exception as parse_error:
logger.error(f"Could not parse error response: {parse_error}")
error_msg += f" - {e.response.reason}"
return False, None, error_msg
except Exception as e:
logger.error(f"Error uploading document to Paperless-ngx: {e}")
return False, None, f"Upload failed: {str(e)}"
def get_document_preview(self, document_id: int) -> Tuple[bool, Optional[bytes], str, Optional[str]]:
"""
Get document preview/content from Paperless-ngx
Args:
document_id: Paperless-ngx document ID
Returns:
(success: bool, content: Optional[bytes], message: str, content_type: Optional[str])
"""
# Try multiple endpoints in order of preference
endpoints_to_try = [
('preview', f'/api/documents/{document_id}/preview/'),
('download', f'/api/documents/{document_id}/download/'),
]
last_error = None
for endpoint_name, endpoint_path in endpoints_to_try:
try:
logger.info(f"Fetching document {endpoint_name} from Paperless-ngx: {document_id}")
response = self.get(endpoint_path, timeout=30)
response.raise_for_status()
content_type = response.headers.get('Content-Type', 'application/octet-stream')
logger.info(f"Successfully retrieved document {document_id} via {endpoint_name} endpoint")
return True, response.content, f"Document retrieved successfully via {endpoint_name}", content_type
except requests.exceptions.HTTPError as e:
logger.warning(f"Failed to retrieve document {document_id} via {endpoint_name}: HTTP {e.response.status_code}")
last_error = e
if e.response.status_code == 404:
continue # Try next endpoint
else:
# For non-404 errors, don't try other endpoints
return False, None, f"Failed to retrieve document: HTTP {e.response.status_code}", None
except Exception as e:
logger.warning(f"Error retrieving document {document_id} via {endpoint_name}: {e}")
last_error = e
continue # Try next endpoint
# If we get here, all endpoints failed
if last_error and isinstance(last_error, requests.exceptions.HTTPError) and last_error.response.status_code == 404:
return False, None, "Document not found in Paperless-ngx", None
else:
return False, None, f"Retrieval failed: {str(last_error) if last_error else 'All endpoints failed'}", None
def get_document_thumbnail(self, document_id: int) -> Tuple[bool, Optional[bytes], str]:
"""
Get document thumbnail from Paperless-ngx
Args:
document_id: Paperless-ngx document ID
Returns:
(success: bool, content: Optional[bytes], message: str)
"""
try:
response = self.get(f'/api/documents/{document_id}/thumb/', timeout=15)
response.raise_for_status()
return True, response.content, "Thumbnail retrieved successfully"
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
return False, None, "Document or thumbnail not found"
else:
return False, None, f"Failed to retrieve thumbnail: HTTP {e.response.status_code}"
except Exception as e:
logger.error(f"Error retrieving thumbnail from Paperless-ngx: {e}")
return False, None, f"Thumbnail retrieval failed: {str(e)}"
def search_documents(self, query: str, limit: int = 25) -> Tuple[bool, Optional[list], str]:
"""
Search documents in Paperless-ngx
Args:
query: Search query string
limit: Maximum number of results to return
Returns:
(success: bool, documents: Optional[list], message: str)
"""
try:
params = {
'query': query,
'page_size': min(limit, 100) # Cap at 100 for performance
}
response = self.get('/api/documents/', params=params, timeout=15)
response.raise_for_status()
result = response.json()
documents = result.get('results', [])
return True, documents, f"Found {len(documents)} documents"
except Exception as e:
logger.error(f"Error searching documents in Paperless-ngx: {e}")
return False, None, f"Search failed: {str(e)}"
def get_document_info(self, document_id: int) -> Tuple[bool, Optional[Dict[str, Any]], str]:
"""
Get document information from Paperless-ngx
Args:
document_id: Paperless-ngx document ID
Returns:
(success: bool, document_info: Optional[Dict], message: str)
"""
try:
response = self.get(f'/api/documents/{document_id}/', timeout=15)
response.raise_for_status()
document_info = response.json()
return True, document_info, "Document info retrieved successfully"
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
return False, None, "Document not found"
else:
return False, None, f"Failed to retrieve document info: HTTP {e.response.status_code}"
except Exception as e:
logger.error(f"Error retrieving document info from Paperless-ngx: {e}")
return False, None, f"Info retrieval failed: {str(e)}"
def debug_document_status(self, document_id: int) -> Dict[str, Any]:
"""
Debug method to check document status and available endpoints
Args:
document_id: Paperless-ngx document ID
Returns:
Dictionary with debug information
"""
debug_info = {
'document_id': document_id,
'endpoints_tested': {},
'document_exists': False,
'document_info': None
}
# Test different endpoints
endpoints_to_test = [
('info', f'/api/documents/{document_id}/'),
('preview', f'/api/documents/{document_id}/preview/'),
('download', f'/api/documents/{document_id}/download/'),
('thumb', f'/api/documents/{document_id}/thumb/')
]
for endpoint_name, endpoint_path in endpoints_to_test:
try:
logger.info(f"Testing endpoint: {self.paperless_url}{endpoint_path}")
response = self.get(endpoint_path, timeout=15)
debug_info['endpoints_tested'][endpoint_name] = {
'status_code': response.status_code,
'success': response.status_code < 400,
'content_type': response.headers.get('Content-Type', 'unknown'),
'content_length': len(response.content) if response.content else 0
}
if endpoint_name == 'info' and response.status_code == 200:
debug_info['document_exists'] = True
try:
debug_info['document_info'] = response.json()
except:
debug_info['document_info'] = 'Could not parse JSON'
except Exception as e:
debug_info['endpoints_tested'][endpoint_name] = {
'error': str(e),
'success': False
}
# Also try to list recent documents to see if our document is there
try:
response = self.get('/api/documents/', params={'ordering': '-created', 'page_size': 10}, timeout=15)
if response.status_code == 200:
recent_docs = response.json().get('results', [])
debug_info['recent_documents'] = [
{'id': doc.get('id'), 'title': doc.get('title'), 'created': doc.get('created')}
for doc in recent_docs
]
debug_info['document_in_recent'] = any(doc.get('id') == document_id for doc in recent_docs)
else:
debug_info['recent_documents'] = f'Error: {response.status_code}'
except Exception as e:
debug_info['recent_documents'] = f'Exception: {str(e)}'
return debug_info
def document_exists(self, document_id: int) -> bool:
"""
Check if a document exists in Paperless-ngx
Args:
document_id: Paperless-ngx document ID
Returns:
True if document exists, False otherwise
"""
try:
response = self.get(f'/api/documents/{document_id}/', timeout=10)
return response.status_code == 200
except Exception as e:
logger.warning(f"Error checking document existence {document_id}: {e}")
return False
def find_document_by_title(self, title: str) -> Tuple[bool, Optional[int], str]:
"""
Find a document by its title in Paperless-ngx
Args:
title: Document title to search for
Returns:
(success: bool, document_id: Optional[int], message: str)
"""
try:
logger.info(f"Searching for document by title: {title}")
# Search for documents with the given title
response = self.get(
'/api/documents/',
params={
'title__icontains': title, # Case-insensitive partial match
'ordering': '-created', # Most recent first
'page_size': 10 # Limit results
},
timeout=15
)
response.raise_for_status()
result = response.json()
if 'results' in result and result['results']:
# Return the first (most recent) match
document = result['results'][0]
document_id = document.get('id')
document_title = document.get('title', 'Unknown')
logger.info(f"Found document: ID {document_id}, Title: {document_title}")
return True, document_id, f"Found document: {document_title}"
else:
logger.info(f"No document found with title containing: {title}")
return False, None, "Document not found"
except requests.exceptions.HTTPError as e:
logger.error(f"HTTP error searching for document: {e}")
return False, None, f"Search failed: HTTP {e.response.status_code}"
except Exception as e:
logger.error(f"Error searching for document: {e}")
return False, None, f"Search failed: {str(e)}"
# ---------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------
def _get_document_id_from_task(
self,
task_id: str,
timeout_secs: int = 600, # increased to 10 minutes per user request
poll_interval: float = 5.0, # less frequent polling to reduce load
) -> Optional[int]:
"""Poll /api/tasks endpoint until the task completes and returns a document ID.
Args:
task_id: The UUID returned by the document upload request.
timeout_secs: Maximum time to wait before giving up.
poll_interval: Seconds between polls.
Returns:
The related document ID if the task completed successfully within the
timeout window; otherwise, ``None``.
"""
import time
if not task_id:
return None
# Prefer the dedicated task endpoint (Paperless ≥2.3). Some older
# releases only support the list + filter variant. We therefore try the
# singular endpoint first and fall back to the legacy query if it 404s.
task_url_primary = f"{self.paperless_url}/api/tasks/{task_id}/"
task_url_legacy_list = f"{self.paperless_url}/api/tasks/"
deadline = time.time() + timeout_secs
while time.time() < deadline:
try:
try:
resp = self.get(task_url_primary, timeout=10)
if resp.status_code == 404:
# Fall back to legacy ?task_id=<uuid> filter
resp = self.get(task_url_legacy_list, params={"task_id": task_id}, timeout=10)
except requests.exceptions.HTTPError as http_err:
if http_err.response.status_code == 404 and http_err.response.url.rstrip('/') == task_url_primary.rstrip('/'):
# Primary endpoint not available, try legacy
resp = self.get(task_url_legacy_list, params={"task_id": task_id}, timeout=10)
else:
raise
resp.raise_for_status()
# Legacy endpoint returns a list
if isinstance(resp.json(), list):
task_info = resp.json()[0] if resp.json() else {}
else:
task_info = resp.json()
# In newer Paperless versions the field is called "state"; fall back to
# "status" for backwards-compatibility.
state = task_info.get("state") or task_info.get("status")
related_doc = task_info.get("related_document")
# Some Paperless versions don't fill related_document but embed the
# newly-created ID in the free-text "result" string, e.g.
# "Success. New document id 416 created" see GH#3064.
if not related_doc and isinstance(task_info.get("result"), str):
import re
m = re.search(r"document id (\d+)", task_info["result"])
if m:
related_doc = m.group(1)
if state == "SUCCESS" and related_doc:
try:
return int(related_doc)
except (ValueError, TypeError):
logger.warning("Unexpected related_document value: %s", related_doc)
return None
if state in {"FAILURE", "REVOKED"}:
logger.error("Paperless task %s finished with state %s", task_id, state)
return None
# Task still running wait and try again
time.sleep(poll_interval)
except Exception as poll_err:
# Transient network or parsing error log and retry until deadline
logger.warning("Error polling task %s: %s", task_id, poll_err)
time.sleep(poll_interval)
# Timed out waiting for the task to finish
logger.warning(
"Timed out after %s seconds waiting for task %s (state pending) will return None so frontend can attempt auto-link",
timeout_secs,
task_id,
)
return None
def get_paperless_handler(conn) -> Optional[PaperlessHandler]:
"""
Get a configured Paperless handler from site settings
Args:
conn: Database connection
Returns:
PaperlessHandler instance or None if not configured/enabled
"""
try:
with conn.cursor() as cur:
cur.execute("""
SELECT key, value FROM site_settings
WHERE key IN ('paperless_enabled', 'paperless_url', 'paperless_api_token')
""")
settings = {row[0]: row[1] for row in cur.fetchall()}
# Check if Paperless-ngx is enabled
if settings.get('paperless_enabled', 'false').lower() != 'true':
return None
# Check required settings
paperless_url = settings.get('paperless_url', '').strip()
paperless_token = settings.get('paperless_api_token', '').strip()
if not paperless_url or not paperless_token:
logger.warning("Paperless-ngx is enabled but URL or API token is missing")
return None
return PaperlessHandler(paperless_url, paperless_token)
except Exception as e:
logger.error(f"Error creating Paperless handler: {e}")
return None