Files
TimeTracker/app/utils/ocr.py
Dries Peeters b353184a4f feat: implement advanced expense management with templates and navigation
Implement complete Advanced Expense Management feature set with UI templates,
database schema fixes, and reorganized navigation structure.

Features:
- Expense Categories: Full CRUD with budget tracking and visualization
- Mileage Tracking: Vehicle mileage entries with approval workflow
- Per Diem Management: Daily allowance claims with location-based rates
- Receipt OCR: Infrastructure for receipt scanning (utilities ready)

Database:
- Migration 037: Create expense_categories, mileage, per_diem_rates, per_diems tables
- Migration 038: Fix schema column name mismatches (trip_purpose→purpose, etc.)
- Add missing columns (description, odometer, rates, reimbursement tracking)
- Fix circular foreign key dependencies

Templates (11 new files):
- expense_categories/: list, form, view
- mileage/: list, form, view
- per_diem/: list, form, view, rates_list, rate_form

Navigation:
- Move Mileage and Per Diem to Expenses sub-pages (header buttons)
- Move Expense Categories to Admin menu only
- Remove expense management items from Finance menu

Fixes:
- Fix NoneType comparison error in expense categories utilization
- Handle None values safely in budget progress bars
- Resolve database column name mismatches

UI/UX:
- Responsive design with Tailwind CSS and dark mode support
- Real-time calculations for mileage amounts
- Color-coded budget utilization progress bars
- Status badges for approval workflow states
- Advanced filtering on all list views

Default data:
- 7 expense categories (Travel, Meals, Accommodation, etc.)
- 4 per diem rates (US, GB, DE, FR)
2025-10-31 06:21:35 +01:00

345 lines
11 KiB
Python

"""
OCR utilities for receipt scanning and text extraction.
This module provides functionality to extract text and data from receipt images
using Tesseract OCR and parse common receipt information.
"""
import os
import re
from decimal import Decimal
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
# Check if Tesseract is available
try:
import pytesseract
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
logger.warning("pytesseract or PIL not installed. Receipt OCR will not be available.")
def is_ocr_available():
"""Check if OCR functionality is available"""
return TESSERACT_AVAILABLE
def extract_text_from_image(image_path, lang='eng'):
"""
Extract text from an image using Tesseract OCR.
Args:
image_path: Path to the image file
lang: OCR language (default: 'eng', can be 'eng+deu' for multilingual)
Returns:
Extracted text as string
"""
if not TESSERACT_AVAILABLE:
raise RuntimeError("Tesseract OCR is not available. Install pytesseract and PIL.")
try:
# Open and preprocess image
image = Image.open(image_path)
# Convert to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
# Extract text
text = pytesseract.image_to_string(image, lang=lang)
return text
except Exception as e:
logger.error(f"Error extracting text from image {image_path}: {e}")
raise
def parse_receipt_data(text):
"""
Parse common receipt information from extracted text.
Args:
text: Extracted text from receipt
Returns:
Dictionary with parsed data (vendor, date, total, items, etc.)
"""
data = {
'vendor': None,
'date': None,
'total': None,
'tax': None,
'subtotal': None,
'items': [],
'currency': 'EUR',
'raw_text': text
}
lines = text.split('\n')
# Try to extract vendor (usually first few lines)
vendor_lines = []
for line in lines[:5]:
line = line.strip()
if line and len(line) > 3:
vendor_lines.append(line)
if vendor_lines:
data['vendor'] = vendor_lines[0]
# Extract amounts
amounts = extract_amounts(text)
if amounts:
# Try to identify total (usually largest amount or labeled as total)
total_candidates = []
for amount_info in amounts:
label = amount_info.get('label', '').lower()
if any(keyword in label for keyword in ['total', 'gesamt', 'suma', 'totale']):
data['total'] = amount_info['amount']
elif any(keyword in label for keyword in ['tax', 'vat', 'mwst', 'iva', 'tva']):
data['tax'] = amount_info['amount']
elif any(keyword in label for keyword in ['subtotal', 'zwischensumme', 'sous-total']):
data['subtotal'] = amount_info['amount']
else:
total_candidates.append(amount_info['amount'])
# If no labeled total found, use the largest amount
if not data['total'] and total_candidates:
data['total'] = max(total_candidates)
# Extract date
date = extract_date(text)
if date:
data['date'] = date
# Extract currency
currency = extract_currency(text)
if currency:
data['currency'] = currency
return data
def extract_amounts(text):
"""
Extract monetary amounts from text.
Returns:
List of dictionaries with 'amount' and 'label' keys
"""
amounts = []
# Patterns for amounts (supports various formats)
# Examples: 12.34, 12,34, $12.34, €12,34, 12.34 EUR
patterns = [
r'([A-Za-z\s]*?)\s*([$€£¥]?)\s*(\d{1,3}(?:[.,]\d{3})*[.,]\d{2})\s*([A-Z]{3})?',
]
for pattern in patterns:
matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
for match in matches:
label = match.group(1).strip() if match.group(1) else ''
symbol = match.group(2) if match.group(2) else ''
amount_str = match.group(3)
currency = match.group(4) if match.group(4) else ''
# Normalize amount (convert comma to dot if needed)
# Determine if comma or dot is decimal separator
if ',' in amount_str and '.' in amount_str:
# Has both, assume European format (1.234,56)
amount_str = amount_str.replace('.', '').replace(',', '.')
elif ',' in amount_str:
# Only comma, check if it's thousands separator or decimal
parts = amount_str.split(',')
if len(parts) == 2 and len(parts[1]) == 2:
# Likely decimal separator
amount_str = amount_str.replace(',', '.')
else:
# Likely thousands separator
amount_str = amount_str.replace(',', '')
try:
amount = Decimal(amount_str)
amounts.append({
'amount': amount,
'label': label,
'symbol': symbol,
'currency': currency
})
except (ValueError, Decimal.InvalidOperation):
continue
return amounts
def extract_date(text):
"""
Extract date from receipt text.
Returns:
datetime.date object or None
"""
# Common date patterns
patterns = [
r'(\d{1,2})[./\-](\d{1,2})[./\-](\d{2,4})', # DD/MM/YYYY or MM/DD/YYYY
r'(\d{4})[./\-](\d{1,2})[./\-](\d{1,2})', # YYYY-MM-DD
r'(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(\d{2,4})', # DD Month YYYY
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
groups = match.groups()
if len(groups) == 3:
if pattern == patterns[0]: # DD/MM/YYYY or MM/DD/YYYY
# Try DD/MM/YYYY first (European format)
try:
day, month, year = int(groups[0]), int(groups[1]), int(groups[2])
if year < 100:
year += 2000
return datetime(year, month, day).date()
except ValueError:
# Try MM/DD/YYYY (US format)
try:
month, day, year = int(groups[0]), int(groups[1]), int(groups[2])
if year < 100:
year += 2000
return datetime(year, month, day).date()
except ValueError:
continue
elif pattern == patterns[1]: # YYYY-MM-DD
year, month, day = int(groups[0]), int(groups[1]), int(groups[2])
return datetime(year, month, day).date()
elif pattern == patterns[2]: # DD Month YYYY
day = int(groups[0])
month_str = groups[1].lower()
year = int(groups[2])
if year < 100:
year += 2000
months = {
'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
month = months.get(month_str[:3])
if month:
return datetime(year, month, day).date()
except (ValueError, TypeError):
continue
return None
def extract_currency(text):
"""
Extract currency code from receipt text.
Returns:
3-letter currency code (ISO 4217) or 'EUR' as default
"""
# Currency symbols and their codes
currency_symbols = {
'$': 'USD',
'': 'EUR',
'£': 'GBP',
'¥': 'JPY',
'': 'INR',
'Fr': 'CHF'
}
# Look for currency symbols
for symbol, code in currency_symbols.items():
if symbol in text:
return code
# Look for currency codes (3 uppercase letters)
currency_pattern = r'\b([A-Z]{3})\b'
matches = re.findall(currency_pattern, text)
# Common currency codes
common_currencies = ['USD', 'EUR', 'GBP', 'JPY', 'CHF', 'CAD', 'AUD', 'INR']
for match in matches:
if match in common_currencies:
return match
return 'EUR' # Default
def scan_receipt(image_path, lang='eng'):
"""
Scan a receipt image and extract structured data.
Args:
image_path: Path to the receipt image
lang: OCR language(s) to use (e.g., 'eng', 'eng+deu')
Returns:
Dictionary with extracted receipt data
"""
if not is_ocr_available():
return {
'error': 'OCR not available',
'message': 'Please install pytesseract and Pillow: pip install pytesseract pillow'
}
try:
# Extract text
text = extract_text_from_image(image_path, lang=lang)
# Parse data
data = parse_receipt_data(text)
return data
except Exception as e:
logger.error(f"Error scanning receipt {image_path}: {e}")
return {
'error': str(e),
'message': 'Failed to scan receipt'
}
def get_suggested_expense_data(receipt_data):
"""
Convert receipt data to expense form data suggestions.
Args:
receipt_data: Dictionary returned by scan_receipt()
Returns:
Dictionary with suggested expense data
"""
suggestions = {}
if receipt_data.get('vendor'):
suggestions['vendor'] = receipt_data['vendor']
suggestions['title'] = f"Receipt from {receipt_data['vendor']}"
if receipt_data.get('total'):
suggestions['amount'] = float(receipt_data['total'])
if receipt_data.get('tax'):
suggestions['tax_amount'] = float(receipt_data['tax'])
if receipt_data.get('date'):
suggestions['expense_date'] = receipt_data['date'].isoformat()
if receipt_data.get('currency'):
suggestions['currency_code'] = receipt_data['currency']
return suggestions