TimeTracker/app/utils/ocr.py

"""
OCR utilities for receipt scanning and text extraction.

This module provides functionality to extract text and data from receipt images
using Tesseract OCR and parse common receipt information.
"""

import os
import re
from decimal import Decimal
from datetime import datetime
import logging

logger = logging.getLogger(__name__)

# Check if Tesseract is available
try:
    import pytesseract
    from PIL import Image

    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False
    logger.warning("pytesseract or PIL not installed. Receipt OCR will not be available.")


def is_ocr_available():
    """Check if OCR functionality is available"""
    return TESSERACT_AVAILABLE


def extract_text_from_image(image_path, lang="eng"):
    """
    Extract text from an image using Tesseract OCR.

    Args:
        image_path: Path to the image file
        lang: OCR language (default: 'eng', can be 'eng+deu' for multilingual)

    Returns:
        Extracted text as string
    """
    if not TESSERACT_AVAILABLE:
        raise RuntimeError("Tesseract OCR is not available. Install pytesseract and PIL.")

    try:
        # Open and preprocess image
        image = Image.open(image_path)

        # Convert to RGB if necessary
        if image.mode != "RGB":
            image = image.convert("RGB")

        # Extract text
        text = pytesseract.image_to_string(image, lang=lang)

        return text
    except Exception as e:
        logger.error(f"Error extracting text from image {image_path}: {e}")
        raise


def parse_receipt_data(text):
    """
    Parse common receipt information from extracted text.

    Args:
        text: Extracted text from receipt

    Returns:
        Dictionary with parsed data (vendor, date, total, items, etc.)
    """
    data = {
        "vendor": None,
        "date": None,
        "total": None,
        "tax": None,
        "subtotal": None,
        "items": [],
        "currency": "EUR",
        "raw_text": text,
    }

    lines = text.split("\n")

    # Try to extract vendor (usually first few lines)
    vendor_lines = []
    for line in lines[:5]:
        line = line.strip()
        if line and len(line) > 3:
            vendor_lines.append(line)

    if vendor_lines:
        data["vendor"] = vendor_lines[0]

    # Extract amounts
    amounts = extract_amounts(text)
    if amounts:
        # Try to identify total (usually largest amount or labeled as total)
        total_candidates = []

        for amount_info in amounts:
            label = amount_info.get("label", "").lower()
            if any(keyword in label for keyword in ["total", "gesamt", "suma", "totale"]):
                data["total"] = amount_info["amount"]
            elif any(keyword in label for keyword in ["tax", "vat", "mwst", "iva", "tva"]):
                data["tax"] = amount_info["amount"]
            elif any(keyword in label for keyword in ["subtotal", "zwischensumme", "sous-total"]):
                data["subtotal"] = amount_info["amount"]
            else:
                total_candidates.append(amount_info["amount"])

        # If no labeled total found, use the largest amount
        if not data["total"] and total_candidates:
            data["total"] = max(total_candidates)

    # Extract date
    date = extract_date(text)
    if date:
        data["date"] = date

    # Extract currency
    currency = extract_currency(text)
    if currency:
        data["currency"] = currency

    return data


def extract_amounts(text):
    """
    Extract monetary amounts from text.

    Returns:
        List of dictionaries with 'amount' and 'label' keys
    """
    amounts = []

    # Patterns for amounts (supports various formats)
    # Examples: 12.34, 12,34, $12.34, €12,34, 12.34 EUR
    patterns = [
        r"([A-Za-z\s]*?)\s*([$€£¥]?)\s*(\d{1,3}(?:[.,]\d{3})*[.,]\d{2})\s*([A-Z]{3})?",
    ]

    for pattern in patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            label = match.group(1).strip() if match.group(1) else ""
            symbol = match.group(2) if match.group(2) else ""
            amount_str = match.group(3)
            currency = match.group(4) if match.group(4) else ""

            # Normalize amount (convert comma to dot if needed)
            # Determine if comma or dot is decimal separator
            if "," in amount_str and "." in amount_str:
                # Has both, assume European format (1.234,56)
                amount_str = amount_str.replace(".", "").replace(",", ".")
            elif "," in amount_str:
                # Only comma, check if it's thousands separator or decimal
                parts = amount_str.split(",")
                if len(parts) == 2 and len(parts[1]) == 2:
                    # Likely decimal separator
                    amount_str = amount_str.replace(",", ".")
                else:
                    # Likely thousands separator
                    amount_str = amount_str.replace(",", "")

            try:
                amount = Decimal(amount_str)
                amounts.append({"amount": amount, "label": label, "symbol": symbol, "currency": currency})
            except (ValueError, Decimal.InvalidOperation):
                continue

    return amounts


def extract_date(text):
    """
    Extract date from receipt text.

    Returns:
        datetime.date object or None
    """
    # Common date patterns
    patterns = [
        r"(\d{1,2})[./\-](\d{1,2})[./\-](\d{2,4})",  # DD/MM/YYYY or MM/DD/YYYY
        r"(\d{4})[./\-](\d{1,2})[./\-](\d{1,2})",  # YYYY-MM-DD
        r"(\d{1,2})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+(\d{2,4})",  # DD Month YYYY
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            try:
                groups = match.groups()

                if len(groups) == 3:
                    if pattern == patterns[0]:  # DD/MM/YYYY or MM/DD/YYYY
                        # Try DD/MM/YYYY first (European format)
                        try:
                            day, month, year = int(groups[0]), int(groups[1]), int(groups[2])
                            if year < 100:
                                year += 2000
                            return datetime(year, month, day).date()
                        except ValueError:
                            # Try MM/DD/YYYY (US format)
                            try:
                                month, day, year = int(groups[0]), int(groups[1]), int(groups[2])
                                if year < 100:
                                    year += 2000
                                return datetime(year, month, day).date()
                            except ValueError:
                                continue

                    elif pattern == patterns[1]:  # YYYY-MM-DD
                        year, month, day = int(groups[0]), int(groups[1]), int(groups[2])
                        return datetime(year, month, day).date()

                    elif pattern == patterns[2]:  # DD Month YYYY
                        day = int(groups[0])
                        month_str = groups[1].lower()
                        year = int(groups[2])
                        if year < 100:
                            year += 2000

                        months = {
                            "jan": 1,
                            "feb": 2,
                            "mar": 3,
                            "apr": 4,
                            "may": 5,
                            "jun": 6,
                            "jul": 7,
                            "aug": 8,
                            "sep": 9,
                            "oct": 10,
                            "nov": 11,
                            "dec": 12,
                        }
                        month = months.get(month_str[:3])
                        if month:
                            return datetime(year, month, day).date()

            except (ValueError, TypeError):
                continue

    return None


def extract_currency(text):
    """
    Extract currency code from receipt text.

    Returns:
        3-letter currency code (ISO 4217) or 'EUR' as default
    """
    # Currency symbols and their codes
    currency_symbols = {"$": "USD", "€": "EUR", "£": "GBP", "¥": "JPY", "₹": "INR", "Fr": "CHF"}

    # Look for currency symbols
    for symbol, code in currency_symbols.items():
        if symbol in text:
            return code

    # Look for currency codes (3 uppercase letters)
    currency_pattern = r"\b([A-Z]{3})\b"
    matches = re.findall(currency_pattern, text)

    # Common currency codes
    common_currencies = ["USD", "EUR", "GBP", "JPY", "CHF", "CAD", "AUD", "INR"]

    for match in matches:
        if match in common_currencies:
            return match

    return "EUR"  # Default


def scan_receipt(image_path, lang="eng"):
    """
    Scan a receipt image and extract structured data.

    Args:
        image_path: Path to the receipt image
        lang: OCR language(s) to use (e.g., 'eng', 'eng+deu')

    Returns:
        Dictionary with extracted receipt data
    """
    if not is_ocr_available():
        return {
            "error": "OCR not available",
            "message": "Please install pytesseract and Pillow: pip install pytesseract pillow",
        }

    try:
        # Extract text
        text = extract_text_from_image(image_path, lang=lang)

        # Parse data
        data = parse_receipt_data(text)

        return data

    except Exception as e:
        logger.error(f"Error scanning receipt {image_path}: {e}")
        return {"error": str(e), "message": "Failed to scan receipt"}


def get_suggested_expense_data(receipt_data):
    """
    Convert receipt data to expense form data suggestions.

    Args:
        receipt_data: Dictionary returned by scan_receipt()

    Returns:
        Dictionary with suggested expense data
    """
    suggestions = {}

    if receipt_data.get("vendor"):
        suggestions["vendor"] = receipt_data["vendor"]
        suggestions["title"] = f"Receipt from {receipt_data['vendor']}"

    if receipt_data.get("total"):
        suggestions["amount"] = float(receipt_data["total"])

    if receipt_data.get("tax"):
        suggestions["tax_amount"] = float(receipt_data["tax"])

    if receipt_data.get("date"):
        suggestions["expense_date"] = receipt_data["date"].isoformat()

    if receipt_data.get("currency"):
        suggestions["currency_code"] = receipt_data["currency"]

    return suggestions