Spaces:

make789
/

OCRdeepSeekService

Sleeping

App Files Files Community

make789 commited on Oct 29

Commit

7f46686

verified ·

1 Parent(s): 0e7bad9

Upload ocr_service.py

Browse files

Files changed (1) hide show

ocr_service.py +561 -101

ocr_service.py CHANGED Viewed

@@ -2,19 +2,59 @@ import asyncio
 import json
 import math
 import os
 import secrets
 from collections import defaultdict, deque
 from time import monotonic
-from typing import Any, Deque, DefaultDict
-import cv2
 import numpy as np
 from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status
 from fastapi.security import APIKeyHeader
-from paddleocr import PaddleOCR
-from paddleocr.ppstructure import PPStructure
-from .llm_splitter import call_llm_splitter
 ALLOWED_CONTENT_TYPES = {
     "image/jpeg",
@@ -24,11 +64,9 @@ ALLOWED_CONTENT_TYPES = {
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(5 * 1024 * 1024)))
 RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "30"))
 RATE_LIMIT_WINDOW_SECONDS = float(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60"))
-SERVICE_API_KEY = os.getenv("SERVICE_API_KEY")
-if not SERVICE_API_KEY:
-    raise RuntimeError(
-        "SERVICE_API_KEY environment variable must be set before starting the OCR service"
-    )
 API_KEY_HEADER_NAME = "X-API-Key"
 MAX_CHILD_LINES = 500
 MAX_JSON_DEPTH = 4
@@ -36,9 +74,350 @@ MAX_JSON_STRING_LENGTH = 512
 MAX_JSON_DICT_KEYS = 50
 MAX_JSON_LIST_ITEMS = 100
-app = FastAPI()
-ocr = PaddleOCR(use_angle_cls=True, lang="en")
-pp = PPStructure(show_log=False, layout=True)
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 _rate_limit_lock = asyncio.Lock()
@@ -46,11 +425,22 @@ _request_log: DefaultDict[str, Deque[float]] = defaultdict(deque)
 def ensure_upload_is_safe(file: UploadFile) -> None:
     content_type = (file.content_type or "").lower()
-    if content_type not in ALLOWED_CONTENT_TYPES:
         raise HTTPException(
             status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
-            detail="Unsupported file type",
         )
     file.file.seek(0, os.SEEK_END)
@@ -63,7 +453,11 @@ def ensure_upload_is_safe(file: UploadFile) -> None:
         )
-async def verify_api_key(api_key: str | None = Depends(api_key_header)) -> str:
     if not api_key or not secrets.compare_digest(api_key, SERVICE_API_KEY):
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
@@ -92,25 +486,35 @@ async def enforce_rate_limit(
 def _decode_image(file: UploadFile):
     data = file.file.read()
     if not data:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Uploaded file is empty",
         )
-    img = cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_COLOR)
-    if img is None:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail="Unable to decode image",
         )
-    return img
-def load_img(file: UploadFile):
     ensure_upload_is_safe(file)
     file.file.seek(0)
-    return _decode_image(file)
 def _parse_json_field(name: str, raw: str, expected_type: type) -> Any:
@@ -274,20 +678,65 @@ def _parse_rules(raw: str) -> list:
     return rules
 @app.post("/ocr")
 async def ocr_page(
     file: UploadFile,
     _: None = Depends(enforce_rate_limit),
 ):
-    img = load_img(file)
-    res = ocr.ocr(img, cls=True)
-    lines = []
-    full = []
-    for line in (res[0] or []):
-        (x1, y1, x2, y2), (txt, conf) = line
-        lines.append({"bbox": [x1, y1, x2, y2], "text": txt, "conf": float(conf)})
-        full.append(txt)
-    return {"text": "\n".join(full), "lines": lines}
 @app.post("/split")
@@ -300,74 +749,85 @@ async def split(
     rules: str = Form("[]"),
     _: None = Depends(enforce_rate_limit),
 ):
-    img = load_img(file)
-    height, width = img.shape[:2]
-    parent_box = _parse_parent_bbox(parent_bbox, width, height)
-    x1, y1, x2, y2 = parent_box
-    x1_i, y1_i, x2_i, y2_i = [int(round(v)) for v in parent_box]
-    crop = img[y1_i:y2_i, x1_i:x2_i]
-    if crop.size == 0:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="parent_bbox region is empty",
-        )
-    blocks = pp(crop)
-    child_lines = []
-    for b in blocks:
-        bx1, by1, bx2, by2 = b["bbox"]
-        sub = crop[by1:by2, bx1:bx2]
-        det = ocr.ocr(sub, cls=True)
-        for ln in (det[0] or []):
-            (lx1, ly1, lx2, ly2), (txt, conf) = ln
-            child_lines.append(
-                {
-                    "bbox": [lx1 + bx1 + x1, ly1 + by1 + y1, lx2 + bx1 + x1, ly2 + by1 + y1],
-                    "text": txt,
-                    "conf": float(conf),
-                    "blockType": b.get("type", "text"),
-                }
-            )
-            if len(child_lines) >= MAX_CHILD_LINES:
-                break
-        if len(child_lines) >= MAX_CHILD_LINES:
-            break
-    sanitized_splitter = _sanitize_label("splitter", splitter)
-    sanitized_schema = _sanitize_label("schemaType", schemaType)
-    parsed_settings = _parse_settings(settings)
-    parsed_rules = _parse_rules(rules)
-    raw_text = "\n".join([l["text"] for l in child_lines])
-    text_truncated = False
-    if len(raw_text) > 5000:
-        raw_text = raw_text[:5000]
-        text_truncated = True
-    llm_input = {
-        "schemaType": sanitized_schema,
-        "splitter": sanitized_splitter,
-        "page": {"width": width, "height": height},
-        "parentBox": parent_box,
-        "rawText": raw_text,
-        "ocrLines": child_lines,
-        "rawTextTruncated": text_truncated,
-        "ocrLinesTruncated": len(child_lines) >= MAX_CHILD_LINES,
-        "settings": parsed_settings,
-        "rules": parsed_rules,
-    }
     try:
-        llm_result = await call_llm_splitter(llm_input)
-    except ValueError as exc:
-        raise HTTPException(
-            status_code=status.HTTP_502_BAD_GATEWAY,
-            detail=str(exc),
-        ) from exc
-    return llm_result
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8080)

 import json
 import math
 import os
+import platform
 import secrets
+import tempfile
 from collections import defaultdict, deque
+from pathlib import Path
 from time import monotonic
+from typing import Any, Deque, DefaultDict, Optional
 import numpy as np
 from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import APIKeyHeader
+from PIL import Image
+# Lazy import DeepSeek-OCR dependencies (only load when needed)
+_torch = None
+_transformers = None
+def _get_torch():
+    global _torch
+    if _torch is None:
+        try:
+            import torch
+            _torch = torch
+        except ImportError:
+            raise RuntimeError(
+                "torch is not installed. Install with: pip install torch"
+            )
+    return _torch
+def _get_transformers():
+    global _transformers
+    if _transformers is None:
+        try:
+            from transformers import AutoModel, AutoTokenizer
+            _transformers = (AutoModel, AutoTokenizer)
+        except ImportError:
+            raise RuntimeError(
+                "transformers is not installed. Install with: pip install transformers"
+            )
+    return _transformers
+# Import llm_splitter (works as module or direct import)
+try:
+    from llm_splitter import call_llm_splitter
+except ImportError:
+    # Fallback for relative import
+    try:
+        from .llm_splitter import call_llm_splitter
+    except ImportError:
+        # If llm_splitter doesn't exist, define a stub
+        async def call_llm_splitter(*args, **kwargs):
+            raise NotImplementedError("llm_splitter not available")
 ALLOWED_CONTENT_TYPES = {
     "image/jpeg",
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(5 * 1024 * 1024)))
 RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "30"))
 RATE_LIMIT_WINDOW_SECONDS = float(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60"))
+# Allow API key to be optional for development (security risk in production!)
+SERVICE_API_KEY = os.getenv("SERVICE_API_KEY", "dev-key-change-in-production")
+REQUIRE_API_KEY = os.getenv("REQUIRE_API_KEY", "false").lower() == "true"
 API_KEY_HEADER_NAME = "X-API-Key"
 MAX_CHILD_LINES = 500
 MAX_JSON_DEPTH = 4
 MAX_JSON_DICT_KEYS = 50
 MAX_JSON_LIST_ITEMS = 100
+# DeepSeek-OCR Model Configuration - Maximum Quality Settings for CPU/Spaces
+MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
+# PIN MODEL REVISION to prevent auto-updates that break compatibility
+MODEL_REVISION = os.getenv("DEEPSEEK_MODEL_REVISION", "2c968b433af61a059311cbf8997765023806a24d")
+# Detect Apple Silicon (M1/M2/M3/M4) - use MPS if available, otherwise CPU
+IS_APPLE_SILICON = platform.machine() == "arm64"
+USE_GPU = os.getenv("USE_GPU", "true").lower() == "true" and not IS_APPLE_SILICON
+USE_MPS = IS_APPLE_SILICON
+# Quality settings - Gundam preset recommended for CPU/Spaces
+BASE_SIZE = int(os.getenv("DEEPSEEK_BASE_SIZE", "1024"))
+IMAGE_SIZE = int(os.getenv("DEEPSEEK_IMAGE_SIZE", "640"))
+CROP_MODE = os.getenv("DEEPSEEK_CROP_MODE", "true").lower() == "true"
+app = FastAPI(
+    title="DeepSeek-OCR API",
+    description="OCR Service using DeepSeek-OCR for maximum quality text extraction",
+    version="1.0.0"
+)
+# Add root endpoint for health check (compatible with HuggingFace Spaces)
+@app.get("/")
+async def root(__sign: Optional[str] = None):
+    """
+    Root endpoint - compatible with HuggingFace Spaces authentication.
+    The __sign parameter is used by HuggingFace's proxy but can be ignored.
+    """
+    return {
+        "service": "DeepSeek-OCR API",
+        "status": "running",
+        "version": "1.0.0",
+        "endpoints": {
+            "docs": "/docs",
+            "ocr": "/ocr",
+            "split": "/split"
+        }
+    }
+# Add CORS middleware to allow frontend requests
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
+    allow_headers=["*"],
+    expose_headers=["*"],
+)
+# Initialize DeepSeek-OCR model
+_ocr_model = None
+_ocr_tokenizer = None
+_model_lock = asyncio.Lock()
+def _download_and_patch_model_locally(model_id: str, revision: str) -> str:
+    """
+    Download DeepSeek-OCR to a local dir, patch for CPU:
+      - remove hardcoded .cuda()
+      - force float32 (strip .bfloat16() / .to(torch.bfloat16))
+      - disable torch.autocast("cuda", ...) blocks
+    Return local path for from_pretrained(...).
+    Per official HuggingFace discussions:
+    - https://huggingface.co/deepseek-ai/DeepSeek-OCR/discussions/21 (CPU inference)
+    - https://huggingface.co/deepseek-ai/DeepSeek-OCR/discussions/20 (BF16/FP32 issues)
+    """
+    import re
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError:
+        raise RuntimeError("huggingface_hub is required. Install with: pip install huggingface_hub")
+    print(f"  📥 Downloading model (revision {revision[:8]})...")
+    local_dir = snapshot_download(model_id, revision=revision)
+    print(f"  ✅ Downloaded to: {local_dir}")
+    local_dir = Path(local_dir)
+    def ensure_imports(src: str) -> str:
+        """Ensure torch and contextlib are imported"""
+        if "import torch" not in src and "from torch" not in src:
+            src = "import torch\n" + src
+        if "import contextlib" not in src:
+            # Add after torch import
+            if "import torch" in src:
+                src = src.replace("import torch", "import torch\nimport contextlib")
+            elif "from torch" in src:
+                src = src.replace("from torch", "import contextlib\nfrom torch")
+            else:
+                # Add at top if no torch import
+                lines = src.split('\n')
+                for i, line in enumerate(lines):
+                    if line.startswith("import ") or line.startswith("from "):
+                        continue
+                    else:
+                        lines.insert(i, "import contextlib")
+                        break
+                src = '\n'.join(lines)
+        return src
+    def patch_text(src: str) -> str:
+        """Patch text to remove CUDA/BF16 and force FP32"""
+        # A) Remove hardcoded CUDA device moves
+        src = src.replace(".unsqueeze(-1).cuda()", ".unsqueeze(-1)")
+        src = src.replace("input_ids.unsqueeze(0).cuda()", "input_ids.unsqueeze(0)")
+        src = src.replace("(images_crop.cuda(), images_ori.cuda())", "(images_crop, images_ori)")
+        src = src.replace("images_seq_mask = images_seq_mask.unsqueeze(0).cuda()",
+                          "images_seq_mask = images_seq_mask.unsqueeze(0)")
+        src = src.replace("input_ids.unsqueeze(0).cuda().shape[1]",
+                          "input_ids.unsqueeze(0).shape[1]")
+        # B) Force FP32 (strip BF16 casts)
+        src = re.sub(r"\.bfloat16\(\)", ".float()", src)
+        src = re.sub(r"\.to\(\s*torch\.bfloat16\s*\)", ".to(torch.float32)", src)
+        src = re.sub(r"\.to\(\s*dtype\s*=\s*torch\.bfloat16\s*\)", ".to(dtype=torch.float32)", src)
+        # C) Disable CUDA autocast blocks
+        src = ensure_imports(src)
+        # Match both torch.autocast("cuda", ...) and torch.autocast(device_type="cuda", ...)
+        src = re.sub(r'torch\.autocast\(\s*["\']cuda["\']\s*,[^)]*\)',
+                     "contextlib.nullcontext()", src)
+        src = re.sub(r'torch\.autocast\(\s*device_type\s*=\s*["\']cuda["\'][^)]*\)',
+                     "contextlib.nullcontext()", src)
+        return src
+    # Patch both files where they may appear
+    targets = list(local_dir.rglob("modeling_deepseekocr.py")) + \
+              list(local_dir.rglob("deepencoder.py"))
+    if not targets:
+        raise RuntimeError("Could not locate DeepSeek-OCR source files to patch")
+    for p in targets:
+        print(f"  🔍 Found file: {p.name}")
+        txt = p.read_text(encoding="utf-8")
+        new = patch_text(txt)
+        if new != txt:
+            p.write_text(new, encoding="utf-8")
+            print(f"  ✅ Patched for CPU+FP32: {p.name}")
+        else:
+            print(f"  ℹ️  Already CPU-patched: {p.name}")
+    return str(local_dir)
+async def get_ocr_model():
+    """Lazy load DeepSeek-OCR model with compatibility patching"""
+    global _ocr_model, _ocr_tokenizer
+    if _ocr_model is None or _ocr_tokenizer is None:
+        async with _model_lock:
+            if _ocr_model is None or _ocr_tokenizer is None:
+                # Lazy import dependencies
+                AutoModel, AutoTokenizer = _get_transformers()
+                torch = _get_torch()
+                print(f"Loading DeepSeek-OCR model (MAXIMUM QUALITY): {MODEL_NAME}")
+                print(f"  - Base size: {BASE_SIZE}")
+                print(f"  - Image size: {IMAGE_SIZE}")
+                print(f"  - Crop mode: {CROP_MODE}")
+                # 1) Download & patch; 2) Load from local dir so our patch is used
+                local_dir = _download_and_patch_model_locally(MODEL_NAME, MODEL_REVISION)
+                print("  - Loading tokenizer (local, pinned revision)...")
+                _ocr_tokenizer = AutoTokenizer.from_pretrained(
+                    local_dir,
+                    trust_remote_code=True,
+                    local_files_only=True  # Load from local patched directory
+                )
+                print("  - Tokenizer loaded successfully")
+                # Fix pad_token_id warning
+                if _ocr_tokenizer.pad_token_id is None:
+                    _ocr_tokenizer.pad_token = _ocr_tokenizer.eos_token or _ocr_tokenizer.unk_token
+                # Load model with compatibility settings
+                load_kwargs = {
+                    "trust_remote_code": True,
+                    "use_safetensors": True,
+                    "attn_implementation": "eager",  # SDPA not supported by this arch
+                }
+                # Load from patched local directory
+                _ocr_model = AutoModel.from_pretrained(
+                    local_dir,
+                    local_files_only=True,  # Load from local patched directory
+                    **load_kwargs
+                ).eval()
+                # Handle device placement (force FP32 on CPU/MPS)
+                if USE_MPS and torch.backends.mps.is_available():
+                    _ocr_model = _ocr_model.to("mps").to(dtype=torch.float32)
+                    print("  - DeepSeek-OCR on MPS (float32)")
+                elif USE_GPU and torch.cuda.is_available():
+                    _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
+                    print("  - DeepSeek-OCR on CUDA (bf16)")
+                else:
+                    _ocr_model = _ocr_model.to(dtype=torch.float32)
+                    print("  - DeepSeek-OCR on CPU (float32)")
+    return _ocr_model, _ocr_tokenizer
+async def run_deepseek_ocr(
+    image_path: str,
+    prompt: str = "<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
+    use_grounding: bool = True
+) -> dict:
+    """
+    Run DeepSeek-OCR on an image file with advanced grounding support.
+    """
+    model, tokenizer = await get_ocr_model()
+    output_path = tempfile.mkdtemp()
+    try:
+        # OCR quality settings - Gundam preset recommended for CPU/Spaces
+        torch = _get_torch()
+        if USE_GPU and torch.cuda.is_available():
+            # GPU: Use maximum quality (Large preset)
+            actual_base_size = BASE_SIZE
+            actual_image_size = IMAGE_SIZE
+        else:
+            # CPU/Spaces: Use Gundam preset (recommended for CPU to avoid OOM)
+            actual_base_size = 1024
+            actual_image_size = 640
+            print(f"  - Using CPU-optimized quality: base_size={actual_base_size}, image_size={actual_image_size}")
+        # Use torch.inference_mode() to reduce overhead on CPU
+        torch = _get_torch()
+        with torch.inference_mode():
+            result = model.infer(
+                tokenizer,
+                prompt=prompt,
+                image_file=image_path,
+                output_path=output_path,
+                base_size=actual_base_size,
+                image_size=actual_image_size,
+                crop_mode=CROP_MODE,
+                save_results=False,
+                test_compress=False,
+            )
+        # Parse result - DeepSeek-OCR returns structured markdown output
+        ocr_text = result if isinstance(result, str) else str(result)
+        # Extract structured lines from markdown
+        lines = _parse_deepseek_output(ocr_text)
+        return {
+            "text": ocr_text,
+            "lines": lines,
+        }
+    except Exception as e:
+        print(f"DeepSeek-OCR error: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"OCR processing failed: {str(e)}",
+        )
+    finally:
+        # Cleanup temp directory
+        try:
+            import shutil
+            if os.path.exists(output_path):
+                shutil.rmtree(output_path)
+        except:
+            pass
+def _parse_deepseek_output(ocr_text: str) -> list:
+    """
+    Extract structured lines from DeepSeek-OCR markdown output.
+    Preserves layout, handles tables, lists, and structured content.
+    """
+    lines = []
+    text_lines = ocr_text.split('\n')
+    y_offset = 0
+    line_height = 24  # Estimated line height in pixels
+    for line_idx, line in enumerate(text_lines):
+        stripped = line.strip()
+        if not stripped:
+            # Empty lines still take space
+            y_offset += line_height // 2
+            continue
+        # Remove markdown formatting but preserve text structure
+        # Handle markdown tables (| separated)
+        if '|' in stripped and stripped.count('|') >= 2:
+            # Table row - split by | and process each cell
+            cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
+            for cell_idx, cell in enumerate(cells):
+                if cell:
+                    lines.append({
+                        "bbox": [
+                            cell_idx * 200,  # Approximate x position
+                            y_offset,
+                            (cell_idx + 1) * 200,
+                            y_offset + line_height
+                        ],
+                        "text": cell,
+                        "conf": 0.95,
+                    })
+            y_offset += line_height
+        # Handle markdown lists (-, *, 1., etc.)
+        elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
+            # List item - remove list marker
+            text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
+            if text:
+                lines.append({
+                    "bbox": [40, y_offset, 1000, y_offset + line_height],
+                    "text": text,
+                    "conf": 0.95,
+                })
+                y_offset += line_height
+        # Handle headers (# ## ###)
+        elif stripped.startswith('#'):
+            header_level = len(stripped) - len(stripped.lstrip('#'))
+            text = stripped.lstrip('#').strip()
+            if text:
+                # Headers are typically larger
+                header_height = line_height + (header_level * 4)
+                lines.append({
+                    "bbox": [0, y_offset, 1000, y_offset + header_height],
+                    "text": text,
+                    "conf": 0.95,
+                })
+                y_offset += header_height
+        # Regular text line
+        else:
+            # Estimate width based on text length (rough approximation)
+            estimated_width = min(len(stripped) * 8, 1000)  # ~8px per char average
+            lines.append({
+                "bbox": [0, y_offset, estimated_width, y_offset + line_height],
+                "text": stripped,
+                "conf": 0.95,
+            })
+            y_offset += line_height
+    return lines
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 _rate_limit_lock = asyncio.Lock()
 def ensure_upload_is_safe(file: UploadFile) -> None:
+    # Check content type from header
     content_type = (file.content_type or "").lower()
+    # Also check file extension as fallback (browsers sometimes send application/octet-stream)
+    filename = (file.filename or "").lower()
+    extension = filename.split('.')[-1] if '.' in filename else ""
+    allowed_extensions = {'jpg', 'jpeg', 'png', 'webp'}
+    # Allow if content type matches OR extension matches
+    content_type_valid = content_type in ALLOWED_CONTENT_TYPES
+    extension_valid = extension in allowed_extensions
+    if not content_type_valid and not extension_valid:
         raise HTTPException(
             status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported file type. Content-Type: {content_type}, Extension: {extension}. Allowed: {', '.join(ALLOWED_CONTENT_TYPES)}",
         )
     file.file.seek(0, os.SEEK_END)
         )
+async def verify_api_key(api_key: Optional[str] = Depends(api_key_header)) -> str:
+    # Skip API key verification in development mode
+    if not REQUIRE_API_KEY:
+        return api_key or SERVICE_API_KEY
+    # Enforce API key in production
     if not api_key or not secrets.compare_digest(api_key, SERVICE_API_KEY):
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
 def _decode_image(file: UploadFile):
+    """Decode uploaded image file to PIL Image"""
     data = file.file.read()
     if not data:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Uploaded file is empty",
         )
+    # Save to temp file for DeepSeek-OCR
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+        tmp_file.write(data)
+        tmp_path = tmp_file.name
+    try:
+        img = Image.open(tmp_path).convert("RGB")
+        return img, tmp_path
+    except Exception as e:
+        os.unlink(tmp_path)
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Unable to decode image: {str(e)}",
         )
+async def load_img(file: UploadFile):
     ensure_upload_is_safe(file)
     file.file.seek(0)
+    img, img_path = _decode_image(file)
+    return img, img_path
 def _parse_json_field(name: str, raw: str, expected_type: type) -> Any:
     return rules
+@app.options("/ocr")
+async def ocr_options():
+    """Handle CORS preflight requests (required by HuggingFace Spaces)"""
+    return {"message": "OK"}
+@app.options("/api/predict")
+async def predict_options():
+    """Handle CORS preflight for HuggingFace Spaces auto-routing"""
+    return {"message": "OK"}
 @app.post("/ocr")
+@app.post("/api/predict")  # HuggingFace Spaces may auto-route POST requests here
 async def ocr_page(
     file: UploadFile,
     _: None = Depends(enforce_rate_limit),
 ):
+    """OCR endpoint using DeepSeek-OCR"""
+    img, img_path = await load_img(file)
+    try:
+        # Save PIL image to temporary file for DeepSeek-OCR
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+            img.save(tmp_file, 'JPEG', quality=95)
+            tmp_img_path = tmp_file.name
+        try:
+            # Use grounding prompt for better structure extraction
+            result = await run_deepseek_ocr(
+                tmp_img_path,
+                prompt="<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
+                use_grounding=True
+            )
+            return result
+        except Exception as e:
+            # Log the error but don't crash - return a helpful error message
+            error_msg = str(e)
+            print(f"OCR processing error: {error_msg}")
+            # Check if it's a model loading issue
+            if "matplotlib" in error_msg or "torchvision" in error_msg or "ImportError" in error_msg:
+                raise HTTPException(
+                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                    detail=f"OCR model dependencies missing: {error_msg}. Please install required packages."
+                )
+            elif "Connection" in error_msg or "timeout" in error_msg.lower():
+                raise HTTPException(
+                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                    detail=f"OCR service temporarily unavailable: {error_msg}"
+                )
+            else:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=f"OCR processing failed: {error_msg}"
+                )
+        finally:
+            if os.path.exists(tmp_img_path):
+                os.unlink(tmp_img_path)
+    finally:
+        if os.path.exists(img_path):
+            os.unlink(img_path)
 @app.post("/split")
     rules: str = Form("[]"),
     _: None = Depends(enforce_rate_limit),
 ):
+    """Split endpoint - uses DeepSeek-OCR for region extraction"""
+    img, img_path = await load_img(file)
     try:
+        width, height = img.size
+        # Save image for DeepSeek-OCR
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+            img.save(tmp_file, 'JPEG', quality=95)
+            tmp_img_path = tmp_file.name
+        try:
+            parent_box = _parse_parent_bbox(parent_bbox, width, height)
+            x1, y1, x2, y2 = parent_box
+            # Crop image to parent bbox
+            crop_img = img.crop((int(x1), int(y1), int(x2), int(y2)))
+            crop_path = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg').name
+            crop_img.save(crop_path, 'JPEG', quality=95)
+            try:
+                # Use DeepSeek-OCR with grounding prompt for better structured extraction
+                prompt = "<image>\n<|grounding|>Convert the document region to markdown with preserved layout."
+                ocr_result = await run_deepseek_ocr(crop_path, prompt=prompt, use_grounding=True)
+                # Parse OCR result to extract lines
+                child_lines = ocr_result.get("lines", [])
+                # Adjust bboxes to parent coordinate space
+                for line in child_lines:
+                    bbox = line["bbox"]
+                    line["bbox"] = [
+                        bbox[0] + x1,
+                        bbox[1] + y1,
+                        bbox[2] + x1,
+                        bbox[3] + y1,
+                    ]
+                    line["blockType"] = "text"
+                if len(child_lines) > MAX_CHILD_LINES:
+                    child_lines = child_lines[:MAX_CHILD_LINES]
+                sanitized_splitter = _sanitize_label("splitter", splitter)
+                sanitized_schema = _sanitize_label("schemaType", schemaType)
+                parsed_settings = _parse_settings(settings)
+                parsed_rules = _parse_rules(rules)
+                raw_text = "\n".join([l["text"] for l in child_lines])
+                text_truncated = False
+                if len(raw_text) > 5000:
+                    raw_text = raw_text[:5000]
+                    text_truncated = True
+                llm_input = {
+                    "schemaType": sanitized_schema,
+                    "splitter": sanitized_splitter,
+                    "page": {"width": width, "height": height},
+                    "parentBox": parent_box,
+                    "rawText": raw_text,
+                    "ocrLines": child_lines,
+                    "rawTextTruncated": text_truncated,
+                    "ocrLinesTruncated": len(child_lines) >= MAX_CHILD_LINES,
+                    "settings": parsed_settings,
+                    "rules": parsed_rules,
+                }
+                try:
+                    llm_result = await call_llm_splitter(llm_input)
+                except ValueError as exc:
+                    raise HTTPException(
+                        status_code=status.HTTP_502_BAD_GATEWAY,
+                        detail=str(exc),
+                    ) from exc
+                return llm_result
+            finally:
+                if os.path.exists(crop_path):
+                    os.unlink(crop_path)
+        finally:
+            if os.path.exists(tmp_img_path):
+                os.unlink(tmp_img_path)
+    finally:
+        if os.path.exists(img_path):
+            os.unlink(img_path)