Spaces:

make789
/

OCRdeepSeekService

Sleeping

App Files Files Community

make789 commited on Oct 29

Commit

0e7bad9

verified ·

1 Parent(s): 0d047c5

Upload ocr_service.py

Browse files

Files changed (1) hide show

ocr_service.py +96 -552

ocr_service.py CHANGED Viewed

@@ -2,59 +2,19 @@ import asyncio
 import json
 import math
 import os
-import platform
 import secrets
-import tempfile
 from collections import defaultdict, deque
 from time import monotonic
-from typing import Any, Deque, DefaultDict, Optional
-from pathlib import Path
 import numpy as np
 from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status
-from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import APIKeyHeader
-from PIL import Image
-# Lazy import DeepSeek-OCR dependencies (only load when needed)
-_torch = None
-_transformers = None
-def _get_torch():
-    global _torch
-    if _torch is None:
-        try:
-            import torch
-            _torch = torch
-        except ImportError:
-            raise RuntimeError(
-                "torch is not installed. Install with: pip install torch"
-            )
-    return _torch
-def _get_transformers():
-    global _transformers
-    if _transformers is None:
-        try:
-            from transformers import AutoModel, AutoTokenizer
-            _transformers = (AutoModel, AutoTokenizer)
-        except ImportError:
-            raise RuntimeError(
-                "transformers is not installed. Install with: pip install transformers"
-            )
-    return _transformers
-# Import llm_splitter (works as module or direct import)
-try:
-    from llm_splitter import call_llm_splitter
-except ImportError:
-    # Fallback for relative import
-    try:
-        from .llm_splitter import call_llm_splitter
-    except ImportError:
-        # If llm_splitter doesn't exist, define a stub
-        async def call_llm_splitter(*args, **kwargs):
-            raise NotImplementedError("llm_splitter not available")
 ALLOWED_CONTENT_TYPES = {
     "image/jpeg",
@@ -64,9 +24,11 @@ ALLOWED_CONTENT_TYPES = {
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(5 * 1024 * 1024)))
 RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "30"))
 RATE_LIMIT_WINDOW_SECONDS = float(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60"))
-# Allow API key to be optional for development (security risk in production!)
-SERVICE_API_KEY = os.getenv("SERVICE_API_KEY", "dev-key-change-in-production")
-REQUIRE_API_KEY = os.getenv("REQUIRE_API_KEY", "false").lower() == "true"
 API_KEY_HEADER_NAME = "X-API-Key"
 MAX_CHILD_LINES = 500
 MAX_JSON_DEPTH = 4
@@ -74,340 +36,9 @@ MAX_JSON_STRING_LENGTH = 512
 MAX_JSON_DICT_KEYS = 50
 MAX_JSON_LIST_ITEMS = 100
-# DeepSeek-OCR Model Configuration - Maximum Quality Settings for M4 Mac (Apple Silicon)
-MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
-# PIN MODEL REVISION to prevent auto-updates that break compatibility
-# Use a commit hash from https://huggingface.co/deepseek-ai/DeepSeek-OCR/tree/main
-# This prevents "A new version of ... was downloaded" warnings and keeps code stable
-MODEL_REVISION = os.getenv("DEEPSEEK_MODEL_REVISION", "2c968b433af61a059311cbf8997765023806a24d")  # Latest stable commit
-# Detect Apple Silicon (M1/M2/M3/M4) - use MPS if available, otherwise CPU
-IS_APPLE_SILICON = platform.machine() == "arm64"
-USE_GPU = os.getenv("USE_GPU", "true").lower() == "true" and not IS_APPLE_SILICON  # M4 uses MPS, not CUDA
-USE_MPS = IS_APPLE_SILICON  # Use Metal Performance Shaders on Apple Silicon
-# Maximum quality settings (larger = better, slower = more accurate)
-BASE_SIZE = int(os.getenv("DEEPSEEK_BASE_SIZE", "1280"))  # Maximum quality: 1280 (not light!)
-IMAGE_SIZE = int(os.getenv("DEEPSEEK_IMAGE_SIZE", "1280"))  # Maximum quality: 1280 (not light!)
-CROP_MODE = os.getenv("DEEPSEEK_CROP_MODE", "true").lower() == "true"  # True for best accuracy
-app = FastAPI(
-    title="DeepSeek-OCR API",
-    description="OCR Service using DeepSeek-OCR for maximum quality text extraction",
-    version="1.0.0"
-)
-# Add root endpoint for health check
-@app.get("/")
-async def root(__sign: Optional[str] = None):
-    """
-    Root endpoint - compatible with HuggingFace Spaces authentication.
-    The __sign parameter is used by HuggingFace's proxy but can be ignored.
-    """
-    return {
-        "service": "DeepSeek-OCR API",
-        "status": "running",
-        "version": "1.0.0",
-        "endpoints": {
-            "docs": "/docs",
-            "ocr": "/ocr",
-            "split": "/split"
-        }
-    }
-# Add CORS middleware to allow frontend requests
-# Configured for HuggingFace Spaces which requires explicit CORS handling
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # In production, replace with specific origins
-    allow_credentials=True,
-    allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
-    allow_headers=["*"],
-    expose_headers=["*"],
-)
-# Initialize DeepSeek-OCR model
-_ocr_model = None
-_ocr_tokenizer = None
-_model_lock = asyncio.Lock()
-def _download_and_patch_model_locally(model_id: str, revision: str) -> str:
-    """
-    Download DeepSeek-OCR to a local dir, patch out hardcoded .cuda() in infer(),
-    and return the local path for from_pretrained(...).
-    This ensures we control the exact file that gets imported (patched version).
-    Per official HuggingFace discussion: https://huggingface.co/deepseek-ai/DeepSeek-OCR/discussions/21
-    """
-    from pathlib import Path
-    import re
-    try:
-        from huggingface_hub import snapshot_download
-    except ImportError:
-        raise RuntimeError("huggingface_hub is required. Install with: pip install huggingface_hub")
-    print(f"  📥 Downloading model (revision {revision[:8]})...")
-    local_dir = snapshot_download(model_id, revision=revision)
-    print(f"  ✅ Downloaded to: {local_dir}")
-    # Find modeling_deepseekocr.py (may be in subdirectories with trust_remote_code)
-    target = Path(local_dir) / "modeling_deepseekocr.py"
-    if not target.exists():
-        # trust_remote_code sometimes stores under modules; fall back to a glob
-        hits = list(Path(local_dir).rglob("modeling_deepseekocr.py"))
-        target = hits[0] if hits else None
-    if not target or not target.exists():
-        raise RuntimeError("Could not locate modeling_deepseekocr.py to patch")
-    print(f"  🔍 Found model file: {target}")
-    # Read file
-    src = target.read_text(encoding='utf-8')
-    # Check if already patched
-    if ".unsqueeze(-1).cuda()" not in src and "input_ids.unsqueeze(0).cuda()" not in src:
-        print(f"  ✅ Model already patched for CPU")
-        return local_dir
-    # CPU patches from the HF discussion (remove hardcoded .cuda() in infer)
-    # https://huggingface.co/deepseek-ai/DeepSeek-OCR/discussions/21
-    repl = [
-        (r"\.unsqueeze\(-1\)\.cuda\(\)", ".unsqueeze(-1)"),
-        (r"input_ids\.unsqueeze\(0\)\.cuda\(\)", "input_ids.unsqueeze(0)"),
-        (r"\(images_crop\.cuda\(\), images_ori\.cuda\(\)\)", "(images_crop, images_ori)"),
-        (r"images_seq_mask = images_seq_mask\.unsqueeze\(0\)\.cuda\(\)",
-         "images_seq_mask = images_seq_mask.unsqueeze(0)"),
-        (r"input_ids\.unsqueeze\(0\)\.cuda\(\)\.shape\[1\]",
-         "input_ids.unsqueeze(0).shape[1]"),
-    ]
-    original_src = src
-    for pat, rep in repl:
-        src = re.sub(pat, rep, src)
-    # Write patched file
-    if src != original_src:
-        target.write_text(src, encoding='utf-8')
-        print(f"  ✅ Patched for CPU: {target}")
-    else:
-        print(f"  ⚠️  No .cuda() calls found to patch")
-    return local_dir
-async def get_ocr_model():
-    """Lazy load DeepSeek-OCR model with compatibility patching"""
-    global _ocr_model, _ocr_tokenizer
-    if _ocr_model is None or _ocr_tokenizer is None:
-        async with _model_lock:
-            if _ocr_model is None or _ocr_tokenizer is None:
-                # Lazy import dependencies
-                # Note: Patching no longer needed - we pin transformers==4.46.3 and model revision
-                AutoModel, AutoTokenizer = _get_transformers()
-                torch = _get_torch()
-                print(f"Loading DeepSeek-OCR model (MAXIMUM QUALITY): {MODEL_NAME}")
-                print(f"  - Base size: {BASE_SIZE}")
-                print(f"  - Image size: {IMAGE_SIZE}")
-                print(f"  - Crop mode: {CROP_MODE}")
-                # 1) Download & patch; 2) Load from local dir so our patch is used
-                local_dir = _download_and_patch_model_locally(MODEL_NAME, MODEL_REVISION)
-                print("  - Loading tokenizer (local, pinned revision)...")
-                _ocr_tokenizer = AutoTokenizer.from_pretrained(
-                    local_dir,
-                    trust_remote_code=True,
-                    local_files_only=True  # Load from local patched directory
-                )
-                print("  - Tokenizer loaded successfully")
-                # Load model with compatibility settings
-                # Official DeepSeek-OCR usage: https://huggingface.co/deepseek-ai/DeepSeek-OCR
-                # GPU version uses: attn_implementation='flash_attention_2', use_safetensors=True
-                # CPU/Spaces version MUST use: attn_implementation='eager' (SDPA not implemented for this arch)
-                # Fix per: https://github.com/huggingface/transformers/issues/28005
-                load_kwargs = {
-                    "trust_remote_code": True,
-                    "use_safetensors": True,
-                    "attn_implementation": "eager",  # SDPA not implemented for this arch
-                }
-                # If CUDA exists you can still cast later; but keep eager attention
-                _ocr_model = AutoModel.from_pretrained(
-                    local_dir,
-                    local_files_only=True,  # Load from local patched directory
-                    **load_kwargs
-                ).eval()
-                # Handle device placement (per official DeepSeek-OCR usage)
-                if USE_MPS and torch.backends.mps.is_available():
-                    # Apple Silicon: MPS (Metal Performance Shaders)
-                    _ocr_model = _ocr_model.to("mps")
-                    print("  - DeepSeek-OCR loaded on Apple Silicon GPU (MPS/M4)")
-                elif USE_GPU and torch.cuda.is_available():
-                    # NVIDIA GPU: CUDA with bfloat16 (per official usage)
-                    _ocr_model = _ocr_model.cuda().to(torch.bfloat16)
-                    print("  - DeepSeek-OCR loaded on NVIDIA GPU (CUDA + bfloat16)")
-                else:
-                    # CPU: No device placement needed
-                    print("  - DeepSeek-OCR loaded on CPU")
-    return _ocr_model, _ocr_tokenizer
-async def run_deepseek_ocr(
-    image_path: str,
-    prompt: str = "<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
-    use_grounding: bool = True
-) -> dict:
-    """
-    Run DeepSeek-OCR on an image file with advanced grounding support.
-    Genius enhancement: Uses grounding prompts for better structure extraction
-    and layout preservation, following DeepSeek-OCR best practices.
-    """
-    model, tokenizer = await get_ocr_model()
-    output_path = tempfile.mkdtemp()
-    try:
-        # OCR quality settings
-        # Official DeepSeek-OCR quality presets (from https://huggingface.co/deepseek-ai/DeepSeek-OCR):
-        # - Tiny: base_size=512, image_size=512, crop_mode=False
-        # - Small: base_size=640, image_size=640, crop_mode=False
-        # - Base: base_size=1024, image_size=1024, crop_mode=False
-        # - Large: base_size=1280, image_size=1280, crop_mode=False (GPU only)
-        # - Gundam: base_size=1024, image_size=640, crop_mode=True (recommended for CPU/ZeroGPU)
-        #
-        # Note: On CPU/Spaces, use lower quality to avoid slowdowns/OOM
-        # We adjust based on device availability
-        # Get torch to check for GPU availability
-        torch = _get_torch()
-        if USE_GPU and torch.cuda.is_available():
-            # GPU: Use maximum quality (Large preset)
-            actual_base_size = BASE_SIZE  # 1280
-            actual_image_size = IMAGE_SIZE  # 1280
-        else:
-            # CPU/Spaces: Use Gundam preset (recommended for CPU to avoid OOM)
-            actual_base_size = 1024
-            actual_image_size = 640
-            print(f"  - Using CPU-optimized quality: base_size={actual_base_size}, image_size={actual_image_size}")
-        result = model.infer(
-            tokenizer,
-            prompt=prompt,
-            image_file=image_path,
-            output_path=output_path,
-            base_size=actual_base_size,
-            image_size=actual_image_size,
-            crop_mode=CROP_MODE,  # True = best accuracy for complex documents
-            save_results=False,  # Don't save intermediate files
-            test_compress=False,  # False = maximum quality, no compression
-        )
-        # Parse result - DeepSeek-OCR returns structured markdown output
-        ocr_text = result if isinstance(result, str) else str(result)
-        # Genius parsing: Extract structured lines from markdown with better layout awareness
-        lines = _parse_deepseek_output(ocr_text)
-        return {
-            "text": ocr_text,
-            "lines": lines,
-        }
-    except Exception as e:
-        print(f"DeepSeek-OCR error: {e}")
-        import traceback
-        traceback.print_exc()
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"OCR processing failed: {str(e)}",
-        )
-    finally:
-        # Cleanup temp directory
-        try:
-            import shutil
-            if os.path.exists(output_path):
-                shutil.rmtree(output_path)
-        except:
-            pass
-def _parse_deepseek_output(ocr_text: str) -> list:
-    """
-    Genius parser: Extract structured lines from DeepSeek-OCR markdown output.
-    Preserves layout, handles tables, lists, and structured content.
-    """
-    lines = []
-    text_lines = ocr_text.split('\n')
-    y_offset = 0
-    line_height = 24  # Estimated line height in pixels
-    for line_idx, line in enumerate(text_lines):
-        stripped = line.strip()
-        if not stripped:
-            # Empty lines still take space
-            y_offset += line_height // 2
-            continue
-        # Remove markdown formatting but preserve text structure
-        # Handle markdown tables (| separated)
-        if '|' in stripped and stripped.count('|') >= 2:
-            # Table row - split by | and process each cell
-            cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
-            for cell_idx, cell in enumerate(cells):
-                if cell:
-                    lines.append({
-                        "bbox": [
-                            cell_idx * 200,  # Approximate x position
-                            y_offset,
-                            (cell_idx + 1) * 200,
-                            y_offset + line_height
-                        ],
-                        "text": cell,
-                        "conf": 0.95,
-                    })
-            y_offset += line_height
-        # Handle markdown lists (-, *, 1., etc.)
-        elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
-            # List item - remove list marker
-            text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
-            if text:
-                lines.append({
-                    "bbox": [40, y_offset, 1000, y_offset + line_height],
-                    "text": text,
-                    "conf": 0.95,
-                })
-                y_offset += line_height
-        # Handle headers (# ## ###)
-        elif stripped.startswith('#'):
-            header_level = len(stripped) - len(stripped.lstrip('#'))
-            text = stripped.lstrip('#').strip()
-            if text:
-                # Headers are typically larger
-                header_height = line_height + (header_level * 4)
-                lines.append({
-                    "bbox": [0, y_offset, 1000, y_offset + header_height],
-                    "text": text,
-                    "conf": 0.95,
-                })
-                y_offset += header_height
-        # Regular text line
-        else:
-            # Estimate width based on text length (rough approximation)
-            estimated_width = min(len(stripped) * 8, 1000)  # ~8px per char average
-            lines.append({
-                "bbox": [0, y_offset, estimated_width, y_offset + line_height],
-                "text": stripped,
-                "conf": 0.95,
-            })
-            y_offset += line_height
-    return lines
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 _rate_limit_lock = asyncio.Lock()
@@ -415,22 +46,11 @@ _request_log: DefaultDict[str, Deque[float]] = defaultdict(deque)
 def ensure_upload_is_safe(file: UploadFile) -> None:
-    # Check content type from header
     content_type = (file.content_type or "").lower()
-    # Also check file extension as fallback (browsers sometimes send application/octet-stream)
-    filename = (file.filename or "").lower()
-    extension = filename.split('.')[-1] if '.' in filename else ""
-    allowed_extensions = {'jpg', 'jpeg', 'png', 'webp'}
-    # Allow if content type matches OR extension matches
-    content_type_valid = content_type in ALLOWED_CONTENT_TYPES
-    extension_valid = extension in allowed_extensions
-    if not content_type_valid and not extension_valid:
         raise HTTPException(
             status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
-            detail=f"Unsupported file type. Content-Type: {content_type}, Extension: {extension}. Allowed: {', '.join(ALLOWED_CONTENT_TYPES)}",
         )
     file.file.seek(0, os.SEEK_END)
@@ -443,11 +63,7 @@ def ensure_upload_is_safe(file: UploadFile) -> None:
         )
-async def verify_api_key(api_key: Optional[str] = Depends(api_key_header)) -> str:
-    # Skip API key verification in development mode
-    if not REQUIRE_API_KEY:
-        return api_key or SERVICE_API_KEY
-    # Enforce API key in production
     if not api_key or not secrets.compare_digest(api_key, SERVICE_API_KEY):
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
@@ -475,36 +91,26 @@ async def enforce_rate_limit(
         window.append(now)
-def _decode_image(file: UploadFile) -> Image.Image:
-    """Decode uploaded image file to PIL Image"""
     data = file.file.read()
     if not data:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Uploaded file is empty",
         )
-    # Save to temp file for DeepSeek-OCR
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
-        tmp_file.write(data)
-        tmp_path = tmp_file.name
-    try:
-        img = Image.open(tmp_path).convert("RGB")
-        return img, tmp_path
-    except Exception as e:
-        os.unlink(tmp_path)
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Unable to decode image: {str(e)}",
         )
-async def load_img(file: UploadFile):
     ensure_upload_is_safe(file)
     file.file.seek(0)
-    img, img_path = _decode_image(file)
-    return img, img_path
 def _parse_json_field(name: str, raw: str, expected_type: type) -> Any:
@@ -668,65 +274,20 @@ def _parse_rules(raw: str) -> list:
     return rules
-@app.options("/ocr")
-async def ocr_options():
-    """Handle CORS preflight requests (required by HuggingFace Spaces)"""
-    return {"message": "OK"}
-@app.options("/api/predict")
-async def predict_options():
-    """Handle CORS preflight for HuggingFace Spaces auto-routing"""
-    return {"message": "OK"}
 @app.post("/ocr")
-@app.post("/api/predict")  # HuggingFace Spaces may auto-route POST requests here
 async def ocr_page(
     file: UploadFile,
     _: None = Depends(enforce_rate_limit),
 ):
-    """OCR endpoint using DeepSeek-OCR"""
-    img, img_path = await load_img(file)
-    try:
-        # Save PIL image to temporary file for DeepSeek-OCR
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
-            img.save(tmp_file, 'JPEG', quality=95)
-            tmp_img_path = tmp_file.name
-        try:
-            # Use grounding prompt for better structure extraction
-            result = await run_deepseek_ocr(
-                tmp_img_path,
-                prompt="<image>\n<|grounding|>Convert the document to markdown with preserved layout.",
-                use_grounding=True
-            )
-            return result
-        except Exception as e:
-            # Log the error but don't crash - return a helpful error message
-            error_msg = str(e)
-            print(f"OCR processing error: {error_msg}")
-            # Check if it's a model loading issue
-            if "matplotlib" in error_msg or "torchvision" in error_msg or "ImportError" in error_msg:
-                raise HTTPException(
-                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-                    detail=f"OCR model dependencies missing: {error_msg}. Please install required packages."
-                )
-            elif "Connection" in error_msg or "timeout" in error_msg.lower():
-                raise HTTPException(
-                    status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-                    detail=f"OCR service temporarily unavailable: {error_msg}"
-                )
-            else:
-                raise HTTPException(
-                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    detail=f"OCR processing failed: {error_msg}"
-                )
-        finally:
-            if os.path.exists(tmp_img_path):
-                os.unlink(tmp_img_path)
-    finally:
-        if os.path.exists(img_path):
-            os.unlink(img_path)
 @app.post("/split")
@@ -739,88 +300,71 @@ async def split(
     rules: str = Form("[]"),
     _: None = Depends(enforce_rate_limit),
 ):
-    """Split endpoint - uses DeepSeek-OCR for region extraction"""
-    img, img_path = await load_img(file)
-    try:
-        width, height = img.size
-        # Save image for DeepSeek-OCR
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
-            img.save(tmp_file, 'JPEG', quality=95)
-            tmp_img_path = tmp_file.name
-        try:
-            parent_box = _parse_parent_bbox(parent_bbox, width, height)
-            x1, y1, x2, y2 = parent_box
-            # Crop image to parent bbox
-            crop_img = img.crop((int(x1), int(y1), int(x2), int(y2)))
-            crop_path = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg').name
-            crop_img.save(crop_path, 'JPEG', quality=95)
-            try:
-                # Use DeepSeek-OCR with grounding prompt for better structured extraction
-                prompt = "<image>\n<|grounding|>Convert the document region to markdown with preserved layout."
-                ocr_result = await run_deepseek_ocr(crop_path, prompt=prompt, use_grounding=True)
-                # Parse OCR result to extract lines
-                child_lines = ocr_result.get("lines", [])
-                # Adjust bboxes to parent coordinate space
-                for line in child_lines:
-                    bbox = line["bbox"]
-                    line["bbox"] = [
-                        bbox[0] + x1,
-                        bbox[1] + y1,
-                        bbox[2] + x1,
-                        bbox[3] + y1,
-                    ]
-                    line["blockType"] = "text"
-                if len(child_lines) > MAX_CHILD_LINES:
-                    child_lines = child_lines[:MAX_CHILD_LINES]
-                sanitized_splitter = _sanitize_label("splitter", splitter)
-                sanitized_schema = _sanitize_label("schemaType", schemaType)
-                parsed_settings = _parse_settings(settings)
-                parsed_rules = _parse_rules(rules)
-                raw_text = "\n".join([l["text"] for l in child_lines])
-                text_truncated = False
-                if len(raw_text) > 5000:
-                    raw_text = raw_text[:5000]
-                    text_truncated = True
-                llm_input = {
-                    "schemaType": sanitized_schema,
-                    "splitter": sanitized_splitter,
-                    "page": {"width": width, "height": height},
-                    "parentBox": parent_box,
-                    "rawText": raw_text,
-                    "ocrLines": child_lines,
-                    "rawTextTruncated": text_truncated,
-                    "ocrLinesTruncated": len(child_lines) >= MAX_CHILD_LINES,
-                    "settings": parsed_settings,
-                    "rules": parsed_rules,
                 }
-                try:
-                    llm_result = await call_llm_splitter(llm_input)
-                except ValueError as exc:
-                    raise HTTPException(
-                        status_code=status.HTTP_502_BAD_GATEWAY,
-                        detail=str(exc),
-                    ) from exc
-                return llm_result
-            finally:
-                if os.path.exists(crop_path):
-                    os.unlink(crop_path)
-        finally:
-            if os.path.exists(tmp_img_path):
-                os.unlink(tmp_img_path)
-    finally:
-        if os.path.exists(img_path):
-            os.unlink(img_path)
 if __name__ == "__main__":

 import json
 import math
 import os
 import secrets
 from collections import defaultdict, deque
 from time import monotonic
+from typing import Any, Deque, DefaultDict
+import cv2
 import numpy as np
 from fastapi import Depends, FastAPI, Form, HTTPException, Request, UploadFile, status
 from fastapi.security import APIKeyHeader
+from paddleocr import PaddleOCR
+from paddleocr.ppstructure import PPStructure
+from .llm_splitter import call_llm_splitter
 ALLOWED_CONTENT_TYPES = {
     "image/jpeg",
 MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_BYTES", str(5 * 1024 * 1024)))
 RATE_LIMIT_REQUESTS = int(os.getenv("RATE_LIMIT_REQUESTS", "30"))
 RATE_LIMIT_WINDOW_SECONDS = float(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60"))
+SERVICE_API_KEY = os.getenv("SERVICE_API_KEY")
+if not SERVICE_API_KEY:
+    raise RuntimeError(
+        "SERVICE_API_KEY environment variable must be set before starting the OCR service"
+    )
 API_KEY_HEADER_NAME = "X-API-Key"
 MAX_CHILD_LINES = 500
 MAX_JSON_DEPTH = 4
 MAX_JSON_DICT_KEYS = 50
 MAX_JSON_LIST_ITEMS = 100
+app = FastAPI()
+ocr = PaddleOCR(use_angle_cls=True, lang="en")
+pp = PPStructure(show_log=False, layout=True)
 api_key_header = APIKeyHeader(name=API_KEY_HEADER_NAME, auto_error=False)
 _rate_limit_lock = asyncio.Lock()
 def ensure_upload_is_safe(file: UploadFile) -> None:
     content_type = (file.content_type or "").lower()
+    if content_type not in ALLOWED_CONTENT_TYPES:
         raise HTTPException(
             status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail="Unsupported file type",
         )
     file.file.seek(0, os.SEEK_END)
         )
+async def verify_api_key(api_key: str | None = Depends(api_key_header)) -> str:
     if not api_key or not secrets.compare_digest(api_key, SERVICE_API_KEY):
         raise HTTPException(
             status_code=status.HTTP_401_UNAUTHORIZED,
         window.append(now)
+def _decode_image(file: UploadFile):
     data = file.file.read()
     if not data:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Uploaded file is empty",
         )
+    img = cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_COLOR)
+    if img is None:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Unable to decode image",
         )
+    return img
+def load_img(file: UploadFile):
     ensure_upload_is_safe(file)
     file.file.seek(0)
+    return _decode_image(file)
 def _parse_json_field(name: str, raw: str, expected_type: type) -> Any:
     return rules
 @app.post("/ocr")
 async def ocr_page(
     file: UploadFile,
     _: None = Depends(enforce_rate_limit),
 ):
+    img = load_img(file)
+    res = ocr.ocr(img, cls=True)
+    lines = []
+    full = []
+    for line in (res[0] or []):
+        (x1, y1, x2, y2), (txt, conf) = line
+        lines.append({"bbox": [x1, y1, x2, y2], "text": txt, "conf": float(conf)})
+        full.append(txt)
+    return {"text": "\n".join(full), "lines": lines}
 @app.post("/split")
     rules: str = Form("[]"),
     _: None = Depends(enforce_rate_limit),
 ):
+    img = load_img(file)
+    height, width = img.shape[:2]
+    parent_box = _parse_parent_bbox(parent_bbox, width, height)
+    x1, y1, x2, y2 = parent_box
+    x1_i, y1_i, x2_i, y2_i = [int(round(v)) for v in parent_box]
+    crop = img[y1_i:y2_i, x1_i:x2_i]
+    if crop.size == 0:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="parent_bbox region is empty",
+        )
+    blocks = pp(crop)
+    child_lines = []
+    for b in blocks:
+        bx1, by1, bx2, by2 = b["bbox"]
+        sub = crop[by1:by2, bx1:bx2]
+        det = ocr.ocr(sub, cls=True)
+        for ln in (det[0] or []):
+            (lx1, ly1, lx2, ly2), (txt, conf) = ln
+            child_lines.append(
+                {
+                    "bbox": [lx1 + bx1 + x1, ly1 + by1 + y1, lx2 + bx1 + x1, ly2 + by1 + y1],
+                    "text": txt,
+                    "conf": float(conf),
+                    "blockType": b.get("type", "text"),
                 }
+            )
+            if len(child_lines) >= MAX_CHILD_LINES:
+                break
+        if len(child_lines) >= MAX_CHILD_LINES:
+            break
+    sanitized_splitter = _sanitize_label("splitter", splitter)
+    sanitized_schema = _sanitize_label("schemaType", schemaType)
+    parsed_settings = _parse_settings(settings)
+    parsed_rules = _parse_rules(rules)
+    raw_text = "\n".join([l["text"] for l in child_lines])
+    text_truncated = False
+    if len(raw_text) > 5000:
+        raw_text = raw_text[:5000]
+        text_truncated = True
+    llm_input = {
+        "schemaType": sanitized_schema,
+        "splitter": sanitized_splitter,
+        "page": {"width": width, "height": height},
+        "parentBox": parent_box,
+        "rawText": raw_text,
+        "ocrLines": child_lines,
+        "rawTextTruncated": text_truncated,
+        "ocrLinesTruncated": len(child_lines) >= MAX_CHILD_LINES,
+        "settings": parsed_settings,
+        "rules": parsed_rules,
+    }
+    try:
+        llm_result = await call_llm_splitter(llm_input)
+    except ValueError as exc:
+        raise HTTPException(
+            status_code=status.HTTP_502_BAD_GATEWAY,
+            detail=str(exc),
+        ) from exc
+    return llm_result
 if __name__ == "__main__":