| """ |
| Tesseract OCR Engine |
| |
| Fallback OCR engine using Tesseract. |
| Provides broad language support and is widely available. |
| """ |
|
|
| import time |
| from typing import List, Optional, Dict, Any |
| import numpy as np |
| from loguru import logger |
|
|
| from .base import OCREngine, OCRConfig, OCRResult |
| from ..schemas.core import BoundingBox, OCRRegion |
|
|
| |
| try: |
| import pytesseract |
| from PIL import Image |
| HAS_TESSERACT = True |
| except ImportError: |
| HAS_TESSERACT = False |
| logger.warning( |
| "pytesseract not installed. Install with: pip install pytesseract " |
| "Also install Tesseract: apt-get install tesseract-ocr" |
| ) |
|
|
|
|
| class TesseractOCREngine(OCREngine): |
| """ |
| OCR engine using Tesseract. |
| |
| Features: |
| - Broad language support (100+ languages) |
| - Mature and well-tested |
| - No GPU required |
| - Page segmentation modes for different layouts |
| """ |
|
|
| |
| LANGUAGE_MAP = { |
| "en": "eng", |
| "ch": "chi_sim", |
| "chinese_cht": "chi_tra", |
| "fr": "fra", |
| "german": "deu", |
| "es": "spa", |
| "it": "ita", |
| "pt": "por", |
| "ru": "rus", |
| "japan": "jpn", |
| "korean": "kor", |
| "ar": "ara", |
| "hi": "hin", |
| "latin": "lat", |
| } |
|
|
| |
| PSM_AUTO = 3 |
| PSM_SINGLE_BLOCK = 6 |
| PSM_SINGLE_LINE = 7 |
| PSM_SPARSE = 11 |
|
|
| def __init__(self, config: Optional[OCRConfig] = None): |
| """Initialize Tesseract OCR engine.""" |
| super().__init__(config) |
| self._tesseract_cmd: Optional[str] = None |
|
|
| def initialize(self): |
| """Initialize Tesseract engine.""" |
| if not HAS_TESSERACT: |
| raise RuntimeError( |
| "pytesseract not installed. Install with: pip install pytesseract. " |
| "Also install Tesseract: apt-get install tesseract-ocr" |
| ) |
|
|
| if self._initialized: |
| return |
|
|
| logger.info("Initializing Tesseract OCR engine...") |
|
|
| |
| try: |
| version = pytesseract.get_tesseract_version() |
| logger.info(f"Tesseract version: {version}") |
| self._initialized = True |
| except Exception as e: |
| logger.error(f"Tesseract not properly installed: {e}") |
| raise RuntimeError( |
| f"Tesseract not properly installed: {e}. " |
| "Install with: apt-get install tesseract-ocr" |
| ) |
|
|
| def recognize( |
| self, |
| image: np.ndarray, |
| page_number: int = 0, |
| ) -> OCRResult: |
| """ |
| Perform OCR on an image using Tesseract. |
| |
| Args: |
| image: Image as numpy array (RGB, HWC format) |
| page_number: Page number for multi-page documents |
| |
| Returns: |
| OCRResult with recognized text and regions |
| """ |
| if not self._initialized: |
| self.initialize() |
|
|
| start_time = time.time() |
|
|
| try: |
| |
| pil_image = Image.fromarray(image) |
|
|
| |
| lang = self._get_tesseract_lang() |
|
|
| |
| custom_config = self._build_config() |
|
|
| |
| data = pytesseract.image_to_data( |
| pil_image, |
| lang=lang, |
| config=custom_config, |
| output_type=pytesseract.Output.DICT, |
| ) |
|
|
| |
| regions = [] |
| all_texts = [] |
| total_confidence = 0.0 |
| valid_count = 0 |
|
|
| height, width = image.shape[:2] |
|
|
| |
| current_line_id = -1 |
| word_id = 0 |
|
|
| for i in range(len(data['text'])): |
| text = data['text'][i].strip() |
| conf = int(data['conf'][i]) |
|
|
| |
| if not text or conf < 0: |
| continue |
|
|
| confidence = conf / 100.0 |
| if confidence < self.config.min_confidence: |
| continue |
|
|
| |
| block_num = data['block_num'][i] |
| line_num = data['line_num'][i] |
| line_id = block_num * 1000 + line_num |
|
|
| if line_id != current_line_id: |
| current_line_id = line_id |
| word_id = 0 |
| else: |
| word_id += 1 |
|
|
| |
| x = data['left'][i] |
| y = data['top'][i] |
| w = data['width'][i] |
| h = data['height'][i] |
|
|
| bbox = BoundingBox( |
| x_min=float(x), |
| y_min=float(y), |
| x_max=float(x + w), |
| y_max=float(y + h), |
| normalized=False, |
| page_width=width, |
| page_height=height, |
| ) |
|
|
| region = OCRRegion( |
| text=text, |
| confidence=confidence, |
| bbox=bbox, |
| page=page_number, |
| line_id=line_id, |
| word_id=word_id, |
| engine="tesseract", |
| ) |
| regions.append(region) |
| all_texts.append(text) |
| total_confidence += confidence |
| valid_count += 1 |
|
|
| |
| full_text = pytesseract.image_to_string( |
| pil_image, |
| lang=lang, |
| config=custom_config, |
| ) |
|
|
| processing_time = (time.time() - start_time) * 1000 |
|
|
| return OCRResult( |
| regions=regions, |
| full_text=full_text.strip(), |
| confidence_avg=total_confidence / valid_count if valid_count > 0 else 0.0, |
| processing_time_ms=processing_time, |
| engine="tesseract", |
| success=True, |
| ) |
|
|
| except Exception as e: |
| logger.error(f"Tesseract recognition failed: {e}") |
| return OCRResult( |
| regions=[], |
| full_text="", |
| confidence_avg=0.0, |
| processing_time_ms=(time.time() - start_time) * 1000, |
| engine="tesseract", |
| success=False, |
| error=str(e), |
| ) |
|
|
| def _get_tesseract_lang(self) -> str: |
| """Get Tesseract language string from config.""" |
| langs = [] |
| for lang in self.config.languages: |
| tess_lang = self.LANGUAGE_MAP.get(lang, "eng") |
| if tess_lang not in langs: |
| langs.append(tess_lang) |
| return "+".join(langs) if langs else "eng" |
|
|
| def _build_config(self) -> str: |
| """Build Tesseract config string.""" |
| config_parts = [ |
| f"--psm {self.PSM_AUTO}", |
| "--oem 3", |
| ] |
|
|
| |
| if self.config.return_word_boxes: |
| config_parts.append("-c preserve_interword_spaces=1") |
|
|
| return " ".join(config_parts) |
|
|
| def get_supported_languages(self) -> List[str]: |
| """Return list of supported language codes.""" |
| return list(self.LANGUAGE_MAP.keys()) |
|
|
| def get_installed_languages(self) -> List[str]: |
| """Get list of languages installed in Tesseract.""" |
| if not self._initialized: |
| self.initialize() |
|
|
| try: |
| langs = pytesseract.get_languages() |
| return langs |
| except Exception as e: |
| logger.warning(f"Could not get installed languages: {e}") |
| return ["eng"] |
|
|
| def recognize_with_hocr( |
| self, |
| image: np.ndarray, |
| page_number: int = 0, |
| ) -> tuple: |
| """ |
| Perform OCR and return hOCR format for detailed layout. |
| |
| Args: |
| image: Image as numpy array |
| page_number: Page number |
| |
| Returns: |
| Tuple of (OCRResult, hOCR string) |
| """ |
| if not self._initialized: |
| self.initialize() |
|
|
| pil_image = Image.fromarray(image) |
| lang = self._get_tesseract_lang() |
| config = self._build_config() |
|
|
| |
| ocr_result = self.recognize(image, page_number) |
|
|
| |
| try: |
| hocr = pytesseract.image_to_pdf_or_hocr( |
| pil_image, |
| lang=lang, |
| config=config, |
| extension='hocr', |
| ) |
| return ocr_result, hocr.decode('utf-8') |
| except Exception as e: |
| logger.warning(f"Failed to generate hOCR: {e}") |
| return ocr_result, None |
|
|