Spaces:

make789
/

OCRdeepSeekService

Sleeping

App Files Files Community

make789 commited on Oct 29, 2025

Commit

d59ba4a

verified ·

1 Parent(s): 8d0b826

Upload ocr_service.py

Browse files

Files changed (1) hide show

ocr_service.py +124 -59

ocr_service.py CHANGED Viewed

@@ -303,14 +303,18 @@ async def run_deepseek_ocr(
             )
         # Parse result - DeepSeek-OCR returns structured markdown output
-        ocr_text = result if isinstance(result, str) else str(result)
-        # Extract structured lines from markdown
-        lines = _parse_deepseek_output(ocr_text)
         return {
-            "text": ocr_text,
-            "lines": lines,
         }
     except Exception as e:
         print(f"DeepSeek-OCR error: {e}")
@@ -330,76 +334,137 @@ async def run_deepseek_ocr(
             pass
 def _parse_deepseek_output(ocr_text: str) -> list:
     """
     Extract structured lines from DeepSeek-OCR markdown output.
-    Preserves layout, handles tables, lists, and structured content.
     """
     lines = []
     text_lines = ocr_text.split('\n')
-    y_offset = 0
-    line_height = 24  # Estimated line height in pixels
-    for line_idx, line in enumerate(text_lines):
-        stripped = line.strip()
-        if not stripped:
-            # Empty lines still take space
-            y_offset += line_height // 2
-            continue
-        # Remove markdown formatting but preserve text structure
-        # Handle markdown tables (| separated)
-        if '|' in stripped and stripped.count('|') >= 2:
-            # Table row - split by | and process each cell
-            cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
-            for cell_idx, cell in enumerate(cells):
-                if cell:
                     lines.append({
-                        "bbox": [
-                            cell_idx * 200,  # Approximate x position
-                            y_offset,
-                            (cell_idx + 1) * 200,
-                            y_offset + line_height
-                        ],
-                        "text": cell,
                         "conf": 0.95,
                     })
-            y_offset += line_height
-        # Handle markdown lists (-, *, 1., etc.)
-        elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
-            # List item - remove list marker
-            text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
-            if text:
                 lines.append({
-                    "bbox": [40, y_offset, 1000, y_offset + line_height],
-                    "text": text,
                     "conf": 0.95,
                 })
                 y_offset += line_height
-        # Handle headers (# ## ###)
-        elif stripped.startswith('#'):
-            header_level = len(stripped) - len(stripped.lstrip('#'))
-            text = stripped.lstrip('#').strip()
-            if text:
-                # Headers are typically larger
-                header_height = line_height + (header_level * 4)
-                lines.append({
-                    "bbox": [0, y_offset, 1000, y_offset + header_height],
-                    "text": text,
-                    "conf": 0.95,
-                })
-                y_offset += header_height
-        # Regular text line
-        else:
-            # Estimate width based on text length (rough approximation)
-            estimated_width = min(len(stripped) * 8, 1000)  # ~8px per char average
-            lines.append({
-                "bbox": [0, y_offset, estimated_width, y_offset + line_height],
-                "text": stripped,
-                "conf": 0.95,
-            })
-            y_offset += line_height
     return lines

             )
         # Parse result - DeepSeek-OCR returns structured markdown output
+        raw_text = result if isinstance(result, str) else str(result)
+        # Extract structured lines from raw text (before cleaning)
+        # This parses grounding annotations to get bounding boxes
+        lines = _parse_deepseek_output(raw_text)
+        # Convert to clean markdown (remove tags, keep text)
+        clean_markdown = _deepseek_to_markdown(raw_text)
         return {
+            "text": clean_markdown,  # Return clean markdown without tags
+            "lines": lines,  # Structured lines with bounding boxes
         }
     except Exception as e:
         print(f"DeepSeek-OCR error: {e}")
             pass
+def _deepseek_to_markdown(s: str) -> str:
+    """
+    Convert DeepSeek-OCR tagged output to clean Markdown.
+    Removes grounding tags (<|ref|>...</|ref|>) and bbox annotations (<|det|>[...]<|/det|>)
+    while preserving the text content.
+    """
+    import re
+    # Remove bbox annotations first
+    det_pattern = re.compile(r'<\|det\|>\[[^\]]*\]<\|\/det\|>', re.DOTALL)
+    s = det_pattern.sub('', s)
+    # Remove ref tags
+    ref_pattern = re.compile(r'<\|ref\|>.*?<\|\/ref\|>', re.DOTALL)
+    s = ref_pattern.sub('', s)
+    # Tidy multiple blank lines
+    s = re.sub(r'\n{3,}', '\n\n', s).strip()
+    return s
 def _parse_deepseek_output(ocr_text: str) -> list:
     """
     Extract structured lines from DeepSeek-OCR markdown output.
+    DeepSeek-OCR returns grounding annotations like:
+    <|ref|>title<|/ref|><|det|>[[x,y,w,h]]<|/det|># Title
+    We parse these annotations to extract precise bounding boxes.
     """
+    import re
     lines = []
+    # Pattern to match grounding annotations: <|ref|>TYPE<|/ref|><|det|>[[x,y,w,h]]<|/det|>CONTENT
+    # Example: <|ref|>title<|/ref|><|det|>[[292, 29, 634, 54]]<|/det|># Taйский карри...
+    grounding_pattern = re.compile(
+        r'<\|ref\|>([^<]+)<\|\/ref\|><\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|\/det\|>(.*?)(?=<\|ref\||$)',
+        re.DOTALL
+    )
     text_lines = ocr_text.split('\n')
+    found_grounding = False
+    # Try to parse grounding annotations first
+    for line in text_lines:
+        matches = list(grounding_pattern.finditer(line))
+        if matches:
+            found_grounding = True
+            for match in matches:
+                type_name = match.group(1).strip()
+                x = int(match.group(2))
+                y = int(match.group(3))
+                w = int(match.group(4))  # Width
+                h = int(match.group(5))  # Height
+                content = match.group(6).strip()
+                # Remove markdown formatting from content
+                content = re.sub(r'^#+\s*', '', content)  # Remove headers
+                content = re.sub(r'\*\*', '', content)  # Remove bold
+                content = re.sub(r'\*', '', content)  # Remove italic
+                content = content.strip()
+                if content:
+                    lines.append({
+                        "bbox": [x, y, x + w, y + h],  # Convert [x, y, w, h] to [x0, y0, x1, y1]
+                        "text": content,
+                        "conf": 0.95,
+                        "type": type_name,  # title, text, sub_title, etc.
+                    })
+    # Fallback: if no grounding annotations found, parse markdown as before
+    if not found_grounding:
+        y_offset = 0
+        line_height = 24
+        for line_idx, line in enumerate(text_lines):
+            stripped = line.strip()
+            if not stripped:
+                y_offset += line_height // 2
+                continue
+            # Remove grounding annotations if present (but use fallback positioning)
+            stripped = re.sub(r'<\|ref\|>[^<]+<\|\/ref\|><\|det\|>\[\[.*?\]\]<\|\/det\|>', '', stripped)
+            stripped = stripped.strip()
+            if not stripped:
+                continue
+            # Handle markdown tables (| separated)
+            if '|' in stripped and stripped.count('|') >= 2:
+                cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
+                for cell_idx, cell in enumerate(cells):
+                    if cell:
+                        lines.append({
+                            "bbox": [cell_idx * 200, y_offset, (cell_idx + 1) * 200, y_offset + line_height],
+                            "text": cell,
+                            "conf": 0.95,
+                        })
+                y_offset += line_height
+            # Handle markdown lists (-, *, 1., etc.)
+            elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
+                text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
+                if text:
                     lines.append({
+                        "bbox": [40, y_offset, 1000, y_offset + line_height],
+                        "text": text,
                         "conf": 0.95,
                     })
+                    y_offset += line_height
+            # Handle headers (# ## ###)
+            elif stripped.startswith('#'):
+                header_level = len(stripped) - len(stripped.lstrip('#'))
+                text = stripped.lstrip('#').strip()
+                if text:
+                    header_height = line_height + (header_level * 4)
+                    lines.append({
+                        "bbox": [0, y_offset, 1000, y_offset + header_height],
+                        "text": text,
+                        "conf": 0.95,
+                    })
+                    y_offset += header_height
+            # Regular text line
+            else:
+                estimated_width = min(len(stripped) * 8, 1000)
                 lines.append({
+                    "bbox": [0, y_offset, estimated_width, y_offset + line_height],
+                    "text": stripped,
                     "conf": 0.95,
                 })
                 y_offset += line_height
     return lines