make789 commited on
Commit
d59ba4a
·
verified ·
1 Parent(s): 8d0b826

Upload ocr_service.py

Browse files
Files changed (1) hide show
  1. ocr_service.py +124 -59
ocr_service.py CHANGED
@@ -303,14 +303,18 @@ async def run_deepseek_ocr(
303
  )
304
 
305
  # Parse result - DeepSeek-OCR returns structured markdown output
306
- ocr_text = result if isinstance(result, str) else str(result)
307
 
308
- # Extract structured lines from markdown
309
- lines = _parse_deepseek_output(ocr_text)
 
 
 
 
310
 
311
  return {
312
- "text": ocr_text,
313
- "lines": lines,
314
  }
315
  except Exception as e:
316
  print(f"DeepSeek-OCR error: {e}")
@@ -330,76 +334,137 @@ async def run_deepseek_ocr(
330
  pass
331
 
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  def _parse_deepseek_output(ocr_text: str) -> list:
334
  """
335
  Extract structured lines from DeepSeek-OCR markdown output.
336
- Preserves layout, handles tables, lists, and structured content.
 
 
 
337
  """
 
 
338
  lines = []
 
 
 
 
 
 
 
 
339
  text_lines = ocr_text.split('\n')
 
340
 
341
- y_offset = 0
342
- line_height = 24 # Estimated line height in pixels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- for line_idx, line in enumerate(text_lines):
345
- stripped = line.strip()
346
- if not stripped:
347
- # Empty lines still take space
348
- y_offset += line_height // 2
349
- continue
350
 
351
- # Remove markdown formatting but preserve text structure
352
- # Handle markdown tables (| separated)
353
- if '|' in stripped and stripped.count('|') >= 2:
354
- # Table row - split by | and process each cell
355
- cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
356
- for cell_idx, cell in enumerate(cells):
357
- if cell:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  lines.append({
359
- "bbox": [
360
- cell_idx * 200, # Approximate x position
361
- y_offset,
362
- (cell_idx + 1) * 200,
363
- y_offset + line_height
364
- ],
365
- "text": cell,
366
  "conf": 0.95,
367
  })
368
- y_offset += line_height
369
- # Handle markdown lists (-, *, 1., etc.)
370
- elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
371
- # List item - remove list marker
372
- text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
373
- if text:
 
 
 
 
 
 
 
 
 
 
374
  lines.append({
375
- "bbox": [40, y_offset, 1000, y_offset + line_height],
376
- "text": text,
377
  "conf": 0.95,
378
  })
379
  y_offset += line_height
380
- # Handle headers (# ## ###)
381
- elif stripped.startswith('#'):
382
- header_level = len(stripped) - len(stripped.lstrip('#'))
383
- text = stripped.lstrip('#').strip()
384
- if text:
385
- # Headers are typically larger
386
- header_height = line_height + (header_level * 4)
387
- lines.append({
388
- "bbox": [0, y_offset, 1000, y_offset + header_height],
389
- "text": text,
390
- "conf": 0.95,
391
- })
392
- y_offset += header_height
393
- # Regular text line
394
- else:
395
- # Estimate width based on text length (rough approximation)
396
- estimated_width = min(len(stripped) * 8, 1000) # ~8px per char average
397
- lines.append({
398
- "bbox": [0, y_offset, estimated_width, y_offset + line_height],
399
- "text": stripped,
400
- "conf": 0.95,
401
- })
402
- y_offset += line_height
403
 
404
  return lines
405
 
 
303
  )
304
 
305
  # Parse result - DeepSeek-OCR returns structured markdown output
306
+ raw_text = result if isinstance(result, str) else str(result)
307
 
308
+ # Extract structured lines from raw text (before cleaning)
309
+ # This parses grounding annotations to get bounding boxes
310
+ lines = _parse_deepseek_output(raw_text)
311
+
312
+ # Convert to clean markdown (remove tags, keep text)
313
+ clean_markdown = _deepseek_to_markdown(raw_text)
314
 
315
  return {
316
+ "text": clean_markdown, # Return clean markdown without tags
317
+ "lines": lines, # Structured lines with bounding boxes
318
  }
319
  except Exception as e:
320
  print(f"DeepSeek-OCR error: {e}")
 
334
  pass
335
 
336
 
337
+ def _deepseek_to_markdown(s: str) -> str:
338
+ """
339
+ Convert DeepSeek-OCR tagged output to clean Markdown.
340
+ Removes grounding tags (<|ref|>...</|ref|>) and bbox annotations (<|det|>[...]<|/det|>)
341
+ while preserving the text content.
342
+ """
343
+ import re
344
+
345
+ # Remove bbox annotations first
346
+ det_pattern = re.compile(r'<\|det\|>\[[^\]]*\]<\|\/det\|>', re.DOTALL)
347
+ s = det_pattern.sub('', s)
348
+
349
+ # Remove ref tags
350
+ ref_pattern = re.compile(r'<\|ref\|>.*?<\|\/ref\|>', re.DOTALL)
351
+ s = ref_pattern.sub('', s)
352
+
353
+ # Tidy multiple blank lines
354
+ s = re.sub(r'\n{3,}', '\n\n', s).strip()
355
+
356
+ return s
357
+
358
+
359
  def _parse_deepseek_output(ocr_text: str) -> list:
360
  """
361
  Extract structured lines from DeepSeek-OCR markdown output.
362
+ DeepSeek-OCR returns grounding annotations like:
363
+ <|ref|>title<|/ref|><|det|>[[x,y,w,h]]<|/det|># Title
364
+
365
+ We parse these annotations to extract precise bounding boxes.
366
  """
367
+ import re
368
+
369
  lines = []
370
+
371
+ # Pattern to match grounding annotations: <|ref|>TYPE<|/ref|><|det|>[[x,y,w,h]]<|/det|>CONTENT
372
+ # Example: <|ref|>title<|/ref|><|det|>[[292, 29, 634, 54]]<|/det|># Taйский карри...
373
+ grounding_pattern = re.compile(
374
+ r'<\|ref\|>([^<]+)<\|\/ref\|><\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|\/det\|>(.*?)(?=<\|ref\||$)',
375
+ re.DOTALL
376
+ )
377
+
378
  text_lines = ocr_text.split('\n')
379
+ found_grounding = False
380
 
381
+ # Try to parse grounding annotations first
382
+ for line in text_lines:
383
+ matches = list(grounding_pattern.finditer(line))
384
+ if matches:
385
+ found_grounding = True
386
+ for match in matches:
387
+ type_name = match.group(1).strip()
388
+ x = int(match.group(2))
389
+ y = int(match.group(3))
390
+ w = int(match.group(4)) # Width
391
+ h = int(match.group(5)) # Height
392
+ content = match.group(6).strip()
393
+
394
+ # Remove markdown formatting from content
395
+ content = re.sub(r'^#+\s*', '', content) # Remove headers
396
+ content = re.sub(r'\*\*', '', content) # Remove bold
397
+ content = re.sub(r'\*', '', content) # Remove italic
398
+ content = content.strip()
399
+
400
+ if content:
401
+ lines.append({
402
+ "bbox": [x, y, x + w, y + h], # Convert [x, y, w, h] to [x0, y0, x1, y1]
403
+ "text": content,
404
+ "conf": 0.95,
405
+ "type": type_name, # title, text, sub_title, etc.
406
+ })
407
 
408
+ # Fallback: if no grounding annotations found, parse markdown as before
409
+ if not found_grounding:
410
+ y_offset = 0
411
+ line_height = 24
 
 
412
 
413
+ for line_idx, line in enumerate(text_lines):
414
+ stripped = line.strip()
415
+ if not stripped:
416
+ y_offset += line_height // 2
417
+ continue
418
+
419
+ # Remove grounding annotations if present (but use fallback positioning)
420
+ stripped = re.sub(r'<\|ref\|>[^<]+<\|\/ref\|><\|det\|>\[\[.*?\]\]<\|\/det\|>', '', stripped)
421
+ stripped = stripped.strip()
422
+
423
+ if not stripped:
424
+ continue
425
+
426
+ # Handle markdown tables (| separated)
427
+ if '|' in stripped and stripped.count('|') >= 2:
428
+ cells = [cell.strip() for cell in stripped.split('|') if cell.strip()]
429
+ for cell_idx, cell in enumerate(cells):
430
+ if cell:
431
+ lines.append({
432
+ "bbox": [cell_idx * 200, y_offset, (cell_idx + 1) * 200, y_offset + line_height],
433
+ "text": cell,
434
+ "conf": 0.95,
435
+ })
436
+ y_offset += line_height
437
+ # Handle markdown lists (-, *, 1., etc.)
438
+ elif stripped.startswith(('-', '*', '+')) or (len(stripped) > 2 and stripped[1] == '.'):
439
+ text = stripped.lstrip('-*+').lstrip('0123456789.').strip()
440
+ if text:
441
  lines.append({
442
+ "bbox": [40, y_offset, 1000, y_offset + line_height],
443
+ "text": text,
 
 
 
 
 
444
  "conf": 0.95,
445
  })
446
+ y_offset += line_height
447
+ # Handle headers (# ## ###)
448
+ elif stripped.startswith('#'):
449
+ header_level = len(stripped) - len(stripped.lstrip('#'))
450
+ text = stripped.lstrip('#').strip()
451
+ if text:
452
+ header_height = line_height + (header_level * 4)
453
+ lines.append({
454
+ "bbox": [0, y_offset, 1000, y_offset + header_height],
455
+ "text": text,
456
+ "conf": 0.95,
457
+ })
458
+ y_offset += header_height
459
+ # Regular text line
460
+ else:
461
+ estimated_width = min(len(stripped) * 8, 1000)
462
  lines.append({
463
+ "bbox": [0, y_offset, estimated_width, y_offset + line_height],
464
+ "text": stripped,
465
  "conf": 0.95,
466
  })
467
  y_offset += line_height
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
  return lines
470