Spaces:

faizan24
/

Text_Extraction

Sleeping

App Files Files Community

Text_Extraction / utils /text_preprocessing.py

faizan24

Initialized

f094211 verified 16 days ago

raw

history blame contribute delete

7.06 kB

	"""
	Urdu text preprocessing utilities.
	Handles Unicode normalization, RTL text cleanup, and Urdu-specific processing.
	"""

	import re
	import unicodedata


	def normalize_unicode(text: str) -> str:
	"""Apply NFC Unicode normalization to standardize Urdu characters."""
	return unicodedata.normalize("NFC", text)


	def remove_diacritics(text: str) -> str:
	"""Remove Arabic/Urdu diacritical marks (tashkeel/harakat)."""
	# Unicode range for Arabic diacritics
	diacritics_pattern = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED]')
	return diacritics_pattern.sub('', text)


	def normalize_whitespace(text: str) -> str:
	"""Normalize multiple spaces, tabs, and unusual whitespace characters."""
	# Replace multiple spaces with single space
	text = re.sub(r'[ \t]+', ' ', text)
	# Remove spaces at line beginnings/ends
	text = re.sub(r'^ +\| +$', '', text, flags=re.MULTILINE)
	# Normalize line breaks
	text = re.sub(r'\r\n', '\n', text)
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()


	def normalize_urdu_characters(text: str) -> str:
	"""Normalize variant forms of Urdu characters to standard forms."""
	replacements = {
	# Arabic Kaf → Urdu Kaf
	'\u0643': '\u06A9', # ك → ک
	# Arabic Yeh → Urdu Yeh
	'\u064A': '\u06CC', # ي → ی
	# Arabic Heh → Urdu Heh
	'\u0647': '\u06BE', # This is context-dependent, keep both
	# Teh Marbuta variants
	'\u0629': '\u06C3', # ة → ۃ (context-dependent)
	# Arabic numerals to English numerals (optional, for legal sections)
	'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
	'٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
	}
	for old, new in replacements.items():
	text = text.replace(old, new)
	return text


	def clean_ocr_artifacts(text: str) -> str:
	"""Remove common OCR artifacts and noise characters."""
	# Remove control characters except newline and tab
	text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
	# Remove isolated dots that aren't part of Urdu punctuation
	text = re.sub(r'(?<!\w)\.{3,}(?!\w)', '', text)
	# Remove unusual symbols that OCR might produce
	text = re.sub(r'[□■▪▫●○◆◇★☆♦♣♠♥]', '', text)
	return text


	def segment_into_chunks(text: str, max_bytes: int = 256, overlap_bytes: int = 64) -> list:
	"""
	Segment long text into overlapping byte-length chunks for ByT5 processing.

	Args:
	text: Input text to segment
	max_bytes: Maximum bytes per chunk (ByT5 input limit)
	overlap_bytes: Overlap between consecutive chunks

	Returns:
	List of (chunk_text, start_pos, end_pos) tuples
	"""
	text_bytes = text.encode('utf-8')
	total_bytes = len(text_bytes)

	if total_bytes <= max_bytes:
	return [(text, 0, len(text))]

	chunks = []
	byte_start = 0

	while byte_start < total_bytes:
	byte_end = min(byte_start + max_bytes, total_bytes)

	# Avoid cutting in the middle of a UTF-8 character
	if byte_end < total_bytes:
	while byte_end > byte_start and (text_bytes[byte_end] & 0xC0) == 0x80:
	byte_end -= 1

	chunk_bytes = text_bytes[byte_start:byte_end]
	chunk_text = chunk_bytes.decode('utf-8', errors='ignore')

	# Calculate character positions
	char_start = len(text_bytes[:byte_start].decode('utf-8', errors='ignore'))
	char_end = char_start + len(chunk_text)

	chunks.append((chunk_text, char_start, char_end))

	# Move forward with overlap
	byte_start = byte_end - overlap_bytes
	if byte_start >= total_bytes:
	break
	# Avoid cutting in the middle of a UTF-8 character
	while byte_start > 0 and (text_bytes[byte_start] & 0xC0) == 0x80:
	byte_start -= 1

	return chunks


	def merge_chunks(chunks: list, overlap_chars: int = 32) -> str:
	"""
	Merge overlapping chunks back into a single text.
	Uses the middle portion of each chunk's overlap for best quality.

	Args:
	chunks: List of (corrected_text, start_pos, end_pos) tuples
	overlap_chars: Approximate character overlap between chunks

	Returns:
	Merged text string
	"""
	if not chunks:
	return ""
	if len(chunks) == 1:
	return chunks[0][0]

	merged = chunks[0][0]

	for i in range(1, len(chunks)):
	current_text = chunks[i][0]

	# Find the best overlap point
	# Take the second half of the overlap from the current chunk
	half_overlap = overlap_chars // 2

	if len(current_text) > half_overlap:
	merged += current_text[half_overlap:]
	else:
	merged += current_text

	return merged


	def preprocess_for_byt5(text: str) -> str:
	"""Full preprocessing pipeline for ByT5 input."""
	text = normalize_unicode(text)
	text = clean_ocr_artifacts(text)
	text = normalize_whitespace(text)
	return text


	def preprocess_for_ner(text: str) -> str:
	"""Full preprocessing pipeline for NER input."""
	text = normalize_unicode(text)
	text = normalize_urdu_characters(text)
	text = normalize_whitespace(text)
	return text


	def extract_section_blocks(text: str) -> dict:
	"""
	Parse section-tagged text from VLM output into a structured dictionary.

	Args:
	text: Text with section tags like [HEADER], [INCIDENT_DETAILS], etc.

	Returns:
	Dictionary mapping section names to their text content
	"""
	sections = {}
	# Pattern to match section tags
	section_pattern = re.compile(r'\[([A-Z_]+)\]')

	current_section = "UNTAGGED"
	current_content = []

	for line in text.split('\n'):
	match = section_pattern.search(line)
	if match:
	# Save previous section
	if current_content:
	content = '\n'.join(current_content).strip()
	if content:
	sections[current_section] = content
	# Start new section
	current_section = match.group(1)
	current_content = []
	# Check if there's text after the tag on the same line
	remaining = line[match.end():].strip()
	if remaining:
	current_content.append(remaining)
	else:
	current_content.append(line)

	# Save last section
	if current_content:
	content = '\n'.join(current_content).strip()
	if content:
	sections[current_section] = content

	return sections


	def get_full_text_from_sections(sections: dict) -> str:
	"""Reconstruct full text from section dictionary, removing tags."""
	parts = []
	for section_name, content in sections.items():
	if content.strip():
	parts.append(content.strip())
	return '\n\n'.join(parts)