Spaces:
Sleeping
Sleeping
File size: 7,064 Bytes
f094211 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | """
Urdu text preprocessing utilities.
Handles Unicode normalization, RTL text cleanup, and Urdu-specific processing.
"""
import re
import unicodedata
def normalize_unicode(text: str) -> str:
"""Apply NFC Unicode normalization to standardize Urdu characters."""
return unicodedata.normalize("NFC", text)
def remove_diacritics(text: str) -> str:
"""Remove Arabic/Urdu diacritical marks (tashkeel/harakat)."""
# Unicode range for Arabic diacritics
diacritics_pattern = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED]')
return diacritics_pattern.sub('', text)
def normalize_whitespace(text: str) -> str:
"""Normalize multiple spaces, tabs, and unusual whitespace characters."""
# Replace multiple spaces with single space
text = re.sub(r'[ \t]+', ' ', text)
# Remove spaces at line beginnings/ends
text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
# Normalize line breaks
text = re.sub(r'\r\n', '\n', text)
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def normalize_urdu_characters(text: str) -> str:
"""Normalize variant forms of Urdu characters to standard forms."""
replacements = {
# Arabic Kaf → Urdu Kaf
'\u0643': '\u06A9', # ك → ک
# Arabic Yeh → Urdu Yeh
'\u064A': '\u06CC', # ي → ی
# Arabic Heh → Urdu Heh
'\u0647': '\u06BE', # This is context-dependent, keep both
# Teh Marbuta variants
'\u0629': '\u06C3', # ة → ۃ (context-dependent)
# Arabic numerals to English numerals (optional, for legal sections)
'٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
'٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def clean_ocr_artifacts(text: str) -> str:
"""Remove common OCR artifacts and noise characters."""
# Remove control characters except newline and tab
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# Remove isolated dots that aren't part of Urdu punctuation
text = re.sub(r'(?<!\w)\.{3,}(?!\w)', '', text)
# Remove unusual symbols that OCR might produce
text = re.sub(r'[□■▪▫●○◆◇★☆♦♣♠♥]', '', text)
return text
def segment_into_chunks(text: str, max_bytes: int = 256, overlap_bytes: int = 64) -> list:
"""
Segment long text into overlapping byte-length chunks for ByT5 processing.
Args:
text: Input text to segment
max_bytes: Maximum bytes per chunk (ByT5 input limit)
overlap_bytes: Overlap between consecutive chunks
Returns:
List of (chunk_text, start_pos, end_pos) tuples
"""
text_bytes = text.encode('utf-8')
total_bytes = len(text_bytes)
if total_bytes <= max_bytes:
return [(text, 0, len(text))]
chunks = []
byte_start = 0
while byte_start < total_bytes:
byte_end = min(byte_start + max_bytes, total_bytes)
# Avoid cutting in the middle of a UTF-8 character
if byte_end < total_bytes:
while byte_end > byte_start and (text_bytes[byte_end] & 0xC0) == 0x80:
byte_end -= 1
chunk_bytes = text_bytes[byte_start:byte_end]
chunk_text = chunk_bytes.decode('utf-8', errors='ignore')
# Calculate character positions
char_start = len(text_bytes[:byte_start].decode('utf-8', errors='ignore'))
char_end = char_start + len(chunk_text)
chunks.append((chunk_text, char_start, char_end))
# Move forward with overlap
byte_start = byte_end - overlap_bytes
if byte_start >= total_bytes:
break
# Avoid cutting in the middle of a UTF-8 character
while byte_start > 0 and (text_bytes[byte_start] & 0xC0) == 0x80:
byte_start -= 1
return chunks
def merge_chunks(chunks: list, overlap_chars: int = 32) -> str:
"""
Merge overlapping chunks back into a single text.
Uses the middle portion of each chunk's overlap for best quality.
Args:
chunks: List of (corrected_text, start_pos, end_pos) tuples
overlap_chars: Approximate character overlap between chunks
Returns:
Merged text string
"""
if not chunks:
return ""
if len(chunks) == 1:
return chunks[0][0]
merged = chunks[0][0]
for i in range(1, len(chunks)):
current_text = chunks[i][0]
# Find the best overlap point
# Take the second half of the overlap from the current chunk
half_overlap = overlap_chars // 2
if len(current_text) > half_overlap:
merged += current_text[half_overlap:]
else:
merged += current_text
return merged
def preprocess_for_byt5(text: str) -> str:
"""Full preprocessing pipeline for ByT5 input."""
text = normalize_unicode(text)
text = clean_ocr_artifacts(text)
text = normalize_whitespace(text)
return text
def preprocess_for_ner(text: str) -> str:
"""Full preprocessing pipeline for NER input."""
text = normalize_unicode(text)
text = normalize_urdu_characters(text)
text = normalize_whitespace(text)
return text
def extract_section_blocks(text: str) -> dict:
"""
Parse section-tagged text from VLM output into a structured dictionary.
Args:
text: Text with section tags like [HEADER], [INCIDENT_DETAILS], etc.
Returns:
Dictionary mapping section names to their text content
"""
sections = {}
# Pattern to match section tags
section_pattern = re.compile(r'\[([A-Z_]+)\]')
current_section = "UNTAGGED"
current_content = []
for line in text.split('\n'):
match = section_pattern.search(line)
if match:
# Save previous section
if current_content:
content = '\n'.join(current_content).strip()
if content:
sections[current_section] = content
# Start new section
current_section = match.group(1)
current_content = []
# Check if there's text after the tag on the same line
remaining = line[match.end():].strip()
if remaining:
current_content.append(remaining)
else:
current_content.append(line)
# Save last section
if current_content:
content = '\n'.join(current_content).strip()
if content:
sections[current_section] = content
return sections
def get_full_text_from_sections(sections: dict) -> str:
"""Reconstruct full text from section dictionary, removing tags."""
parts = []
for section_name, content in sections.items():
if content.strip():
parts.append(content.strip())
return '\n\n'.join(parts)
|