File size: 7,064 Bytes
f094211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
Urdu text preprocessing utilities.
Handles Unicode normalization, RTL text cleanup, and Urdu-specific processing.
"""

import re
import unicodedata


def normalize_unicode(text: str) -> str:
    """Apply NFC Unicode normalization to standardize Urdu characters."""
    return unicodedata.normalize("NFC", text)


def remove_diacritics(text: str) -> str:
    """Remove Arabic/Urdu diacritical marks (tashkeel/harakat)."""
    # Unicode range for Arabic diacritics
    diacritics_pattern = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED]')
    return diacritics_pattern.sub('', text)


def normalize_whitespace(text: str) -> str:
    """Normalize multiple spaces, tabs, and unusual whitespace characters."""
    # Replace multiple spaces with single space
    text = re.sub(r'[ \t]+', ' ', text)
    # Remove spaces at line beginnings/ends
    text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
    # Normalize line breaks
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


def normalize_urdu_characters(text: str) -> str:
    """Normalize variant forms of Urdu characters to standard forms."""
    replacements = {
        # Arabic Kaf → Urdu Kaf
        '\u0643': '\u06A9',  # ك → ک
        # Arabic Yeh → Urdu Yeh
        '\u064A': '\u06CC',  # ي → ی
        # Arabic Heh → Urdu Heh
        '\u0647': '\u06BE',  # This is context-dependent, keep both
        # Teh Marbuta variants
        '\u0629': '\u06C3',  # ة → ۃ  (context-dependent)
        # Arabic numerals to English numerals (optional, for legal sections)
        '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
        '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def clean_ocr_artifacts(text: str) -> str:
    """Remove common OCR artifacts and noise characters."""
    # Remove control characters except newline and tab
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    # Remove isolated dots that aren't part of Urdu punctuation
    text = re.sub(r'(?<!\w)\.{3,}(?!\w)', '', text)
    # Remove unusual symbols that OCR might produce
    text = re.sub(r'[□■▪▫●○◆◇★☆♦♣♠♥]', '', text)
    return text


def segment_into_chunks(text: str, max_bytes: int = 256, overlap_bytes: int = 64) -> list:
    """
    Segment long text into overlapping byte-length chunks for ByT5 processing.
    
    Args:
        text: Input text to segment
        max_bytes: Maximum bytes per chunk (ByT5 input limit)
        overlap_bytes: Overlap between consecutive chunks
        
    Returns:
        List of (chunk_text, start_pos, end_pos) tuples
    """
    text_bytes = text.encode('utf-8')
    total_bytes = len(text_bytes)
    
    if total_bytes <= max_bytes:
        return [(text, 0, len(text))]
    
    chunks = []
    byte_start = 0
    
    while byte_start < total_bytes:
        byte_end = min(byte_start + max_bytes, total_bytes)
        
        # Avoid cutting in the middle of a UTF-8 character
        if byte_end < total_bytes:
            while byte_end > byte_start and (text_bytes[byte_end] & 0xC0) == 0x80:
                byte_end -= 1
        
        chunk_bytes = text_bytes[byte_start:byte_end]
        chunk_text = chunk_bytes.decode('utf-8', errors='ignore')
        
        # Calculate character positions
        char_start = len(text_bytes[:byte_start].decode('utf-8', errors='ignore'))
        char_end = char_start + len(chunk_text)
        
        chunks.append((chunk_text, char_start, char_end))
        
        # Move forward with overlap
        byte_start = byte_end - overlap_bytes
        if byte_start >= total_bytes:
            break
        # Avoid cutting in the middle of a UTF-8 character
        while byte_start > 0 and (text_bytes[byte_start] & 0xC0) == 0x80:
            byte_start -= 1
    
    return chunks


def merge_chunks(chunks: list, overlap_chars: int = 32) -> str:
    """
    Merge overlapping chunks back into a single text.
    Uses the middle portion of each chunk's overlap for best quality.
    
    Args:
        chunks: List of (corrected_text, start_pos, end_pos) tuples
        overlap_chars: Approximate character overlap between chunks
        
    Returns:
        Merged text string
    """
    if not chunks:
        return ""
    if len(chunks) == 1:
        return chunks[0][0]
    
    merged = chunks[0][0]
    
    for i in range(1, len(chunks)):
        current_text = chunks[i][0]
        
        # Find the best overlap point
        # Take the second half of the overlap from the current chunk
        half_overlap = overlap_chars // 2
        
        if len(current_text) > half_overlap:
            merged += current_text[half_overlap:]
        else:
            merged += current_text
    
    return merged


def preprocess_for_byt5(text: str) -> str:
    """Full preprocessing pipeline for ByT5 input."""
    text = normalize_unicode(text)
    text = clean_ocr_artifacts(text)
    text = normalize_whitespace(text)
    return text


def preprocess_for_ner(text: str) -> str:
    """Full preprocessing pipeline for NER input."""
    text = normalize_unicode(text)
    text = normalize_urdu_characters(text)
    text = normalize_whitespace(text)
    return text


def extract_section_blocks(text: str) -> dict:
    """
    Parse section-tagged text from VLM output into a structured dictionary.
    
    Args:
        text: Text with section tags like [HEADER], [INCIDENT_DETAILS], etc.
        
    Returns:
        Dictionary mapping section names to their text content
    """
    sections = {}
    # Pattern to match section tags
    section_pattern = re.compile(r'\[([A-Z_]+)\]')
    
    current_section = "UNTAGGED"
    current_content = []
    
    for line in text.split('\n'):
        match = section_pattern.search(line)
        if match:
            # Save previous section
            if current_content:
                content = '\n'.join(current_content).strip()
                if content:
                    sections[current_section] = content
            # Start new section
            current_section = match.group(1)
            current_content = []
            # Check if there's text after the tag on the same line
            remaining = line[match.end():].strip()
            if remaining:
                current_content.append(remaining)
        else:
            current_content.append(line)
    
    # Save last section
    if current_content:
        content = '\n'.join(current_content).strip()
        if content:
            sections[current_section] = content
    
    return sections


def get_full_text_from_sections(sections: dict) -> str:
    """Reconstruct full text from section dictionary, removing tags."""
    parts = []
    for section_name, content in sections.items():
        if content.strip():
            parts.append(content.strip())
    return '\n\n'.join(parts)