Spaces:

faizan24
/

Text_Extraction

Sleeping

File size: 7,064 Bytes

f094211

"""
Urdu text preprocessing utilities.
Handles Unicode normalization, RTL text cleanup, and Urdu-specific processing.
"""

import re
import unicodedata


def normalize_unicode(text: str) -> str:
    """Apply NFC Unicode normalization to standardize Urdu characters."""
    return unicodedata.normalize("NFC", text)


def remove_diacritics(text: str) -> str:
    """Remove Arabic/Urdu diacritical marks (tashkeel/harakat)."""
    # Unicode range for Arabic diacritics
    diacritics_pattern = re.compile(r'[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED]')
    return diacritics_pattern.sub('', text)


def normalize_whitespace(text: str) -> str:
    """Normalize multiple spaces, tabs, and unusual whitespace characters."""
    # Replace multiple spaces with single space
    text = re.sub(r'[ \t]+', ' ', text)
    # Remove spaces at line beginnings/ends
    text = re.sub(r'^ +| +$', '', text, flags=re.MULTILINE)
    # Normalize line breaks
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


def normalize_urdu_characters(text: str) -> str:
    """Normalize variant forms of Urdu characters to standard forms."""
    replacements = {
        # Arabic Kaf → Urdu Kaf
        '\u0643': '\u06A9',  # ك → ک
        # Arabic Yeh → Urdu Yeh
        '\u064A': '\u06CC',  # ي → ی
        # Arabic Heh → Urdu Heh
        '\u0647': '\u06BE',  # This is context-dependent, keep both
        # Teh Marbuta variants
        '\u0629': '\u06C3',  # ة → ۃ  (context-dependent)
        # Arabic numerals to English numerals (optional, for legal sections)
        '٠': '0', '١': '1', '٢': '2', '٣': '3', '٤': '4',
        '٥': '5', '٦': '6', '٧': '7', '٨': '8', '٩': '9',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text


def clean_ocr_artifacts(text: str) -> str:
    """Remove common OCR artifacts and noise characters."""
    # Remove control characters except newline and tab
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    # Remove isolated dots that aren't part of Urdu punctuation
    text = re.sub(r'(?<!\w)\.{3,}(?!\w)', '', text)
    # Remove unusual symbols that OCR might produce
    text = re.sub(r'[□■▪▫●○◆◇★☆♦♣♠♥]', '', text)
    return text


def segment_into_chunks(text: str, max_bytes: int = 256, overlap_bytes: int = 64) -> list:
    """
    Segment long text into overlapping byte-length chunks for ByT5 processing.
    
    Args:
        text: Input text to segment
        max_bytes: Maximum bytes per chunk (ByT5 input limit)
        overlap_bytes: Overlap between consecutive chunks
        
    Returns:
        List of (chunk_text, start_pos, end_pos) tuples
    """
    text_bytes = text.encode('utf-8')
    total_bytes = len(text_bytes)
    
    if total_bytes <= max_bytes:
        return [(text, 0, len(text))]
    
    chunks = []
    byte_start = 0
    
    while byte_start < total_bytes:
        byte_end = min(byte_start + max_bytes, total_bytes)
        
        # Avoid cutting in the middle of a UTF-8 character
        if byte_end < total_bytes:
            while byte_end > byte_start and (text_bytes[byte_end] & 0xC0) == 0x80:
                byte_end -= 1
        
        chunk_bytes = text_bytes[byte_start:byte_end]
        chunk_text = chunk_bytes.decode('utf-8', errors='ignore')
        
        # Calculate character positions
        char_start = len(text_bytes[:byte_start].decode('utf-8', errors='ignore'))
        char_end = char_start + len(chunk_text)
        
        chunks.append((chunk_text, char_start, char_end))
        
        # Move forward with overlap
        byte_start = byte_end - overlap_bytes
        if byte_start >= total_bytes:
            break
        # Avoid cutting in the middle of a UTF-8 character
        while byte_start > 0 and (text_bytes[byte_start] & 0xC0) == 0x80:
            byte_start -= 1
    
    return chunks


def merge_chunks(chunks: list, overlap_chars: int = 32) -> str:
    """
    Merge overlapping chunks back into a single text.
    Uses the middle portion of each chunk's overlap for best quality.
    
    Args:
        chunks: List of (corrected_text, start_pos, end_pos) tuples
        overlap_chars: Approximate character overlap between chunks
        
    Returns:
        Merged text string
    """
    if not chunks:
        return ""
    if len(chunks) == 1:
        return chunks[0][0]
    
    merged = chunks[0][0]
    
    for i in range(1, len(chunks)):
        current_text = chunks[i][0]
        
        # Find the best overlap point
        # Take the second half of the overlap from the current chunk
        half_overlap = overlap_chars // 2
        
        if len(current_text) > half_overlap:
            merged += current_text[half_overlap:]
        else:
            merged += current_text
    
    return merged


def preprocess_for_byt5(text: str) -> str:
    """Full preprocessing pipeline for ByT5 input."""
    text = normalize_unicode(text)
    text = clean_ocr_artifacts(text)
    text = normalize_whitespace(text)
    return text


def preprocess_for_ner(text: str) -> str:
    """Full preprocessing pipeline for NER input."""
    text = normalize_unicode(text)
    text = normalize_urdu_characters(text)
    text = normalize_whitespace(text)
    return text


def extract_section_blocks(text: str) -> dict:
    """
    Parse section-tagged text from VLM output into a structured dictionary.
    
    Args:
        text: Text with section tags like [HEADER], [INCIDENT_DETAILS], etc.
        
    Returns:
        Dictionary mapping section names to their text content
    """
    sections = {}
    # Pattern to match section tags
    section_pattern = re.compile(r'\[([A-Z_]+)\]')
    
    current_section = "UNTAGGED"
    current_content = []
    
    for line in text.split('\n'):
        match = section_pattern.search(line)
        if match:
            # Save previous section
            if current_content:
                content = '\n'.join(current_content).strip()
                if content:
                    sections[current_section] = content
            # Start new section
            current_section = match.group(1)
            current_content = []
            # Check if there's text after the tag on the same line
            remaining = line[match.end():].strip()
            if remaining:
                current_content.append(remaining)
        else:
            current_content.append(line)
    
    # Save last section
    if current_content:
        content = '\n'.join(current_content).strip()
        if content:
            sections[current_section] = content
    
    return sections


def get_full_text_from_sections(sections: dict) -> str:
    """Reconstruct full text from section dictionary, removing tags."""
    parts = []
    for section_name, content in sections.items():
        if content.strip():
            parts.append(content.strip())
    return '\n\n'.join(parts)