Spaces:
Runtime error
Runtime error
| import re | |
| from klpt.preprocess import Preprocess | |
| from klpt.tokenize import Tokenize | |
| import unicodedata | |
| preprocessor_ckb = Preprocess("Sorani", "Arabic", numeral="Arabic") | |
| tokenizer_ckb = Tokenize("Sorani", "Arabic") | |
| unify_numbers = { | |
| "٠|۰": "0", | |
| "١|۱": "1", | |
| "٢|۲": "2", | |
| "٣|۳": "3", | |
| "٤|۴": "4", | |
| "٥|۵": "5", | |
| "٦|۶": "6", | |
| "٧|۷": "7", | |
| "٨|۸": "8", | |
| "٩|۹": "9" | |
| } | |
| # Taken from AsoSoft library | |
| def number_to_word(text): | |
| # convert numbers to latin | |
| for k, v in unify_numbers.items(): | |
| text = re.sub(k, v, text) | |
| text = re.sub(r"([0-9]{1,3})[,،](?=[0-9]{3})", r"\1", text); # remove thousend seperator 12,345,678 => 12345678 | |
| text = re.sub(r"(?<![0-9])-([0-9]+)", r"ناقس \1", text); # negative | |
| text = text.replace("٪", "%") # Replace arabic percent sign with latin | |
| text = re.sub(r"(?<![0-9])% ?([0-9]+)", r"لە سەددا \1", text); # percent sign before | |
| text = re.sub(r"([0-9]+) ?%", r"\1 لە سەد", text); # percent sign after | |
| text = re.sub(r"\$ ?([0-9]+(\.[0-9]+)?)", r"\1 دۆلار", text) # $ querency | |
| text = re.sub(r"£ ?([0-9]+(\.[0-9]+)?)", r"\1 پاوەن", text) # £ querency | |
| text = re.sub(r"€ ?([0-9]+(\.[0-9]+)?)", r"\1 یۆرۆ", text) # € querency | |
| # convert float numbers | |
| text = re.sub(r"([0-9]+)\.([0-9]+)", lambda x: float_name(x.group(1), x.group(2)), text) | |
| # convert remaining integr numbers | |
| text = re.sub(r"([0-9]+)", lambda match: integer_name(match.group(1)), text) | |
| return text | |
| def float_name(integerPart, decimalPart): | |
| zeros = re.search("^0+", decimalPart) | |
| point = " پۆینت " | |
| if(zeros): | |
| point = point + re.sub("0", " سفر ", zeros[0]) | |
| return integer_name(integerPart) + point + integer_name(decimalPart) | |
| ones = ["", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ"] | |
| teens = [ "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" ] | |
| tens = [ "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد"] | |
| hundreds = ["", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد"] | |
| thousands = ["", " هەزار", " ملیۆن", " ملیار", " ترلیۆن", " کوادرلیۆن", " کوینتلیۆن"] | |
| def integer_name(inputInteger): | |
| output = "" | |
| if (inputInteger != "0"): | |
| temp = inputInteger | |
| for i in range(0, len(inputInteger), 3): | |
| matched_numbers = re.findall(r"[0-9]{1,3}$", temp) | |
| currentThree = matched_numbers[0] if matched_numbers else "" | |
| temp = temp[:len(temp) - len(currentThree)] | |
| currentThree = currentThree.rjust(3, '0') | |
| C = int(currentThree[0]) | |
| X = int(currentThree[1]) | |
| I = int(currentThree[2]) | |
| conjunction1 = " و " if (C != 0) and (X != 0 or I != 0) else "" | |
| conjunction2 = " و " if X != 0 and I != 0 else "" | |
| if (X == 1): | |
| currentThree = hundreds[C] + conjunction1 + teens[I] | |
| else: | |
| currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I] | |
| currentThree += "" if currentThree == "" else thousands[i // 3] | |
| conjunction3 = "" if output == "" else " و " | |
| if (currentThree != ""): | |
| output = currentThree + conjunction3 + output | |
| output = output.replace("یەک هەزار", "هەزار") | |
| else: # if input number = 0 | |
| output = "سفر" | |
| return output | |
| def replace_words_in_corpus(sentence): | |
| modified_corpus = [] | |
| words = sentence.split() | |
| modified_words = [] | |
| for word in words: | |
| if word in word_replacements: | |
| modified_words.append(word_replacements[word]) | |
| else: | |
| modified_words.append(word) | |
| modified_sentence = " ".join(modified_words) | |
| return modified_sentence | |
| # put this in a json file | |
| word_replacements = { | |
| "ههڵاڵەەي": "هەڵاڵەی", | |
| "وهەمهەمه": "وهەمهەمه", | |
| "ئهباتههوه": "ئەباتەوە", | |
| "بەخءرایی": "بەخێرایی", | |
| "ئیثانۆڵ": "ئیسانۆڵ", | |
| "عەبدوڵڵاهـ": "عەبدوڵڵا", | |
| "کولاهـ": "کولاه", | |
| "ئاھ": "ئاه", | |
| } | |
| char_replacements = { | |
| '\u200e': '', | |
| '\u200f': '', | |
| '\u200c': '', | |
| 'õ': '', | |
| 'ھ': 'ه' | |
| } | |
| def apply_char_replacements(text: str): | |
| for old, new in char_replacements.items(): | |
| text = text.replace(old, new) | |
| return text | |
| def remove_arabic_alphabets(text: str): | |
| """ | |
| Removes ``Arabic`` words and digits from a ``text`` | |
| Args: | |
| text (str): Sorani text | |
| Returns: | |
| str: ``str`` object with arabic alphabets removed | |
| """ | |
| characters = "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ" | |
| table = str.maketrans({key: None for key in characters}) | |
| return text.translate(table) | |
| def filtered_arabic_characters(): | |
| kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ") | |
| arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ") | |
| # Create a new set of Arabic characters without the Kurdish characters | |
| filtered_arabic_characters = arabic_characters - kurdish_characters | |
| return ''.join(filtered_arabic_characters) | |
| def is_arabic_string(text): | |
| """Returns True if the text contains any Arabic characters, False otherwise.""" | |
| # arabic_characters = set("ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْٰٱ") | |
| arabic_characters = filtered_arabic_characters() | |
| for ch in text: | |
| if ch in arabic_characters: | |
| return True | |
| return False | |
| def contains_arabic(text): | |
| arabic_characters = filtered_arabic_characters() | |
| return any(char in arabic_characters for char in text) | |
| def is_english_string(text): | |
| """Returns True if the text contains only English characters, False otherwise.""" | |
| english_pattern = re.compile(r'[a-zA-Z]') | |
| return bool(english_pattern.search(text)) | |
| def remove_english_alphabets(text: str): | |
| """ | |
| Removes ``English`` words and digits from a ``text`` | |
| """ | |
| characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
| table = str.maketrans({key: None for key in characters}) | |
| return text.translate(table) | |
| def resolve_ae(text): | |
| """ | |
| This function takes a text input in Central Kurdish (Sorani) script and performs a series of character replacements | |
| to standardize variations in the script. Specifically, it addresses cases where the character 'ە' (Arabic letter | |
| AE) may be used in different contexts. | |
| """ | |
| # First replace all occurrences of 'ه' with 'ە' | |
| text = re.sub("ه", "ە", text) | |
| # Replace specific combinations with 'ها', 'هێ', and 'ه' | |
| text = re.sub("ەا", "ها", text) # Replace ەا with ها | |
| text = re.sub("ەێ", "هێ", text) # Replace ەێ with هێ | |
| text = re.sub("ەۆ", "هۆ", text) # Replace ەۆ with هۆ | |
| # Replace ە (AE) at the beginning of a word with ه (HEH) | |
| text = re.sub(r"\b(ە\w*)", lambda match: "ه" + match.group(1)[1:], text) | |
| # Replace ALEF+AE with ALEF+HEH | |
| text = re.sub("اە", "اه", text) | |
| # Special words should go here before the replcement of 'ە' at the end of the word | |
| # Special case: گەهـ or گاهـ but without the tatweel since tatweel is not a phoneme in Kurdish and it will be a class for the model | |
| text = re.sub(r'\bگەە[-ـ]?\b', "گەه", text) | |
| # Replace 'ەە' at the beginning and end with 'هە' | |
| text = re.sub(r"\bەە|ەە\b", "هە", text) | |
| # Special case if two AEs come before ۆ it should be replaced with AE+HEH | |
| text = re.sub(r"ەە(?=ۆ)", "ەه", text) | |
| # Special case if two AEs come after either و or ب or ئ or ڕ or ق or ز they should be replaced with AE+HEH | |
| text = re.sub(r"(?<=\b[بوئڕقزژ])ەە", "ەه", text) | |
| # The following special case should happen after the previous special case and before the following speciall case | |
| # Special case when two words are together with waw and the the AEs after the waw becomes HEH+AE | |
| text = re.sub(r'(?<=و)ەە(?=\w)', "هە", text) | |
| # Replace Three AEs with AE+HEH+AE (This has to be run before the following special case so words like لەهەوادا will not be ruined) | |
| text = re.sub(r"(?<=\w)ەەە(?=\w)", "ەهە", text) | |
| # Special case if two AEs are in the middle of a word and come before YEH ی or TCHEH چ or و they will be replaced with AE+HEH if the YEH or TCHEH are not at the END of the word | |
| text = re.sub(r"(?<=\w)ەە(?=[چیو]\B)", "ەه", text) | |
| # Replace 'ەە'AE+AE in the middle of a word with HEH+AE | |
| text = re.sub(r"(?<=\w)ەە(?=\w)", "هە", text) | |
| # Replace two AE with spaces in between with AE HEH | |
| text = re.sub("ە ە", "ە ه", text) | |
| # Replace all HEH DOACHASHMEE with HEH | |
| # text = text.replace('ھ', 'ە') | |
| return text | |
| clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!\/](?!\d)") | |
| def remove_punctuation(text): | |
| """Remove all punctuation from string, except if it's between digits""" | |
| return clean_punctuation.sub("", text) | |
| def extract_punctuation(text): | |
| # Initialize an empty string to store the extracted punctuation | |
| extracted_punctuation = "" | |
| # Iterate through each character in the input text | |
| for char in text: | |
| # Check if the character is categorized as punctuation | |
| if unicodedata.category(char).startswith('P'): | |
| extracted_punctuation += char # Add it to the result | |
| return set(extracted_punctuation) | |
| ARABIC_PUCTUATIONS = "،؛۔٫٪؟" | |
| CKB_PUNCTUATIONS = "!.:;?،؛؟«»" + ARABIC_PUCTUATIONS | |
| KURDISH_CHARS = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ") | |
| def contains_non_kurdish_characters(text): | |
| # kurdish_characters = set("ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ") | |
| kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ") | |
| non_kurdish_chars = set(text) - kurdish_characters | |
| return len(non_kurdish_chars) > 0 | |
| def keep_kurdish_characters(text): | |
| kurdish_characters = set(f"{CKB_PUNCTUATIONS}ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنهەوووۆیێ٠١٢٣٤٥٦٧٨٩ ") | |
| cleaned_text = ''.join(char for char in text if char in kurdish_characters) | |
| return cleaned_text | |
| def remove_emojis(text): | |
| emoji_pattern = re.compile("[" | |
| "\U0001F600-\U0001F64F" # Emoticons | |
| "\U0001F300-\U0001F5FF" # Symbols & Pictographs | |
| "\U0001F680-\U0001F6FF" # Transport & Map Symbols | |
| "\U0001F700-\U0001F77F" # Alchemical Symbols | |
| "\U0001F780-\U0001F7FF" # Geometric Shapes Extended | |
| "\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
| "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
| "\U0001FA00-\U0001FA6F" # Chess Symbols | |
| "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
| "\U00002702-\U000027B0" # Dingbats | |
| "]+", flags=re.UNICODE) | |
| return emoji_pattern.sub(r'', text) | |
| def remove_language_families(text): | |
| patterns = [ | |
| "[\u1100-\u11FF\u2E80-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF]+", # Asian scripts | |
| "[\u0000-\u024F]+", # Basic Latin and Latin-1 Supplement | |
| "[\u0400-\u04FF]+", # Cyrillic | |
| "[\u0370-\u03FF]+", # Greek | |
| "[\u0900-\u097F]+", # Devanagari | |
| r"\u0B80-\u0BFF", # Tamil | |
| r"\u4E00-\u9FFF", # Han | |
| r"\u10A0-\u10FF", # Georgian | |
| r"\u0C80-\u0CFF" # Kannada | |
| ] | |
| combined_pattern = re.compile("|".join(patterns)) | |
| cleaned_text = combined_pattern.sub(r'', text) | |
| return cleaned_text | |
| clean_punctuation = re.compile(r"(?<!\d)[.,;:'?!،.؟؛:](?!\d)") | |
| def remove_punctuation(text): | |
| """Remove all punctuation from string, except if it's between digits""" | |
| return clean_punctuation.sub("", text) | |
| def contains_repeated_ngram(window, n): | |
| ngrams = generate_ngrams(window, n) | |
| ngram_set = set(ngrams) | |
| return len(ngrams) != len(ngram_set) | |
| def generate_ngrams(text, n): | |
| words = text.split() | |
| output = [] | |
| for i in range(len(words)- n+1): | |
| output.append(tuple(words[i:i+n])) | |
| return output | |
| def remove_repeated_ngram(text, n): | |
| words = text.split() | |
| output = [] | |
| for i in range(len(words)- n+1): | |
| if not contains_repeated_ngram(" ".join(words[i:i+n]), n): | |
| output.append(words[i]) | |
| return " ".join(output) | |
| def normalize_punctuations(text: str) -> str: | |
| # Replace , with ، | |
| text = text.replace(',', '،') | |
| # Replace ? with ؟ | |
| text = text.replace('?', '؟') | |
| # Replace two or three of the same punctuation marks with a single one | |
| text = re.sub(r'([.,;:?!،؛؟])\1{1,2}', r'\1', text) | |
| # Replace double opening and closing parentheses with guillemets | |
| text = re.sub(r'\(\(', '«', text) | |
| text = re.sub(r'\)\)', '»', text) | |
| # Normalize space around the guillemets and other punctuation marks | |
| text = re.sub(r'\s*«\s*', ' «', text) | |
| text = re.sub(r'\s*»\s*', '» ', text) | |
| # Additional punctuation normalization | |
| text = re.sub(r'\s*([,،؟])\s*', r'\1 ', text) | |
| # Ensure there is no space before a guillemet at the beginning of the text or after a | |
| # guillemet at the end of the text | |
| text = re.sub(r'^\s*«', '«', text) | |
| text = re.sub(r'»\s*$', '»', text) | |
| # If multiple punctuation marks come after each other only keep the first one | |
| # text = re.sub(r'([.!?؟،؛])\1+', r'\1', text) | |
| # if conective punctuation marks come after each other only keep the first one | |
| text = re.sub(r'([.!?؟،؛])\1+', r'\1', text) | |
| # if punctuation marks come after each other with space between them like: ? ? ? keep the first one remove the rest | |
| text = re.sub(r'([.!?؟،؛])\s\1+', r'\1', text) | |
| # Trim leading and trailing spaces and return the normalized text | |
| text = text.strip() | |
| return text | |
| def fix_sentence(sentence): | |
| if sentence.startswith('"') and sentence.endswith('"'): | |
| # we can remove trailing quotation marks as they do not affect the sentence | |
| sentence = sentence[1:-1] | |
| if sentence[-1] not in [".", "?", "!"]: | |
| # append a full-stop to sentences that do not end in punctuation | |
| sentence = sentence + "." | |
| # sentence = sentence[:-1].translate(str.maketrans('', '', string.punctuation)) + sentence[-1] | |
| return sentence | |
| def add_period_abbreviations(text): | |
| abbreviations = set(["پ", "د"]) # Add more abbreviations as needed | |
| # Define a regular expression pattern to match a letter followed by a space and then a word character | |
| pattern = re.compile(r'([{}]) (?=\w)'.format(''.join(abbreviations))) | |
| # Use regex to add periods after the specified abbreviations with a space after the period | |
| text = pattern.sub(r'\1. ', text) | |
| # Add periods after each letter if "د" and "خ" appear together | |
| text = re.sub(r'د\sخ|خ ?د|د\.?خ|خ\.?د', 'د. خ.', text) | |
| # Abbreviated dates | |
| # text = re.sub(r'\b(پ\. ز)\b', r'\1.', text) | |
| return text | |
| def process_text(text): | |
| # text = replace_words_in_corpus(text) | |
| text = resolve_ae(text) | |
| # text = number_to_word(text) | |
| text = preprocessor_ckb.preprocess(text) | |
| # text = normalizer(text).strip() | |
| text = remove_emojis(text) | |
| text = normalize_punctuations(text) | |
| text = fix_sentence(text) | |
| text = apply_char_replacements(text) | |
| return text | |
| if __name__ == "__main__": | |
| # text = "لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت. 'õ'\u200c\u200f\u200e'ھ'" | |
| # print(process_text(text)) | |
| # print(contains_non_kurdish_characters(text)) | |
| # text = "دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟" | |
| # correct = "دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟" | |
| # print("Before punctuation normalization:", text) | |
| # print("After punctuation normalization:", normalize_punctuations(text)) | |
| # print("Correct:\t\t\t", correct) | |
| # print(normalize_punctuations(text) == correct) | |
| # print(normalize_punctuations("ڕەوا بورهان 4 تەمموز ، کوردستانی سلێمانی?!!")) | |
| # print(normalize_punctuations("یانەی کوردژین تکایە چۆن بە شی سە ڕە کی و لاوە کی بۆ مالپە ڕە کە م زیاد بکە م؟؟ ؟ ؟ لە سکرێپە یتی ژومیلە")) | |
| # with open('data/data.ckb.txt', 'r', encoding='utf-8') as src_file: | |
| # source_data = src_file.read() | |
| # unified_data = normalize_punctuations(source_data) | |
| # # Save the unified data to a new file | |
| # with open('data/unified_data.txt', 'w', encoding='utf-8') as file: | |
| # file.writelines(unified_data) | |
| # print("Unified data saved to unified_data.txt") | |
| text = "Hello ((Friend)) Hello , Friend World" | |
| # print(remove_repeated_ngram(text, 2)) | |
| # print(remove_repeated_ngrams(text, )) | |
| print(process_text(text)) |