| | import re |
| | import string |
| | import numpy as np |
| | import torch |
| | import unicodedata |
| | import nltk |
| |
|
| | |
| | nltk.download('stopwords') |
| | from nltk.corpus import stopwords |
| | stop_words = set(stopwords.words('russian', 'english')) |
| |
|
| | def data_preprocessing(text: str) -> str: |
| |
|
| | text = text.lower() |
| | text = text.replace('-', ' ').replace('\n', ' ') |
| |
|
| | text = re.sub('<.*?>', '', text) |
| | text = ''.join([c for c in text if unicodedata.category(c).startswith(('L', 'N', 'Z')) or c == "'"]) |
| | text = ' '.join([word for word in text.split() if word.lower() not in stop_words]) |
| | text = ' '.join([word for word in text.split() if not word.isdigit()]) |
| | return text |
| |
|
| |
|
| | def get_words_by_freq(sorted_words: list, n: int = 10) -> list: |
| | return list(filter(lambda x: x[1] > n, sorted_words)) |
| |
|
| | def padding(review_int: list, seq_len: int) -> np.array: |
| |
|
| | features = np.zeros((len(review_int), seq_len), dtype = int) |
| | for i, review in enumerate(review_int): |
| | if len(review) <= seq_len: |
| | zeros = list(np.zeros(seq_len - len(review))) |
| | new = zeros + review |
| | else: |
| | new = review[: seq_len] |
| | features[i, :] = np.array(new) |
| | |
| | return features |
| |
|
| | def preprocess_single_string( |
| | input_string: str, |
| | seq_len: int, |
| | vocab_to_int: dict, |
| | verbose : bool = False |
| | ) -> torch.tensor: |
| |
|
| |
|
| | preprocessed_string = data_preprocessing(input_string) |
| | result_list = [] |
| | for word in preprocessed_string.split(): |
| | try: |
| | result_list.append(vocab_to_int[word]) |
| | except KeyError as e: |
| | if verbose: |
| | print(f'{e}: not in dictionary!') |
| | pass |
| | result_padded = padding([result_list], seq_len)[0] |
| |
|
| | return torch.tensor(result_padded) |