| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | If you use the VADER sentiment analysis tools, please cite: |
| | |
| | Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for |
| | Sentiment Analysis of Social Media Text. Eighth International Conference on |
| | Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. |
| | """ |
| |
|
| | import math |
| | import re |
| | import string |
| | from itertools import product |
| |
|
| | import nltk.data |
| | from nltk.util import pairwise |
| |
|
| |
|
| | class VaderConstants: |
| | """ |
| | A class to keep the Vader lists and constants. |
| | """ |
| |
|
| | |
| | |
| | B_INCR = 0.293 |
| | B_DECR = -0.293 |
| |
|
| | |
| | |
| | C_INCR = 0.733 |
| |
|
| | N_SCALAR = -0.74 |
| |
|
| | NEGATE = { |
| | "aint", |
| | "arent", |
| | "cannot", |
| | "cant", |
| | "couldnt", |
| | "darent", |
| | "didnt", |
| | "doesnt", |
| | "ain't", |
| | "aren't", |
| | "can't", |
| | "couldn't", |
| | "daren't", |
| | "didn't", |
| | "doesn't", |
| | "dont", |
| | "hadnt", |
| | "hasnt", |
| | "havent", |
| | "isnt", |
| | "mightnt", |
| | "mustnt", |
| | "neither", |
| | "don't", |
| | "hadn't", |
| | "hasn't", |
| | "haven't", |
| | "isn't", |
| | "mightn't", |
| | "mustn't", |
| | "neednt", |
| | "needn't", |
| | "never", |
| | "none", |
| | "nope", |
| | "nor", |
| | "not", |
| | "nothing", |
| | "nowhere", |
| | "oughtnt", |
| | "shant", |
| | "shouldnt", |
| | "uhuh", |
| | "wasnt", |
| | "werent", |
| | "oughtn't", |
| | "shan't", |
| | "shouldn't", |
| | "uh-uh", |
| | "wasn't", |
| | "weren't", |
| | "without", |
| | "wont", |
| | "wouldnt", |
| | "won't", |
| | "wouldn't", |
| | "rarely", |
| | "seldom", |
| | "despite", |
| | } |
| |
|
| | |
| | |
| |
|
| | BOOSTER_DICT = { |
| | "absolutely": B_INCR, |
| | "amazingly": B_INCR, |
| | "awfully": B_INCR, |
| | "completely": B_INCR, |
| | "considerably": B_INCR, |
| | "decidedly": B_INCR, |
| | "deeply": B_INCR, |
| | "effing": B_INCR, |
| | "enormously": B_INCR, |
| | "entirely": B_INCR, |
| | "especially": B_INCR, |
| | "exceptionally": B_INCR, |
| | "extremely": B_INCR, |
| | "fabulously": B_INCR, |
| | "flipping": B_INCR, |
| | "flippin": B_INCR, |
| | "fricking": B_INCR, |
| | "frickin": B_INCR, |
| | "frigging": B_INCR, |
| | "friggin": B_INCR, |
| | "fully": B_INCR, |
| | "fucking": B_INCR, |
| | "greatly": B_INCR, |
| | "hella": B_INCR, |
| | "highly": B_INCR, |
| | "hugely": B_INCR, |
| | "incredibly": B_INCR, |
| | "intensely": B_INCR, |
| | "majorly": B_INCR, |
| | "more": B_INCR, |
| | "most": B_INCR, |
| | "particularly": B_INCR, |
| | "purely": B_INCR, |
| | "quite": B_INCR, |
| | "really": B_INCR, |
| | "remarkably": B_INCR, |
| | "so": B_INCR, |
| | "substantially": B_INCR, |
| | "thoroughly": B_INCR, |
| | "totally": B_INCR, |
| | "tremendously": B_INCR, |
| | "uber": B_INCR, |
| | "unbelievably": B_INCR, |
| | "unusually": B_INCR, |
| | "utterly": B_INCR, |
| | "very": B_INCR, |
| | "almost": B_DECR, |
| | "barely": B_DECR, |
| | "hardly": B_DECR, |
| | "just enough": B_DECR, |
| | "kind of": B_DECR, |
| | "kinda": B_DECR, |
| | "kindof": B_DECR, |
| | "kind-of": B_DECR, |
| | "less": B_DECR, |
| | "little": B_DECR, |
| | "marginally": B_DECR, |
| | "occasionally": B_DECR, |
| | "partly": B_DECR, |
| | "scarcely": B_DECR, |
| | "slightly": B_DECR, |
| | "somewhat": B_DECR, |
| | "sort of": B_DECR, |
| | "sorta": B_DECR, |
| | "sortof": B_DECR, |
| | "sort-of": B_DECR, |
| | } |
| |
|
| | |
| | SPECIAL_CASE_IDIOMS = { |
| | "the shit": 3, |
| | "the bomb": 3, |
| | "bad ass": 1.5, |
| | "yeah right": -2, |
| | "cut the mustard": 2, |
| | "kiss of death": -1.5, |
| | "hand to mouth": -2, |
| | } |
| |
|
| | |
| | REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]") |
| |
|
| | PUNC_LIST = [ |
| | ".", |
| | "!", |
| | "?", |
| | ",", |
| | ";", |
| | ":", |
| | "-", |
| | "'", |
| | '"', |
| | "!!", |
| | "!!!", |
| | "??", |
| | "???", |
| | "?!?", |
| | "!?!", |
| | "?!?!", |
| | "!?!?", |
| | ] |
| |
|
| | def __init__(self): |
| | pass |
| |
|
| | def negated(self, input_words, include_nt=True): |
| | """ |
| | Determine if input contains negation words |
| | """ |
| | neg_words = self.NEGATE |
| | if any(word.lower() in neg_words for word in input_words): |
| | return True |
| | if include_nt: |
| | if any("n't" in word.lower() for word in input_words): |
| | return True |
| | for first, second in pairwise(input_words): |
| | if second.lower() == "least" and first.lower() != "at": |
| | return True |
| | return False |
| |
|
| | def normalize(self, score, alpha=15): |
| | """ |
| | Normalize the score to be between -1 and 1 using an alpha that |
| | approximates the max expected value |
| | """ |
| | norm_score = score / math.sqrt((score * score) + alpha) |
| | return norm_score |
| |
|
| | def scalar_inc_dec(self, word, valence, is_cap_diff): |
| | """ |
| | Check if the preceding words increase, decrease, or negate/nullify the |
| | valence |
| | """ |
| | scalar = 0.0 |
| | word_lower = word.lower() |
| | if word_lower in self.BOOSTER_DICT: |
| | scalar = self.BOOSTER_DICT[word_lower] |
| | if valence < 0: |
| | scalar *= -1 |
| | |
| | if word.isupper() and is_cap_diff: |
| | if valence > 0: |
| | scalar += self.C_INCR |
| | else: |
| | scalar -= self.C_INCR |
| | return scalar |
| |
|
| |
|
| | class SentiText: |
| | """ |
| | Identify sentiment-relevant string-level properties of input text. |
| | """ |
| |
|
| | def __init__(self, text, punc_list, regex_remove_punctuation): |
| | if not isinstance(text, str): |
| | text = str(text.encode("utf-8")) |
| | self.text = text |
| | self.PUNC_LIST = punc_list |
| | self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation |
| | self.words_and_emoticons = self._words_and_emoticons() |
| | |
| | |
| | self.is_cap_diff = self.allcap_differential(self.words_and_emoticons) |
| |
|
| | def _words_plus_punc(self): |
| | """ |
| | Returns mapping of form: |
| | { |
| | 'cat,': 'cat', |
| | ',cat': 'cat', |
| | } |
| | """ |
| | no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text) |
| | |
| | words_only = no_punc_text.split() |
| | |
| | words_only = {w for w in words_only if len(w) > 1} |
| | |
| | punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)} |
| | punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)} |
| | words_punc_dict = punc_before |
| | words_punc_dict.update(punc_after) |
| | return words_punc_dict |
| |
|
| | def _words_and_emoticons(self): |
| | """ |
| | Removes leading and trailing puncutation |
| | Leaves contractions and most emoticons |
| | Does not preserve punc-plus-letter emoticons (e.g. :D) |
| | """ |
| | wes = self.text.split() |
| | words_punc_dict = self._words_plus_punc() |
| | wes = [we for we in wes if len(we) > 1] |
| | for i, we in enumerate(wes): |
| | if we in words_punc_dict: |
| | wes[i] = words_punc_dict[we] |
| | return wes |
| |
|
| | def allcap_differential(self, words): |
| | """ |
| | Check whether just some words in the input are ALL CAPS |
| | |
| | :param list words: The words to inspect |
| | :returns: `True` if some but not all items in `words` are ALL CAPS |
| | """ |
| | is_different = False |
| | allcap_words = 0 |
| | for word in words: |
| | if word.isupper(): |
| | allcap_words += 1 |
| | cap_differential = len(words) - allcap_words |
| | if 0 < cap_differential < len(words): |
| | is_different = True |
| | return is_different |
| |
|
| |
|
| | class SentimentIntensityAnalyzer: |
| | """ |
| | Give a sentiment intensity score to sentences. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | lexicon_file="config/vader_lexicon.txt", |
| | ): |
| | self.lexicon_file = nltk.data.load(lexicon_file) |
| | self.lexicon = self.make_lex_dict() |
| | self.constants = VaderConstants() |
| |
|
| | def make_lex_dict(self): |
| | """ |
| | Convert lexicon file to a dictionary |
| | """ |
| | lex_dict = {} |
| | for line in self.lexicon_file.split("\n"): |
| | (word, measure) = line.strip().split("\t")[0:2] |
| | lex_dict[word] = float(measure) |
| | return lex_dict |
| |
|
| | def polarity_scores(self, text): |
| | """ |
| | Return a float for sentiment strength based on the input text. |
| | Positive values are positive valence, negative value are negative |
| | valence. |
| | """ |
| | |
| | sentitext = SentiText( |
| | text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION |
| | ) |
| | sentiments = [] |
| | words_and_emoticons = sentitext.words_and_emoticons |
| | for item in words_and_emoticons: |
| | valence = 0 |
| | i = words_and_emoticons.index(item) |
| | if ( |
| | i < len(words_and_emoticons) - 1 |
| | and item.lower() == "kind" |
| | and words_and_emoticons[i + 1].lower() == "of" |
| | ) or item.lower() in self.constants.BOOSTER_DICT: |
| | sentiments.append(valence) |
| | continue |
| |
|
| | sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) |
| |
|
| | sentiments = self._but_check(words_and_emoticons, sentiments) |
| |
|
| | return self.score_valence(sentiments, text) |
| |
|
| | def sentiment_valence(self, valence, sentitext, item, i, sentiments): |
| | is_cap_diff = sentitext.is_cap_diff |
| | words_and_emoticons = sentitext.words_and_emoticons |
| | item_lowercase = item.lower() |
| | if item_lowercase in self.lexicon: |
| | |
| | valence = self.lexicon[item_lowercase] |
| |
|
| | |
| | if item.isupper() and is_cap_diff: |
| | if valence > 0: |
| | valence += self.constants.C_INCR |
| | else: |
| | valence -= self.constants.C_INCR |
| |
|
| | for start_i in range(0, 3): |
| | if ( |
| | i > start_i |
| | and words_and_emoticons[i - (start_i + 1)].lower() |
| | not in self.lexicon |
| | ): |
| | |
| | |
| | |
| | s = self.constants.scalar_inc_dec( |
| | words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff |
| | ) |
| | if start_i == 1 and s != 0: |
| | s = s * 0.95 |
| | if start_i == 2 and s != 0: |
| | s = s * 0.9 |
| | valence = valence + s |
| | valence = self._never_check( |
| | valence, words_and_emoticons, start_i, i |
| | ) |
| | if start_i == 2: |
| | valence = self._idioms_check(valence, words_and_emoticons, i) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | valence = self._least_check(valence, words_and_emoticons, i) |
| |
|
| | sentiments.append(valence) |
| | return sentiments |
| |
|
| | def _least_check(self, valence, words_and_emoticons, i): |
| | |
| | if ( |
| | i > 1 |
| | and words_and_emoticons[i - 1].lower() not in self.lexicon |
| | and words_and_emoticons[i - 1].lower() == "least" |
| | ): |
| | if ( |
| | words_and_emoticons[i - 2].lower() != "at" |
| | and words_and_emoticons[i - 2].lower() != "very" |
| | ): |
| | valence = valence * self.constants.N_SCALAR |
| | elif ( |
| | i > 0 |
| | and words_and_emoticons[i - 1].lower() not in self.lexicon |
| | and words_and_emoticons[i - 1].lower() == "least" |
| | ): |
| | valence = valence * self.constants.N_SCALAR |
| | return valence |
| |
|
| | def _but_check(self, words_and_emoticons, sentiments): |
| | words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons] |
| | but = {"but"} & set(words_and_emoticons) |
| | if but: |
| | bi = words_and_emoticons.index(next(iter(but))) |
| | for sidx, sentiment in enumerate(sentiments): |
| | if sidx < bi: |
| | sentiments[sidx] = sentiment * 0.5 |
| | elif sidx > bi: |
| | sentiments[sidx] = sentiment * 1.5 |
| | return sentiments |
| |
|
| | def _idioms_check(self, valence, words_and_emoticons, i): |
| | onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}" |
| |
|
| | twoonezero = "{} {} {}".format( |
| | words_and_emoticons[i - 2], |
| | words_and_emoticons[i - 1], |
| | words_and_emoticons[i], |
| | ) |
| |
|
| | twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}" |
| |
|
| | threetwoone = "{} {} {}".format( |
| | words_and_emoticons[i - 3], |
| | words_and_emoticons[i - 2], |
| | words_and_emoticons[i - 1], |
| | ) |
| |
|
| | threetwo = "{} {}".format( |
| | words_and_emoticons[i - 3], words_and_emoticons[i - 2] |
| | ) |
| |
|
| | sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] |
| |
|
| | for seq in sequences: |
| | if seq in self.constants.SPECIAL_CASE_IDIOMS: |
| | valence = self.constants.SPECIAL_CASE_IDIOMS[seq] |
| | break |
| |
|
| | if len(words_and_emoticons) - 1 > i: |
| | zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}" |
| | if zeroone in self.constants.SPECIAL_CASE_IDIOMS: |
| | valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone] |
| | if len(words_and_emoticons) - 1 > i + 1: |
| | zeroonetwo = "{} {} {}".format( |
| | words_and_emoticons[i], |
| | words_and_emoticons[i + 1], |
| | words_and_emoticons[i + 2], |
| | ) |
| | if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS: |
| | valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo] |
| |
|
| | |
| | if ( |
| | threetwo in self.constants.BOOSTER_DICT |
| | or twoone in self.constants.BOOSTER_DICT |
| | ): |
| | valence = valence + self.constants.B_DECR |
| | return valence |
| |
|
| | def _never_check(self, valence, words_and_emoticons, start_i, i): |
| | if start_i == 0: |
| | if self.constants.negated([words_and_emoticons[i - 1]]): |
| | valence = valence * self.constants.N_SCALAR |
| | if start_i == 1: |
| | if words_and_emoticons[i - 2] == "never" and ( |
| | words_and_emoticons[i - 1] == "so" |
| | or words_and_emoticons[i - 1] == "this" |
| | ): |
| | valence = valence * 1.5 |
| | elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): |
| | valence = valence * self.constants.N_SCALAR |
| | if start_i == 2: |
| | if ( |
| | words_and_emoticons[i - 3] == "never" |
| | and ( |
| | words_and_emoticons[i - 2] == "so" |
| | or words_and_emoticons[i - 2] == "this" |
| | ) |
| | or ( |
| | words_and_emoticons[i - 1] == "so" |
| | or words_and_emoticons[i - 1] == "this" |
| | ) |
| | ): |
| | valence = valence * 1.25 |
| | elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): |
| | valence = valence * self.constants.N_SCALAR |
| | return valence |
| |
|
| | def _punctuation_emphasis(self, sum_s, text): |
| | |
| | ep_amplifier = self._amplify_ep(text) |
| | qm_amplifier = self._amplify_qm(text) |
| | punct_emph_amplifier = ep_amplifier + qm_amplifier |
| | return punct_emph_amplifier |
| |
|
| | def _amplify_ep(self, text): |
| | |
| | ep_count = text.count("!") |
| | if ep_count > 4: |
| | ep_count = 4 |
| | |
| | |
| | ep_amplifier = ep_count * 0.292 |
| | return ep_amplifier |
| |
|
| | def _amplify_qm(self, text): |
| | |
| | qm_count = text.count("?") |
| | qm_amplifier = 0 |
| | if qm_count > 1: |
| | if qm_count <= 3: |
| | |
| | |
| | qm_amplifier = qm_count * 0.18 |
| | else: |
| | qm_amplifier = 0.96 |
| | return qm_amplifier |
| |
|
| | def _sift_sentiment_scores(self, sentiments): |
| | |
| | pos_sum = 0.0 |
| | neg_sum = 0.0 |
| | neu_count = 0 |
| | for sentiment_score in sentiments: |
| | if sentiment_score > 0: |
| | pos_sum += ( |
| | float(sentiment_score) + 1 |
| | ) |
| | if sentiment_score < 0: |
| | neg_sum += ( |
| | float(sentiment_score) - 1 |
| | ) |
| | if sentiment_score == 0: |
| | neu_count += 1 |
| | return pos_sum, neg_sum, neu_count |
| |
|
| | def score_valence(self, sentiments, text): |
| | if sentiments: |
| | sum_s = float(sum(sentiments)) |
| | |
| | punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) |
| | if sum_s > 0: |
| | sum_s += punct_emph_amplifier |
| | elif sum_s < 0: |
| | sum_s -= punct_emph_amplifier |
| |
|
| | compound = self.constants.normalize(sum_s) |
| | |
| | pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) |
| |
|
| | if pos_sum > math.fabs(neg_sum): |
| | pos_sum += punct_emph_amplifier |
| | elif pos_sum < math.fabs(neg_sum): |
| | neg_sum -= punct_emph_amplifier |
| |
|
| | total = pos_sum + math.fabs(neg_sum) + neu_count |
| | pos = math.fabs(pos_sum / total) |
| | neg = math.fabs(neg_sum / total) |
| | neu = math.fabs(neu_count / total) |
| |
|
| | else: |
| | compound = 0.0 |
| | pos = 0.0 |
| | neg = 0.0 |
| | neu = 0.0 |
| |
|
| | sentiment_dict = { |
| | "neg": round(neg, 3), |
| | "neu": round(neu, 3), |
| | "pos": round(pos, 3), |
| | "compound": round(compound, 4), |
| | } |
| |
|
| | return sentiment_dict |
| |
|