import emoji, re, string, time, os from nltk.sentiment.vader import SentimentIntensityAnalyzer from nltk import PorterStemmer, ngrams from nltk.stem.wordnet import WordNetLemmatizer from gensim.models import LdaModel from gensim.corpora import MmCorpus, Dictionary import data_processing as data_proc import vocab_helpers as helper # Get 13 pragmatic features (like the presence of laughter, capitalized words, emojis, hashtags, user mentions) # but also counts of punctuation, strong affirmatives, negations, intensifiers, tokens and average token size def get_pragmatic_features(tweet_tokens): capitalized_words = laughter = user_mentions = negations = affirmatives \ = interjections = intensifiers = punctuation = emojis = tweet_len_ch = hashtags = 0 for t in tweet_tokens: tweet_len_ch += len(t) if t.isupper(): capitalized_words = 1 # binary feature marking the presence of capitalized words if t.startswith("@"): user_mentions = 1 # binary feature for the presence of user mentions if t.startswith("#"): hashtags += 1 # count-based feature of hashtags used (excluding sarcasm or sarcastic) if t.lower().startswith("haha") or re.match('l(o)+l$', t.lower()): laughter = 1 # binary feature marking the presence of laughter if t in helper.strong_negations: negations += 1 # count-based feature of strong negations if t in helper.strong_affirmatives: affirmatives += 1 # count-based feature of strong affirmatives if t in helper.interjections: interjections += 1 # count-based feature of relevant interjections if t in helper.intensifiers: intensifiers += 1 # count-based feature of relevant intensifiers if t in helper.punctuation: punctuation += 1 # count-based feature of relevant punctuation signs if t in emoji.UNICODE_EMOJI: emojis += 1 # count-based feature of relevant punctuation signs tweet_len_tokens = len(tweet_tokens) # get the length of the tweet in tokens average_token_length = float(tweet_len_tokens) / max(1.0, float(tweet_len_ch)) # average tweet length feature_list = {'tw_len_ch': tweet_len_ch, 'tw_len_tok': tweet_len_tokens, 'avg_len': average_token_length, 'capitalized': capitalized_words, 'laughter': laughter, 'user_mentions': user_mentions, 'negations': negations, 'affirmatives': affirmatives, 'interjections': interjections, 'intensifiers': intensifiers, 'punctuation': punctuation, 'emojis': emojis, 'hashtags':hashtags} return feature_list # Obtain 25 POS Tags from the CMU Twitter Part-of-Speech Tagger according to the paper # "Part-of-Speech Tagging for Twitter: Annotation, Features, and Experiments" by Gimpel et al. def get_pos_features(pos_tweet): pos_dict = dict.fromkeys(['N', 'O', 'S', '^', 'Z', 'L', 'M', 'V', 'A', 'R', '!', 'D', 'P', '&', 'T', 'X', 'Y', '#', '@', '~', 'U', 'E', '$', ',', 'G'], 0) for pos in pos_tweet: pos_dict[pos] += 1 return pos_dict # Extract the n-grams (specified as a list n = [1, 2, 3, ...]) # e.g if n = [1,2,3] then ngram_features is a dictionary of all unigrams, bigrams and trigrams # This ngram extractor works for any kind of tokens i.e both words and pos tags def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False): if len(n) < 1: return {} if not for_semantics: if stem: porter = PorterStemmer() tokens = [porter.stem(t.lower()) for t in tokens] if use_just_words: tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#') and t not in string.punctuation] ngram_tokens = [] for i in n: for gram in ngrams(tokens, i): string_token = 'gram ' for j in range(i): string_token += gram[j] + ' ' ngram_tokens.append(string_token) ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)} return ngram_features # Get sentiment features -- a total of 18 features derived # Emoji features: a count of the positive, negative and neutral emojis # along with the ratio of positive to negative emojis and negative to neutral # Using the MPQA subjectivity lexicon, we have to check words for their part of speech # and obtain features: a count of positive, negative and neutral words, as well as # a count of the strong and weak subjectives, along with their ratios and a total sentiment words. # Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) def get_sentiment_features(tweet, tweet_tokens, tweet_pos, emoji_sent_dict, subj_dict): sent_features = dict.fromkeys(["positive emoji", "negative emoji", "neutral emoji", "emojis pos:neg", "emojis neutral:neg", "subjlexicon weaksubj", "subjlexicon strongsubj", "subjlexicon positive", "subjlexicon negative", "subjlexicon neutral", "words pos:neg", "words neutral:neg", "subjectivity strong:weak", "total sentiment words"], 0.0) for t in tweet_tokens: if t in emoji_sent_dict.keys(): sent_features['negative emoji'] += float(emoji_sent_dict[t][0]) sent_features['neutral emoji'] += float(emoji_sent_dict[t][1]) sent_features['positive emoji'] += float(emoji_sent_dict[t][2]) # Obtain the report of positive to negative emojis if sent_features['negative emoji'] != 0: if sent_features['positive emoji'] == 0: sent_features['emojis pos:neg'] = -1.0 / float(sent_features['negative emoji']) else: sent_features['emojis pos:neg'] = sent_features['positive emoji'] \ / float(sent_features['negative emoji']) # Obtain the report of neutral to negative emojis if sent_features['negative emoji'] != 0: if sent_features['neutral emoji'] == 0: sent_features['emojis neutral:neg'] = -1.0 / float(sent_features['negative emoji']) else: sent_features['emojis neutral:neg'] = sent_features['neutral emoji'] \ / float(sent_features['negative emoji']) lemmatizer = WordNetLemmatizer() pos_translation = {'N': 'noun', 'V': 'verb', 'D': 'adj', 'R': 'adverb'} for index in range(len(tweet_tokens)): stemmed = lemmatizer.lemmatize(tweet_tokens[index], 'v') if stemmed in subj_dict.keys(): if tweet_pos[index] in pos_translation and pos_translation[tweet_pos[index]] in subj_dict[stemmed].keys(): # Get the type of subjectivity (strong or weak) of this stemmed word sent_features['subjlexicon ' + subj_dict[stemmed][pos_translation[tweet_pos[index]]][0]] += 1 # Get the type of polarity (pos, neg, neutral) of this stemmed word if subj_dict[stemmed][pos_translation[tweet_pos[index]]][1] == 'both': sent_features['subjlexicon positive'] += 1 sent_features['subjlexicon negative'] += 1 else: sent_features['subjlexicon ' + subj_dict[stemmed][pos_translation[tweet_pos[index]]][1]] += 1 else: if 'anypos' in subj_dict[stemmed].keys(): # Get the type of subjectivity (strong or weak) of this stemmed word sent_features['subjlexicon ' + subj_dict[stemmed]['anypos'][0]] += 1 # strong or weak subjectivity # Get the type of polarity (pos, neg, neutral) of this stemmed word if subj_dict[stemmed]['anypos'][1] == 'both': sent_features['subjlexicon positive'] += 1 sent_features['subjlexicon negative'] += 1 else: sent_features['subjlexicon ' + subj_dict[stemmed]['anypos'][1]] += 1 # Use the total number of sentiment words as a feature sent_features["total sentiment words"] = sent_features["subjlexicon positive"] \ + sent_features["subjlexicon negative"] \ + sent_features["subjlexicon neutral"] if sent_features["subjlexicon negative"] != 0: if sent_features["subjlexicon positive"] == 0: sent_features["words pos:neg"] = -1.0 / float(sent_features["subjlexicon negative"]) else: sent_features["words pos:neg"] = sent_features["subjlexicon positive"] \ / float(sent_features["subjlexicon negative"]) if sent_features["subjlexicon negative"] != 0: if sent_features["subjlexicon neutral"] == 0: sent_features["words neutral:neg"] = -1.0 / float(sent_features["subjlexicon negative"]) else: sent_features["words neutral:neg"] = sent_features["subjlexicon neutral"] \ / float(sent_features["subjlexicon negative"]) if sent_features["subjlexicon weaksubj"] != 0: if sent_features["subjlexicon strongsubj"] == 0: sent_features["subjectivity strong:weak"] = -1.0 / float(sent_features["subjlexicon weaksubj"]) else: sent_features["subjectivity strong:weak"] = sent_features["subjlexicon strongsubj"] \ / float(sent_features["subjlexicon weaksubj"]) # Vader Sentiment Analyser # Obtain the negative, positive, neutral and compound scores of a tweet sia = SentimentIntensityAnalyzer() polarity_scores = sia.polarity_scores(tweet) for name, score in polarity_scores.items(): sent_features["Vader score " + name] = score return sent_features # Get the necessary data to perform topic modelling # including clean noun and verb phrases (lemmatized, lower-case) # Tokenization and POS labelled done as advertised by CMU def build_lda_model(tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False, num_of_topics=8, passes=25, verbose=True): path = os.getcwd()[:os.getcwd().rfind('/')] topics_filename = str(num_of_topics) + "topics" if use_nouns: topics_filename += "_nouns" if use_verbs: topics_filename += "_verbs" if use_all: topics_filename += "_all" # Set the LDA, Dictionary and Corpus filenames lda_filename = path + "/models/topic_models/lda_" + topics_filename + ".model" dict_filename = path + "/res/topic_data/dict/dict_" + topics_filename + ".dict" corpus_filename = path + "/res/topic_data/corpus/corpus_" + topics_filename + ".mm" # Build a topic model if it wasn't created yet if not os.path.exists(lda_filename): # Extract the lemmatized documents docs = [] for index in range(len(tokens_tags)): tokens = tokens_tags[index].split() pos = pos_tags[index].split() docs.append(data_proc.extract_lemmatized_tweet(tokens, pos, use_verbs, use_nouns, use_all)) # Compute the dictionary and save it dictionary = Dictionary(docs) dictionary.filter_extremes(keep_n=60000) dictionary.compactify() Dictionary.save(dictionary, dict_filename) # Compute the bow corpus and save it corpus = [dictionary.doc2bow(d) for d in docs] MmCorpus.serialize(corpus_filename, corpus) if verbose: print("\nCleaned documents:") print(docs) print("\nDictionary:") print(dictionary) print("\nCorpus in BoW form:") print(corpus) # Start training an LDA Model start = time.time() print("\nBuilding the LDA topic model...") lda_model = LdaModel(corpus=corpus, num_topics=num_of_topics, passes=passes, id2word=dictionary) lda_model.save(lda_filename) end = time.time() print("Completion time for building LDA model: %.3f s = %.3f min" % ((end - start), (end - start) / 60.0)) if verbose: print("\nList of words associated with each topic:") lda_topics = lda_model.show_topics(formatted=False) lda_topics_list = [[word for word, prob in topic] for topic_id, topic in lda_topics] print([t for t in lda_topics_list]) # Load the previously saved dictionary dictionary = Dictionary.load(dict_filename) # Load the previously saved corpus mm_corpus = MmCorpus(corpus_filename) # Load the previously saved LDA model lda_model = LdaModel.load(lda_filename) # Print the top 10 words for each topic for topic_id in range(num_of_topics): print("\nTop 10 words for topic ", topic_id) print([dictionary[word_id] for (word_id, prob) in lda_model.get_topic_terms(topic_id, topn=10)]) index = 0 if verbose: for doc_topics, word_topics, word_phis in lda_model.get_document_topics(mm_corpus, per_word_topics=True): print('Index ', index) print('Document topics:', doc_topics) print('Word topics:', word_topics) print('Phi values:', word_phis) print('-------------- \n') index += 1 return dictionary, mm_corpus, lda_model # Predict the topic of an unseen testing example based on the LDA model built on the train set def get_topic_features_for_unseen_tweet(dictionary, lda_model, tokens_tags, pos_tags, use_nouns=True, use_verbs=True, use_all=False, verbose=False): # Extract the lemmatized documents docs = data_proc.extract_lemmatized_tweet(tokens_tags, pos_tags, use_verbs, use_nouns, use_all) tweet_bow = dictionary.doc2bow(docs) topic_prediction = lda_model[tweet_bow] topic_features = {} if verbose: print("\nTopic prediction\n") for t in topic_prediction: print(t) if any(isinstance(topic_list, type([])) for topic_list in topic_prediction): topic_prediction = topic_prediction[0] for topic in topic_prediction: topic_features['topic ' + str(topic[0])] = topic[1] return topic_features # Use the distributions of topics in a tweet as features def get_topic_features(corpus, lda_model, index): topic_features = {} doc_topics, word_topic, phi_values = lda_model.get_document_topics(corpus, per_word_topics=True)[index] for topic in doc_topics: topic_features['topic ' + str(topic[0])] = topic[1] return topic_features # Collect all features def get_feature_set(tweets_tokens, tweets_pos, pragmatic=False, pos_unigrams=False, pos_bigrams=False, lexical=False, ngram_list=[1], sentiment=False, topic=True): features = [] if sentiment: # Emoji lexicon - underlying sentiment (pos, neutral, neg) emoji_dict = data_proc.build_emoji_sentiment_dictionary() # Obtain subjectivity features from the MPQA lexicon and build the subjectivity lexicon subj_dict = data_proc.get_subj_lexicon() if topic: use_nouns = True use_verbs = True use_all = False dictionary, corpus, lda_model = build_lda_model(tweets_tokens, tweets_pos, use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all, num_of_topics=6, passes=20, verbose=False) for index in range(len(tweets_tokens)): tokens_this_tweet = tweets_tokens[index].split() pos_this_tweet = tweets_pos[index].split() pragmatic_features = {} pos_unigrams_features = {} pos_bigrams_features = {} words_ngrams = {} sentiment_features = {} topic_features = {} if pragmatic: pragmatic_features = get_pragmatic_features(tokens_this_tweet) if pos_unigrams: pos_unigrams_features = get_pos_features(pos_this_tweet) if pos_bigrams: pos_bigrams_features = get_ngrams(pos_this_tweet, n=[3], for_semantics=True) if lexical: words_ngrams = get_ngrams(tokens_this_tweet, n=ngram_list, use_just_words=True) if sentiment: sentiment_features = get_sentiment_features(tweets_tokens[index], tokens_this_tweet, pos_this_tweet, emoji_dict, subj_dict) if topic: topic_features = get_topic_features_for_unseen_tweet\ (dictionary, lda_model, tokens_this_tweet, pos_this_tweet, use_nouns=use_nouns, use_verbs=use_verbs, use_all=use_all) # Append all extracted features of this tweet to the final list of all features features.append({**pragmatic_features, **pos_unigrams_features, **pos_bigrams_features, **words_ngrams, **sentiment_features, **topic_features}) return features