python source code of data

import re, os, itertools, string
from collections import Counter
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import words
import numpy as np
import emoji
import utils
from vocab_helpers import contractions, implicit_emoticons, slang, \
    wikipedia_emoticons, emotiocons_to_emojis

path = os.getcwd()[:os.getcwd().rfind('/')]
dict_filename = "word_list.txt"
word_filename = "word_list_freq.txt"


def build_subj_dicionary(lines):
    subj_dict = dict()
    for line in lines:
        splits = line.split(' ')
        if len(splits) == 6:
            word = splits[2][6:]        # the word analyzed
            word_type = splits[0][5:]   # weak or strong subjective
            pos = splits[3][5:]         # part of speech: noun, verb, adj, adv or anypos
            polarity = splits[5][14:]   # its polarity: can be positive, negative or neutral
            new_dict_entry = {pos: [word_type, polarity]}
            if word in subj_dict.keys():
                subj_dict[word].update(new_dict_entry)
            else:
                subj_dict[word] = new_dict_entry
    return subj_dict


def get_subj_lexicon():
    lexicon = utils.load_file(path + "/res/subjectivity_lexicon.tff")
    subj_dict = build_subj_dicionary(lexicon)
    return subj_dict


def get_emoji_dictionary():
    emojis = utils.load_file(path + "/res/emoji/emoji_list.txt")
    emoji_dict = {}
    for line in emojis:
        line = line.split(" ", 1)
        emoji = line[0]
        description = line[1]
        emoji_dict[emoji] = description
    return emoji_dict


def build_emoji_sentiment_dictionary():
    new_emoji_sentiment_filename = path + "/res/emoji/emoji_sentiment_dictionary.txt"
    if not os.path.exists(new_emoji_sentiment_filename):
        filename = path + "/res/emoji/emoji_sentiment_raw.txt"
        emojis = utils.load_file(filename)[1:]
        lines = []
        for line in emojis:
            line = line.split(",")
            emoji = line[0]
            occurences = line[2]
            negative = float(line[4]) / float(occurences)
            neutral = float(line[5]) / float(occurences)
            positive = float(line[6]) / float(occurences)
            description = line[7]
            lines.append(str(emoji) + "\t" + str(negative) + "\t" + str(neutral)
                         + "\t" + str(positive) + "\t" + description.lower())
            utils.save_file(lines, new_emoji_sentiment_filename)
    emoji_sentiment_data = utils.load_file(new_emoji_sentiment_filename)
    emoji_sentiment_dict = {}
    for line in emoji_sentiment_data:
        line = line.split("\t")
        # Get emoji characteristics as a list [negative, neutral, positive, description]
        emoji_sentiment_dict[line[0]] = [line[1], line[2], line[3], line[4]]
    return emoji_sentiment_dict


# Extract each tweet's emojis - obv. it's just a brute force solution (so, it's slow) but works in ALL cases
def extract_emojis(tweets):
    emojis = []
    for tw in tweets:
        tw_emojis = []
        for word in tw:
            chars = list(word)
            for ch in chars:
                if ch in emoji.UNICODE_EMOJI:
                    tw_emojis.append(ch)
        emojis.append(' '.join(tw_emojis))
    return emojis


# Replace a contraction (coming from possessives, verbs, emphasis or just bad language) by its longer form
def replace_contracted_form(contracted_word, pos, dictionary):
    long_form = []
    if "'" in contracted_word:
        # print("Found apostrophe in word: ", contracted_word, ' with pos: ', pos)
        split_words = contracted_word.split("'")
        check_if_in_dict = False
        # If the contraction is a nominal + verbal or a proper noun + verbal
        if pos is 'L' or pos is 'M':
            long_form.append(split_words[0])
            if split_words[1].lower() in contractions:
                long_form.extend(contractions[split_words[1].lower()].split())
        # If the contraction is a whole verb (like let's or isn't)
        elif pos in ['V', 'Y', 'O'] and contracted_word.lower() in contractions:
            long_form.extend(contractions[contracted_word.lower()].split())
        # If the contraction is proper noun with possessive or a nominal with a possessive or even a (proper) noun
        elif pos in ['S', 'Z', 'D', 'N', '^']:
            if contracted_word.lower() in contractions:
                long_form.extend(contractions[contracted_word.lower()].split())
            elif split_words[1].lower() == 's':
                long_form.append(split_words[0])
            elif contracted_word.lower() in contractions:
                long_form.extend(contractions[contracted_word.lower()].split())
            else:
                check_if_in_dict = True
        # Can skip ' which are just punctuation marks (usually used to emphasize or quote something)
        elif pos is ',':
            # print("Punctuation, nothing to replace.", split_words[0], ' -- ', split_words[1])
            return []
        # Never replace contractions in emojis or emoticons (will be translated later)
        elif pos is 'E':
            long_form.append(contracted_word)
        else:
            check_if_in_dict = True
        if check_if_in_dict:
            # Attempt to separate words which have been separated by ' by human error
            clean0 = re.findall("[a-zA-Z]+", split_words[0])
            clean1 = re.findall("[a-zA-Z]+", split_words[1])
            if clean0 != [] and clean0[0].lower() in dictionary and clean1 != [] and clean1[0].lower() in dictionary:
                # print("Cleaned to ", clean0, ', ', clean1)
                long_form.extend([clean0[0], clean1[0]])
            else:
                # print("Word couldn't be de-contracted!")
                long_form.append(contracted_word)
        return long_form
    else:
        return long_form.append(contracted_word)


# Cannot do lemmatization with NLTK without changing the case - which we don't want
# So lemmatize but remember if upper case or startign with upper letter
# This will be needed when performing CMU pos-tagging or when extracting pragmatic features
def correct_spelling_but_preserve_case(lemmatizer, word):
    corrected = lemmatizer.lemmatize(word.lower(), 'v')
    corrected = lemmatizer.lemmatize(corrected)
    if word.isupper():
        return corrected.upper()
    if word[0].isupper():
        return corrected[0].upper() + corrected[1:]
    return corrected


# Reduce the length of the pattern (if repeating characters are found)
def reduce_lengthening(word, dictionary):
    if word.lower() in dictionary or word.isnumeric():
        return word
    # Pattern for repeating character sequences of length 2 or greater
    pattern2 = re.compile(r"(.)\1{2,}")
    # Pattern for repeating character sequences of length 1 or greater
    pattern1 = re.compile(r"(.)\1{1,}")
    # Word obtained from stripping repeating sequences of length 2
    word2 = pattern2.sub(r"\1\1", word)
    # Word obtained from stripping repeating sequences of length 1
    word1 = pattern1.sub(r"\1", word)
    # print("Reduced length from ", word, " w2 -- ", word2, " w1 -- ", word1)
    if word1.lower() in dictionary:
        return word1
    else:
        return word2


# Translate emojis (or a group of emojis) into a list of descriptions
def process_emojis(word, emoji_dict, translate_emojis=True):
    processed = []
    chars = list(word)
    remaining = ""
    for c in chars:
        if c in emoji_dict.keys() or c in emoji.UNICODE_EMOJI:
            if remaining != "":
                processed.append(remaining)
                remaining = ""
            if translate_emojis:
                if c in emoji_dict:
                    processed.extend(emoji_dict[c][3].lower().split())
            else:
                processed.extend(c)
        else:
            remaining += c
    if remaining != "":
        processed.append(remaining)
    if processed != []:
        return ' '.join(processed)
    else:
        return word


# TODO: Numerals - sarcasm heavily relies on them so find a way to extract meaning behind numbers
# Attempt to clean each tweet and make it as grammatical as possible
def grammatical_clean(tweets, pos_tags, word_file, filename, translate_emojis=True, replace_slang=True, lowercase=False):
    if not os.path.exists(filename):
        dictionary = utils.load_file(word_file).split()
        emoji_dict = build_emoji_sentiment_dictionary()
        lemmatizer = WordNetLemmatizer()
        corrected_tweets = []
        for tweet, pos_tag in zip(tweets, pos_tags):
            corrected_tweet = []
            # print("Tweet: ", tweet)
            # print("POS: ", pos_tag)
            for word, pos in zip(tweet.split(), pos_tag.split()):
                if lowercase:
                    t = word.lower()
                else:
                    t = word
                if t.startswith("#"):
                    t = t[1:]
                # Remove unnecessary hyphens that just add noise (but not from composed words)
                if t.startswith('-') or t.endswith('-'):
                    t = re.sub('[-]', '', t)
                # Process emojis (not written with parenthesis, but with symbols)
                emoji_translation = process_emojis(t, emoji_dict, translate_emojis=translate_emojis)
                if emoji_translation != t:
                    corrected_tweet.append(emoji_translation)
                    continue
                # Replace contractions with long-forms
                if "'" in t:
                    long_form = replace_contracted_form(t, pos, dictionary)
                    corrected_tweet.extend(long_form)
                    # print("Removed contracted form of ", t, " to ", long_form)
                    continue
                # Check if token contains repeating characters and if so, remove them
                # Exclude removal of repeating punctuation, numerals, user mentions
                if pos not in [',', '$', '~', '@'] and len(t) > 0:
                    t = correct_spelling_but_preserve_case(lemmatizer, t)
                    reduced = reduce_lengthening(t, dictionary)
                    if reduced != t.lower:
                        # print("Reduced length of word ", t, " to ", reduced)
                        t = reduced
                # Translate emoticons to their description
                if translate_emojis and t.lower() in wikipedia_emoticons:
                    translated_emoticon = wikipedia_emoticons[t.lower()].split()
                    # print("WIKI emoticon translated from  ", t, " to ", translated_emoticon)
                    corrected_tweet.extend(translated_emoticon)
                    continue
                elif t.lower() in emotiocons_to_emojis:
                    translated_emoticon = emotiocons_to_emojis[t.lower()]
                    corrected_tweet.append(translated_emoticon)
                    # print("Replaced emoticon from ", t, " to ", translated_emoticon)
                    continue
                # Replace all slang (or twitter abbreviations) to explicit form
                if replace_slang and t.lower() in slang.keys():
                    slang_translation = slang[t.lower()]
                    # print("Slang word replaced from ", t, " to ", slang_translation)
                    corrected_tweet.extend(slang_translation.split())
                    continue
                if t != '':
                    # print("Corrected tweet ", t)
                    corrected_tweet.append(t)
            corrected_tweets.append(corrected_tweet)
        # Save the grammatical set to filename
        lines = [' '.join(line) for line in corrected_tweets]
        # Used for comparison between previous data and the cleaned, grammatical data
        for dirty, corrected in zip(tweets, lines):
            print("Dirty:\t%s\nGr\t%s\nammatical:" % (dirty, corrected))
        utils.save_file(lines, filename)
        return lines
    # Load grammatical set from filename
    # corrected_tweets = [[word for word in line.split()] for line in utils.load_file(filename)]
    corrected_tweets = [line for line in utils.load_file(filename)]
    return corrected_tweets


def get_stopwords_list(filename="stopwords.txt"):
    stopwords = utils.load_file(path + "/res/" + filename)
    return stopwords


def build_vocabulary(vocab_filename, lines, minimum_occurrence=1):
    if not os.path.exists(vocab_filename):
        stopwords = get_stopwords_list(filename="stopwords_loose.txt")
        print("Building vocabulary...")
        vocabulary = Counter()
        for line in lines:
            vocabulary.update([l.lower() for l in line.split() if l not in stopwords])
        print("The top 10 most common words: ", vocabulary.most_common(10))
        # Filter all words that appear too rarely or too frequently to be conclusive
        vocabulary = {key: vocabulary[key] for key in vocabulary
                      if vocabulary[key] >= minimum_occurrence}
        utils.save_file(vocabulary.keys(), vocab_filename)
        print("Vocabulary saved to file \"%s\"" % vocab_filename)
    vocabulary = set(utils.load_file(vocab_filename))
    print("Loaded vocabulary of size ", len(vocabulary))
    return vocabulary


def build_vocabulary_for_dnn_tasks(vocab_filename, lines):
    if not os.path.exists(vocab_filename):
        print("Building vocabulary...")
        vocabulary = Counter()
        for line in lines:
            vocabulary.update([l.lower() for l in line])
        vocabulary = {key: vocabulary[key] for key in vocabulary}
        vocabulary = sorted(vocabulary.items(), key=lambda pair: pair[1], reverse=True)
        counter = 1
        indexed_vocabulary = {}
        for (key, _) in vocabulary:
            indexed_vocabulary[key] = counter
            counter += 1
        indexed_vocabulary['unk'] = len(indexed_vocabulary) + 1
        utils.save_dictionary(indexed_vocabulary, vocab_filename)
        print("Vocabulary saved to file \"%s\"" % vocab_filename)
    vocabulary = utils.load_dictionary(vocab_filename)
    print("Loaded vocabulary of size ", len(vocabulary))
    return vocabulary


def vocabulary_filtering(vocabulary, lines):
    filtered_lines = []
    indices = []
    for line in lines:
        filtered_line = []
        individual_word_indices = []
        for word in line:
            word = word.lower()
            if word in vocabulary:
                individual_word_indices.append(vocabulary[word])
                filtered_line.append(word)
            else:
                individual_word_indices.append(vocabulary['unk'])
                filtered_line.append('unk')
        indices.append(individual_word_indices)
        filtered_lines.append(filtered_line)
    return filtered_lines, indices


# Extract the lemmatized nouns and/or verbs from a set of documents - used in LDA modelling
def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False):
    lemmatizer = WordNetLemmatizer()
    clean_data = []
    for index in range(len(tokens)):
        if use_verbs and pos[index] is 'V':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v'))
        if use_nouns and pos[index] is 'N':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower()))
        if use_all:
            lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v')
            word = lemmatizer.lemmatize(lemmatized_word)
            if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']:
                clean_data.append(word)
    return clean_data


def filter_based_on_vocab(tweets, vocab_filename, min_occ=5):
    vocab = build_vocabulary(vocab_filename, tweets, minimum_occurrence=min_occ)
    filtered = []
    for tw in tweets:
        filtered.append(' '.join([t for t in tw.split() if t.lower() in vocab]))
    return filtered


def ulterior_clean(tweets, filename):
    if not os.path.exists(filename):
        stopwords = get_stopwords_list()
        lemmatizer = WordNetLemmatizer()
        filtered_tweets = []
        for tw in tweets:
            filtered_tweet = []
            for t in tw.split():
                token = t.lower()
                if token in stopwords:
                    continue
                filtered_token = lemmatizer.lemmatize(token, 'v')
                filtered_token = lemmatizer.lemmatize(filtered_token)
                filtered_tweet.append(filtered_token)
            filtered_tweets.append(' '.join(filtered_tweet))
        utils.save_file(filtered_tweets, filename)
    # Load the filtered tokens
    filtered_tweets = utils.load_file(filename)
    return filtered_tweets


def get_tags_for_each_tweet(tweets_filename, tokens_filename, pos_filename):
    if not os.path.exists(pos_filename):
        tweets = utils.load_file(tweets_filename)
        tokens_lines = []
        pos_lines = []
        tokens_line = ""
        pos_line = ""
        for t in tweets:
            if len(t) < 1:
                tokens_lines.append(tokens_line[:])
                pos_lines.append(pos_line[:])
                tokens_line = ""
                pos_line = ""
            else:
                t_split = t.split("\t")
                tokens_line += t_split[0] + " "
                pos_line += t_split[1] + " "
        utils.save_file(tokens_lines, tokens_filename)
        utils.save_file(pos_lines, pos_filename)
    # Load the tokens and the pos for the tweets in this set
    tokens = utils.load_file(tokens_filename)
    pos = utils.load_file(pos_filename)
    return tokens, pos


# Based on the probabilities of the tokenization and POS tagging obtain from CMU, get back coherent files
def cmu_probs_to_files(filename):
    # Get the tags corresponding to the test and train files
    tokens, pos = get_tags_for_each_tweet(path + "/res/cmu_tweet_tagger/" + filename,
                                          path + "/res/tokens/tokens_" + filename,
                                          path + "/res/pos/pos_" + filename)
    return tokens, pos


# Split based on Camel Case
def camel_case_split(term):
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
    splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term)
    return [s.group(0) for s in splits]


# Split a long, compound hash tag into its component tags. Given the character limit of tweets,
# people would stick words together to save space so this is a useful tool.
# Examples of hash splits from real data (train set) are in /stats/hashtag_splits.txt
# Implementation adapted from https://github.com/matchado/HashTagSplitter
def split_hashtag_to_words_all_possibilities(hashtag, word_dictionary):
    all_possibilities = []
    split_possibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag) + 1))]
    possible_split_positions = [i for i, x in enumerate(split_possibility) if x is True]

    for split_pos in possible_split_positions:
        split_words = []
        word_1, word_2 = hashtag[:len(hashtag) - split_pos], hashtag[len(hashtag) - split_pos:]

        if word_2 in word_dictionary:
            split_words.append(word_1)
            split_words.append(word_2)
            all_possibilities.append(split_words)
            another_round = split_hashtag_to_words_all_possibilities(word_2, word_dictionary)
            if len(another_round) > 0:
                all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in
                                                         zip([word_1] * len(another_round), another_round)]
        else:
            another_round = split_hashtag_to_words_all_possibilities(word_2, word_dictionary)
            if len(another_round) > 0:
                all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in
                                                         zip([word_1] * len(another_round), another_round)]
    return all_possibilities


def split_hashtag(hashtag, word_list):
    split_words = []
    if hashtag != hashtag.lower() and hashtag != hashtag.upper():
        split_words = camel_case_split(hashtag)
    else:
        j = 0
        while j <= len(hashtag):
            loc = j
            for i in range(j + 1, len(hashtag) + 1, 1):
                if hashtag[j:i].lower() in word_list:
                    loc = i
            if loc == j:
                j += 1
            else:
                split_words.append(hashtag[j:loc])
                j = loc
    split_words = ['#' + str(s) for s in split_words]
    return split_words


# Select the best possible hashtag split based on upper-case
# or component words maximizing the length of the possible word split
def split_hashtag_long_version(hashtag):
    word_file = path + "/res/word_list.txt"
    word_list = utils.load_file(word_file).split()
    word_dictionary = list(set(words.words()))
    for alphabet in "bcdefghjklmnopqrstuvwxyz":
        word_dictionary.remove(alphabet)
    all_poss = split_hashtag_to_words_all_possibilities(hashtag.lower(), word_dictionary)
    max_p = 0
    min_len = 1000
    found = False
    best_p = []
    for poss in all_poss:
        counter = 0
        for p in poss:
            if p in word_list:
                counter += 1
        if counter == len(poss) and min_len > counter:
            found = True
            min_len = counter
            best_p = poss
        else:
            if counter > max_p and not found:
                max_p = counter
                best_p = poss
    best_p_v2 = split_hashtag(hashtag, word_list)
    if best_p != [] and best_p_v2 != []:
        split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2
    else:
        if best_p == [] and best_p_v2 == []:
            split_words = [hashtag]
        else:
            split_words = best_p if best_p_v2 == [] else best_p_v2
    split_words = ['#' + str(s) for s in split_words]
    return split_words


def split_hashtags2(hashtag, word_list, verbose=False):
    if verbose:
        print("Hashtag is %s" % hashtag)
    # Get rid of the hashtag
    if hashtag.startswith('#'):
        term = hashtag[1:]
    else:
        term = hashtag

    # If the hastag is already an existing word (a single word), return it
    if word_list is not None and term.lower() in word_list:
        return ['#' + term]
    # First, attempt splitting by CamelCase
    if term[1:] != term[1:].lower() and term[1:] != term[1:].upper():
        splits = camel_case_split(term)
    elif '#' in term:
        splits = term.split("#")
    elif len(term) > 27:
        if verbose:
            print("Hashtag %s is too big so let as it is." % term)
        splits = [term]
    else:
        # Second, build possible splits and choose the best split by assigning
        # a "score" to each possible split, based on the frequency with which a word is occurring
        penalty = -69971
        max_coverage = penalty
        max_splits = 6
        n_splits = 0
        term = re.sub(r'([0-9]+)', r' \1', term)
        term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
        term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
        term = re.sub(r'([A-Z]{2,})+', r' \1', term)
        splits = term.strip().split(' ')
        if len(splits) < 3:
            # Splitting lower case and uppercase hashtags in up to 5 words
            chars = [c for c in term.lower()]
            found_all_words = False

            while n_splits < max_splits and not found_all_words:
                for index in itertools.combinations(range(0, len(chars)), n_splits):
                    output = np.split(chars, index)
                    line = [''.join(o) for o in output]
                    score = 0.0
                    for word in line:
                        stripped = word.strip()
                        if stripped in word_list:
                            score += int(word_list.get(stripped))
                        else:
                            if stripped.isnumeric():  # not stripped.isalpha():
                                score += 0.0
                            else:
                                score += penalty
                    score = score / float(len(line))
                    if score > max_coverage:
                        splits = line
                        max_coverage = score
                        line_is_valid_word = [word.strip() in word_list if not word.isnumeric()
                                              else True for word in line]
                        if all(line_is_valid_word):
                            found_all_words = True
                n_splits = n_splits + 1
    splits = ['#' + str(s) for s in splits]
    if verbose:
        print("Split to: ", splits)
    return splits


# Initial tweet cleaning - useful to filter data before tokenization
def clean_tweet(tweet, word_list, split_hashtag_method, replace_user_mentions=True,
                remove_hashtags=False, remove_emojis=False, all_to_lower_case=False):
    # Add white space before every punctuation sign so that we can split around it and keep it
    tweet = re.sub('([!?*&%"~`^+{}])', r' \1 ', tweet)
    tweet = re.sub('\s{2,}', ' ', tweet)
    tokens = tweet.split()
    valid_tokens = []
    for word in tokens:
        # Never include #sarca* hashtags
        if word.lower().startswith('#sarca'):
            continue
        # Never include URLs
        if 'http' in word:
            continue
        # Replace specific user mentions with a general user name
        if replace_user_mentions and word.startswith('@'):
            word = '@user'
        # Split or remove hashtags
        if word.startswith('#'):
            if remove_hashtags:
                continue
            splits = split_hashtag_method(word[1:], word_list)
            if all_to_lower_case:
                valid_tokens.extend([split.lower() for split in splits])
            else:
                valid_tokens.extend(splits)
            continue
        if remove_emojis and word in emoji.UNICODE_EMOJI:
            continue
        if all_to_lower_case:
            word = word.lower()
        valid_tokens.append(word)
    return ' '.join(valid_tokens)


def process_tweets(tweets, word_list, split_hashtag_method):
    clean_tweets = []
    for tweet in tweets:
        clean_tw = clean_tweet(tweet, word_list, split_hashtag_method)
        clean_tweets.append(clean_tw)
    return clean_tweets


def process_set(dataset_filename, vocab_filename, word_list, min_occ=10):
    data, labels = utils.load_data_panda(dataset_filename)
    tweets = process_tweets(data, word_list, split_hashtag)
    vocabulary = build_vocabulary(tweets, vocab_filename, minimum_occurrence=min_occ)
    filtered_tweets = []
    for tweet in tweets:
        filtered_tweets.append([t for t in tweet if t in vocabulary])
    return filtered_tweets, labels


def initial_clean(tweets, clean_filename, word_file, word_file_is_dict=False, split_hashtag_method=split_hashtag):
    if not os.path.exists(clean_filename):
        if word_file_is_dict:
            word_list = utils.load_dictionary(path + "/res/" + word_file)
        else:
            word_list = utils.load_file(path + "/res/" + word_file).split()
        filtered_tweets = process_tweets(tweets, word_list, split_hashtag_method)
        utils.save_file(filtered_tweets, clean_filename)
        return filtered_tweets
    else:
        filtered_tweets = utils.load_file(clean_filename)
        return filtered_tweets


# Return true or false depending on whether the word contains an emoji or not
def check_if_emoji(word, emoji_dict):
    emojis = list(word)
    for em in emojis:
        if em in emoji_dict.keys() or em in emoji.UNICODE_EMOJI:
            return True
    return False


# A strict clean of the twitter data - removing emojis, hashtags, URLs, user mentions
def strict_clean(tweets, filename):
    if not os.path.exists(filename):
        strict_tweets = []
        emoji_dict = get_emoji_dictionary()
        for tweet in tweets:
            strict_tweet = []
            for word in tweet.split():
                if '#' in word:
                    continue
                if '@' in word:
                    continue
                if 'http' in word:
                    continue
                if check_if_emoji(word, emoji_dict):
                    continue
                strict_tweet.append(word)
            strict_tweets.append(' '.join(strict_tweet))
        utils.save_file(strict_tweets, filename)
        return strict_tweets
    else:
        strict_tweets = utils.load_file(filename)
        return strict_tweets


# Get strictly cleaned data (designed to be applied on top of original data - e.g. original_train.txt)
def get_strict_data(train_filename, test_filename):
    # Load the train and test sets
    print("Loading data...")
    train_tweets = utils.load_file(path + "/res/data/" + train_filename)
    test_tweets = utils.load_file(path + "/res/data/" + test_filename)

    # Initial clean of data
    strict_tweets_train = strict_clean(train_tweets, path + "/res/data/strict_" + train_filename)
    strict_tweets_test = strict_clean(test_tweets, path + "/res/data/strict_" + test_filename)
    return strict_tweets_train, strict_tweets_test


# Initial clean of data (designed to be applied on top of original data - e.g. original_train.txt)
def get_clean_data(train_filename, test_filename, word_filename):
    # Load the (original) train and test sets
    print("Loading data...")
    train_tweets = utils.load_file(path + "/res/datasets/sarcasmdetection/sd_" + train_filename)
    test_tweets = utils.load_file(path + "/res/datasets/sarcasmdetection/sd_" + test_filename)
    clean_train = initial_clean(train_tweets, path + "/res/datasets/sarcasmdetection/clean_" + train_filename, word_filename,
                                word_file_is_dict=True, split_hashtag_method=split_hashtags2)
    clean_test = initial_clean(test_tweets, path + "/res/datasets/sarcasmdetection/clean_" + test_filename, word_filename,
                               word_file_is_dict=True, split_hashtag_method=split_hashtags2)
    return clean_train, clean_test


# An ulterior clean of data (designed to be applied on top of initial clean - e.g. train.txt)
def get_filtered_clean_data(train_filename, test_filename):
    # Loading the train and test sets
    print("Loading data...")
    train_tokens = utils.load_file(path + "/res/data/" + train_filename)
    test_tokens = utils.load_file(path + "/res/data/" + test_filename)
    filtered_train_tokens = ulterior_clean(train_tokens, path + "/res/data/filtered_" + train_filename)
    filtered_test_tokens = ulterior_clean(test_tokens, path + "/res/data/filtered_" + test_filename)
    return filtered_train_tokens, filtered_test_tokens


# Grammatical clean of data (designed to be applied on top of initial clean - e.g. train.txt)
def get_grammatical_data(train_filename, test_filename, dict_filename,
                         translate_emojis=True, replace_slang=True, lowercase=True):
    # Load the train and test sets
    print("Loading data...")
    train_tokens = utils.load_file(path + "/res/tokens/tokens_" + train_filename)
    train_pos = utils.load_file(path + "/res/pos/pos_" + train_filename)
    test_tokens = utils.load_file(path + "/res/tokens/tokens_" + test_filename)
    test_pos = utils.load_file(path + "/res/pos/pos_" + test_filename)

    if translate_emojis and replace_slang and lowercase:
        save_path = path + "/res/data/finest_grammatical_"
    else:
        save_path = path + "/res/data/grammatical_"

    # Clean the data and brind it to the most *grammatical* form possible
    gramm_train = grammatical_clean(train_tokens, train_pos, path + "/res/" + dict_filename, save_path + train_filename,
                                    translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase)
    gramm_test = grammatical_clean(test_tokens, test_pos, path + "/res/" + dict_filename, save_path + test_filename,
                                   translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase)
    return gramm_train, gramm_test


# Get train and test tokens, as well as indices assigned according to a vocabulary
# (designed to be applied on top of initial clean tokens - e.g. train.txt)
def get_clean_dl_data(train_filename, test_filename, word_list):
    vocab_filename = "dnn_vocabulary_" + train_filename
    # Load the train and test sets
    print("Loading data...")
    train_tweets = utils.load_file(path + "/res/tokens/tokens_" + train_filename)
    test_tweets = utils.load_file(path + "/res/tokens/tokens_" + test_filename)
    vocabulary = build_vocabulary_for_dnn_tasks(path + "/res/vocabulary/" + vocab_filename, train_tweets)
    clean_train_tweets, train_indices = vocabulary_filtering(vocabulary, train_tweets)
    clean_test_tweets, test_indices = vocabulary_filtering(vocabulary, test_tweets)
    return clean_train_tweets, train_indices, clean_test_tweets, test_indices, len(vocabulary)


def get_dataset(dataset):
    data_path = path + "/res/datasets/" + dataset + "/"
    train_tweets = utils.load_file(data_path + "tokens_train.txt")
    test_tweets = utils.load_file(data_path + "tokens_test.txt")
    train_pos = utils.load_file(data_path + "pos_train.txt")
    test_pos = utils.load_file(data_path + "pos_test.txt")
    train_labels = [int(l) for l in utils.load_file(data_path + "labels_train.txt")]
    test_labels = [int(l) for l in utils.load_file(data_path + "labels_test.txt")]
    print("Size of the train set: ", len(train_labels))
    print("Size of the test set: ", len(test_labels))
    return train_tweets, train_pos, train_labels, test_tweets, test_pos, test_labels


if __name__ == '__main__':

    train_filename = "clean_original_train.txt"
    test_filename = "clean_original_test.txt"

    # For a superficial clean
    clean_train, clean_test = get_clean_data(train_filename, test_filename, word_filename)

    # For a more aggressive clean
    filtered_train_tokens, filtered_test_tokens = get_filtered_clean_data(train_filename, test_filename)

    # For complete removal of any twitter-specific data
    strict_tweets_train, strict_tweets_test = get_strict_data(train_filename, test_filename)

    # For an attempt at a grammatical clean
    gramm_train, gramm_test = get_grammatical_data(train_filename, test_filename, dict_filename,
                                                   translate_emojis=False, replace_slang=False, lowercase=False)

    # For a more aggressive attempt at a grammatical clean
    finest_gramm_train, finest_gramm_test = get_grammatical_data(train_filename, test_filename, dict_filename,
                                                                 translate_emojis=True, replace_slang=True, lowercase=True)