Python nltk.corpus.words.words() Examples

The following are 28 code examples of nltk.corpus.words.words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.words , or try the search function

Example #1

Source File: wordfinder.py From razzy-spinner with GNU General Public License v3.0

7 votes

def word_finder():
    from nltk.corpus import words
    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print("Word Finder\n")
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print(grid[i][j], end=' ')
        print()
    print()

    for i in range(len(used)):
        print("%d:" % (i+1), used[i])

Example #2

Source File: data_processing.py From Sarcasm-Detection with MIT License

6 votes

def split_hashtag(hashtag, word_list):
    split_words = []
    if hashtag != hashtag.lower() and hashtag != hashtag.upper():
        split_words = camel_case_split(hashtag)
    else:
        j = 0
        while j <= len(hashtag):
            loc = j
            for i in range(j + 1, len(hashtag) + 1, 1):
                if hashtag[j:i].lower() in word_list:
                    loc = i
            if loc == j:
                j += 1
            else:
                split_words.append(hashtag[j:loc])
                j = loc
    split_words = ['#' + str(s) for s in split_words]
    return split_words


# Select the best possible hashtag split based on upper-case
# or component words maximizing the length of the possible word split

Example #3

Source File: wordfinder.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def word_finder():
    from nltk.corpus import words

    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print("Word Finder\n")
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print(grid[i][j], end=' ')
        print()
    print()

    for i in range(len(used)):
        print("%d:" % (i + 1), used[i])

Example #4

Source File: res_sen2vec.py From resilient-community-apps with MIT License

6 votes

def __init__(self, w2v, sif, log=None):
        # A NLPWord2Vec to get the vec for a word
        self.word2vec = w2v
        # A ResSIF used to get word count
        self.sif = sif
        # util to pre-process data
        self.utils = WordSentenceUtils()
        self.log = log if log else logging.getLogger(__name__)
        self.sentence_vectors = []
        self.feature_size = 0
        # download nltk resource if necessary
        nltk.download('words', quiet=True)
        self.setofwords = set(nltk_words.words())

        # pca vector
        self.pca_u = []

Example #5

Source File: suggest.py From gitsuggest with MIT License

6 votes

def __get_words_to_ignore(self):
        """Compiles list of all words to ignore.

        :return: List of words to ignore.
        """
        # Stop words in English.
        english_stopwords = stopwords.words("english")

        here = path.abspath(path.dirname(__file__))

        # Languages in git repositories.
        git_languages = []
        with open(path.join(here, "gitlang/languages.txt"), "r") as langauges:
            git_languages = [line.strip() for line in langauges]

        # Other words to avoid in git repositories.
        words_to_avoid = []
        with open(path.join(here, "gitlang/others.txt"), "r") as languages:
            words_to_avoid = [line.strip() for line in languages]

        return set(
            itertools.chain(english_stopwords, git_languages, words_to_avoid)
        )

Example #6

Source File: data_processing.py From Sarcasm-Detection with MIT License

6 votes

def build_vocabulary(vocab_filename, lines, minimum_occurrence=1):
    if not os.path.exists(vocab_filename):
        stopwords = get_stopwords_list(filename="stopwords_loose.txt")
        print("Building vocabulary...")
        vocabulary = Counter()
        for line in lines:
            vocabulary.update([l.lower() for l in line.split() if l not in stopwords])
        print("The top 10 most common words: ", vocabulary.most_common(10))
        # Filter all words that appear too rarely or too frequently to be conclusive
        vocabulary = {key: vocabulary[key] for key in vocabulary
                      if vocabulary[key] >= minimum_occurrence}
        utils.save_file(vocabulary.keys(), vocab_filename)
        print("Vocabulary saved to file \"%s\"" % vocab_filename)
    vocabulary = set(utils.load_file(vocab_filename))
    print("Loaded vocabulary of size ", len(vocabulary))
    return vocabulary

Example #7

Source File: wordfinder.py From luscan-devel with GNU General Public License v2.0

6 votes

def word_finder():
    from nltk.corpus import words
    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print "Word Finder\n"
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print grid[i][j],
        print
    print

    for i in range(len(used)):
        print "%d:" % (i+1), used[i]

Example #8

Source File: suggest.py From gitsuggest with MIT License

5 votes

def __get_words_to_consider(self):
        """Compiles list of all words to consider.

        :return: List of words to consider.
        """
        return set(words.words())

Example #9

Source File: named_entity.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.label(), []))
            for subchild in child:
                newtree[-1].append((subchild, next(tag_iter)))
        else:
            newtree.append((child, next(tag_iter)))
    return newtree

Example #10

Source File: named_entity.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def _english_wordlist(self):
        try:
            wl = self._en_wordlist
        except AttributeError:
            from nltk.corpus import words

            self._en_wordlist = set(words.words('en-basic'))
            wl = self._en_wordlist
        return wl

Example #11

Source File: named_entity.py From razzy-spinner with GNU General Public License v3.0

5 votes

def _english_wordlist(self):
        try:
            wl = self._en_wordlist
        except AttributeError:
            from nltk.corpus import words
            self._en_wordlist = set(words.words('en-basic'))
            wl = self._en_wordlist
        return wl

Example #12

Source File: res_sen2vec.py From resilient-community-apps with MIT License

5 votes

def find_keywords(self, sentence, closest):
        """
        For each incident in closest,  find words in sentence with highest contribution.

        :param sentence: description of the input (new) incident
        :param closest: the top closest incidents found
        :return: add "keywords" to each closest incident
        """
        w_util = WordSentenceUtils()
        words = w_util.get_words(sentence)
        for cl in closest:
            v2_1 = cl["vec"]
            v2_norm_1 = np.linalg.norm(v2_1)
            word_sim = []
            for w in words:
                try:
                    wc = self.sif.get_word_count(w)
                    if wc < self.COUNT_THRESHOLD:
                        wc = self.COUNT_THRESHOLD - (self.COUNT_THRESHOLD - wc)/2

                    a_value = ResSen2Vec.SIF_A / (ResSen2Vec.SIF_A + wc)
                    w_v = self.word2vec[w]
                    sub = np.multiply(self.pca_u, w_v)
                    w_v = np.subtract(w_v, sub)

                    sim = a_value * np.dot(w_v, v2_1) / (np.linalg.norm(w_v) * v2_norm_1)
                    word_sim.append((sim, w))
                except:
                    pass
            word_sim.sort(key=lambda u: u[0])
            top_5 = word_sim[:5]
            cl["keywords"] = ', '.join([ws[1] for ws in top_5])

Example #13

Source File: res_sen2vec.py From resilient-community-apps with MIT License

5 votes

def remove_pca(self, vecs):
        """
        Remove the principle component from the vectors. This follows the recommendation of
        "A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDINGS",
        The reason according to them is that all vectors contain a common principle component
        that comes from common words. Here we use sk-learn PCA to do so
        :param vecs: input vectors
        :return: vectors with principle component removed
        """
        pca = PCA()
        pca.fit(np.array(vecs))
        u = pca.components_[0]

        # Need to store this PCA for later use when we compare incidents
        with open(FileManage.DEFAULT_PCA_FILE, 'w') as outfile:
            ul = list(u)
            json.dump(ul, outfile)

        u = np.multiply(u, np.transpose(u))

        # Corner case for small number of dataset. Pad with 0
        if len(u) < self.word2vec.vector_size:
            for i in range(self.word2vec.vector_size - len(u)):
                u = np.append(u, 0)

        ret = []
        for v in vecs:
            sub = np.multiply(u, v)
            ret.append(np.subtract(v, sub))

        return ret

Example #14

Source File: res_sen2vec.py From resilient-community-apps with MIT License

5 votes

def get_vec_for_sentence(self, sentence):
        """

        :param sentence:
        :return:
        """
        words = self.utils.get_words(sentence)
        return self.get_vec_for_words(words)

Example #15

Source File: res_sen2vec.py From resilient-community-apps with MIT License

5 votes

def get_vec_for_words(self, words):
        """
        Here is the implementation of SIF (Smooth Inverse Frequency)
        :param words:
        :return:
        """
        self.feature_size = self.word2vec.vector_size
        # This is the accumulator. Initialize to zero vector first
        v = np.zeros(self.feature_size, dtype="float64")

        # given words of a sentence, now we are ready to compute the
        # vector for this sentence.
        count = 0               # count of how many words contributing to this sentence
        for word in words:
            if word in self.setofwords:
                # We only care about words in nltk words set
                word_count = self.sif.get_word_count(word)
                if word_count > 0:
                    # some words have unreasonably low count and adjust it a little bit
                    if word_count < self.COUNT_THRESHOLD:
                        word_count = self.COUNT_THRESHOLD - (self.COUNT_THRESHOLD - word_count)/2
                    try:
                        # This is the SIF method
                        a_value = self.SIF_A / (self.SIF_A + word_count)
                        vec = np.multiply(a_value, self.word2vec[word])
                        # accumulate it
                        v = np.add(v, vec)
                        count += 1
                    except Exception as e:
                        # Not an error if word is not in the vocab
                        self.log.debug("{} is not in the vocab of the word2vec model".format(word))

        # normalize it
        if count != 0:
            v = np.divide(v, count)

        return v

Example #16

Source File: named_entity.py From razzy-spinner with GNU General Public License v3.0

5 votes

def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.label(), []))
            for subchild in child:
                newtree[-1].append( (subchild, next(tag_iter)) )
        else:
            newtree.append( (child, next(tag_iter)) )
    return newtree

Example #17

Source File: suggest.py From gitsuggest with MIT License

5 votes

def __construct_lda_model(self):
        """Method to create LDA model to procure list of topics from.

        We do that by first fetching the descriptions of repositories user has
        shown interest in. We tokenize the hence fetched descriptions to
        procure list of cleaned tokens by dropping all the stop words and
        language names from it.

        We use the cleaned and sanitized token list to train LDA model from
        which we hope to procure topics of interests to the authenticated user.
        """
        # Fetch descriptions of repos of interest to authenticated user.
        repos_of_interest = self.__get_interests()

        # Procure clean tokens from the descriptions.
        cleaned_tokens = self.__clean_and_tokenize(repos_of_interest)

        # If cleaned tokens are empty, it can cause an exception while
        # generating LDA. But tokens shouldn't be something meaningful as that
        # would mean we are suggesting repos without reason. Hence the random
        # string to ensure that LDA doesn't cause exception but the token
        # doesn't generate any suggestions either.
        if not cleaned_tokens:
            cleaned_tokens = [["zkfgzkfgzkfgzkfgzkfgzkfg"]]

        # Setup LDA requisites.
        dictionary = corpora.Dictionary(cleaned_tokens)
        corpus = [dictionary.doc2bow(text) for text in cleaned_tokens]

        # Generate LDA model
        self.lda_model = models.ldamodel.LdaModel(
            corpus, num_topics=1, id2word=dictionary, passes=10
        )

Example #18

Source File: rd_ft.py From DeepLearn with MIT License

5 votes

def WPS(text):
    count = 0
    for word in text.split():
        if word in set(w.lower() for w in words.words()):
            count += 1
    return count

#Average Number Of Syllables In Sentence(Returns Float):

Example #19

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def split_hashtag_long_version(hashtag):
    word_file = path + "/res/word_list.txt"
    word_list = utils.load_file(word_file).split()
    word_dictionary = list(set(words.words()))
    for alphabet in "bcdefghjklmnopqrstuvwxyz":
        word_dictionary.remove(alphabet)
    all_poss = split_hashtag_to_words_all_possibilities(hashtag.lower(), word_dictionary)
    max_p = 0
    min_len = 1000
    found = False
    best_p = []
    for poss in all_poss:
        counter = 0
        for p in poss:
            if p in word_list:
                counter += 1
        if counter == len(poss) and min_len > counter:
            found = True
            min_len = counter
            best_p = poss
        else:
            if counter > max_p and not found:
                max_p = counter
                best_p = poss
    best_p_v2 = split_hashtag(hashtag, word_list)
    if best_p != [] and best_p_v2 != []:
        split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2
    else:
        if best_p == [] and best_p_v2 == []:
            split_words = [hashtag]
        else:
            split_words = best_p if best_p_v2 == [] else best_p_v2
    split_words = ['#' + str(s) for s in split_words]
    return split_words

Example #20

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def camel_case_split(term):
    term = re.sub(r'([0-9]+)', r' \1', term)
    term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
    splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term)
    return [s.group(0) for s in splits]


# Split a long, compound hash tag into its component tags. Given the character limit of tweets,
# people would stick words together to save space so this is a useful tool.
# Examples of hash splits from real data (train set) are in /stats/hashtag_splits.txt
# Implementation adapted from https://github.com/matchado/HashTagSplitter

Example #21

Source File: art.py From Text-Recognition with GNU Lesser General Public License v2.1

5 votes

def __init__(self, config, abc):
	# def __init__(self):

		self.config = config
		self.seed()

		self.all_words = words.words()
		self.english_alpha = ''.join([c for c in abc if c in 'abcdefghijklmnopqrstuvwxyz0123456789'])+"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
		self.english_symbols = ''.join([c for c in abc if c not in 'abcdefghijklmnopqrstuvwxyz0123456789'])

		self.english = self.english_alpha + self.english_symbols

		self.transparent_mean = 0.8		
		self.transparent_gaussian = 0.06
		
		self.prob_lexi = 0.5
		self.symbol_word = 1

		self.art_font_size_range = self.config['augmentation']['font_range']
		self.border_range = self.config['augmentation']['border_range']

		self.font_dir_name='/home/Common/Datasets_SSD/Dataset_Text/ART/fonts_for_text'

		#probabilty distribution for length of words
		self.probability_dist = np.array([0.1, 0.6, 2.6, 5.2, 8.5, 12.2, 14, 14, 12.6, 10.1, 7.5])#, 5.2, 3.2, 2, 1, 0.6, 0.3, 0.2, 0.1, 0.1
		self.probability_dist = self.probability_dist/np.sum(self.probability_dist)

		list_of_files = self.get_list_of_files(self.font_dir_name)

		self.all_fonts = []
		for i in range(len(list_of_files)):
			with open(list_of_files[i],"rb") as f:
				font_bytes=f.read()
				self.all_fonts.append(font_bytes)

		self.image_net_location = "/media/mayank/0b40607e-7efc-4216-b12f-8bb86facfaed/Dataset_HDD/Image_Net/ILSVRC/Data/CLS-LOC/test/"
		# self.image_net_location = "/home/Common/ImageNet/test"
		self.images_orig = self.get_imagenet_images(self.config['augmentation']['imagenet_no'])#self.config['augmentation']['base_number']
		# self.image_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Images/'
		# self.label_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Labels/'

Example #22

Source File: named_entity.py From luscan-devel with GNU General Public License v2.0

5 votes

def postag_tree(tree):
    # Part-of-speech tagging.
    words = tree.leaves()
    tag_iter = (pos for (word, pos) in pos_tag(words))
    newtree = Tree('S', [])
    for child in tree:
        if isinstance(child, Tree):
            newtree.append(Tree(child.node, []))
            for subchild in child:
                newtree[-1].append( (subchild, tag_iter.next()) )
        else:
            newtree.append( (child, tag_iter.next()) )
    return newtree

Example #23

Source File: named_entity.py From luscan-devel with GNU General Public License v2.0

5 votes

def _english_wordlist(self):
        try:
            wl = self._en_wordlist
        except AttributeError:
            from nltk.corpus import words
            self._en_wordlist = set(words.words('en-basic'))
            wl = self._en_wordlist
        return wl

Example #24

Source File: rd_ft.py From DL-text with MIT License

5 votes

def WPS(text):
    count = 0
    for word in text.split():
        if word in set(w.lower() for w in words.words()):
            count += 1
    return count

#Average Number Of Syllables In Sentence(Returns Float):

Example #25

Source File: wordfinder.py From luscan-devel with GNU General Public License v2.0

4 votes

def wordfinder(words, rows=20, cols=20, attempts=50,
               alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
    """
    Attempt to arrange words into a letter-grid with the specified
    number of rows and columns.  Try each word in several positions
    and directions, until it can be fitted into the grid, or the
    maximum number of allowable attempts is exceeded.  Returns a tuple
    consisting of the grid and the words that were successfully
    placed.

    :param words: the list of words to be put into the grid
    :type words: list
    :param rows: the number of rows in the grid
    :type rows: int
    :param cols: the number of columns in the grid
    :type cols: int
    :param attempts: the number of times to attempt placing a word
    :type attempts: int
    :param alph: the alphabet, to be used for filling blank cells
    :type alph: list
    :rtype: tuple
    """

    # place longer words first
    words.sort(cmp=lambda x,y:cmp(len(x),len(y)), reverse=True)

    grid = []  # the letter grid
    used = []  # the words we used

    # initialize the grid
    for i in range(rows):
        grid.append([""] * cols)

    # try to place each word
    for word in words:
        word = strip(word).upper()   # normalize
        save = word                  # keep a record of the word
        word = revword(word)
        for attempt in range(attempts):
            r = random.randint(0, len(word))
            dir = random.choice([1,2,3,4])
            x = random.randint(0,rows)
            y = random.randint(0,cols)
            if   dir==1: x+=r; y+=r
            elif dir==2: x+=r
            elif dir==3: x+=r; y-=r
            elif dir==4: y+=r
            if 0<=x<rows and 0<=y<cols:
                if check(word, dir, x, y, grid, rows, cols):
#                   used.append((save, dir, x, y, word))
                    used.append(save)
                    break

    # Fill up the remaining spaces
    for i in range(rows):
        for j in range(cols):
            if grid[i][j] == '':
                grid[i][j] = random.choice(alph)

    return grid, used

Example #26

Source File: suggest.py From gitsuggest with MIT License

4 votes

def __clean_and_tokenize(self, doc_list):
        """Method to clean and tokenize the document list.

        :param doc_list: Document list to clean and tokenize.
        :return: Cleaned and tokenized document list.
        """
        # Some repositories fill entire documentation in description. We ignore
        # such repositories for cleaner tokens.
        doc_list = filter(
            lambda x: x is not None and len(x) <= GitSuggest.MAX_DESC_LEN,
            doc_list,
        )

        cleaned_doc_list = list()

        # Regular expression to remove out all punctuations, numbers and other
        # un-necessary text substrings like emojis etc.
        tokenizer = RegexpTokenizer(r"[a-zA-Z]+")

        # Get stop words.
        stopwords = self.__get_words_to_ignore()

        # Get english words.
        dict_words = self.__get_words_to_consider()

        for doc in doc_list:
            # Lowercase doc.
            lower = doc.lower()

            # Tokenize removing numbers and punctuation.
            tokens = tokenizer.tokenize(lower)

            # Include meaningful words.
            tokens = [tok for tok in tokens if tok in dict_words]

            # Remove stopwords.
            tokens = [tok for tok in tokens if tok not in stopwords]

            # Filter Nones if any are introduced.
            tokens = [tok for tok in tokens if tok is not None]

            cleaned_doc_list.append(tokens)

        return cleaned_doc_list

Example #27

Source File: wordfinder.py From razzy-spinner with GNU General Public License v3.0

4 votes

def wordfinder(words, rows=20, cols=20, attempts=50,
               alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
    """
    Attempt to arrange words into a letter-grid with the specified
    number of rows and columns.  Try each word in several positions
    and directions, until it can be fitted into the grid, or the
    maximum number of allowable attempts is exceeded.  Returns a tuple
    consisting of the grid and the words that were successfully
    placed.

    :param words: the list of words to be put into the grid
    :type words: list
    :param rows: the number of rows in the grid
    :type rows: int
    :param cols: the number of columns in the grid
    :type cols: int
    :param attempts: the number of times to attempt placing a word
    :type attempts: int
    :param alph: the alphabet, to be used for filling blank cells
    :type alph: list
    :rtype: tuple
    """

    # place longer words first
    words = sorted(words, key=len, reverse=True)

    grid = []  # the letter grid
    used = []  # the words we used

    # initialize the grid
    for i in range(rows):
        grid.append([""] * cols)

    # try to place each word
    for word in words:
        word = word.strip().upper()  # normalize
        save = word                  # keep a record of the word
        word = revword(word)
        for attempt in range(attempts):
            r = random.randint(0, len(word))
            dir = random.choice([1,2,3,4])
            x = random.randint(0,rows)
            y = random.randint(0,cols)
            if   dir==1: x+=r; y+=r
            elif dir==2: x+=r
            elif dir==3: x+=r; y-=r
            elif dir==4: y+=r
            if 0<=x<rows and 0<=y<cols:
                if check(word, dir, x, y, grid, rows, cols):
#                   used.append((save, dir, x, y, word))
                    used.append(save)
                    break

    # Fill up the remaining spaces
    for i in range(rows):
        for j in range(cols):
            if grid[i][j] == '':
                grid[i][j] = random.choice(alph)

    return grid, used

Example #28

Source File: data_processing.py From Sarcasm-Detection with MIT License

4 votes

def replace_contracted_form(contracted_word, pos, dictionary):
    long_form = []
    if "'" in contracted_word:
        # print("Found apostrophe in word: ", contracted_word, ' with pos: ', pos)
        split_words = contracted_word.split("'")
        check_if_in_dict = False
        # If the contraction is a nominal + verbal or a proper noun + verbal
        if pos is 'L' or pos is 'M':
            long_form.append(split_words[0])
            if split_words[1].lower() in contractions:
                long_form.extend(contractions[split_words[1].lower()].split())
        # If the contraction is a whole verb (like let's or isn't)
        elif pos in ['V', 'Y', 'O'] and contracted_word.lower() in contractions:
            long_form.extend(contractions[contracted_word.lower()].split())
        # If the contraction is proper noun with possessive or a nominal with a possessive or even a (proper) noun
        elif pos in ['S', 'Z', 'D', 'N', '^']:
            if contracted_word.lower() in contractions:
                long_form.extend(contractions[contracted_word.lower()].split())
            elif split_words[1].lower() == 's':
                long_form.append(split_words[0])
            elif contracted_word.lower() in contractions:
                long_form.extend(contractions[contracted_word.lower()].split())
            else:
                check_if_in_dict = True
        # Can skip ' which are just punctuation marks (usually used to emphasize or quote something)
        elif pos is ',':
            # print("Punctuation, nothing to replace.", split_words[0], ' -- ', split_words[1])
            return []
        # Never replace contractions in emojis or emoticons (will be translated later)
        elif pos is 'E':
            long_form.append(contracted_word)
        else:
            check_if_in_dict = True
        if check_if_in_dict:
            # Attempt to separate words which have been separated by ' by human error
            clean0 = re.findall("[a-zA-Z]+", split_words[0])
            clean1 = re.findall("[a-zA-Z]+", split_words[1])
            if clean0 != [] and clean0[0].lower() in dictionary and clean1 != [] and clean1[0].lower() in dictionary:
                # print("Cleaned to ", clean0, ', ', clean1)
                long_form.extend([clean0[0], clean1[0]])
            else:
                # print("Word couldn't be de-contracted!")
                long_form.append(contracted_word)
        return long_form
    else:
        return long_form.append(contracted_word)


# Cannot do lemmatization with NLTK without changing the case - which we don't want
# So lemmatize but remember if upper case or startign with upper letter
# This will be needed when performing CMU pos-tagging or when extracting pragmatic features