Python nltk.corpus.words.words() Examples
The following are 28
code examples of nltk.corpus.words.words().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.words
, or try the search function
.
Example #1
Source File: wordfinder.py From razzy-spinner with GNU General Public License v3.0 | 7 votes |
def word_finder(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print("Word Finder\n") for i in range(len(grid)): for j in range(len(grid[i])): print(grid[i][j], end=' ') print() print() for i in range(len(used)): print("%d:" % (i+1), used[i])
Example #2
Source File: data_processing.py From Sarcasm-Detection with MIT License | 6 votes |
def split_hashtag(hashtag, word_list): split_words = [] if hashtag != hashtag.lower() and hashtag != hashtag.upper(): split_words = camel_case_split(hashtag) else: j = 0 while j <= len(hashtag): loc = j for i in range(j + 1, len(hashtag) + 1, 1): if hashtag[j:i].lower() in word_list: loc = i if loc == j: j += 1 else: split_words.append(hashtag[j:loc]) j = loc split_words = ['#' + str(s) for s in split_words] return split_words # Select the best possible hashtag split based on upper-case # or component words maximizing the length of the possible word split
Example #3
Source File: wordfinder.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def word_finder(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print("Word Finder\n") for i in range(len(grid)): for j in range(len(grid[i])): print(grid[i][j], end=' ') print() print() for i in range(len(used)): print("%d:" % (i + 1), used[i])
Example #4
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 6 votes |
def __init__(self, w2v, sif, log=None): # A NLPWord2Vec to get the vec for a word self.word2vec = w2v # A ResSIF used to get word count self.sif = sif # util to pre-process data self.utils = WordSentenceUtils() self.log = log if log else logging.getLogger(__name__) self.sentence_vectors = [] self.feature_size = 0 # download nltk resource if necessary nltk.download('words', quiet=True) self.setofwords = set(nltk_words.words()) # pca vector self.pca_u = []
Example #5
Source File: suggest.py From gitsuggest with MIT License | 6 votes |
def __get_words_to_ignore(self): """Compiles list of all words to ignore. :return: List of words to ignore. """ # Stop words in English. english_stopwords = stopwords.words("english") here = path.abspath(path.dirname(__file__)) # Languages in git repositories. git_languages = [] with open(path.join(here, "gitlang/languages.txt"), "r") as langauges: git_languages = [line.strip() for line in langauges] # Other words to avoid in git repositories. words_to_avoid = [] with open(path.join(here, "gitlang/others.txt"), "r") as languages: words_to_avoid = [line.strip() for line in languages] return set( itertools.chain(english_stopwords, git_languages, words_to_avoid) )
Example #6
Source File: data_processing.py From Sarcasm-Detection with MIT License | 6 votes |
def build_vocabulary(vocab_filename, lines, minimum_occurrence=1): if not os.path.exists(vocab_filename): stopwords = get_stopwords_list(filename="stopwords_loose.txt") print("Building vocabulary...") vocabulary = Counter() for line in lines: vocabulary.update([l.lower() for l in line.split() if l not in stopwords]) print("The top 10 most common words: ", vocabulary.most_common(10)) # Filter all words that appear too rarely or too frequently to be conclusive vocabulary = {key: vocabulary[key] for key in vocabulary if vocabulary[key] >= minimum_occurrence} utils.save_file(vocabulary.keys(), vocab_filename) print("Vocabulary saved to file \"%s\"" % vocab_filename) vocabulary = set(utils.load_file(vocab_filename)) print("Loaded vocabulary of size ", len(vocabulary)) return vocabulary
Example #7
Source File: wordfinder.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def word_finder(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print "Word Finder\n" for i in range(len(grid)): for j in range(len(grid[i])): print grid[i][j], print print for i in range(len(used)): print "%d:" % (i+1), used[i]
Example #8
Source File: suggest.py From gitsuggest with MIT License | 5 votes |
def __get_words_to_consider(self): """Compiles list of all words to consider. :return: List of words to consider. """ return set(words.words())
Example #9
Source File: named_entity.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append((subchild, next(tag_iter))) else: newtree.append((child, next(tag_iter))) return newtree
Example #10
Source File: named_entity.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words self._en_wordlist = set(words.words('en-basic')) wl = self._en_wordlist return wl
Example #11
Source File: named_entity.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words self._en_wordlist = set(words.words('en-basic')) wl = self._en_wordlist return wl
Example #12
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 5 votes |
def find_keywords(self, sentence, closest): """ For each incident in closest, find words in sentence with highest contribution. :param sentence: description of the input (new) incident :param closest: the top closest incidents found :return: add "keywords" to each closest incident """ w_util = WordSentenceUtils() words = w_util.get_words(sentence) for cl in closest: v2_1 = cl["vec"] v2_norm_1 = np.linalg.norm(v2_1) word_sim = [] for w in words: try: wc = self.sif.get_word_count(w) if wc < self.COUNT_THRESHOLD: wc = self.COUNT_THRESHOLD - (self.COUNT_THRESHOLD - wc)/2 a_value = ResSen2Vec.SIF_A / (ResSen2Vec.SIF_A + wc) w_v = self.word2vec[w] sub = np.multiply(self.pca_u, w_v) w_v = np.subtract(w_v, sub) sim = a_value * np.dot(w_v, v2_1) / (np.linalg.norm(w_v) * v2_norm_1) word_sim.append((sim, w)) except: pass word_sim.sort(key=lambda u: u[0]) top_5 = word_sim[:5] cl["keywords"] = ', '.join([ws[1] for ws in top_5])
Example #13
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 5 votes |
def remove_pca(self, vecs): """ Remove the principle component from the vectors. This follows the recommendation of "A SIMPLE BUT TOUGH-TO-BEAT BASELINE FOR SENTENCE EMBEDDINGS", The reason according to them is that all vectors contain a common principle component that comes from common words. Here we use sk-learn PCA to do so :param vecs: input vectors :return: vectors with principle component removed """ pca = PCA() pca.fit(np.array(vecs)) u = pca.components_[0] # Need to store this PCA for later use when we compare incidents with open(FileManage.DEFAULT_PCA_FILE, 'w') as outfile: ul = list(u) json.dump(ul, outfile) u = np.multiply(u, np.transpose(u)) # Corner case for small number of dataset. Pad with 0 if len(u) < self.word2vec.vector_size: for i in range(self.word2vec.vector_size - len(u)): u = np.append(u, 0) ret = [] for v in vecs: sub = np.multiply(u, v) ret.append(np.subtract(v, sub)) return ret
Example #14
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 5 votes |
def get_vec_for_sentence(self, sentence): """ :param sentence: :return: """ words = self.utils.get_words(sentence) return self.get_vec_for_words(words)
Example #15
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 5 votes |
def get_vec_for_words(self, words): """ Here is the implementation of SIF (Smooth Inverse Frequency) :param words: :return: """ self.feature_size = self.word2vec.vector_size # This is the accumulator. Initialize to zero vector first v = np.zeros(self.feature_size, dtype="float64") # given words of a sentence, now we are ready to compute the # vector for this sentence. count = 0 # count of how many words contributing to this sentence for word in words: if word in self.setofwords: # We only care about words in nltk words set word_count = self.sif.get_word_count(word) if word_count > 0: # some words have unreasonably low count and adjust it a little bit if word_count < self.COUNT_THRESHOLD: word_count = self.COUNT_THRESHOLD - (self.COUNT_THRESHOLD - word_count)/2 try: # This is the SIF method a_value = self.SIF_A / (self.SIF_A + word_count) vec = np.multiply(a_value, self.word2vec[word]) # accumulate it v = np.add(v, vec) count += 1 except Exception as e: # Not an error if word is not in the vocab self.log.debug("{} is not in the vocab of the word2vec model".format(word)) # normalize it if count != 0: v = np.divide(v, count) return v
Example #16
Source File: named_entity.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append( (subchild, next(tag_iter)) ) else: newtree.append( (child, next(tag_iter)) ) return newtree
Example #17
Source File: suggest.py From gitsuggest with MIT License | 5 votes |
def __construct_lda_model(self): """Method to create LDA model to procure list of topics from. We do that by first fetching the descriptions of repositories user has shown interest in. We tokenize the hence fetched descriptions to procure list of cleaned tokens by dropping all the stop words and language names from it. We use the cleaned and sanitized token list to train LDA model from which we hope to procure topics of interests to the authenticated user. """ # Fetch descriptions of repos of interest to authenticated user. repos_of_interest = self.__get_interests() # Procure clean tokens from the descriptions. cleaned_tokens = self.__clean_and_tokenize(repos_of_interest) # If cleaned tokens are empty, it can cause an exception while # generating LDA. But tokens shouldn't be something meaningful as that # would mean we are suggesting repos without reason. Hence the random # string to ensure that LDA doesn't cause exception but the token # doesn't generate any suggestions either. if not cleaned_tokens: cleaned_tokens = [["zkfgzkfgzkfgzkfgzkfgzkfg"]] # Setup LDA requisites. dictionary = corpora.Dictionary(cleaned_tokens) corpus = [dictionary.doc2bow(text) for text in cleaned_tokens] # Generate LDA model self.lda_model = models.ldamodel.LdaModel( corpus, num_topics=1, id2word=dictionary, passes=10 )
Example #18
Source File: rd_ft.py From DeepLearn with MIT License | 5 votes |
def WPS(text): count = 0 for word in text.split(): if word in set(w.lower() for w in words.words()): count += 1 return count #Average Number Of Syllables In Sentence(Returns Float):
Example #19
Source File: data_processing.py From Sarcasm-Detection with MIT License | 5 votes |
def split_hashtag_long_version(hashtag): word_file = path + "/res/word_list.txt" word_list = utils.load_file(word_file).split() word_dictionary = list(set(words.words())) for alphabet in "bcdefghjklmnopqrstuvwxyz": word_dictionary.remove(alphabet) all_poss = split_hashtag_to_words_all_possibilities(hashtag.lower(), word_dictionary) max_p = 0 min_len = 1000 found = False best_p = [] for poss in all_poss: counter = 0 for p in poss: if p in word_list: counter += 1 if counter == len(poss) and min_len > counter: found = True min_len = counter best_p = poss else: if counter > max_p and not found: max_p = counter best_p = poss best_p_v2 = split_hashtag(hashtag, word_list) if best_p != [] and best_p_v2 != []: split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2 else: if best_p == [] and best_p_v2 == []: split_words = [hashtag] else: split_words = best_p if best_p_v2 == [] else best_p_v2 split_words = ['#' + str(s) for s in split_words] return split_words
Example #20
Source File: data_processing.py From Sarcasm-Detection with MIT License | 5 votes |
def camel_case_split(term): term = re.sub(r'([0-9]+)', r' \1', term) term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term) splits = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', term) return [s.group(0) for s in splits] # Split a long, compound hash tag into its component tags. Given the character limit of tweets, # people would stick words together to save space so this is a useful tool. # Examples of hash splits from real data (train set) are in /stats/hashtag_splits.txt # Implementation adapted from https://github.com/matchado/HashTagSplitter
Example #21
Source File: art.py From Text-Recognition with GNU Lesser General Public License v2.1 | 5 votes |
def __init__(self, config, abc): # def __init__(self): self.config = config self.seed() self.all_words = words.words() self.english_alpha = ''.join([c for c in abc if c in 'abcdefghijklmnopqrstuvwxyz0123456789'])+"ABCDEFGHIJKLMNOPQRSTUVWXYZ" self.english_symbols = ''.join([c for c in abc if c not in 'abcdefghijklmnopqrstuvwxyz0123456789']) self.english = self.english_alpha + self.english_symbols self.transparent_mean = 0.8 self.transparent_gaussian = 0.06 self.prob_lexi = 0.5 self.symbol_word = 1 self.art_font_size_range = self.config['augmentation']['font_range'] self.border_range = self.config['augmentation']['border_range'] self.font_dir_name='/home/Common/Datasets_SSD/Dataset_Text/ART/fonts_for_text' #probabilty distribution for length of words self.probability_dist = np.array([0.1, 0.6, 2.6, 5.2, 8.5, 12.2, 14, 14, 12.6, 10.1, 7.5])#, 5.2, 3.2, 2, 1, 0.6, 0.3, 0.2, 0.1, 0.1 self.probability_dist = self.probability_dist/np.sum(self.probability_dist) list_of_files = self.get_list_of_files(self.font_dir_name) self.all_fonts = [] for i in range(len(list_of_files)): with open(list_of_files[i],"rb") as f: font_bytes=f.read() self.all_fonts.append(font_bytes) self.image_net_location = "/media/mayank/0b40607e-7efc-4216-b12f-8bb86facfaed/Dataset_HDD/Image_Net/ILSVRC/Data/CLS-LOC/test/" # self.image_net_location = "/home/Common/ImageNet/test" self.images_orig = self.get_imagenet_images(self.config['augmentation']['imagenet_no'])#self.config['augmentation']['base_number'] # self.image_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Images/' # self.label_save_location = '/home/Common/Mayank/Text/Segmentation/Dataset/ART/Labels/'
Example #22
Source File: named_entity.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.node, [])) for subchild in child: newtree[-1].append( (subchild, tag_iter.next()) ) else: newtree.append( (child, tag_iter.next()) ) return newtree
Example #23
Source File: named_entity.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words self._en_wordlist = set(words.words('en-basic')) wl = self._en_wordlist return wl
Example #24
Source File: rd_ft.py From DL-text with MIT License | 5 votes |
def WPS(text): count = 0 for word in text.split(): if word in set(w.lower() for w in words.words()): count += 1 return count #Average Number Of Syllables In Sentence(Returns Float):
Example #25
Source File: wordfinder.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def wordfinder(words, rows=20, cols=20, attempts=50, alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'): """ Attempt to arrange words into a letter-grid with the specified number of rows and columns. Try each word in several positions and directions, until it can be fitted into the grid, or the maximum number of allowable attempts is exceeded. Returns a tuple consisting of the grid and the words that were successfully placed. :param words: the list of words to be put into the grid :type words: list :param rows: the number of rows in the grid :type rows: int :param cols: the number of columns in the grid :type cols: int :param attempts: the number of times to attempt placing a word :type attempts: int :param alph: the alphabet, to be used for filling blank cells :type alph: list :rtype: tuple """ # place longer words first words.sort(cmp=lambda x,y:cmp(len(x),len(y)), reverse=True) grid = [] # the letter grid used = [] # the words we used # initialize the grid for i in range(rows): grid.append([""] * cols) # try to place each word for word in words: word = strip(word).upper() # normalize save = word # keep a record of the word word = revword(word) for attempt in range(attempts): r = random.randint(0, len(word)) dir = random.choice([1,2,3,4]) x = random.randint(0,rows) y = random.randint(0,cols) if dir==1: x+=r; y+=r elif dir==2: x+=r elif dir==3: x+=r; y-=r elif dir==4: y+=r if 0<=x<rows and 0<=y<cols: if check(word, dir, x, y, grid, rows, cols): # used.append((save, dir, x, y, word)) used.append(save) break # Fill up the remaining spaces for i in range(rows): for j in range(cols): if grid[i][j] == '': grid[i][j] = random.choice(alph) return grid, used
Example #26
Source File: suggest.py From gitsuggest with MIT License | 4 votes |
def __clean_and_tokenize(self, doc_list): """Method to clean and tokenize the document list. :param doc_list: Document list to clean and tokenize. :return: Cleaned and tokenized document list. """ # Some repositories fill entire documentation in description. We ignore # such repositories for cleaner tokens. doc_list = filter( lambda x: x is not None and len(x) <= GitSuggest.MAX_DESC_LEN, doc_list, ) cleaned_doc_list = list() # Regular expression to remove out all punctuations, numbers and other # un-necessary text substrings like emojis etc. tokenizer = RegexpTokenizer(r"[a-zA-Z]+") # Get stop words. stopwords = self.__get_words_to_ignore() # Get english words. dict_words = self.__get_words_to_consider() for doc in doc_list: # Lowercase doc. lower = doc.lower() # Tokenize removing numbers and punctuation. tokens = tokenizer.tokenize(lower) # Include meaningful words. tokens = [tok for tok in tokens if tok in dict_words] # Remove stopwords. tokens = [tok for tok in tokens if tok not in stopwords] # Filter Nones if any are introduced. tokens = [tok for tok in tokens if tok is not None] cleaned_doc_list.append(tokens) return cleaned_doc_list
Example #27
Source File: wordfinder.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def wordfinder(words, rows=20, cols=20, attempts=50, alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'): """ Attempt to arrange words into a letter-grid with the specified number of rows and columns. Try each word in several positions and directions, until it can be fitted into the grid, or the maximum number of allowable attempts is exceeded. Returns a tuple consisting of the grid and the words that were successfully placed. :param words: the list of words to be put into the grid :type words: list :param rows: the number of rows in the grid :type rows: int :param cols: the number of columns in the grid :type cols: int :param attempts: the number of times to attempt placing a word :type attempts: int :param alph: the alphabet, to be used for filling blank cells :type alph: list :rtype: tuple """ # place longer words first words = sorted(words, key=len, reverse=True) grid = [] # the letter grid used = [] # the words we used # initialize the grid for i in range(rows): grid.append([""] * cols) # try to place each word for word in words: word = word.strip().upper() # normalize save = word # keep a record of the word word = revword(word) for attempt in range(attempts): r = random.randint(0, len(word)) dir = random.choice([1,2,3,4]) x = random.randint(0,rows) y = random.randint(0,cols) if dir==1: x+=r; y+=r elif dir==2: x+=r elif dir==3: x+=r; y-=r elif dir==4: y+=r if 0<=x<rows and 0<=y<cols: if check(word, dir, x, y, grid, rows, cols): # used.append((save, dir, x, y, word)) used.append(save) break # Fill up the remaining spaces for i in range(rows): for j in range(cols): if grid[i][j] == '': grid[i][j] = random.choice(alph) return grid, used
Example #28
Source File: data_processing.py From Sarcasm-Detection with MIT License | 4 votes |
def replace_contracted_form(contracted_word, pos, dictionary): long_form = [] if "'" in contracted_word: # print("Found apostrophe in word: ", contracted_word, ' with pos: ', pos) split_words = contracted_word.split("'") check_if_in_dict = False # If the contraction is a nominal + verbal or a proper noun + verbal if pos is 'L' or pos is 'M': long_form.append(split_words[0]) if split_words[1].lower() in contractions: long_form.extend(contractions[split_words[1].lower()].split()) # If the contraction is a whole verb (like let's or isn't) elif pos in ['V', 'Y', 'O'] and contracted_word.lower() in contractions: long_form.extend(contractions[contracted_word.lower()].split()) # If the contraction is proper noun with possessive or a nominal with a possessive or even a (proper) noun elif pos in ['S', 'Z', 'D', 'N', '^']: if contracted_word.lower() in contractions: long_form.extend(contractions[contracted_word.lower()].split()) elif split_words[1].lower() == 's': long_form.append(split_words[0]) elif contracted_word.lower() in contractions: long_form.extend(contractions[contracted_word.lower()].split()) else: check_if_in_dict = True # Can skip ' which are just punctuation marks (usually used to emphasize or quote something) elif pos is ',': # print("Punctuation, nothing to replace.", split_words[0], ' -- ', split_words[1]) return [] # Never replace contractions in emojis or emoticons (will be translated later) elif pos is 'E': long_form.append(contracted_word) else: check_if_in_dict = True if check_if_in_dict: # Attempt to separate words which have been separated by ' by human error clean0 = re.findall("[a-zA-Z]+", split_words[0]) clean1 = re.findall("[a-zA-Z]+", split_words[1]) if clean0 != [] and clean0[0].lower() in dictionary and clean1 != [] and clean1[0].lower() in dictionary: # print("Cleaned to ", clean0, ', ', clean1) long_form.extend([clean0[0], clean1[0]]) else: # print("Word couldn't be de-contracted!") long_form.append(contracted_word) return long_form else: return long_form.append(contracted_word) # Cannot do lemmatization with NLTK without changing the case - which we don't want # So lemmatize but remember if upper case or startign with upper letter # This will be needed when performing CMU pos-tagging or when extracting pragmatic features