Python nltk.wordpunct_tokenize() Examples
The following are 19 code examples for showing how to use nltk.wordpunct_tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: product-classifier Author: georgetown-analytics File: features.py License: MIT License | 5 votes |
def tokenize(self, text): """ Returns a list of individual tokens from the text utilizing NLTK's tokenize built in utility (far better than split on space). It also removes any stopwords and punctuation from the text, as well as ensure that every token is normalized. For now, token = word as in bag of words (the feature we're using). """ for token in wordpunct_tokenize(text): token = self.normalize(token) if token in self.punctuation: continue if token in self.stopwords: continue yield token
Example 2
Project: atap Author: foxbook File: parse.py License: Apache License 2.0 | 5 votes |
def parse(sent): parser = nltk.ChartParser(grammar) tokens = nltk.wordpunct_tokenize(sent) return parser.parse(tokens)
Example 3
Project: atap Author: foxbook File: recommender.py License: Apache License 2.0 | 5 votes |
def recommend(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.vect.transform(wordpunct_tokenize(terms)) distance_matches = self.knn.transform(vect_doc) # the result is a list with a 2-tuple of arrays matches = distance_matches[0][1][0] # the matches are the indices of documents return matches
Example 4
Project: atap Author: foxbook File: recommender.py License: Apache License 2.0 | 5 votes |
def query(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.transformer.named_steps['transform'].fit_transform( wordpunct_tokenize(terms) ) dists, inds = self.tree.query(vect_doc, k=self.k) return inds[0]
Example 5
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def words(self, fileids=None, categories=None): """ Uses the built in word tokenizer to extract tokens from sentences. Note that this method uses BeautifulSoup to parse HTML content. """ for sentence in self.sents(fileids, categories): for token in wordpunct_tokenize(sentence): yield token
Example 6
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def tokenize(self, fileids=None, categories=None): """ Segments, tokenizes, and tags a document in the corpus. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
Example 7
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def words(self, fileids=None, categories=None): """ Uses the built in word tokenizer to extract tokens from sentences. Note that this method uses BeautifulSoup to parse HTML content. """ for sentence in self.sents(fileids, categories): for token in wordpunct_tokenize(sentence): yield token
Example 8
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def tokenize(self, fileids=None, categories=None): """ Segments, tokenizes, and tags a document in the corpus. """ for paragraph in self.paras(fileids=fileids): yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(paragraph) ]
Example 9
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ started = time.time() # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in sent_tokenize(para): counts['sents'] += 1 for word in wordpunct_tokenize(sent): counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self.resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self.resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
Example 10
Project: atap Author: foxbook File: preprocess.py License: Apache License 2.0 | 5 votes |
def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(paragraph) ]
Example 11
Project: atap Author: foxbook File: am_reader.py License: Apache License 2.0 | 5 votes |
def words(self): """ Returns a generator of words. """ for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield word
Example 12
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def words(self): """ Returns a generator of words. """ for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield word
Example 13
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 5 votes |
def tagged_tokens(self): for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield nltk.pos_tag(word)
Example 14
Project: luscan-devel Author: blackye File: natural_language.py License: GNU General Public License v2.0 | 5 votes |
def calculate_language_scores(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}. :param text: Text to analyze. :type text: str :return: Dictionary with languages and unique stopwords seen in analyzed text. :rtype: dict(str -> int) :raises: TypeError """ if not isinstance(text, basestring): raise TypeError("Expected basestring, got '%s' instead" % type(text)) if not text: return {} languages_ratios = {} # Split the text into separate tokens, using natural language punctuation signs. tokens = wordpunct_tokenize(text) tokenized_words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(tokenized_words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios #------------------------------------------------------------------------------
Example 15
Project: dialog Author: edward-zhu File: cluster.py License: Apache License 2.0 | 5 votes |
def normalize(sent): return wordpunct_tokenize(sent.lower())
Example 16
Project: dialog Author: edward-zhu File: utils.py License: Apache License 2.0 | 5 votes |
def tokenize(sent): tokens = tokenizer.tokenize(sent) ret = [] for t in tokens: if '<' not in t: ret.extend(wordpunct_tokenize(t)) else: ret.append(t) return ret
Example 17
Project: transferable_sent2vec Author: wasiahmad File: helper.py License: MIT License | 5 votes |
def tokenize_and_normalize(s): """Tokenize and normalize string.""" token_list = [] tokens = wordpunct_tokenize(s.lower()) token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)]) return token_list
Example 18
Project: transferable_sent2vec Author: wasiahmad File: helper.py License: MIT License | 5 votes |
def tokenize_and_normalize(s): """Tokenize and normalize string.""" token_list = [] tokens = wordpunct_tokenize(s.lower()) token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)]) return token_list
Example 19
Project: luscan-devel Author: blackye File: natural_language.py License: GNU General Public License v2.0 | 4 votes |
def get_words(text, min_length = None, max_length = None): """ Parse the given text as natural language and extract words from it. Optionally filter the words by minimum and/or maximum length. :param text: Text to parse. :type text: str :param min_length: Minimum length required by each token. Use None for no limit. :type min_length: int | None :param min_length: Maximum length allowed by each token. Use None for no limit. :type min_length: int | None :return: Set of unique words extracted from the text. :rtype: set(str) :raises: TypeError, ValueError """ if min_length is not None: if not isinstance(min_length, int): raise TypeError("Expected int, got '%s' instead" % type(min_length)) elif min_length < 0: raise ValueError("Min length must be greater than 0, got %s." % min_length) if max_length is not None: if not isinstance(max_length, int): raise TypeError("Expected int, got '%s' instead" % type(min_length)) elif max_length < 0: raise ValueError("Min length must be greater than 0, got %s" % max_length) # Split the text into separate tokens, using natural language # punctuation signs. Then filter out by min/max length, and tokens # that aren't strictly alphabetic. Finally, convert the words to # lowercase form. return { word.lower() for word in wordpunct_tokenize(text) if ( word.isalpha() and (min_length is None or len(word) >= min_length) and (max_length is None or len(word) <= max_length) ) } #------------------------------------------------------------------------------