Python nltk.tokenize() Examples
The following are 30 code examples for showing how to use nltk.tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: razzy-spinner Author: rafasashi File: util.py License: GNU General Public License v3.0 | 6 votes |
def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load('sa_subjectivity.pickle') except LookupError: print('Cannot find the sentiment analyzer you want to load.') print('Training a new one using NaiveBayesClassifier.') sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text))
Example 2
Project: sato Author: megagonlabs File: train_LDA.py License: Apache License 2.0 | 6 votes |
def process_col(col, **kwargs): numeric = kwargs['num'] # process the cols to return a bags of word representation if col.dtype == 'int64' or col.dtype =='float64': if numeric == 'directstr': return list(col.astype(str)) elif numeric == 'placeholder': return [str(col.dtype)] * len(col) if col.dtype == 'object': return tokenize(list(col.astype(str)), **kwargs) else: return list(col.astype(str)) return col
Example 3
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example 4
Project: clickbait Author: bhargaviparanjape File: experiments.py License: MIT License | 6 votes |
def handle_multiple_sentences(infile, outfile): titles = [] f = open(infile, "r") f2 = codecs.open(outfile, "w+", "utf-8") for line in f: line = line.decode("utf-8") sentences = sent_detector.tokenize(line.strip()) for i in range(len(sentences)): if i == 0: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) else: sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title()) sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::") titles.append(" ".join(sentences)) title_set = set(titles) for l in title_set: print >> f2, l
Example 5
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 6 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): for sent in para: for word, tag in sent: counts['words'] += 1 tokens[word] += 1 # Return data structure with information return { 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), }
Example 6
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example 7
Project: flambe Author: asappresearch File: word.py License: MIT License | 6 votes |
def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if self.exclude_stopwords and self.stop_words: example = ' '.join([word for word in word_tokenize(example) if word not in self.stop_words]) if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)
Example 8
Project: Dense-CoAttention-Network Author: cvlab-tohoku File: load_data.py License: MIT License | 6 votes |
def tokenize_mcb(sentence): """ MCB tokenize implementation. -------------------- Arguments: sentence (str): a setence that will be tokenized. Return: A list of tokens from the sentence. """ for i in [r"\?", r"\!", r"\'", r"\"", r"\$", r"\:", r"\@", r"\(", r"\)", r"\,", r"\.", r"\;"]: sen = re.sub(i, "", sen) for i in [r"\-", r"\/"]: sen = re.sub(i, " ", sen) q_list = re.sub(r"\?", "", sen.lower()).split() q_list = list(filter(lambda x: len(x) > 0, q_list)) return q_list
Example 9
Project: modin Author: modin-project File: kaggle18.py License: Apache License 2.0 | 6 votes |
def tokenize(text): """ sent_tokenize(): segment text into sentences word_tokenize(): break sentences into words """ try: regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]") text = regex.sub(" ", text) # remove punctuation tokens_ = [word_tokenize(s) for s in sent_tokenize(text)] tokens = [] for token_by_sent in tokens_: tokens += token_by_sent tokens = list(filter(lambda t: t.lower() not in stop, tokens)) filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)] filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3] return filtered_tokens except TypeError as e: print(text, e)
Example 10
Project: DOTA_models Author: ringringyi File: skip_thoughts_encoder.py License: Apache License 2.0 | 5 votes |
def _tokenize(self, item): """Tokenizes an input string into a list of words.""" tokenized = [] for s in self._sentence_detector.tokenize(item): tokenized.extend(nltk.tokenize.word_tokenize(s)) return tokenized
Example 11
Project: steppy-toolkit Author: minerva-ml File: text.py License: MIT License | 5 votes |
def _use_stopwords(self, x): words = tokenizer.tokenize(x) words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example 12
Project: steppy-toolkit Author: minerva-ml File: text.py License: MIT License | 5 votes |
def _apostrophes(self, x): words = tokenizer.tokenize(x) words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words] words = [lem.lemmatize(word, "v") for word in words] words = [w for w in words if not w in eng_stopwords] x = " ".join(words) return x
Example 13
Project: razzy-spinner Author: rafasashi File: util.py License: GNU General Public License v3.0 | 5 votes |
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print('Positive') elif pos_words < neg_words: print('Negative') elif pos_words == neg_words: print('Neutral') if plot == True: _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
Example 14
Project: razzy-spinner Author: rafasashi File: chunked.py License: GNU General Public License v3.0 | 5 votes |
def read_block(self, stream): block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset, target_tagset=self._target_tagset) # If requested, throw away the tags. if not self._tagged: sent = self._untag(sent) # If requested, throw away the chunks. if not self._chunked: sent = sent.leaves() # Add the sentence to `para`. if self._group_by_sent: para.append(sent) else: para.extend(sent) # Add the paragraph to `block`. if self._group_by_para: block.append(para) else: block.extend(para) # Return the block return block
Example 15
Project: razzy-spinner Author: rafasashi File: rte_classify.py License: GNU General Public License v3.0 | 5 votes |
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is', 'have', 'are', 'were', 'and', 'very', '.', ',']) self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set(lemmatize(token) for token in self.text_tokens) self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words
Example 16
Project: sato Author: megagonlabs File: train_LDA.py License: Apache License 2.0 | 5 votes |
def tokenize(col, **kwargs): threshold = int(kwargs['thr']) ret = [] for st in col: if len(st)> threshold: # tokenize the string if longer than threshold # and append a longstr tag ret.extend(clean(st)) if threshold > 0: ret.append('longstr') else: ret.append(st.lower()) return ret
Example 17
Project: StackGAN Author: hanzhanggit File: skipthoughts.py License: MIT License | 5 votes |
def preprocess(text): """ Preprocess text for encoder """ X = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for t in text: sents = sent_detector.tokenize(t) result = '' for s in sents: tokens = word_tokenize(s) result += ' ' + ' '.join(tokens) X.append(result) return X
Example 18
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 5 votes |
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
Example 19
Project: text-to-image Author: paarthneekhara File: skipthoughts.py License: MIT License | 5 votes |
def preprocess(text): """ Preprocess text for encoder """ X = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for t in text: sents = sent_detector.tokenize(t) result = '' for s in sents: tokens = word_tokenize(s) result += ' ' + ' '.join(tokens) X.append(result) return X
Example 20
Project: text-to-image Author: paarthneekhara File: skipthoughts.py License: MIT License | 5 votes |
def preprocess(text): """ Preprocess text for encoder """ X = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for t in text: sents = sent_detector.tokenize(t) result = '' for s in sents: tokens = word_tokenize(s) result += ' ' + ' '.join(tokens) X.append(result) return X
Example 21
Project: chirp Author: 9b File: helpers.py License: MIT License | 5 votes |
def get_tokens(text): """Tokenize the input text.""" soup = BeautifulSoup(text, "html.parser") tokenizer = RegexpTokenizer(r'\w+') return tokenizer.tokenize(soup.get_text())
Example 22
Project: visDial.pytorch Author: jiasenlu File: prepro.py License: MIT License | 5 votes |
def tokenize(sentence): return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n'];
Example 23
Project: gutenberg Author: pgcorpus File: tokenizer.py License: GNU General Public License v3.0 | 5 votes |
def tokenize_text(text, language="english"): '''Tokenize a string into a list of tokens. Use NLTK's Treebankwordtokenizer. Note that we first split into sentences using NLTK's sent_tokenize. We additionally call a filtering function to remove un-wanted tokens. IN: - text, str OUT: - list of strings ''' ## list of tokens list_tokens = [] ## split text into sentences sentences=sent_tokenize(text, language=language) ## define the tokenizer tokenizer = TreebankWordTokenizer() ## loop over all sentences for sent in sentences: ## tokenize the sentence sent_tokenized = tokenizer.tokenize(sent) ## lowercase the tokens ## add tokens to list of tokens list_tokens += sent_tokenized list_tokens = filter_tokens(list_tokens) return list_tokens
Example 24
Project: SOQAL Author: husseinmozannar File: tfidf_reader.py License: MIT License | 5 votes |
def stem_string(self, str): str_tokens = self.tokenizer.tokenize(str) str_processed = "" for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: str_processed += token + " " + self.stemmer.stem(token) +" " return str_processed
Example 25
Project: SOQAL Author: husseinmozannar File: slidingwindow_distance.py License: MIT License | 5 votes |
def tokenize_string(self, str): str_tokens = self.tokenizer.tokenize(str) tokens_stemmed = [] for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: tokens_stemmed.append(self.stemmer.stem(token)) return tokens_stemmed
Example 26
Project: SOQAL Author: husseinmozannar File: TfidfRetriever.py License: MIT License | 5 votes |
def stem_string(self, str): str_tokens = self.tokenizer.tokenize(str) str_processed = "" for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: str_processed += token + " " return str_processed
Example 27
Project: tokenquery Author: ramtinms File: tokenizer.py License: GNU General Public License v3.0 | 5 votes |
def tokenize(self, text): """ tokenize text into a list of Token objects :param text: text to be tokenized (might contains several sentences) :type text: str :return: List of Token objects :rtype: list(Token) """ tokens = [] if self.tokenizer_type == "SpaceTokenizer": operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+') for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer": operator = WhitespaceTokenizer() for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "PTBTokenizer": ptb_tokens = word_tokenize(text) counter = 0 for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens): new_token = Token(counter, token, span[0], span[1]) counter += 1 tokens.append(new_token) return tokens
Example 28
Project: PageRank Author: ashkonf File: textrank.py License: Apache License 2.0 | 5 votes |
def __tokenizeWords(sentence): return nltk.tokenize.word_tokenize(sentence) ## tests ########################################################################################
Example 29
Project: yolo_v2 Author: rky0930 File: skip_thoughts_encoder.py License: Apache License 2.0 | 5 votes |
def _tokenize(self, item): """Tokenizes an input string into a list of words.""" tokenized = [] for s in self._sentence_detector.tokenize(item): tokenized.extend(nltk.tokenize.word_tokenize(s)) return tokenized
Example 30
Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: IdentifyingTopic.py License: MIT License | 5 votes |
def cleanDocuments(self): tokenizer = RegexpTokenizer(r'[a-zA-Z]+') en_stop = set(stopwords.words('english')) self.cleaned = [] for doc in self.documents: lowercase_doc = doc.lower() words = tokenizer.tokenize(lowercase_doc) non_stopped_words = [i for i in words if not i in en_stop] self.cleaned.append(non_stopped_words) print("INFO: Clearning {} documents completed".format(len(self.documents)))