Python nltk.word_tokenize() Examples
The following are 30 code examples for showing how to use nltk.word_tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: JARVIS Author: abhi007tyagi File: math_expression_calculator.py License: Apache License 2.0 | 8 votes |
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print("text2num -> ", text) return text
Example 2
Project: dl-models-for-qa Author: sujitpal File: kaggle.py License: Apache License 2.0 | 8 votes |
def get_story_question_answer_triples(sqa_file): sqatriples = [] fsqa = open(sqa_file, "rb") for line in fsqa: line = line.strip().decode("utf8").encode("ascii", "ignore") if line.startswith("#"): continue story, question, answer, correct = line.split("\t") swords = [] story_sents = nltk.sent_tokenize(story) for story_sent in story_sents: swords.extend(nltk.word_tokenize(story_sent)) qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) is_correct = int(correct) == 1 sqatriples.append((swords, qwords, awords, is_correct)) fsqa.close() return sqatriples
Example 3
Project: ConvLab Author: ConvLab File: Mem2Seq.py License: MIT License | 6 votes |
def predict(self, query): usr = query print('Mem2Seq usr:', usr) #example input: 'please find a restaurant called nusha .' self.t += 1 print('Mem2Seq turn:', self.t) usr = ' '.join(word_tokenize(usr.lower())) self.memory += generate_memory(usr, '$u', self.t) src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],) src_seqs = plain2tensor(self.lang.word2index, src_plain[0]) words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain) row = np.transpose(words)[0].tolist() if '<EOS>' in row: row = row[:row.index('<EOS>')] sys = ' '.join(row) sys = denormalize(sys) print('Mem2Seq sys:', sys) self.memory += generate_memory(sys, '$s', self.t) return sys
Example 4
Project: Machine-Translation Author: foamliu File: analyze_data.py License: Apache License 2.0 | 6 votes |
def analyze_en(): translation_path = os.path.join(train_translation_folder, train_translation_en_filename) with open(translation_path, 'r') as f: sentences = f.readlines() sent_lengths = [] for sentence in tqdm(sentences): sentence_en = sentence.strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)] seg_list = list(jieba.cut(sentence.strip())) # Update word frequency sent_lengths.append(len(seg_list)) num_bins = 100 n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5) title = 'English Sentence Lengths Distribution' plt.title(title) plt.show()
Example 5
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-5-document-classification.py License: MIT License | 6 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' stop_words = nltk.corpus.stopwords.words("english") # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer min_df=2, # minimum document frequency, i.e. the word must appear more than once. ngram_range=(1, 2), stop_words=stop_words ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example 6
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-6.4-tfidf-svm.py License: MIT License | 6 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' stop_words = nltk.corpus.stopwords.words("english") # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer min_df=2, # minimum document frequency, i.e. the word must appear more than once. ngram_range=(1, 2), stop_words=stop_words ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example 7
Project: Snowball Author: davidsbatista File: VectorSpaceModel.py License: GNU General Public License v3.0 | 6 votes |
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print("Gathering sentences and removing stopwords") for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) print(len(documents), "documents red") print(len(self.dictionary), " unique tokens")
Example 8
Project: Document-Classifier-LSTM Author: AlexGidiotis File: data_prep.py License: MIT License | 6 votes |
def preprocess(text): min_length = 3 text = re.sub('\d+','#',text) text = re.sub('\.',' eos ',text) # Tokenize words = map(lambda word: word.lower(), word_tokenize(text)) tokens = words # Remove non characters p = re.compile('[a-zA-Z#]+') # Filter tokens (we do not remove stopwords) filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens)) # Encode to ascii filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens] return filtered_tokens # Modify this path
Example 9
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example 10
Project: lisc Author: lisc-tools File: utils.py License: Apache License 2.0 | 6 votes |
def convert_string(text): """Convert a str of text into tokenized and selected list of words. Parameters ---------- text : str Text as one long string. Returns ------- words_cleaned : list of str List of tokenized words, after processing. Notes ----- This function sets text to lower case, and removes stopwords and punctuation. """ words = word_tokenize(text) words_cleaned = [word.lower() for word in words if ( (not word.lower() in stopwords.words('english')) & word.isalnum())] return words_cleaned
Example 11
Project: Projects Author: iamshang1 File: sentiwordnet.py License: MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' ')) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example 12
Project: Projects Author: iamshang1 File: combined.py License: MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example 13
Project: self-attentive-parser Author: nikitakit File: nltk_plugin.py License: MIT License | 6 votes |
def _nltk_process_sents(self, sents): for sentence in sents: if isinstance(sentence, STRING_TYPES): if self._tokenizer_lang is None: raise ValueError( "No word tokenizer available for this language. " "Please tokenize before calling the parser." ) sentence = nltk.word_tokenize(sentence, self._tokenizer_lang) if IS_PY2: sentence = [ word.decode('utf-8', 'ignore') if isinstance(word, str) else word for word in sentence ] if not self._provides_tags: sentence = nltk.pos_tag(sentence) yield [word for word, tag in sentence], sentence else: yield sentence, sentence
Example 14
Project: Text-Classification-Models-Pytorch Author: AnubhavGupta3377 File: utils.py License: MIT License | 6 votes |
def encode_text(text, word_embeddings, max_sen_len): ''' Encode a sequence of words into corresponding vector representation Input: text (string) : text (space separated words, etc..) word_embeddings (dict) : dictionary mapping from words to their representation max_sen_len (int) : maximum sentence length (in words) Returns: X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding ''' default_embed = np.zeros(300) words = word_tokenize(text)[:max_sen_len] embeds = [word_embeddings.get(x, default_embed) for x in words] embeds += [default_embed] * (max_sen_len - len(embeds)) return np.array(embeds, dtype=np.float32)
Example 15
Project: Text-Classification-Models-Pytorch Author: AnubhavGupta3377 File: utils.py License: MIT License | 6 votes |
def encode_text(text, word_embeddings): ''' Encode a sequence of words into corresponding vector representation Input: text (string) : text (space separated words, etc..) word_embeddings (dict) : dictionary mapping from words to their representation max_sent_len (int) : maximum sentence length (in words) Returns: X (np.array) : array of shape (embedding_size,) averaging all word vectors of text ''' embed = np.zeros(300) count = 0 words = word_tokenize(text) for word in words: if word in word_embeddings: embed += word_embeddings[word] count += 1 return embed / count
Example 16
Project: BERT Author: yyht File: evaluate.py License: Apache License 2.0 | 6 votes |
def mixed_segmentation(in_str, rm_punc=False): in_str = str(in_str).decode('utf-8').lower().strip() segs_out = [] temp_str = "" sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', ',','。',':','?','!','“','”',';','’','《','》','……','·','、', '「','」','(',')','-','~','『','』'] for char in in_str: if rm_punc and char in sp_char: continue if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char: if temp_str != "": ss = nltk.word_tokenize(temp_str) segs_out.extend(ss) temp_str = "" segs_out.append(char) else: temp_str += char #handling last part
Example 17
Project: Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs Author: kamalkraj File: ner.py License: GNU General Public License v3.0 | 5 votes |
def predict(self,Sentence): Sentence = words = word_tokenize(Sentence) Sentence = self.addCharInformation(Sentence) Sentence = self.padding(self.createTensor(Sentence,self.word2Idx,self.case2Idx,self.char2Idx)) tokens, casing,char = Sentence tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = self.model.predict([tokens, casing,char], verbose=False)[0] pred = pred.argmax(axis=-1) pred = [self.idx2Label[x].strip() for x in pred] return list(zip(words,pred))
Example 18
Project: botbuilder-python Author: microsoft File: bidaf_model_runtime.py License: MIT License | 5 votes |
def _preprocess(text: str) -> Tuple[np.ndarray, np.ndarray]: tokens = word_tokenize(text) # split into lower-case word tokens, in numpy array with shape of (seq, 1) words = np.asarray([w.lower() for w in tokens]).reshape(-1, 1) # split words into chars, in numpy array with shape of (seq, 1, 1, 16) chars = [[c for c in t][:16] for t in tokens] chars = [cs + [""] * (16 - len(cs)) for cs in chars] chars = np.asarray(chars).reshape(-1, 1, 1, 16) return words, chars
Example 19
Project: razzy-spinner Author: rafasashi File: textcat.py License: GNU General Public License v3.0 | 5 votes |
def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
Example 20
Project: Machine-Translation Author: foamliu File: pre_process.py License: Apache License 2.0 | 5 votes |
def build_wordmap_en(): translation_path = os.path.join(train_translation_folder, train_translation_en_filename) with open(translation_path, 'r') as f: sentences = f.readlines() word_freq = Counter() for sentence in tqdm(sentences): sentence_en = sentence.strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0] # Update word frequency word_freq.update(tokens) # Create word map # words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq] words = word_freq.most_common(input_lang_vocab_size - 4) word_map = {k[0]: v + 4 for v, k in enumerate(words)} word_map['<pad>'] = 0 word_map['<start>'] = 1 word_map['<end>'] = 2 word_map['<unk>'] = 3 print(len(word_map)) print(words[:10]) with open('data/WORDMAP_en.json', 'w') as file: json.dump(word_map, file, indent=4)
Example 21
Project: Machine-Translation Author: foamliu File: pre_process.py License: Apache License 2.0 | 5 votes |
def build_samples(): word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r')) word_map_en = json.load(open('data/WORDMAP_en.json', 'r')) for usage in ['train', 'valid']: if usage == 'train': translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename) translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename) filename = 'data/samples_train.json' else: translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename) translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename) filename = 'data/samples_valid.json' print('loading {} texts and vocab'.format(usage)) with open(translation_path_en, 'r') as f: data_en = f.readlines() with open(translation_path_zh, 'r') as f: data_zh = f.readlines() print('building {} samples'.format(usage)) samples = [] for idx in tqdm(range(len(data_en))): sentence_zh = data_zh[idx].strip() seg_list = jieba.cut(sentence_zh) input_zh = encode_text(word_map_zh, list(seg_list)) sentence_en = data_en[idx].strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0] output_en = encode_text(word_map_en, tokens) if len(input_zh) <= max_len and len( output_en) <= max_len and UNK_token not in input_zh and UNK_token not in output_en: samples.append({'input': list(input_zh), 'output': list(output_en)}) with open(filename, 'w') as f: json.dump(samples, f, indent=4) print('{} {} samples created at: {}.'.format(len(samples), usage, filename))
Example 22
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-6.1-nlp-pipeline.py License: MIT License | 5 votes |
def tokenize_words(targets): while True: sentence = (yield) words = nltk.word_tokenize(sentence) for target in targets: target.send(words)
Example 23
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-3-sentiment-analysis.py License: MIT License | 5 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer stop_words='english', # remove stop words min_df=1 # minimum document frequency, i.e. the word must appear more than once. ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example 24
Project: Hands-on-NLP-with-NLTK-and-scikit-learn- Author: PacktPublishing File: nlp-4-ngrams.py License: MIT License | 5 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' sa_stop_words = nltk.corpus.stopwords.words("english") # words that might invert a sentence's meaning white_list = [ 'what', 'but', 'if', 'because', 'as', 'until', 'against', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should'] # take these out of the standard NLTK stop word list sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list] # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer min_df=2, # minimum document frequency, i.e. the word must appear more than once. ngram_range=(1, 2), stop_words=sa_stop_words ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example 25
Project: NSCL-PyTorch-Release Author: vacancy File: datasets.py License: MIT License | 5 votes |
def _get_metainfo(self, index): question = gdef.translate_question(self.questions[index]) scene = gdef.translate_scene(self.scenes[question['image_index']]) question['scene'] = scene question['image_index'] = question['image_index'] question['image_filename'] = gdef.get_image_filename(scene) question['question_index'] = index question['question_tokenized'] = nltk.word_tokenize(question['question']) # program section has_program = False if 'program_nsclseq' in question: question['program_raw'] = question['program_nsclseq'] question['program_seq'] = question['program_nsclseq'] has_program = True elif 'program' in question: question['program_raw'] = question['program'] question['program_seq'] = gdef.program_to_nsclseq(question['program'], question) has_program = True if has_program: question['program_tree'] = nsclseq_to_nscltree(question['program_seq']) question['program_qsseq'] = nsclseq_to_nsclqsseq(question['program_seq']) question['program_qstree'] = nscltree_to_nsclqstree(question['program_tree']) question['question_type'] = question['program_seq'][-1]['op'] else: question['question_type'] = None return question
Example 26
Project: sato Author: megagonlabs File: train_LDA.py License: Apache License 2.0 | 5 votes |
def clean(s): tokens = nltk.word_tokenize(s.lower()) tokens_clean = [token for token in tokens if token not in stopwords.words('english')] tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean] return tokens_stemmed
Example 27
Project: mipsqa Author: google File: squad_prepro.py License: Apache License 2.0 | 5 votes |
def _word_tokenize(text): # TODO(seominjoon): Consider using Stanford Tokenizer or othe tokenizers. return [ word.replace('``', '"').replace("''", '"') for word in nltk.word_tokenize(text) ]
Example 28
Project: qb Author: Pinafore File: preprocess.py License: MIT License | 5 votes |
def tokenize_question(text: str) -> List[str]: return word_tokenize(clean_question(text))
Example 29
Project: cs224n-win18-squad Author: abisee File: squad_preprocess.py License: Apache License 2.0 | 5 votes |
def tokenize(sequence): tokens = [token.replace("``", '"').replace("''", '"').lower() for token in nltk.word_tokenize(sequence)] return tokens
Example 30
Project: Snowball Author: davidsbatista File: Snowball.py License: GNU General Public License v3.0 | 5 votes |
def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged """ try: os.path.isfile("processed_tuples.pkl") f = open("processed_tuples.pkl", "r") print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f) f.close() print(len(self.processed_tuples), "tuples loaded") except IOError: print("\nGenerating relationship instances from sentences") f_sentences = codecs.open(sentences_file, encoding='utf-8') count = 0 for line in f_sentences: count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size) for rel in sentence.relationships: if rel.arg1type == self.config.e1_type and rel.arg2type == self.config.e2_type: bef_tokens = word_tokenize(rel.before) bet_tokens = word_tokenize(rel.between) aft_tokens = word_tokenize(rel.after) if not (bef_tokens == 0 and bet_tokens == 0 and aft_tokens == 0): t = Tuple(rel.ent1, rel.ent2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) f_sentences.close() print("\n", len(self.processed_tuples), "relationships generated") print("Dumping relationships to file") f = open("processed_tuples.pkl", "wb") pickle.dump(self.processed_tuples, f) f.close()