Python nltk.word_tokenize() Examples

The following are 30 code examples of nltk.word_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function

Example #1

Source File: kaggle.py From dl-models-for-qa with Apache License 2.0

8 votes

def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples

Example #2

Source File: math_expression_calculator.py From JARVIS with Apache License 2.0

8 votes

def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text

Example #3

Source File: utils.py From lisc with Apache License 2.0

6 votes

def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned

Example #4

Source File: nltk_plugin.py From self-attentive-parser with MIT License

6 votes

def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence

Example #5

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

6 votes

def encode_text(text, word_embeddings, max_sen_len):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sen_len (int) : maximum sentence length (in words)
    Returns:
        X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding
    '''
    
    default_embed = np.zeros(300)
    words = word_tokenize(text)[:max_sen_len]
    embeds = [word_embeddings.get(x, default_embed) for x in words]
    embeds += [default_embed] * (max_sen_len - len(embeds))
    return np.array(embeds, dtype=np.float32)

Example #6

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

6 votes

def encode_text(text, word_embeddings):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sent_len (int) : maximum sentence length (in words)
    Returns:
        X (np.array) : array of shape (embedding_size,) averaging all word vectors of text
    '''
    
    embed = np.zeros(300)
    count = 0
    words = word_tokenize(text)
    for word in words:
        if word in word_embeddings:
            embed += word_embeddings[word]
            count += 1
    return embed / count

Example #7

Source File: VectorSpaceModel.py From Snowball with GNU General Public License v3.0

6 votes

def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens")

Example #8

Source File: combined.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Example #9

Source File: sentiwordnet.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Example #10

Source File: evaluate.py From BERT with Apache License 2.0

6 votes

def mixed_segmentation(in_str, rm_punc=False):
	in_str = str(in_str).decode('utf-8').lower().strip()
	segs_out = []
	temp_str = ""
	sp_char = ['-',':','_','*','^','/','\\','~','`','+','=',
			   '，','。','：','？','！','“','”','；','’','《','》','……','·','、',
			   '「','」','（','）','－','～','『','』']
	for char in in_str:
		if rm_punc and char in sp_char:
			continue
		if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char:
			if temp_str != "":
				ss = nltk.word_tokenize(temp_str)
				segs_out.extend(ss)
				temp_str = ""
			segs_out.append(char)
		else:
			temp_str += char

	#handling last part

Example #11

Source File: nlp-6.4-tfidf-svm.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License

6 votes

def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

Example #12

Source File: analyze_data.py From Machine-Translation with Apache License 2.0

6 votes

def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

Example #13

Source File: nlp-5-document-classification.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License

6 votes

def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

Example #14

Source File: data_prep.py From Document-Classifier-LSTM with MIT License

6 votes

def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path

Example #15

Source File: Mem2Seq.py From ConvLab with MIT License

6 votes

def predict(self, query):
        usr = query
        print('Mem2Seq usr:', usr)
        #example input: 'please find a restaurant called nusha .'
        self.t += 1
        print('Mem2Seq turn:', self.t)
        usr = ' '.join(word_tokenize(usr.lower()))
        self.memory += generate_memory(usr, '$u', self.t)
        src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
        src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
        words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
        row = np.transpose(words)[0].tolist()
        if '<EOS>' in row:
            row = row[:row.index('<EOS>')]
        sys = ' '.join(row)
        sys = denormalize(sys)
        print('Mem2Seq sys:', sys)
        self.memory += generate_memory(sys, '$s', self.t)
        return sys

Example #16

Source File: phrasemachine.py From scattertext with Apache License 2.0

6 votes

def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html

Example #17

Source File: classify.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

5 votes

def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!')

Example #18

Source File: tokenizers.py From atis with MIT License

5 votes

def nl_tokenize(string):
    """Tokenizes a natural language string into tokens.

    Inputs:
       string: the string to tokenize.
    Outputs:
        a list of tokens.

    Assumes data is space-separated (this is true of ZC07 data in ATIS2/3).
    """
    return nltk.word_tokenize(string)

Example #19

Source File: EvaluateTruecaser.py From truecaser with Apache License 2.0

5 votes

def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    correctTokens = 0
    totalTokens = 0
    
    for sentence in testSentences:
        tokensCorrect = nltk.word_tokenize(sentence)
        tokens = [token.lower() for token in tokensCorrect]
        tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
        
        perfectMatch = True
        
        for idx in xrange(len(tokensCorrect)):
            totalTokens += 1
            if tokensCorrect[idx] == tokensTrueCase[idx]:
                correctTokens += 1
            else:
                perfectMatch = False
        
        if not perfectMatch:
            print tokensCorrect
            print tokensTrueCase
        
            print "-------------------"
    

    print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100)

Example #20

Source File: yelp.py From Point-Then-Operate with Apache License 2.0

5 votes

def get_references(self):

        assert self.mode == 'test', 'Only test mode support get_references().'
        path0 = os.path.join(self.root, 'reference.0')
        path1 = os.path.join(self.root, 'reference.1')
        ref0 = []
        ori0 = []
        ref1 = []
        ori1 = []
        with open(path0) as f:
            for i, line in enumerate(f):
                if i in self.remove0:
                    continue
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ori0.append(ori)
                ref0.append(ref)
        with open(path1) as f:
            for i, line in enumerate(f):
                if i in self.remove1:
                    continue
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ori1.append(ori)
                ref1.append(ref)
        ori0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori0]
        ref0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref0]
        ori1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori1]
        ref1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref1]
        return ori0, ref0, ori1, ref1

Example #21

Source File: amazon.py From Point-Then-Operate with Apache License 2.0

5 votes

def create_resort(self):
        """The file of human references is not originally aligned with the test split."""
        _ids = []
        ref_gt = []
        with open(os.path.join(self.root, 'reference.0'), 'r') as f:
            for line in tqdm(f):
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ref_gt.append(ori)
        for sent in self.data0:
            _ids.append(ref_gt.index(sent))
        with open(os.path.join(self.root, 'resort_0.txt'), 'w') as f:
            for _id in _ids:
                f.write(str(_id) + '\n')

        _ids = []
        ref_gt = []
        with open(os.path.join(self.root, 'reference.1'), 'r') as f:
            for line in tqdm(f):
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ref_gt.append(ori)
        for sent in self.data1:
            _ids.append(ref_gt.index(sent))
        with open(os.path.join(self.root, 'resort_1.txt'), 'w') as f:
            for _id in _ids:
                f.write(str(_id) + '\n')

Example #22

Source File: tokens.py From chimera with MIT License

5 votes

def tokenize(s):
    return word_tokenize(s.replace("|", " | "))

Example #23

Source File: flashcards-embedding.py From dl-models-for-qa with Apache License 2.0

5 votes

def __iter__(self):
        for line in open(self.filename, "rb"):
            line = line.strip()
            line = line.decode("utf8").encode("ascii", "ignore")
            _, question, answer = line.split("\t")
            qwords = nltk.word_tokenize(question)
            awords = nltk.word_tokenize(answer)
            yield qwords + awords

# build model from sentences (CBOW w/negative sampling)

Example #24

Source File: textpro.py From comparable-text-miner with Apache License 2.0

5 votes

def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity

Example #25

Source File: rd_ft.py From DeepLearn with MIT License

5 votes

def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return nouns

#Average Edit Distance Value For Two String And The Average Edit Distance Between The Nouns Present In Them(Returns Float)

Example #26

Source File: hnd_ft.py From DeepLearn with MIT License

5 votes

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

Example #27

Source File: data.py From fever-naacl-2018 with Apache License 2.0

5 votes

def nltk_tokenizer(self,text):
        return " ".join(word_tokenize(text))

Example #28

Source File: chatbot.py From facebook-chatbot-python with MIT License

5 votes

def __init__(self, text):
        tokens = nltk.word_tokenize(text)
        print "tokenized"
        self._model = nltk.model.ngram.NgramModel(3, tokens)
        print "modelized"

Example #29

Source File: feature_engineering.py From DeepLearn with MIT License

5 votes

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

Example #30

Source File: preprocessors.py From philo2vec with MIT License

5 votes

def get_words(text_stream):
        """
        Tokenize and transform a stream of text.
        :param text_stream: list of sentences.
        :return: return the tokenized sentences after stemming and lower casing
        """
        return [StemmingLookup.stem(word.lower())
                for line in text_stream
                for word in word_tokenize(line)
                if word not in string.punctuation]