Python nltk.word_tokenize() Examples

The following are 30 code examples for showing how to use nltk.word_tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: ConvLab   Author: ConvLab   File: Mem2Seq.py    License: MIT License 6 votes vote down vote up
def predict(self, query):
        usr = query
        print('Mem2Seq usr:', usr)
        #example input: 'please find a restaurant called nusha .'
        self.t += 1
        print('Mem2Seq turn:', self.t)
        usr = ' '.join(word_tokenize(usr.lower()))
        self.memory += generate_memory(usr, '$u', self.t)
        src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
        src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
        words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
        row = np.transpose(words)[0].tolist()
        if '<EOS>' in row:
            row = row[:row.index('<EOS>')]
        sys = ' '.join(row)
        sys = denormalize(sys)
        print('Mem2Seq sys:', sys)
        self.memory += generate_memory(sys, '$s', self.t)
        return sys 
Example 2
Project: Machine-Translation   Author: foamliu   File: analyze_data.py    License: Apache License 2.0 6 votes vote down vote up
def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show() 
Example 3
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-5-document-classification.py    License: MIT License 6 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 4
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.4-tfidf-svm.py    License: MIT License 6 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 5
Project: Snowball   Author: davidsbatista   File: VectorSpaceModel.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
Example 6
Project: Document-Classifier-LSTM   Author: AlexGidiotis   File: data_prep.py    License: MIT License 6 votes vote down vote up
def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path 
Example 7
Project: JARVIS   Author: abhi007tyagi   File: math_expression_calculator.py    License: Apache License 2.0 6 votes vote down vote up
def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text 
Example 8
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example 9
Project: dl-models-for-qa   Author: sujitpal   File: kaggle.py    License: Apache License 2.0 6 votes vote down vote up
def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples 
Example 10
Project: lisc   Author: lisc-tools   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned 
Example 11
Project: Projects   Author: iamshang1   File: sentiwordnet.py    License: MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example 12
Project: Projects   Author: iamshang1   File: combined.py    License: MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example 13
Project: self-attentive-parser   Author: nikitakit   File: nltk_plugin.py    License: MIT License 6 votes vote down vote up
def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence 
Example 14
Project: Text-Classification-Models-Pytorch   Author: AnubhavGupta3377   File: utils.py    License: MIT License 6 votes vote down vote up
def encode_text(text, word_embeddings, max_sen_len):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sen_len (int) : maximum sentence length (in words)
    Returns:
        X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding
    '''
    
    default_embed = np.zeros(300)
    words = word_tokenize(text)[:max_sen_len]
    embeds = [word_embeddings.get(x, default_embed) for x in words]
    embeds += [default_embed] * (max_sen_len - len(embeds))
    return np.array(embeds, dtype=np.float32) 
Example 15
Project: Text-Classification-Models-Pytorch   Author: AnubhavGupta3377   File: utils.py    License: MIT License 6 votes vote down vote up
def encode_text(text, word_embeddings):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sent_len (int) : maximum sentence length (in words)
    Returns:
        X (np.array) : array of shape (embedding_size,) averaging all word vectors of text
    '''
    
    embed = np.zeros(300)
    count = 0
    words = word_tokenize(text)
    for word in words:
        if word in word_embeddings:
            embed += word_embeddings[word]
            count += 1
    return embed / count 
Example 16
Project: BERT   Author: yyht   File: evaluate.py    License: Apache License 2.0 6 votes vote down vote up
def mixed_segmentation(in_str, rm_punc=False):
	in_str = str(in_str).decode('utf-8').lower().strip()
	segs_out = []
	temp_str = ""
	sp_char = ['-',':','_','*','^','/','\\','~','`','+','=',
			   ',','。',':','?','!','“','”',';','’','《','》','……','·','、',
			   '「','」','(',')','-','~','『','』']
	for char in in_str:
		if rm_punc and char in sp_char:
			continue
		if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char:
			if temp_str != "":
				ss = nltk.word_tokenize(temp_str)
				segs_out.extend(ss)
				temp_str = ""
			segs_out.append(char)
		else:
			temp_str += char

	#handling last part 
Example 17
def predict(self,Sentence):
        Sentence = words =  word_tokenize(Sentence)
        Sentence = self.addCharInformation(Sentence)
        Sentence = self.padding(self.createTensor(Sentence,self.word2Idx,self.case2Idx,self.char2Idx))
        tokens, casing,char = Sentence
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = self.model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1)
        pred = [self.idx2Label[x].strip() for x in pred]
        return list(zip(words,pred)) 
Example 18
Project: botbuilder-python   Author: microsoft   File: bidaf_model_runtime.py    License: MIT License 5 votes vote down vote up
def _preprocess(text: str) -> Tuple[np.ndarray, np.ndarray]:
        tokens = word_tokenize(text)
        # split into lower-case word tokens, in numpy array with shape of (seq, 1)
        words = np.asarray([w.lower() for w in tokens]).reshape(-1, 1)
        # split words into chars, in numpy array with shape of (seq, 1, 1, 16)
        chars = [[c for c in t][:16] for t in tokens]
        chars = [cs + [""] * (16 - len(cs)) for cs in chars]
        chars = np.asarray(chars).reshape(-1, 1, 1, 16)
        return words, chars 
Example 19
Project: razzy-spinner   Author: rafasashi   File: textcat.py    License: GNU General Public License v3.0 5 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 20
Project: Machine-Translation   Author: foamliu   File: pre_process.py    License: Apache License 2.0 5 votes vote down vote up
def build_wordmap_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    word_freq = Counter()

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0]
        # Update word frequency
        word_freq.update(tokens)

    # Create word map
    # words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    words = word_freq.most_common(input_lang_vocab_size - 4)
    word_map = {k[0]: v + 4 for v, k in enumerate(words)}
    word_map['<pad>'] = 0
    word_map['<start>'] = 1
    word_map['<end>'] = 2
    word_map['<unk>'] = 3
    print(len(word_map))
    print(words[:10])

    with open('data/WORDMAP_en.json', 'w') as file:
        json.dump(word_map, file, indent=4) 
Example 21
Project: Machine-Translation   Author: foamliu   File: pre_process.py    License: Apache License 2.0 5 votes vote down vote up
def build_samples():
    word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r'))
    word_map_en = json.load(open('data/WORDMAP_en.json', 'r'))

    for usage in ['train', 'valid']:
        if usage == 'train':
            translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename)
            translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename)
            filename = 'data/samples_train.json'
        else:
            translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename)
            translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename)
            filename = 'data/samples_valid.json'

        print('loading {} texts and vocab'.format(usage))
        with open(translation_path_en, 'r') as f:
            data_en = f.readlines()

        with open(translation_path_zh, 'r') as f:
            data_zh = f.readlines()

        print('building {} samples'.format(usage))
        samples = []
        for idx in tqdm(range(len(data_en))):
            sentence_zh = data_zh[idx].strip()
            seg_list = jieba.cut(sentence_zh)
            input_zh = encode_text(word_map_zh, list(seg_list))

            sentence_en = data_en[idx].strip().lower()
            tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0]
            output_en = encode_text(word_map_en, tokens)

            if len(input_zh) <= max_len and len(
                    output_en) <= max_len and UNK_token not in input_zh and UNK_token not in output_en:
                samples.append({'input': list(input_zh), 'output': list(output_en)})

        with open(filename, 'w') as f:
            json.dump(samples, f, indent=4)

        print('{} {} samples created at: {}.'.format(len(samples), usage, filename)) 
Example 22
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    License: MIT License 5 votes vote down vote up
def tokenize_words(targets):
    while True:
        sentence = (yield)
        words = nltk.word_tokenize(sentence)
        for target in targets:
            target.send(words) 
Example 23
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-3-sentiment-analysis.py    License: MIT License 5 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=1  # minimum document frequency, i.e. the word must appear more than once.
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 24
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-4-ngrams.py    License: MIT License 5 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    sa_stop_words = nltk.corpus.stopwords.words("english")

    # words that might invert a sentence's meaning
    white_list = [
        'what', 'but', 'if', 'because', 'as', 'until', 'against',
        'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
        'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any',
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
        'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should']

    # take these out of the standard NLTK stop word list
    sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list]

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=sa_stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 25
Project: NSCL-PyTorch-Release   Author: vacancy   File: datasets.py    License: MIT License 5 votes vote down vote up
def _get_metainfo(self, index):
        question = gdef.translate_question(self.questions[index])
        scene = gdef.translate_scene(self.scenes[question['image_index']])
        question['scene'] = scene

        question['image_index'] = question['image_index']
        question['image_filename'] = gdef.get_image_filename(scene)
        question['question_index'] = index
        question['question_tokenized'] = nltk.word_tokenize(question['question'])

        # program section
        has_program = False
        if 'program_nsclseq' in question:
            question['program_raw'] = question['program_nsclseq']
            question['program_seq'] = question['program_nsclseq']
            has_program = True
        elif 'program' in question:
            question['program_raw'] = question['program']
            question['program_seq'] = gdef.program_to_nsclseq(question['program'], question)
            has_program = True

        if has_program:
            question['program_tree'] = nsclseq_to_nscltree(question['program_seq'])
            question['program_qsseq'] = nsclseq_to_nsclqsseq(question['program_seq'])
            question['program_qstree'] = nscltree_to_nsclqstree(question['program_tree'])
            question['question_type'] = question['program_seq'][-1]['op']
        else:
            question['question_type'] = None

        return question 
Example 26
Project: sato   Author: megagonlabs   File: train_LDA.py    License: Apache License 2.0 5 votes vote down vote up
def clean(s):
    tokens = nltk.word_tokenize(s.lower())
    tokens_clean = [token for token in tokens if token not in stopwords.words('english')]
    tokens_stemmed = [PorterStemmer().stem(token) for token in tokens_clean]
    return tokens_stemmed 
Example 27
Project: mipsqa   Author: google   File: squad_prepro.py    License: Apache License 2.0 5 votes vote down vote up
def _word_tokenize(text):
  # TODO(seominjoon): Consider using Stanford Tokenizer or othe tokenizers.
  return [
      word.replace('``', '"').replace("''", '"')
      for word in nltk.word_tokenize(text)
  ] 
Example 28
Project: qb   Author: Pinafore   File: preprocess.py    License: MIT License 5 votes vote down vote up
def tokenize_question(text: str) -> List[str]:
    return word_tokenize(clean_question(text)) 
Example 29
Project: qb   Author: Pinafore   File: dataset.py    License: MIT License 5 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
Example 30
Project: cs224n-win18-squad   Author: abisee   File: squad_preprocess.py    License: Apache License 2.0 5 votes vote down vote up
def tokenize(sequence):
    tokens = [token.replace("``", '"').replace("''", '"').lower() for token in nltk.word_tokenize(sequence)]
    return tokens