Python nltk.tokenize() Examples

The following are 30 code examples for showing how to use nltk.tokenize(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 6 votes vote down vote up
def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text)) 
Example 2
Project: sato   Author: megagonlabs   File: train_LDA.py    License: Apache License 2.0 6 votes vote down vote up
def process_col(col, **kwargs):

    numeric = kwargs['num']
    # process the cols to return a bags of word representation
    if col.dtype == 'int64' or col.dtype =='float64':
        if numeric == 'directstr':
            return list(col.astype(str))
        elif numeric == 'placeholder':
            return [str(col.dtype)] * len(col)
        
    if col.dtype == 'object':
        return tokenize(list(col.astype(str)), **kwargs)
    
    else:
        return list(col.astype(str))
       
    return col 
Example 3
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example 4
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    License: MIT License 6 votes vote down vote up
def handle_multiple_sentences(infile, outfile):
	titles = []
	f = open(infile, "r")
	f2 = codecs.open(outfile, "w+", "utf-8")
	for line in f:
		line = line.decode("utf-8")
		sentences = sent_detector.tokenize(line.strip())
		for i in range(len(sentences)):
			if i == 0:
				sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
			else:
				sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
				sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::")

	 	titles.append(" ".join(sentences))
	title_set = set(titles)
	for l in title_set:
		print >> f2, l 
Example 5
Project: atap   Author: foxbook   File: reader.py    License: Apache License 2.0 6 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        } 
Example 6
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 6 votes vote down vote up
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
Example 7
Project: flambe   Author: asappresearch   File: word.py    License: MIT License 6 votes vote down vote up
def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string.

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        if self.exclude_stopwords and self.stop_words:
            example = ' '.join([word for word in word_tokenize(example)
                                if word not in self.stop_words])

        if isinstance(self.ngrams, List):
            ret: List[str] = []
            for i in self.ngrams:
                ret.extend(self._tokenize(example, i))
            return ret
        else:
            return NGramsTokenizer._tokenize(example, self.ngrams) 
Example 8
Project: Dense-CoAttention-Network   Author: cvlab-tohoku   File: load_data.py    License: MIT License 6 votes vote down vote up
def tokenize_mcb(sentence):
	"""
	MCB tokenize implementation.
	--------------------
	Arguments:
		sentence (str): a setence that will be tokenized.
	Return:
		A list of tokens from the sentence.
	"""
	for i in [r"\?", r"\!", r"\'", r"\"", r"\$", r"\:", r"\@", r"\(", r"\)", r"\,", r"\.", r"\;"]:
		sen = re.sub(i, "", sen)

	for i in [r"\-", r"\/"]:
		sen = re.sub(i, " ", sen)
	q_list = re.sub(r"\?", "", sen.lower()).split()
	q_list = list(filter(lambda x: len(x) > 0, q_list))

	return q_list 
Example 9
Project: modin   Author: modin-project   File: kaggle18.py    License: Apache License 2.0 6 votes vote down vote up
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try:
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        text = regex.sub(" ", text)  # remove punctuation
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
        return filtered_tokens
    except TypeError as e:
        print(text, e) 
Example 10
Project: DOTA_models   Author: ringringyi   File: skip_thoughts_encoder.py    License: Apache License 2.0 5 votes vote down vote up
def _tokenize(self, item):
    """Tokenizes an input string into a list of words."""
    tokenized = []
    for s in self._sentence_detector.tokenize(item):
      tokenized.extend(nltk.tokenize.word_tokenize(s))

    return tokenized 
Example 11
Project: steppy-toolkit   Author: minerva-ml   File: text.py    License: MIT License 5 votes vote down vote up
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example 12
Project: steppy-toolkit   Author: minerva-ml   File: text.py    License: MIT License 5 votes vote down vote up
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example 13
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 5 votes vote down vote up
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) 
Example 14
Project: razzy-spinner   Author: rafasashi   File: chunked.py    License: GNU General Public License v3.0 5 votes vote down vote up
def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset,
                                           target_tagset=self._target_tagset)

                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        # Return the block
        return block 
Example 15
Project: razzy-spinner   Author: rafasashi   File: rte_classify.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
Example 16
Project: sato   Author: megagonlabs   File: train_LDA.py    License: Apache License 2.0 5 votes vote down vote up
def tokenize(col, **kwargs):
    threshold = int(kwargs['thr'])
    ret = []
    for st in col:
        if len(st)> threshold:
            # tokenize the string if longer than threshold
            # and append a longstr tag
            ret.extend(clean(st))
            if threshold > 0:
                ret.append('longstr')
        else:
            ret.append(st.lower())
    return ret 
Example 17
Project: StackGAN   Author: hanzhanggit   File: skipthoughts.py    License: MIT License 5 votes vote down vote up
def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X 
Example 18
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 5 votes vote down vote up
def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
Example 19
Project: text-to-image   Author: paarthneekhara   File: skipthoughts.py    License: MIT License 5 votes vote down vote up
def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X 
Example 20
Project: text-to-image   Author: paarthneekhara   File: skipthoughts.py    License: MIT License 5 votes vote down vote up
def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X 
Example 21
Project: chirp   Author: 9b   File: helpers.py    License: MIT License 5 votes vote down vote up
def get_tokens(text):
    """Tokenize the input text."""
    soup = BeautifulSoup(text, "html.parser")
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(soup.get_text()) 
Example 22
Project: visDial.pytorch   Author: jiasenlu   File: prepro.py    License: MIT License 5 votes vote down vote up
def tokenize(sentence):
    return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n']; 
Example 23
Project: gutenberg   Author: pgcorpus   File: tokenizer.py    License: GNU General Public License v3.0 5 votes vote down vote up
def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []
    
    ## split text into sentences
    sentences=sent_tokenize(text, language=language)
    
    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens 
Example 24
Project: SOQAL   Author: husseinmozannar   File: tfidf_reader.py    License: MIT License 5 votes vote down vote up
def stem_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        str_processed = ""
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                str_processed += token +  " " + self.stemmer.stem(token) +" "
        return str_processed 
Example 25
Project: SOQAL   Author: husseinmozannar   File: slidingwindow_distance.py    License: MIT License 5 votes vote down vote up
def tokenize_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        tokens_stemmed = []
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                tokens_stemmed.append(self.stemmer.stem(token))
        return tokens_stemmed 
Example 26
Project: SOQAL   Author: husseinmozannar   File: TfidfRetriever.py    License: MIT License 5 votes vote down vote up
def stem_string(self, str):
        str_tokens = self.tokenizer.tokenize(str)
        str_processed = ""
        for token in str_tokens:
            has_symbol = False
            for s in self.SYMBOLS:
                if s in token:
                    has_symbol = True
                    break
            if not has_symbol:
                str_processed += token + " "
        return str_processed 
Example 27
Project: tokenquery   Author: ramtinms   File: tokenizer.py    License: GNU General Public License v3.0 5 votes vote down vote up
def tokenize(self, text):
        """
           tokenize text into a list of Token objects

            :param text: text to be tokenized (might contains several sentences)
            :type text: str
            :return: List of Token objects
            :rtype: list(Token)
        """
        tokens = []

        if self.tokenizer_type == "SpaceTokenizer":
            operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
            operator = WhitespaceTokenizer()
            for counter, span in enumerate(operator.span_tokenize(text)):
                new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
                tokens.append(new_token)

        elif self.tokenizer_type == "PTBTokenizer":
            ptb_tokens = word_tokenize(text)
            counter = 0
            for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
                new_token = Token(counter, token, span[0], span[1])
                counter += 1
                tokens.append(new_token)

        return tokens 
Example 28
Project: PageRank   Author: ashkonf   File: textrank.py    License: Apache License 2.0 5 votes vote down vote up
def __tokenizeWords(sentence):
    return nltk.tokenize.word_tokenize(sentence)

## tests ######################################################################################## 
Example 29
Project: yolo_v2   Author: rky0930   File: skip_thoughts_encoder.py    License: Apache License 2.0 5 votes vote down vote up
def _tokenize(self, item):
    """Tokenizes an input string into a list of words."""
    tokenized = []
    for s in self._sentence_detector.tokenize(item):
      tokenized.extend(nltk.tokenize.word_tokenize(s))

    return tokenized 
Example 30
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: IdentifyingTopic.py    License: MIT License 5 votes vote down vote up
def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents)))