Python nltk.tokenize() Examples

The following are 30 code examples of nltk.tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: word.py    From flambe with MIT License 6 votes vote down vote up
def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string.

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        if self.exclude_stopwords and self.stop_words:
            example = ' '.join([word for word in word_tokenize(example)
                                if word not in self.stop_words])

        if isinstance(self.ngrams, List):
            ret: List[str] = []
            for i in self.ngrams:
                ret.extend(self._tokenize(example, i))
            return ret
        else:
            return NGramsTokenizer._tokenize(example, self.ngrams) 
Example #2
Source File: reader.py    From atap with Apache License 2.0 6 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        } 
Example #3
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text)) 
Example #4
Source File: phrasemachine.py    From phrasemachine with MIT License 6 votes vote down vote up
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
Example #5
Source File: kaggle18.py    From modin with Apache License 2.0 6 votes vote down vote up
def tokenize(text):
    """
    sent_tokenize(): segment text into sentences
    word_tokenize(): break sentences into words
    """
    try:
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        text = regex.sub(" ", text)  # remove punctuation
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
        return filtered_tokens
    except TypeError as e:
        print(text, e) 
Example #6
Source File: load_data.py    From Dense-CoAttention-Network with MIT License 6 votes vote down vote up
def tokenize_mcb(sentence):
	"""
	MCB tokenize implementation.
	--------------------
	Arguments:
		sentence (str): a setence that will be tokenized.
	Return:
		A list of tokens from the sentence.
	"""
	for i in [r"\?", r"\!", r"\'", r"\"", r"\$", r"\:", r"\@", r"\(", r"\)", r"\,", r"\.", r"\;"]:
		sen = re.sub(i, "", sen)

	for i in [r"\-", r"\/"]:
		sen = re.sub(i, " ", sen)
	q_list = re.sub(r"\?", "", sen.lower()).split()
	q_list = list(filter(lambda x: len(x) > 0, q_list))

	return q_list 
Example #7
Source File: train_LDA.py    From sato with Apache License 2.0 6 votes vote down vote up
def process_col(col, **kwargs):

    numeric = kwargs['num']
    # process the cols to return a bags of word representation
    if col.dtype == 'int64' or col.dtype =='float64':
        if numeric == 'directstr':
            return list(col.astype(str))
        elif numeric == 'placeholder':
            return [str(col.dtype)] * len(col)
        
    if col.dtype == 'object':
        return tokenize(list(col.astype(str)), **kwargs)
    
    else:
        return list(col.astype(str))
       
    return col 
Example #8
Source File: experiments.py    From clickbait with MIT License 6 votes vote down vote up
def handle_multiple_sentences(infile, outfile):
	titles = []
	f = open(infile, "r")
	f2 = codecs.open(outfile, "w+", "utf-8")
	for line in f:
		line = line.decode("utf-8")
		sentences = sent_detector.tokenize(line.strip())
		for i in range(len(sentences)):
			if i == 0:
				sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
			else:
				sentences[i] = sentences[i].replace(sentences[i].split()[0],sentences[i].split()[0].title())
				sentences[i-1] = sentences[i-1].replace(sentences[i-1].split()[-1][-1], " ::::")

	 	titles.append(" ".join(sentences))
	title_set = set(titles)
	for l in title_set:
		print >> f2, l 
Example #9
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example #10
Source File: skip_thoughts_encoder.py    From object_detection_kitti with Apache License 2.0 5 votes vote down vote up
def _tokenize(self, item):
    """Tokenizes an input string into a list of words."""
    tokenized = []
    for s in self._sentence_detector.tokenize(item):
      tokenized.extend(nltk.tokenize.word_tokenize(s))

    return tokenized 
Example #11
Source File: phrasemachine.py    From phrasemachine with MIT License 5 votes vote down vote up
def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed
        
        sents = self.sent_detector.tokenize(text)    # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]} 
Example #12
Source File: common.py    From acl-anthology with Apache License 2.0 5 votes vote down vote up
def tokenize(s):
    """Splits tokens (hyphens/slashes count as separate tokens)."""
    tokens = []
    # NLTK tokenizer uses PTB standard, which doesn't split on hyphens or slashes
    for tok in nltk.tokenize.word_tokenize(s):
        # tokenizer normalizes quotes etc., so we need to detokenize later
        tokens.extend([t for t in re.split(r"([-–/])", tok) if t != ""])
    return tokens 
Example #13
Source File: preprocessing.py    From open-solution-toxic-comments with MIT License 5 votes vote down vote up
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example #14
Source File: kaggle18.py    From modin with Apache License 2.0 5 votes vote down vote up
def tokenize(text):
    try:
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        text = regex.sub(" ", text)  # remove punctuation
        tokens_ = [word_tokenize(s) for s in sent_tokenize(text)]
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent
        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        filtered_tokens = [w for w in tokens if re.search("[a-zA-Z]", w)]
        filtered_tokens = [w.lower() for w in filtered_tokens if len(w) >= 3]
        return filtered_tokens
    except TypeError as e:
        print(text, e) 
Example #15
Source File: rake.py    From rake-nltk with MIT License 5 votes vote down vote up
def extract_keywords_from_text(self, text):
        """Method to extract keywords from the text provided.

        :param text: Text to extract keywords from, provided as a string.
        """
        sentences = nltk.tokenize.sent_tokenize(text)
        self.extract_keywords_from_sentences(sentences) 
Example #16
Source File: ner.py    From metadoc with MIT License 5 votes vote down vote up
def __init__(self, text):
    self.perceptron_tagger = AveragedPerceptronTagger(autoload=True)
    self.stopwords = set(nltk.corpus.stopwords.words())
    self.top_fraction = 70 # consider top candidate keywords only
    self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    self.sentences = self.sent_detector.tokenize(text) 
Example #17
Source File: text.py    From open-solution-mapping-challenge with MIT License 5 votes vote down vote up
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example #18
Source File: text.py    From open-solution-mapping-challenge with MIT License 5 votes vote down vote up
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APPO[word] if word in APPO else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example #19
Source File: word.py    From flambe with MIT License 5 votes vote down vote up
def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        return example.split() 
Example #20
Source File: data.py    From dong_iccv_2017 with MIT License 5 votes vote down vote up
def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower()) 
Example #21
Source File: rte_classify.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
                              'have', 'is', 'are', 'were', 'and', 'very', '.',','])

        self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set([lemmatize(token) for token in self.text_tokens])
            self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
Example #22
Source File: chunked.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(sent_str)

                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        # Return the block
        return block 
Example #23
Source File: text.py    From open-solution-data-science-bowl-2018 with MIT License 5 votes vote down vote up
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APPO[word] if word in APPO else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example #24
Source File: text.py    From open-solution-data-science-bowl-2018 with MIT License 5 votes vote down vote up
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example #25
Source File: text.py    From pliers with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _filter(self, stim):
        if self.tokenizer:
            tokens = self.tokenizer.tokenize(stim.text)
        else:
            tokens = word_tokenize(stim.text)
        stims = [TextStim(stim.filename, token, order=i)
                 for i, token in enumerate(tokens)]
        return stims 
Example #26
Source File: text.py    From pliers with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _filter(self, stim):
        pos_map = {
            'ADJ': 'a',
            'ADJ_SAT': 's',
            'ADV': 'r',
            'NOUN': 'n',
            'VERB': 'v'
        }

        def pos_wordnet(txt):
            pos_tagged = dict(nltk.pos_tag(txt, tagset='universal'))
            pos_tagged = {t: pos_map[tag] if tag in pos_map else 'n'
                          for t, tag in pos_tagged.items()}
            return pos_tagged

        tokens = [stim.text]
        if self.tokenize:
            tokens = nltk.word_tokenize(tokens[0])
        tokens = [t if self.case_sensitive else t.lower() for t in tokens]
        if not isinstance(self.stemmer, stem.WordNetLemmatizer):
            stemmed = ' '.join([self.stemmer.stem(t) for t in tokens])
        else:
            pos_tagged = pos_wordnet(tokens)
            stemmed = ' '.join([self.stemmer.lemmatize(t, pos=pos_tagged[t])
                                for t in tokens])
        return TextStim(stim.filename, stemmed, stim.onset, stim.duration,
                        stim.order, stim.url) 
Example #27
Source File: text.py    From pliers with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False,
                 *args, **kwargs):
        if isinstance(stemmer, str):
            if stemmer not in self._stemmers:
                valid = list(self._stemmers.keys())
                raise ValueError("Invalid stemmer '%s'; please use one of %s."
                                 % (stemmer, valid))
            stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs)
        elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)):
            raise ValueError("stemmer must be either a valid string, or an "
                             "instance of class StemmerI.")
        self.stemmer = stemmer
        self.tokenize = tokenize
        self.case_sensitive = case_sensitive
        super().__init__() 
Example #28
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts = nltk.FreqDist()
        tokens = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files': n_fileids,
            'topics': n_topics,
            'paras': counts['paras'],
            'sents': counts['sents'],
            'words': counts['words'],
            'vocab': len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc': float(counts['paras']) / float(n_fileids),
            'sppar': float(counts['sents']) / float(counts['paras']),
            'secs': time.time() - started,
        } 
Example #29
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ] 
Example #30
Source File: s2v_encoder.py    From S2V with Apache License 2.0 5 votes vote down vote up
def _tokenize(self, item):
    """Tokenizes an input string into a list of words."""
    tokenized = []
    for s in self._sentence_detector.tokenize(item):
      tokenized.extend(nltk.tokenize.word_tokenize(s))

    return tokenized