Python nltk.wordpunct_tokenize() Examples

The following are 19 code examples of nltk.wordpunct_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: preprocess.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ] 
Example #2
Source File: helper.py    From transferable_sent2vec with MIT License 5 votes vote down vote up
def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list 
Example #3
Source File: helper.py    From transferable_sent2vec with MIT License 5 votes vote down vote up
def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list 
Example #4
Source File: utils.py    From dialog with Apache License 2.0 5 votes vote down vote up
def tokenize(sent):
    tokens = tokenizer.tokenize(sent)
    ret = []
    for t in tokens:
        if '<' not in t:
            ret.extend(wordpunct_tokenize(t))
        else:
            ret.append(t)
    return ret 
Example #5
Source File: cluster.py    From dialog with Apache License 2.0 5 votes vote down vote up
def normalize(sent):
    return wordpunct_tokenize(sent.lower()) 
Example #6
Source File: natural_language.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)

    :raises: TypeError
    """
    if not isinstance(text, basestring):
        raise TypeError("Expected basestring, got '%s' instead" % type(text))
    if not text:
        return {}

    languages_ratios = {}

    # Split the text into separate tokens, using natural language punctuation signs.
    tokens = wordpunct_tokenize(text)
    tokenized_words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(tokenized_words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios


#------------------------------------------------------------------------------ 
Example #7
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def tagged_tokens(self):
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield nltk.pos_tag(word) 
Example #8
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word 
Example #9
Source File: am_reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word 
Example #10
Source File: features.py    From product-classifier with MIT License 5 votes vote down vote up
def tokenize(self, text):
        """
        Returns a list of individual tokens from the text utilizing NLTK's
        tokenize built in utility (far better than split on space). It also
        removes any stopwords and punctuation from the text, as well as
        ensure that every token is normalized.

        For now, token = word as in bag of words (the feature we're using).
        """
        for token in wordpunct_tokenize(text):
            token = self.normalize(token)
            if token in self.punctuation: continue
            if token in self.stopwords: continue
            yield token 
Example #11
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in sent_tokenize(para):
                counts['sents'] += 1

                for word in wordpunct_tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        } 
Example #12
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.paras(fileids=fileids):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ] 
Example #13
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token 
Example #14
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ] 
Example #15
Source File: reader.py    From atap with Apache License 2.0 5 votes vote down vote up
def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token 
Example #16
Source File: recommender.py    From atap with Apache License 2.0 5 votes vote down vote up
def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0] 
Example #17
Source File: recommender.py    From atap with Apache License 2.0 5 votes vote down vote up
def recommend(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.vect.transform(wordpunct_tokenize(terms))
        distance_matches = self.knn.transform(vect_doc)
        # the result is a list with a 2-tuple of arrays
        matches = distance_matches[0][1][0]
        # the matches are the indices of documents
        return matches 
Example #18
Source File: parse.py    From atap with Apache License 2.0 5 votes vote down vote up
def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens) 
Example #19
Source File: natural_language.py    From luscan-devel with GNU General Public License v2.0 4 votes vote down vote up
def get_words(text, min_length = None, max_length = None):
    """
    Parse the given text as natural language and extract words from it.
    Optionally filter the words by minimum and/or maximum length.

    :param text: Text to parse.
    :type text: str

    :param min_length: Minimum length required by each token. Use None for no limit.
    :type min_length: int | None

    :param min_length: Maximum length allowed by each token. Use None for no limit.
    :type min_length: int | None

    :return: Set of unique words extracted from the text.
    :rtype: set(str)

    :raises: TypeError, ValueError
    """
    if min_length is not None:
        if not isinstance(min_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif min_length < 0:
            raise ValueError("Min length must be greater than 0, got %s." % min_length)

    if max_length is not None:
        if not isinstance(max_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif max_length < 0:
            raise ValueError("Min length must be greater than 0, got %s" % max_length)

    # Split the text into separate tokens, using natural language
    # punctuation signs. Then filter out by min/max length, and tokens
    # that aren't strictly alphabetic. Finally, convert the words to
    # lowercase form.
    return {
        word.lower() for word in wordpunct_tokenize(text) if
        (
            word.isalpha() and
            (min_length is None or len(word) >= min_length) and
            (max_length is None or len(word) <= max_length)
        )
    }


#------------------------------------------------------------------------------