Python nltk.wordpunct_tokenize() Examples

The following are code examples for showing how to use nltk.wordpunct_tokenize(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma 
Example 2
Project: Molly   Author: Sulstice   File: app.py    Mozilla Public License 2.0 6 votes vote down vote up
def get_message(facebook_message):


    # First process the message
    message_with_punctuation = facebook_message
    table = str.maketrans(dict.fromkeys(string.punctuation))
    stripped_punctuation_message = message_with_punctuation.translate(table)

    import nltk
    words = set(nltk.corpus.words.words())

    # return selected item to the user
    for word in nltk.wordpunct_tokenize(stripped_punctuation_message):
        if word.lower() in words:
            continue
        else:
            protein_code = word

    pdb_file = get_pdb_file(protein_code, filetype='pdb', compression=False)

    return "Here is your protein %s" % pdb_file 
Example 3
Project: pdf-hacks   Author: AnnaMag   File: language.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_languages(text):
    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens

    >>> wordpunct_tokenize("My name's Anna.End.")
    ['My', name', 's', 'Anna', '.', 'End', '.']
    '''
    languages_ratios = {}

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # number of unique stopwords appearing in analyzed text as included in nltk(Africaans classified as Dutch)
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios 
Example 4
Project: honours_project   Author: JFriel   File: NLTKPreprocessor.py    GNU General Public License v3.0 6 votes vote down vote up
def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma 
Example 5
Project: trolling_detection   Author: rafaharo   File: feature_extraction.py    Apache License 2.0 6 votes vote down vote up
def tokenize_document(document, lowercase=True, stopwords=None, min_length=3):
    import nltk
    if not document or len(document) == 0:
        raise ValueError("Can't tokenize null or empty texts")

    if lowercase:
        document = document.lower()

    tokens = nltk.wordpunct_tokenize(document)

    if stopwords and isinstance(stopwords, str):
        stops = set(nltk.corpus.stopwords.words(stopwords))
    elif stopwords and isinstance(stopwords, list):
        stops = set(stopwords)
    else:
        stops = set()

    result = [token for token in tokens if not token in stops and len(token) >= min_length]
    return result 
Example 6
Project: product-classifier   Author: georgetown-analytics   File: features.py    MIT License 5 votes vote down vote up
def tokenize(self, text):
        """
        Returns a list of individual tokens from the text utilizing NLTK's
        tokenize built in utility (far better than split on space). It also
        removes any stopwords and punctuation from the text, as well as
        ensure that every token is normalized.

        For now, token = word as in bag of words (the feature we're using).
        """
        for token in wordpunct_tokenize(text):
            token = self.normalize(token)
            if token in self.punctuation: continue
            if token in self.stopwords: continue
            yield token 
Example 7
Project: domain_discovery_API   Author: VIDA-NYU   File: concat_nltk.py    GNU General Public License v3.0 5 votes vote down vote up
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    return max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0] 
Example 8
Project: domain_discovery_API   Author: VIDA-NYU   File: concat_nltk.py    GNU General Public License v3.0 5 votes vote down vote up
def is_english(text):
    text = text.lower()
    words = set(nltk.wordpunct_tokenize(text))
    return len(words & ENGLISH_STOPWORDS) > len(words & NON_ENGLISH_STOPWORDS) 
Example 9
Project: deep-nlp   Author: alfredolainez   File: language.py    Apache License 2.0 5 votes vote down vote up
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios


#---------------------------------------------------------------------- 
Example 10
Project: twitter_nltk_volkswagen   Author: edlectrico   File: language_detector.py    Apache License 2.0 5 votes vote down vote up
def _calculate_languages_ratios(text):
    text = str(text) # assuring we receive a String
    languages_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios 
Example 11
Project: minke   Author: DistrictDataLabs   File: normalize.py    MIT License 5 votes vote down vote up
def tokenize(self, text):
        """
        Performs tokenization in addition to normalization.
        """
        return self.normalize(nltk.wordpunct_tokenize(text)) 
Example 12
Project: content   Author: demisto   File: WordTokenizeTest.py    MIT License 5 votes vote down vote up
def tokenize_text(text):
    if not text:
        return ''
    text = text.lower()
    if TOKENIZE_TYPE == 'word':
        word_tokens = nltk.word_tokenize(text)
    elif TOKENIZE_TYPE == 'punkt':
        word_tokens = nltk.wordpunct_tokenize(text)
    else:
        raise Exception("Unsupported tokenize type: %s" % TOKENIZE_TYPE)
    if HASH_SEED:
        word_tokens = map(str, map(lambda x: hash_djb2(x, int(HASH_SEED)), word_tokens))
    return (' '.join(word_tokens)).strip() 
Example 13
Project: atap   Author: foxbook   File: parse.py    Apache License 2.0 5 votes vote down vote up
def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens) 
Example 14
Project: atap   Author: foxbook   File: recommender.py    Apache License 2.0 5 votes vote down vote up
def recommend(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.vect.transform(wordpunct_tokenize(terms))
        distance_matches = self.knn.transform(vect_doc)
        # the result is a list with a 2-tuple of arrays
        matches = distance_matches[0][1][0]
        # the matches are the indices of documents
        return matches 
Example 15
Project: atap   Author: foxbook   File: recommender.py    Apache License 2.0 5 votes vote down vote up
def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0] 
Example 16
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token 
Example 17
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ] 
Example 18
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token 
Example 19
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.paras(fileids=fileids):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ] 
Example 20
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in sent_tokenize(para):
                counts['sents'] += 1

                for word in wordpunct_tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        } 
Example 21
Project: atap   Author: foxbook   File: preprocess.py    Apache License 2.0 5 votes vote down vote up
def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ] 
Example 22
Project: atap   Author: foxbook   File: am_reader.py    Apache License 2.0 5 votes vote down vote up
def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word 
Example 23
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word 
Example 24
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def tagged_tokens(self):
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield nltk.pos_tag(word) 
Example 25
Project: luscan-devel   Author: blackye   File: natural_language.py    GNU General Public License v2.0 5 votes vote down vote up
def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)

    :raises: TypeError
    """
    if not isinstance(text, basestring):
        raise TypeError("Expected basestring, got '%s' instead" % type(text))
    if not text:
        return {}

    languages_ratios = {}

    # Split the text into separate tokens, using natural language punctuation signs.
    tokens = wordpunct_tokenize(text)
    tokenized_words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(tokenized_words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios


#------------------------------------------------------------------------------ 
Example 26
Project: nltk_sentiment_analysis   Author: edlectrico   File: language_detector.py    GNU General Public License v2.0 5 votes vote down vote up
def _calculate_languages_ratios(text):
    text = str(text) # assuring we receive a String
    languages_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios 
Example 27
Project: twitter-gen-classifier-pt   Author: mvicente93   File: pre_processing.py    MIT License 5 votes vote down vote up
def tokenize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(string.lower().strip())
    else:
        return nltk.wordpunct_tokenize(string.strip()) 
Example 28
Project: twitter-gen-classifier-pt   Author: mvicente93   File: pre_processing.py    MIT License 5 votes vote down vote up
def tokenize_and_normalize(string, lower=True):
    if lower:
        return nltk.wordpunct_tokenize(normalize(string).lower().strip())
    else:
        return nltk.wordpunct_tokenize(normalize(string).strip()) 
Example 29
Project: tagbot   Author: emre   File: utils.py    MIT License 5 votes vote down vote up
def tokenize(text):
    porter = PorterStemmer()
    words = wordpunct_tokenize(text)
    stemmed = [porter.stem(word) for word in words]
    return [
        word.lower()
        for word in stemmed
        if word.isalpha()
        and len(word) > 2
    ] 
Example 30
Project: dialog   Author: edward-zhu   File: cluster.py    Apache License 2.0 5 votes vote down vote up
def normalize(sent):
    return wordpunct_tokenize(sent.lower()) 
Example 31
Project: dialog   Author: edward-zhu   File: utils.py    Apache License 2.0 5 votes vote down vote up
def tokenize(sent):
    tokens = tokenizer.tokenize(sent)
    ret = []
    for t in tokens:
        if '<' not in t:
            ret.extend(wordpunct_tokenize(t))
        else:
            ret.append(t)
    return ret 
Example 32
Project: transferable_sent2vec   Author: wasiahmad   File: helper.py    MIT License 5 votes vote down vote up
def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list 
Example 33
Project: transferable_sent2vec   Author: wasiahmad   File: helper.py    MIT License 5 votes vote down vote up
def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list 
Example 34
Project: luscan-devel   Author: blackye   File: natural_language.py    GNU General Public License v2.0 4 votes vote down vote up
def get_words(text, min_length = None, max_length = None):
    """
    Parse the given text as natural language and extract words from it.
    Optionally filter the words by minimum and/or maximum length.

    :param text: Text to parse.
    :type text: str

    :param min_length: Minimum length required by each token. Use None for no limit.
    :type min_length: int | None

    :param min_length: Maximum length allowed by each token. Use None for no limit.
    :type min_length: int | None

    :return: Set of unique words extracted from the text.
    :rtype: set(str)

    :raises: TypeError, ValueError
    """
    if min_length is not None:
        if not isinstance(min_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif min_length < 0:
            raise ValueError("Min length must be greater than 0, got %s." % min_length)

    if max_length is not None:
        if not isinstance(max_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif max_length < 0:
            raise ValueError("Min length must be greater than 0, got %s" % max_length)

    # Split the text into separate tokens, using natural language
    # punctuation signs. Then filter out by min/max length, and tokens
    # that aren't strictly alphabetic. Finally, convert the words to
    # lowercase form.
    return {
        word.lower() for word in wordpunct_tokenize(text) if
        (
            word.isalpha() and
            (min_length is None or len(word) >= min_length) and
            (max_length is None or len(word) <= max_length)
        )
    }


#------------------------------------------------------------------------------