Python gensim.utils.tokenize() Examples

The following are 30 code examples of gensim.utils.tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.utils , or try the search function .
Example #1
Source File: textcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        lineno = -1
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
            self.length = lineno + 1 # will be 0 if loop never executes 
Example #2
Source File: textcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        lineno = -1
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
            self.length = lineno + 1 # will be 0 if loop never executes 
Example #3
Source File: text_utils.py    From pytorch-widedeep with MIT License 6 votes vote down vote up
def simple_preprocess(
    doc: str,
    lower: bool = False,
    deacc: bool = False,
    min_len: int = 2,
    max_len: int = 15,
) -> List[str]:
    r"""
    Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
    lower case all the token in the texts

    For more informations see: https://radimrehurek.com/gensim/utils.html
    """
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors="ignore")
        if min_len <= len(token) <= max_len and not token.startswith("_")
    ]
    return tokens 
Example #4
Source File: textcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        lineno = -1
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
            self.length = lineno + 1 # will be 0 if loop never executes 
Example #5
Source File: textcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        lineno = -1
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
            self.length = lineno + 1 # will be 0 if loop never executes 
Example #6
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler 
Example #7
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class') 
Example #8
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] 
Example #9
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler 
Example #10
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        """
        Parse tokens out of xml. There are two types of token: normal text and
        mathematics. Both are returned interspersed in a single list, in the same
        order as they appeared in the content.

        The math tokens will be returned in the form $tex_expression$, ie. with
        a dollar sign prefix and suffix.
        """
        handler = ArxmlivSource.ArxmlivContentHandler()
        xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler())
        return handler.tokens 
Example #11
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid 
Example #12
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class') 
Example #13
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] 
Example #14
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] 
Example #15
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        """
        Parse tokens out of xml. There are two types of token: normal text and
        mathematics. Both are returned interspersed in a single list, in the same
        order as they appeared in the content.

        The math tokens will be returned in the form $tex_expression$, ie. with
        a dollar sign prefix and suffix.
        """
        handler = ArxmlivSource.ArxmlivContentHandler()
        xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler())
        return handler.tokens 
Example #16
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')] 
Example #17
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid 
Example #18
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class') 
Example #19
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] 
Example #20
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler 
Example #21
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')] 
Example #22
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid 
Example #23
Source File: shakescorpus.py    From twip with MIT License 5 votes vote down vote up
def tokenize(self, line, **kwargs):
        return list(utils.tokenize(line, **kwargs)) 
Example #24
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class') 
Example #25
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler 
Example #26
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        """
        Parse tokens out of xml. There are two types of token: normal text and
        mathematics. Both are returned interspersed in a single list, in the same
        order as they appeared in the content.

        The math tokens will be returned in the form $tex_expression$, ie. with
        a dollar sign prefix and suffix.
        """
        handler = ArxmlivSource.ArxmlivContentHandler()
        xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler())
        return handler.tokens 
Example #27
Source File: wikicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def process_article(args):
    """
    Parse a wikipedia article, returning its content as a list of tokens
    (utf8-encoded strings).
    """
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid 
Example #28
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        raise NotImplementedError('Abstract Base Class') 
Example #29
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def tokenize(self, content):
        return [token.encode('utf8') for token in utils.tokenize(content, errors = 'ignore') if not token.isdigit()] 
Example #30
Source File: sources.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def characters(self, text):
            # for text, we only care about tokens directly within the <p> tag
            if self.path[-1] == 'p':
                tokens = [token.encode('utf8') for token in utils.tokenize(text, errors = 'ignore') if not token.isdigit()]
                self.tokens.extend(tokens)
    #endclass ArxmlivHandler