Python gensim.utils.simple_preprocess() Examples

The following are 16 code examples of gensim.utils.simple_preprocess(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.utils , or try the search function .
Example #1
Source File: doc2vec.py    From asreview with Apache License 2.0 7 votes vote down vote up
def fit(self, texts):

        model_param = {
            "vector_size": self.vector_size,
            "epochs": self.epochs,
            "min_count": self.min_count,
            "workers": self.n_jobs,
            "window": self.window,
            "dm_concat": self.dm_concat,
            "dbow_words": self.dbow_words,
        }

        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        # If self.dm is 2, train both models and concatenate the feature
        # vectors later. Resulting vector size should be the same.
        if self.dm == 2:
            model_param["vector_size"] = int(model_param["vector_size"]/2)
            self.model_dm = _train_model(corpus, **model_param, dm=1)
            self.model_dbow = _train_model(corpus, **model_param, dm=0)
        else:
            self.model = _train_model(corpus, **model_param, dm=self.dm) 
Example #2
Source File: transformations.py    From keras-pandas with MIT License 6 votes vote down vote up
def fit(self, X, y=None):
        # Format text for processing, by creating a list of strings
        observations = self.prepare_input(X)

        # Preprocess & tokenize
        observations = list(map(lambda x: simple_preprocess(x), observations))

        # Generate embedding_sequence_length, if necessary
        if self.max_sequence_length is None:
            self.max_sequence_length = self.generate_embedding_sequence_length(observations)

        # Update index_lookup
        tokens = set()
        for observation in observations:
            tokens.update(observation)

        logging.debug('Fitting with tokens: {}'.format(tokens))

        current_max_index = max(self.token_index_lookup.values())
        index_range = range(current_max_index, len(tokens) + current_max_index)
        learned_token_index_lookup = dict(zip(tokens, index_range))
        self.token_index_lookup.update(learned_token_index_lookup)
        new_max_token_index = max(self.token_index_lookup.values())
        logging.info('Learned tokens, new_max_token_index: {}'.format(new_max_token_index))
        return self 
Example #3
Source File: parseundp.py    From Semantic-Search-for-Sustainable-Development with Apache License 2.0 6 votes vote down vote up
def read_corpus(path = '.', exclude = [], targets = None):
    i= 0
    for file in os.listdir(path):
        if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
            print(file)
            with open(os.path.join(path, file),  encoding="utf8") as document_text:
                for line in document_text:
                    count = 0
                    words = simple_preprocess(line)
                    for word in words: # count the number of words with <= 3 characters
                        if len(word) <= 3:
                            count += 1
                    if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less 
                        yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
                        i+=1
    if targets:
        for key, val in targets.items():
            yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
            i+=1 
Example #4
Source File: CustomParVec.py    From Semantic-Search-for-Sustainable-Development with Apache License 2.0 6 votes vote down vote up
def inferVector1(self, line):
        '''
        Given a new line, infer a custom vector representation using the corpus tfidf.
 
        Args: 
            line : new sentence to be inferred

        Returns: 
            numpy.ndarray : vector representation of the line
        '''
        line = ' '.join(simple_preprocess(line)) # pre-process the line
        line_tf_idf = self.tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
        rows, cols = line_tf_idf.nonzero()
        
        new_vec = np.zeros(self.dimensions)
        # Apply the same sentence to vector conversion as above. 
        for col in cols:
            try:    
                new_vec += (self.word2vec_model[(self.word_index[col])] * line_tf_idf[0, col])
            except:
                continue
        return np.asarray(new_vec) 
Example #5
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #6
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #7
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #8
Source File: transformations.py    From keras-pandas with MIT License 5 votes vote down vote up
def process_string(self, input_string):
        """
        Turn a string into padded sequences, consistent with Keras's Embedding layer

         - Simple preprocess & tokenize
         - Convert tokens to indices
         - Pad sequence to be the correct length

        :param input_string: A string, to be converted into a padded sequence of token indices
        :type input_string: str
        :return: A padded, fixed-length array of token indices
        :rtype: [int]
        """
        logging.debug('Processing string: {}'.format(input_string))

        # Convert to tokens
        tokens = simple_preprocess(input_string)
        logging.debug('Tokens: {}'.format(tokens))

        # Convert to indices
        indices = list(map(lambda x: self.token_index_lookup[x], tokens))
        logging.debug('Indices: {}'.format(indices))

        # Pad indices
        padding_index = self.token_index_lookup['__PAD__']
        padding_length = self.max_sequence_length
        padded_indices = self.pad(indices, length=padding_length, pad_char=padding_index)
        logging.debug('Padded indices: {}'.format(padded_indices))

        return padded_indices 
Example #9
Source File: CustomParVec.py    From Semantic-Search-for-Sustainable-Development with Apache License 2.0 5 votes vote down vote up
def inferVector2(self, line):
        '''
        Given a new line, infer a custom vector representation using the ground truth tfidf.
 
        Args: 
            line : new sentence to be inferred

        Returns: 
            numpy.ndarray : vector representation of the line
        '''
        line = ' '.join(simple_preprocess(line)) # pre-process the line
        
        replacement_words = []
        for word in line.split():
            if word not in self.extra_tf_idf_obj.vocabulary_:
                try:
                    similar_words = self.word2vec_model.similar_by_word(word, topn=10, restrict_vocab=None)
                    for sim in similar_words:
                        if sim[0] in self.extra_tf_idf_obj.vocabulary_:
                            replacement_words.append((word, sim[0]))
                            break
                except:
                    continue
                    
        for old, new in replacement_words:
            line = line.replace(old, new)
            
        line_tf_idf = self.extra_tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line
        rows, cols = line_tf_idf.nonzero()
        
        new_vec = np.zeros(self.dimensions)
        # Apply the same sentence to vector conversion as above. 
        for col in cols:
            try:    
                new_vec += (self.word2vec_model[(self.extra_word_index[col])] * line_tf_idf[0, col])
            except:
                continue
                            
        return np.asarray(new_vec) 
Example #10
Source File: test_vec4ir.py    From vec4ir with MIT License 5 votes vote down vote up
def test_doc2vec_inference():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1 
Example #11
Source File: test_vec4ir.py    From vec4ir with MIT License 5 votes vote down vote up
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1 
Example #12
Source File: doc2vec.py    From asreview with Apache License 2.0 5 votes vote down vote up
def transform(self, texts):
        corpus = [TaggedDocument(simple_preprocess(text), [i])
                  for i, text in enumerate(texts)]

        if self.dm == 2:
            X_dm = _transform_text(self.model_dm, corpus)
            X_dbow = _transform_text(self.model_dbow, corpus)
            X = np.concatenate((X_dm, X_dbow), axis=1)
        else:
            X = _transform_text(self.model, corpus)
        return X 
Example #13
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #14
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #15
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        with open(datapath('lee_background.cor')) as f:
            for line in f:
                yield utils.simple_preprocess(line) 
Example #16
Source File: wordtwovec.py    From aristo-mini with Apache License 2.0 5 votes vote down vote up
def tokenizer(sentence: str) -> List[str]:
    """use gensim's `simple_preprocess` and `STOPWORDS` list"""
    return [stem(token) for token in simple_preprocess(sentence) if token not in STOPWORDS]