# Copyright 2019 The ASReview Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np try: from gensim.utils import simple_preprocess from gensim.models.doc2vec import TaggedDocument except ImportError: raise ImportError("Install gensim package (`pip install gensim`) to use" " 'doc2vec' model.") from asreview.feature_extraction.base import BaseFeatureExtraction def _train_model(corpus, *args, **kwargs): import gensim model = gensim.models.doc2vec.Doc2Vec(*args, **kwargs) model.build_vocab(corpus) model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs) return model def _transform_text(model, corpus): X = [] for doc_id in range(len(corpus)): doc_vec = model.infer_vector(corpus[doc_id].words) X.append(doc_vec) return np.array(X) class Doc2Vec(BaseFeatureExtraction): """Base class for doc2vec feature extraction.""" name = "doc2vec" def __init__(self, *args, vector_size=40, epochs=33, min_count=1, n_jobs=1, window=7, dm_concat=0, dm=2, dbow_words=0, **kwargs): """Initialize the doc2vec model. Arguments --------- vector_size: int Output size of the vector. epochs: int Number of epochs to train the doc2vec model. min_count: int Minimum number of occurences for a word in the corpus for it to be included in the model. n_jobs: int Number of threads to train the model with. window: int Maximum distance over which word vectors influence each other. dm_concat: int Whether to concatenate word vectors or not. See paper for more detail. dm: int Model to use. 0: Use distribute bag of words (DBOW). 1: Use distributed memory (DM). 2: Use both of the above with half the vector size and concatenate them. dbow_words: int Whether to train the word vectors using the skipgram method. """ super(Doc2Vec, self).__init__(*args, **kwargs) self.vector_size = int(vector_size) self.epochs = int(epochs) self.min_count = int(min_count) self.n_jobs = int(n_jobs) self.window = int(window) self.dm_concat = int(dm_concat) self.dm = int(dm) self.dbow_words = int(dbow_words) self._model = None self._model_dm = None self._model_dbow = None def fit(self, texts): model_param = { "vector_size": self.vector_size, "epochs": self.epochs, "min_count": self.min_count, "workers": self.n_jobs, "window": self.window, "dm_concat": self.dm_concat, "dbow_words": self.dbow_words, } corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] # If self.dm is 2, train both models and concatenate the feature # vectors later. Resulting vector size should be the same. if self.dm == 2: model_param["vector_size"] = int(model_param["vector_size"]/2) self.model_dm = _train_model(corpus, **model_param, dm=1) self.model_dbow = _train_model(corpus, **model_param, dm=0) else: self.model = _train_model(corpus, **model_param, dm=self.dm) def transform(self, texts): corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] if self.dm == 2: X_dm = _transform_text(self.model_dm, corpus) X_dbow = _transform_text(self.model_dbow, corpus) X = np.concatenate((X_dm, X_dbow), axis=1) else: X = _transform_text(self.model, corpus) return X def full_hyper_space(self): from hyperopt import hp eps = 1e-7 hyper_space, hyper_choices = super(Doc2Vec, self).full_hyper_space() hyper_space.update({ "fex_vector_size": hp.quniform( "fex_vector_size", 31.5, 127.5-eps, 8), "fex_epochs": hp.quniform("fex_epochs", 20, 50, 1), "fex_min_count": hp.quniform("fex_min_count", 0.5, 2.499999, 1), "fex_window": hp.quniform("fex_window", 4.5, 9.4999999, 1), "fex_dm_concat": hp.randint("fex_dm_concat", 2), "fex_dm": hp.randint("fex_dm", 3), "fex_dbow_words": hp.randint("fex_dbow_words", 2), }) return hyper_space, hyper_choices