Python sklearn.mixture.GaussianMixture() Examples

The following are 30 code examples for showing how to use sklearn.mixture.GaussianMixture(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.mixture , or try the search function .

Example 1
Project: adversarial-policies   Author: HumanCompatibleAI   File: fit_density.py    License: MIT License 6 votes vote down vote up
def gen_exp_name(model_class, model_kwargs):
    """Generates experiment name from model class and parameters.

    :param model_class: (type) the class, one of GaussianMixture, PCAPreDensity or KernelDensity.
    :param model_kwargs: (dict) constructor arguments to the class.
    :return A string succinctly encoding the class and parameters."""
    if model_class == GaussianMixture:
        n_components = model_kwargs.get("n_components", 1)
        covariance_type = model_kwargs.get("covariance_type", "full")
        return f"gmm_{n_components}_components_{covariance_type}"
    elif model_class == PCAPreDensity:
        if model_kwargs["density_class"] == KernelDensity:
            return "pca_kde"
        elif model_kwargs["density_class"] == GaussianMixture:
            return "pca_gmm"
        else:
            return "pca_unknown"
    elif model_class == KernelDensity:
        return "kde"
    else:
        return "default" 
Example 2
Project: scVI   Author: YosefLab   File: posterior.py    License: MIT License 6 votes vote down vote up
def clustering_scores(self, prediction_algorithm: str = "knn") -> Tuple:
        if self.gene_dataset.n_labels > 1:
            latent, _, labels = self.get_latent()
            if prediction_algorithm == "knn":
                labels_pred = KMeans(
                    self.gene_dataset.n_labels, n_init=200
                ).fit_predict(
                    latent
                )  # n_jobs>1 ?
            elif prediction_algorithm == "gmm":
                gmm = GMM(self.gene_dataset.n_labels)
                gmm.fit(latent)
                labels_pred = gmm.predict(latent)

            asw_score = silhouette_score(latent, labels)
            nmi_score = NMI(labels, labels_pred)
            ari_score = ARI(labels, labels_pred)
            uca_score = unsupervised_clustering_accuracy(labels, labels_pred)[0]
            logger.debug(
                "Clustering Scores:\nSilhouette: %.4f\nNMI: %.4f\nARI: %.4f\nUCA: %.4f"
                % (asw_score, nmi_score, ari_score, uca_score)
            )
            return asw_score, nmi_score, ari_score, uca_score 
Example 3
Project: geosketch   Author: brianhie   File: differential_entropies.py    License: MIT License 6 votes vote down vote up
def differential_entropies(X, labels):
    n_samples, n_features = X.shape
    
    labels = np.array(labels)
    names = sorted(set(labels))

    entropies = []
    
    for name in names:
        name_idx = np.where(labels == name)[0]

        gm = GaussianMixture().fit(X[name_idx, :])

        mn = multivariate_normal(
            mean=gm.means_.flatten(),
            cov=gm.covariances_.reshape(n_features, n_features)
        )

        entropies.append(mn.entropy())

    probs = softmax(entropies)

    for name, entropy, prob in zip(names, entropies, probs):
        #print('{}\t{}\t{}'.format(name, entropy, prob))
        print('{}\t{}'.format(name, entropy)) 
Example 4
Project: Speech_Signal_Processing_and_Classification   Author: gionanide   File: gmm.py    License: MIT License 6 votes vote down vote up
def determineComponents(data):
	X,Y = preparingData(data)
	n_components = np.arange(1,10)
	bic = np.zeros(n_components.shape)

	for i,n in enumerate(n_components):
		#fit gmm to data for each value of components
		gmm = GaussianMixture(n_components=n,max_iter=200, covariance_type='diag' ,n_init=3)
		gmm.fit(X)
		#store BIC scores
		bic[i] = gmm.bic(X)

	#Therefore, Bayesian Information Criteria (BIC) is introduced as a cost function composing of 2 terms; 
	#1) minus of log-likelihood and 2) model complexity. Please see my old post. You will see that BIC prefers model 
	#that gives good result while the complexity remains small. In other words, the model whose BIC is smallest is the winner
	#plot the results
	plt.plot(bic)
	plt.show() 
Example 5
Project: ImageSetCleaner   Author: GuillaumeErhard   File: predicting.py    License: GNU General Public License v3.0 6 votes vote down vote up
def detection_with_gaussian_mixture(image_set):
    """

    :param image_set: The bottleneck values of the relevant images.
    :return: Predictions vector
    """

    # Might achieve, better results by initializing weights, or means, given we know when we introduce noisy labels
    clf = mixture.GaussianMixture(n_components=2)

    clf.fit(image_set)

    predictions = clf.predict(image_set)
    predictions = normalize_predictions(predictions)

    return predictions 
Example 6
Project: platform-resource-manager   Author: intel   File: gmmfense.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, data, max_mixture=10, threshold=0.1):
        """
        Class constructor, arguments include:
            data - data to build GMM model
            max_mixture - max number of Gaussian mixtures
            threshold - probability threhold to determine fense
        """
        self.data = data
        self.thresh = threshold
        lowest_bic = np.infty
        components = 1
        bic = []
        n_components_range = range(1, max_mixture + 1)
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          random_state=1005)
            gmm.fit(data)
            bic.append(gmm.bic(data))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm
                components = n_components
        log.debug('best gmm components number: %d, bic %f ', components, lowest_bic)
        self.gmm = best_gmm 
Example 7
Project: platform-resource-manager   Author: intel   File: gmmfense.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, data, max_mixture=10, threshold=0.1):
        """
        Class constructor, arguments include:
            data - data to build GMM model
            max_mixture - max number of Gaussian mixtures
            threshold - probability threhold to determine fense
        """
        self.data = data
        self.thresh = threshold
        lowest_bic = np.infty
        components = 1
        bic = []
        n_components_range = range(1, max_mixture + 1)
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          random_state=1005)
            gmm.fit(data)
            bic.append(gmm.bic(data))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm
                components = n_components
        log.debug('best gmm components number: %d, bic %f ', components, lowest_bic)
        self.gmm = best_gmm 
Example 8
Project: kenchi   Author: Y-oHr-N   File: statistical.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _fit(self, X):
        self.estimator_     = GaussianMixture(
            covariance_type = self.covariance_type,
            init_params     = self.init_params,
            max_iter        = self.max_iter,
            means_init      = self.means_init,
            n_components    = self.n_components,
            n_init          = self.n_init,
            precisions_init = self.precisions_init,
            random_state    = self.random_state,
            reg_covar       = self.reg_covar,
            tol             = self.tol,
            warm_start      = self.warm_start,
            weights_init    = self.weights_init
        ).fit(X)

        return self 
Example 9
Project: DivideMix   Author: LiJunnan1992   File: Train_webvision_parallel.py    License: MIT License 6 votes vote down vote up
def eval_train(eval_loader,model,device,whichnet,queue):   
    CE = nn.CrossEntropyLoss(reduction='none')
    model.eval()
    num_iter = (len(eval_loader.dataset)//eval_loader.batch_size)+1
    losses = torch.zeros(len(eval_loader.dataset))    
    with torch.no_grad():
        for batch_idx, (inputs, targets, index) in enumerate(eval_loader):
            inputs, targets = inputs.to(device), targets.to(device,non_blocking=True) 
            outputs = model(inputs) 
            loss = CE(outputs, targets)  
            for b in range(inputs.size(0)):
                losses[index[b]]=loss[b]       
            sys.stdout.write('\n')
            sys.stdout.write('|%s Evaluating loss Iter[%3d/%3d]\t' %(whichnet,batch_idx,num_iter)) 
            sys.stdout.flush()    
                                    
    losses = (losses-losses.min())/(losses.max()-losses.min())    

    # fit a two-component GMM to the loss
    input_loss = losses.reshape(-1,1)
    gmm = GaussianMixture(n_components=2,max_iter=10,tol=1e-2,reg_covar=1e-3)
    gmm.fit(input_loss)
    prob = gmm.predict_proba(input_loss) 
    prob = prob[:,gmm.means_.argmin()]         
    queue.put(prob) 
Example 10
Project: DivideMix   Author: LiJunnan1992   File: Train_webvision.py    License: MIT License 6 votes vote down vote up
def eval_train(model,all_loss):    
    model.eval()
    num_iter = (len(eval_loader.dataset)//eval_loader.batch_size)+1
    losses = torch.zeros(len(eval_loader.dataset))    
    with torch.no_grad():
        for batch_idx, (inputs, targets, index) in enumerate(eval_loader):
            inputs, targets = inputs.cuda(), targets.cuda() 
            outputs = model(inputs) 
            loss = CE(outputs, targets)  
            for b in range(inputs.size(0)):
                losses[index[b]]=loss[b]       
            sys.stdout.write('\r')
            sys.stdout.write('| Evaluating loss Iter[%3d/%3d]\t' %(batch_idx,num_iter)) 
            sys.stdout.flush()    
                                    
    losses = (losses-losses.min())/(losses.max()-losses.min())    
    all_loss.append(losses)

    # fit a two-component GMM to the loss
    input_loss = losses.reshape(-1,1)
    gmm = GaussianMixture(n_components=2,max_iter=10,tol=1e-2,reg_covar=5e-4)
    gmm.fit(input_loss)
    prob = gmm.predict_proba(input_loss) 
    prob = prob[:,gmm.means_.argmin()]         
    return prob,all_loss 
Example 11
Project: celeb-detection-oss   Author: Giphy   File: clustering.py    License: Mozilla Public License 2.0 6 votes vote down vote up
def clusterize(points, n_components=2, covariance_type='tied',
               centers=None, weights=None, output=None, random_state=1000):
    if centers is not None:
        n_components = len(centers)

    if output is None:
        output = points

    if len(points) < 2:
        return [list(output)]

    gmm = GaussianMixture(n_components=n_components,
                          covariance_type=covariance_type,
                          means_init=centers,
                          weights_init=weights,
                          random_state=random_state)
    gmm.fit(points)
    labels = gmm.predict(points)

    clusters = defaultdict(list)
    for label, point in zip(labels, output):
        clusters[label].append(point)

    return sorted(clusters.values(), key=lambda x: len(x), reverse=True) 
Example 12
Project: UnsupervisedDeepLearning-Pytorch   Author: eelxpeng   File: vade.py    License: MIT License 6 votes vote down vote up
def initialize_gmm(self, dataloader):
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()

        self.eval()
        data = []
        for batch_idx, (inputs, _) in enumerate(dataloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            z, outputs, mu, logvar = self.forward(inputs)
            data.append(z.data.cpu().numpy())
        data = np.concatenate(data)
        gmm = GaussianMixture(n_components=self.n_centroids,covariance_type='diag')
        gmm.fit(data)
        self.u_p.data.copy_(torch.from_numpy(gmm.means_.T.astype(np.float32)))
        self.lambda_p.data.copy_(torch.from_numpy(gmm.covariances_.T.astype(np.float32))) 
Example 13
Project: SCDV   Author: dheeraj7596   File: SCDV.py    License: MIT License 6 votes vote down vote up
def cluster_GMM(num_clusters, word_vectors):
    # Initalize a GMM object and use it for clustering.
    clf = GaussianMixture(n_components=num_clusters,
                          covariance_type="tied", init_params='kmeans', max_iter=50)
    # Get cluster assignments.
    clf.fit(word_vectors)
    idx = clf.predict(word_vectors)
    print("Clustering Done...", time.time() - start, "seconds")
    # Get probabilities of cluster assignments.
    idx_proba = clf.predict_proba(word_vectors)
    # Dump cluster assignments and probability of cluster assignments. 
    joblib.dump(idx, 'gmm_latestclusmodel_len2alldata.pkl')
    print("Cluster Assignments Saved...")

    joblib.dump(idx_proba, 'gmm_prob_latestclusmodel_len2alldata.pkl')
    print("Probabilities of Cluster Assignments Saved...")
    return (idx, idx_proba) 
Example 14
Project: SCDV   Author: dheeraj7596   File: SCDV.py    License: MIT License 6 votes vote down vote up
def cluster_GMM(num_clusters, word_vectors):
    # Initalize a GMM object and use it for clustering.
    clf = GaussianMixture(n_components=num_clusters,
                          covariance_type="tied", init_params='kmeans', max_iter=50)
    # Get cluster assignments.
    clf.fit(word_vectors)
    idx = clf.predict(word_vectors)
    print("Clustering Done...", time.time() - start, "seconds")
    # Get probabilities of cluster assignments.
    idx_proba = clf.predict_proba(word_vectors)
    # Dump cluster assignments and probability of cluster assignments.
    joblib.dump(idx, 'gmm_latestclusmodel_len2alldata.pkl')
    print("Cluster Assignments Saved...")

    joblib.dump(idx_proba, 'gmm_prob_latestclusmodel_len2alldata.pkl')
    print("Probabilities of Cluster Assignments Saved...")
    return (idx, idx_proba) 
Example 15
def gmm(n_clusters, samples):

    """
    Run GMM clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Fit Gaussian Mixture Model
    gmm = mixture.GaussianMixture(
        n_components=n_clusters, covariance_type='tied', max_iter=1000,
        init_params='kmeans', verbose=0)
    gmm.fit(samples)

    labels = gmm.predict(samples)
    labels = labels.astype(np.int32)+1

    return labels 
Example 16
Project: SDGym   Author: sdv-dev   File: utils.py    License: MIT License 6 votes vote down vote up
def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.meta = self.get_metadata(data, categorical_columns, ordinal_columns)
        model = []

        self.output_info = []
        self.output_dim = 0
        for id_, info in enumerate(self.meta):
            if info['type'] == CONTINUOUS:
                gm = GaussianMixture(self.n_clusters)
                gm.fit(data[:, id_].reshape([-1, 1]))
                model.append(gm)
                self.output_info += [(1, 'tanh'), (self.n_clusters, 'softmax')]
                self.output_dim += 1 + self.n_clusters
            else:
                model.append(None)
                self.output_info += [(info['size'], 'softmax')]
                self.output_dim += info['size']

        self.model = model 
Example 17
Project: SDGym   Author: sdv-dev   File: evaluate.py    License: MIT License 6 votes vote down vote up
def _evaluate_gmm_likelihood(train, test, metadata, components=[10, 30]):
    results = list()
    for n_components in components:
        gmm = GaussianMixture(n_components, covariance_type='diag')
        LOGGER.info('Evaluating using %s', gmm)
        gmm.fit(test)
        l1 = gmm.score(train)

        gmm.fit(train)
        l2 = gmm.score(test)

        results.append({
            "name": repr(gmm),
            "syn_likelihood": l1,
            "test_likelihood": l2,
        })

    return pd.DataFrame(results) 
Example 18
Project: Conditional_Density_Estimation   Author: freelunchtheorem   File: BaseNNMixtureEstimator.py    License: MIT License 6 votes vote down vote up
def _sample_rows_same(self, X):
    """ uses efficient sklearn implementation to sample from gaussian mixture -> only works if all rows of X are the same"""
    weights, locs, scales = self._get_mixture_components(np.expand_dims(X[0], axis=0))

    # make sure that sum of weights < 1
    weights = weights.astype(np.float64)
    weights = weights / np.sum(weights)

    gmm = GaussianMixture(n_components=self.n_centers, covariance_type='diag', max_iter=5, tol=1e-1)
    gmm.fit(np.random.normal(size=(100,self.ndim_y))) # just pretending a fit
    # overriding the GMM parameters with own params
    gmm.converged_ = True
    gmm.weights_ = weights[0]
    gmm.means_ = locs[0]
    gmm.covariances_ = scales[0]
    y_sample, _ = gmm.sample(X.shape[0])
    assert y_sample.shape == (X.shape[0], self.ndim_y)
    return X, y_sample 
Example 19
Project: scanobjectnn   Author: hkust-vgd   File: utils.py    License: MIT License 6 votes vote down vote up
def get_3d_grid_gmm(subdivisions=[5,5,5], variance=0.04):
    """
    Compute the weight, mean and covariance of a gmm placed on a 3D grid
    :param subdivisions: 2 element list of number of subdivisions of the 3D space in each axes to form the grid
    :param variance: scalar for spherical gmm.p
    :return gmm: gmm: instance of sklearn GaussianMixture (GMM) object Gauassian mixture model
    """
    # n_gaussians = reduce(lambda x, y: x*y,subdivisions)
    n_gaussians = np.prod(np.array(subdivisions))
    step = [1.0/(subdivisions[0]),  1.0/(subdivisions[1]),  1.0/(subdivisions[2])]

    means = np.mgrid[ step[0]-1: 1.0-step[0]: complex(0, subdivisions[0]),
                      step[1]-1: 1.0-step[1]: complex(0, subdivisions[1]),
                      step[2]-1: 1.0-step[2]: complex(0, subdivisions[2])]
    means = np.reshape(means, [3, -1]).T
    covariances = variance*np.ones_like(means)
    weights = (1.0/n_gaussians)*np.ones(n_gaussians)
    gmm = GaussianMixture(n_components=n_gaussians, covariance_type='diag')
    gmm.weights_ = weights
    gmm.covariances_ = covariances
    gmm.means_ = means
    from sklearn.mixture.gaussian_mixture import _compute_precision_cholesky
    gmm.precisions_cholesky_ = _compute_precision_cholesky(covariances, 'diag')
    return gmm 
Example 20
Project: scanobjectnn   Author: hkust-vgd   File: utils.py    License: MIT License 6 votes vote down vote up
def get_2d_grid_gmm(subdivisions=[5, 5], variance=0.04):
    """
    Compute the weight, mean and covariance of a 2D gmm placed on a 2D grid

    :param subdivisions: 2 element list of number of subdivisions of the 2D space in each axes to form the grid
    :param variance: scalar for spherical gmm.p
    :return gmm: gmm: instance of sklearn GaussianMixture (GMM) object Gauassian mixture model
    """
    # n_gaussians = reduce(lambda x, y: x*y,subdivisions)
    n_gaussians = np.prod(np.array(subdivisions))
    step = [1.0/(subdivisions[0]),  1.0/(subdivisions[1])]

    means = np.mgrid[step[0]-1: 1.0-step[0]: complex(0, subdivisions[0]),
            step[1]-1: 1.0-step[1]: complex(0, subdivisions[1])]
    means = np.reshape(means, [2,-1]).T
    covariances = variance*np.ones_like(means)
    weights = (1.0/n_gaussians)*np.ones(n_gaussians)
    gmm = GaussianMixture(n_components=n_gaussians, covariance_type='diag')
    gmm.weights_ = weights
    gmm.covariances_ = covariances
    gmm.means_ = means
    from sklearn.mixture.gaussian_mixture import _compute_precision_cholesky
    gmm.precisions_cholesky_ = _compute_precision_cholesky(covariances, 'diag')
    return gmm 
Example 21
Project: teachDeepRL   Author: flowersteam   File: alp_gmm.py    License: MIT License 6 votes vote down vote up
def sample_task(self):
        if (len(self.tasks) < self.nb_random) or (np.random.random() < self.random_task_ratio):
            # Random task sampling
            new_task = self.random_task_generator.sample()
        else:
            # ALP-based task sampling

            # 1 - Retrieve the mean ALP value of each Gaussian in the GMM
            self.alp_means = []
            for pos, _, w in zip(self.gmm.means_, self.gmm.covariances_, self.gmm.weights_):
                self.alp_means.append(pos[-1])

            # 2 - Sample Gaussian proportionally to its mean ALP
            idx = proportional_choice(self.alp_means, eps=0.0)

            # 3 - Sample task in Gaussian, without forgetting to remove ALP dimension
            new_task = np.random.multivariate_normal(self.gmm.means_[idx], self.gmm.covariances_[idx])[:-1]
            new_task = np.clip(new_task, self.mins, self.maxs).astype(np.float32)

        return new_task 
Example 22
Project: fishervector   Author: jonasrothfuss   File: FisherVector.py    License: MIT License 5 votes vote down vote up
def _fit(self, X, model_dump_path=None, verbose=True):
    """
    :param X: shape (n_videos, n_frames, n_descriptors_per_image, n_dim_descriptor)
    :param model_dump_path: (optional) path where the fitted model shall be dumped
    :param verbose - boolean that controls the verbosity
    :return: fitted Fisher vector object
    """
    assert X.ndim == 4
    self.feature_dim = X.shape[-1]

    X = X.reshape(-1, X.shape[-1])

    # fit GMM and store params of fitted model
    self.gmm = gmm = GaussianMixture(n_components=self.n_kernels, covariance_type=self.covariance_type, max_iter=1000).fit(X)
    self.covars = gmm.covariances_
    self.means = gmm.means_
    self.weights = gmm.weights_

    # if cov_type is diagonal - make sure that covars holds a diagonal matrix
    if self.covariance_type == 'diag':
      cov_matrices = np.empty(shape=(self.n_kernels, self.covars.shape[1], self.covars.shape[1]))
      for i in range(self.n_kernels):
        cov_matrices[i, :, :] = np.diag(self.covars[i, :])
      self.covars = cov_matrices

    assert self.covars.ndim == 3
    self.fitted = True
    if verbose:
      print('fitted GMM with %i kernels'%self.n_kernels)

    if model_dump_path:
      with open(model_dump_path, 'wb') as f:
        pickle.dump(self,f, protocol=4)
      if verbose:
        print('Dumped fitted model to', model_dump_path)

    return self 
Example 23
Project: adversarial-policies   Author: HumanCompatibleAI   File: fit_density.py    License: MIT License 5 votes vote down vote up
def base_config():
    ray_server = None  # by default will launch a server
    activation_glob = None  # directory of generated activations
    output_root = None  # directory to write output
    data_type = "ff_policy"  # key into activations
    max_timesteps = None  # if specified, maximum number of timesteps of activations to use
    seed = 0
    model_class = GaussianMixture  # density model to use
    model_kwargs = {"n_components": 10}  # parameters for density model
    train_opponent = "zoo_1"  # opponent ID to use for fitting density model (extracted from path)
    train_percentage = 0.5  # percentage of data to use for training (remainder is validation)
    _ = locals()  # quieten flake8 unused variable warning
    del _ 
Example 24
Project: adversarial-policies   Author: HumanCompatibleAI   File: fit_density.py    License: MIT License 5 votes vote down vote up
def gmm():
    model_class = GaussianMixture
    _ = locals()  # quieten flake8 unused variable warning
    del _ 
Example 25
Project: adversarial-policies   Author: HumanCompatibleAI   File: fit_density.py    License: MIT License 5 votes vote down vote up
def pca_gmm():
    model_class = PCAPreDensity
    model_kwargs = {"density_class": GaussianMixture}
    _ = locals()  # quieten flake8 unused variable warning
    del _ 
Example 26
Project: speaker-recognition-py3   Author: crouchred   File: skgmm.py    License: Apache License 2.0 5 votes vote down vote up
def fit_new(self, x, label):
        self.y.append(label)
        gmm = GaussianMixture(self.gmm_order)
        gmm.fit(x)
        self.gmms.append(gmm) 
Example 27
Project: bert-extractive-summarizer   Author: dmmiller612   File: cluster_features.py    License: MIT License 5 votes vote down vote up
def __get_model(self, k: int):
        """
        Retrieve clustering model

        :param k: amount of clusters
        :return: Clustering model

        """

        if self.algorithm == 'gmm':
            return GaussianMixture(n_components=k, random_state=self.random_state)
        return KMeans(n_clusters=k, random_state=self.random_state) 
Example 28
Project: redshells   Author: m3dev   File: scdv.py    License: MIT License 5 votes vote down vote up
def __init__(self, documents: List[List[str]], cluster_size: int, sparsity_percentage: float, gaussian_mixture_kwargs: Dict[Any, Any],
                 dictionary: gensim.corpora.Dictionary, w2v: Union[FastText, Word2Vec]) -> None:
        """

        :param documents: documents for training.
        :param cluster_size:  word cluster size.
        :param sparsity_percentage: sparsity percentage. This must be in [0, 1].
        :param gaussian_mixture_kwargs: Arguments to build `sklearn.mixture.GaussianMixture` except cluster_size. Please see `sklearn.mixture.GaussianMixture.__init__` for details.
        :param dictionary: `gensim.corpora.Dictionary`. 
        """
        logger.info('_build_dictionary...')
        self._dictionary = dictionary
        vocabulary_size = len(self._dictionary.token2id)
        embedding_size = w2v.wv.vector_size

        logger.info('_build_word_embeddings...')
        self._word_embeddings = self._build_word_embeddings(self._dictionary, w2v)
        assert self._word_embeddings.shape == (vocabulary_size, embedding_size)

        logger.info('_build_word_cluster_probabilities...')
        self._word_cluster_probabilities = self._build_word_cluster_probabilities(self._word_embeddings, cluster_size, gaussian_mixture_kwargs)
        assert self._word_cluster_probabilities.shape == (vocabulary_size, cluster_size)

        logger.info('_build_idf...')
        self._idf = self._build_idf(self._dictionary)
        assert self._idf.shape == (vocabulary_size, )

        logger.info('_build_word_cluster_vectors...')
        word_cluster_vectors = self._build_word_cluster_vectors(self._word_embeddings, self._word_cluster_probabilities)
        assert word_cluster_vectors.shape == (vocabulary_size, cluster_size, embedding_size)

        logger.info('_build_word_topic_vectors...')
        word_topic_vectors = self._build_word_topic_vectors(self._idf, word_cluster_vectors)
        assert word_topic_vectors.shape == (vocabulary_size, (cluster_size * embedding_size))

        logger.info('_build_sparsity_threshold...')
        self._sparse_threshold = self._build_sparsity_threshold(word_topic_vectors, self._dictionary, documents, sparsity_percentage) 
Example 29
Project: redshells   Author: m3dev   File: scdv.py    License: MIT License 5 votes vote down vote up
def _build_word_cluster_probabilities(word_embeddings: np.ndarray, cluster_size: int, gaussian_mixture_parameters: Dict[Any, Any]) -> np.ndarray:
        gm = GaussianMixture(n_components=cluster_size, **gaussian_mixture_parameters)
        gm.fit(word_embeddings)
        return gm.predict_proba(word_embeddings) 
Example 30
Project: Speech_Signal_Processing_and_Classification   Author: gionanide   File: gmm.py    License: MIT License 5 votes vote down vote up
def GaussianMixtureModel(data,gender):
	#A GMM attempts to find a mixture of multidimensional Gaussian probability distributions that best model any input dataset.
	#In the simplest case , GMM's can be used for finding clusters in the same manner as k-means.
	X,Y = preparingData(data)
	#print data.head(n=5)
	
	#we do not split into training and testing becuase we all ready did that in a file basis, so teh X,Y in this
	#function is to train the model and another file with another set of X,Y is the testModels function to assess the model

	#takes only the first feature to redefine the problem as 1-D problem
	#dataFeature1 =  data.as_matrix(columns=data.columns[0:1])
	#plot histogram
	#sns.distplot(dataFeature1,bins=20,kde=False)
	#plt.show()

	
	
	#Y = target variable
	gmm =  GaussianMixture(n_components=8,max_iter=200,covariance_type='diag',n_init=3)
	gmm.fit(X)
	
		

	#save the model to disk
	filename = 'finalizedModel_'+gender+'.gmm'
	pickle.dump(gmm,open(filename,'wb'))
	print 'Model saved in path: PATH_TO'+filename


	return X
	#load the model from disk
	'''loadedModel = pickle.load(open(filename,'rb'))
	result = loadedModel.score(X)
	print result'''