Python sklearn.metrics.pairwise.pairwise_distances() Examples

The following are code examples for showing how to use sklearn.metrics.pairwise.pairwise_distances(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: fmralign   Author: Parietal-INRIA   File: alignment_methods.py    BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def optimal_permutation(X, Y):
    """Compute the optmal permutation matrix of X toward Y

    Parameters
    ----------
    X: (n_samples, n_features) nd array
        source data
    Y: (n_samples, n_features) nd array
        target data

    Returns
    ----------
    permutation : (n_features, n_features) nd array
        transformation matrix
    """
    dist = pairwise_distances(X.T, Y.T)
    u = linear_assignment(dist)
    permutation = scipy.sparse.csr_matrix(
        (np.ones(X.shape[1]), (u[:, 0], u[:, 1]))).T
    return permutation 
Example 2
Project: kuaa   Author: rafaelwerneck   File: plugin_tfidf-cosine.py    GNU General Public License v3.0 6 votes vote down vote up
def distance(seq1, seq2, extra={}):
    """
    Performs TF-IDF on the sequences and calculates the euclidean distance
    between them.
    """
    
    idf_matrix = extra["idf_matrix"]
    
    #Seq1
    string_seq1 = " ".join(seq1)
    seq1_fv = idf_matrix.transform([string_seq1]).todense()
    
    #Seq2
    string_seq2 = " ".join(seq2)
    seq2_fv = idf_matrix.transform([string_seq2]).todense()
    
    #Distance
    return pairwise.pairwise_distances(seq1_fv, seq2_fv, 'cosine')[INDEX_ZERO][INDEX_ZERO] 
Example 3
Project: kuaa   Author: rafaelwerneck   File: plugin_tfidf.py    GNU General Public License v3.0 6 votes vote down vote up
def distance(seq1, seq2, extra={}):
    """
    Performs TF-IDF on the sequences and calculates the euclidean distance
    between them.
    """
    
    idf_matrix = extra["idf_matrix"]
    
    #Seq1
    string_seq1 = " ".join(seq1)
    seq1_fv = idf_matrix.transform([string_seq1])
    
    #Seq2
    string_seq2 = " ".join(seq2)
    seq2_fv = idf_matrix.transform([string_seq2])
    
    #Distance
    return pairwise.pairwise_distances(seq1_fv, seq2_fv)[INDEX_ZERO][INDEX_ZERO] 
Example 4
Project: smote_variants   Author: gykovacs   File: _smote_variants.py    MIT License 6 votes vote down vote up
def _set_reach_dist(self, point_index, processed, X, nbrs):
        P = X[point_index:point_index + 1]
        indices = nbrs.radius_neighbors(P, radius=self.max_eps,
                                        return_distance=False)[0]

        # Getting indices of neighbors that have not been processed
        unproc = np.compress((~np.take(processed, indices)).ravel(),
                             indices, axis=0)
        # Keep n_jobs = 1 in the following lines...please
        if not unproc.size:
            # Everything is already processed. Return to main loop
            return point_index

        dists = pairwise_distances(P, np.take(X, unproc, axis=0),
                                   self.metric, n_jobs=1).ravel()

        rdists = np.maximum(dists, self.core_distances_[point_index])
        new_reach = np.minimum(np.take(self.reachability_, unproc), rdists)
        self.reachability_[unproc] = new_reach

        # Define return order based on reachability distance
        return (unproc[self.quick_scan(np.take(self.reachability_, unproc),
                                  dists)]) 
Example 5
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def canopy(X, T1, T2, distance_metric='euclidean', filemap=None):
    canopies = dict()
    X1_dist = pairwise_distances(X, metric=distance_metric)
    canopy_points = set(range(X.shape[0]))
    while canopy_points:
        point = canopy_points.pop()
        i = len(canopies)
        canopies[i] = {"c":point, "points": list(np.where(X1_dist[point] < T2)[0])}
        canopy_points = canopy_points.difference(set(np.where(X1_dist[point] < T1)[0]))
    if filemap:
        for canopy_id in canopies.keys():
            canopy = canopies.pop(canopy_id)
            canopy2 = {"c":filemap[canopy['c']], "points":list()}
            for point in canopy['points']:
                canopy2["points"].append(filemap[point])
            canopies[canopy_id] = canopy2
    return canopies 
Example 6
Project: progressivis   Author: jdfekete   File: test_03_pairwise.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def NOtest_vec_distances(self):
        s= self.scheduler()
        vec=VECLoader(get_dataset('warlogs'),scheduler=s)
#        dis=PairwiseDistances(metric='cosine',scheduler=s)
#        dis.input.df = vec.output.df
#        dis.input.array = vec.output.array
        cnt = Every(proc=self.terse,constant_time=True,scheduler=s)
#        cnt.input.df = dis.output.dist
        cnt.input.df = vec.output.table
        global times
        times = 0
        s.start()
        table = vec.table()
        #print(table)
#        computed = dis.dist()
#        self.assertEquals(computed.shape[0], len(df))
#        truth = pairwise_distances(vec.toarray(), metric=dis._metric)
#        self.assertTrue(np.allclose(truth, computed)) 
Example 7
Project: progressivis   Author: jdfekete   File: test_03_pairwise.py    BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_csv_distances(self):
        s = self.scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
#        dis=PairwiseDistances(metric='euclidean',scheduler=s)
#        dis.input.df = vec.output.df
        cnt = Every(proc=self.terse,constant_time=True,scheduler=s)
#        cnt.input.df = dis.output.dist
        cnt.input.df = vec.output.table
        global times
        times = 0
        s.start(ten_times)
        s.join()
        table = vec.table()
        #print(repr(table))
#        computed = dis.dist()
        #self.assertEquals(computed.shape[0], len(df))

#        offset=0
#        size=offset+5000
#        truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric)
#        dist = computed[offset:size,offset:size]
#        self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance 
Example 8
Project: scikit-feature   Author: jundongl   File: UDFS.py    GNU General Public License v2.0 6 votes vote down vote up
def construct_M(X, k, gamma):
    """
    This function constructs the M matrix described in the paper
    """
    n_sample, n_feature = X.shape
    Xt = X.T
    D = pairwise_distances(X)
    # sort the distance matrix D in ascending order
    idx = np.argsort(D, axis=1)
    # choose the k-nearest neighbors for each instance
    idx_new = idx[:, 0:k+1]
    H = np.eye(k+1) - 1/(k+1) * np.ones((k+1, k+1))
    I = np.eye(k+1)
    Mi = np.zeros((n_sample, n_sample))
    for i in range(n_sample):
        Xi = Xt[:, idx_new[i, :]]
        Xi_tilde =np.dot(Xi, H)
        Bi = np.linalg.inv(np.dot(Xi_tilde.T, Xi_tilde) + gamma*I)
        Si = np.zeros((n_sample, k+1))
        for q in range(k+1):
            Si[idx_new[q], q] = 1
        Mi = Mi + np.dot(np.dot(Si, np.dot(np.dot(H, Bi), H)), Si.T)
    M = np.dot(np.dot(X.T, Mi), X)
    return M 
Example 9
Project: MCSAuditing   Author: spring-epfl   File: defense_mechanisms.py    GNU General Public License v3.0 6 votes vote down vote up
def compute_geometric_median(probabilities_input, values_input):
    # Computes the geometric median. Weiszfeld's algorithm
    probabilities = np.copy(probabilities_input)
    values = np.copy(values_input)

    values = values[probabilities > 0]  # remove entries of values with probability of 0
    probabilities = probabilities[probabilities > 0]  # remove zero entries
    geo_median_old = np.array([float("inf"), float("inf")])
    geo_median = np.dot(probabilities.transpose(), values)  # Initial estimation is the mean
    nIter = 0
    while (check_condition(geo_median, geo_median_old)) and (nIter < 200):

        distance_matrix = pairwise_distances([geo_median], values)
        distance_matrix = distance_matrix[0]
        # Return if there is a zero value in distance_matrix
        if np.any(distance_matrix == 0):
            #print "emerg brake", nIter
            return geo_median
        # print len(distance_matrix)
        geo_median_old = geo_median
        div = np.divide(probabilities, distance_matrix)
        geo_median = np.divide(np.dot(div, values), np.dot(div, np.ones_like(values)))
        nIter += 1
    return geo_median 
Example 10
Project: redis_stats   Author: mapado   File: stats.py    GNU General Public License v3.0 6 votes vote down vote up
def clusterize_keys(keys_vector, dbname):
    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(keys_vector)

    if dbname == 'kmeans':
        db = cluster.KMeans(n_clusters=10)
    else:
        X = pairwise_distances(X, metric='cosine')
        db = cluster.DBSCAN(min_samples=1)

    print "Feature len: {}".format(len(vectorizer.get_feature_names()))
    db.fit(X)

    labels = db.labels_
    nb_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print 'Number of cluster found: {}'.format(nb_clusters)

    return labels 
Example 11
Project: modAL   Author: modAL-python   File: density.py    MIT License 6 votes vote down vote up
def information_density(X: modALinput, metric: Union[str, Callable] = 'euclidean') -> np.ndarray:
    """
    Calculates the information density metric of the given data using the given metric.

    Args:
        X: The data for which the information density is to be calculated.
        metric: The metric to be used. Should take two 1d numpy.ndarrays for argument.

    Todo:
        Should work with all possible modALinput.
        Perhaps refactor the module to use some stuff from sklearn.metrics.pairwise

    Returns:
        The information density for each sample.
    """
    # inf_density = np.zeros(shape=(X.shape[0],))
    # for X_idx, X_inst in enumerate(X):
    #     inf_density[X_idx] = sum(similarity_measure(X_inst, X_j) for X_j in X)
    #
    # return inf_density/X.shape[0]

    similarity_mtx = 1/(1+pairwise_distances(X, X, metric=metric))

    return similarity_mtx.mean(axis=1) 
Example 12
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_t_sne.py    Apache License 2.0 6 votes vote down vote up
def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
                     verbose=False, perplexity=0.1, skip_num_points=0):
    distances = pairwise_distances(pos_input).astype(np.float32)
    args = distances, perplexity, verbose
    pos_output = pos_output.astype(np.float32)
    neighbors = neighbors.astype(np.int64)
    pij_input = _joint_probabilities(*args)
    pij_input = squareform(pij_input).astype(np.float32)
    grad_bh = np.zeros(pos_output.shape, dtype=np.float32)

    from scipy.sparse import csr_matrix
    P = csr_matrix(pij_input)

    neighbors = P.indices.astype(np.int64)
    indptr = P.indptr.astype(np.int64)

    _barnes_hut_tsne.gradient(P.data, pos_output, neighbors, indptr,
                              grad_bh, 0.5, 2, 1, skip_num_points=0)
    assert_array_almost_equal(grad_bh, grad_output, decimal=4) 
Example 13
Project: design_embeddings_jmd_2016   Author: IDEALLab   File: metrics.py    MIT License 5 votes vote down vote up
def geo_dist_inconsistency(X, F, X_precomputed=False, verbose=0):
    ''' Geodesic distance inconsistency '''
    if X_precomputed:
        geo_X = X # geodesic distance metrix for X
    else:
        geo_X = get_geo_dist(X, verbose=verbose)
    geo_X[geo_X==0] = np.inf # if two points are not connected
    np.fill_diagonal(geo_X, 0)
    
    dist_F = pairwise.pairwise_distances(F) # distance metrix for F
    
    gdi = 1-pearsonr(geo_X.flatten(), dist_F.flatten())[0]**2
    
#    from matplotlib import pyplot as plt
#    plt.figure()
#    plt.scatter(geo_X.flatten(), dist_F.flatten())
#    plt.show()
    
#    def cost(alpha):
#        return smape(geo_X*alpha, dist_F)
#    
#    # Cost increases when min(geo_X*alpha) > max(dist_F) or max(geo_X*alpha) < min(dist_F)
#    # But min could be very close to 0 (for points close to each other), so use mean instead
#    bounds=((np.mean(dist_F)/np.max(geo_X), np.max(dist_F)/np.mean(geo_X)),)
#    res = differential_evolution(cost, bounds)
#    
#    gdi = res.fun
    
    return gdi 
Example 14
Project: ABRW   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):
    # XXX: possible to integrate pairwise_similarity with top_k to enhance performance? 
    # we'll use it elsewhere. if really needed, write a new method for this purpose
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = euclidean_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = manhattan_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result


# ---------------------------------ulits for preprocessing-------------------------------- 
Example 15
Project: deepSVDD   Author: GSRS   File: svm.py    MIT License 5 votes vote down vote up
def train(self, **kwargs):

        if self.data._X_train.ndim > 2:
            X_train_shape = self.data._X_train.shape
            X_train = self.data._X_train.reshape(X_train_shape[0],
                                                 np.prod(X_train_shape[1:]))
        else:
            X_train = self.data._X_train

        print("Starting training...")
        self.start_clock()

        if self.loss == 'SVC':

            if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'):
                self.get_kernel_matrix(kernel=self.kernel, which_set='train',
                                       **kwargs)
                self.svm.fit(self.K_train, self.data._y_train)
            else:
                self.svm.fit(X_train, self.data._y_train)

        if self.loss == 'OneClassSVM':

            if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'):
                self.get_kernel_matrix(kernel=self.kernel, which_set='train',
                                       **kwargs)
                self.svm.fit(self.K_train)
            else:
                # if rbf-kernel, re-initialize svm with gamma minimizing the
                # numerical error
                if self.kernel == 'rbf':
                    gamma = 1 / (np.max(pairwise_distances(X_train)) ** 2)
                    self.svm = svm.OneClassSVM(kernel='rbf', nu=Cfg.svm_nu,
                                               gamma=gamma)

                self.svm.fit(X_train)

        self.stop_clock()
        self.train_time = self.clocked 
Example 16
Project: deepSVDD   Author: GSRS   File: kde.py    MIT License 5 votes vote down vote up
def train(self, bandwidth_GridSearchCV=True):

        if self.data._X_train.ndim > 2:
            X_train_shape = self.data._X_train.shape
            X_train = self.data._X_train.reshape(X_train_shape[0], -1)
        else:
            X_train = self.data._X_train

        print("Starting training...")
        self.start_clock()

        if bandwidth_GridSearchCV:
            # use grid search cross-validation to select bandwidth
            print("Using GridSearchCV for bandwidth selection...")

            d = X_train.shape[1]
            grid = np.logspace(-9, 20, num=30, base=2)
            params = {'bandwidth': (d / (2.0 * grid)) ** 0.5}

            hyper_kde = GridSearchCV(KernelDensity(kernel=self.kernel), params,
                                     n_jobs=10, cv=20, verbose=1)
            hyper_kde.fit(X_train)

            self.bandwidth = hyper_kde.best_estimator_.bandwidth
            self.kde = hyper_kde.best_estimator_
        else:
            # if exponential kernel, re-initialize kde with bandwidth minimizing
            # the numerical error
            if self.kernel == 'exponential':
                bandwidth = np.max(pairwise_distances(X_train)) ** 2
                self.kde = KernelDensity(kernel=self.kernel,
                                         bandwidth=bandwidth)

            self.kde.fit(X_train)

        self.stop_clock()
        self.train_time = self.clocked 
Example 17
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def _intra_cluster_distances_block_(subX, metric, **kwds):
    distances = pairwise_distances(subX, metric=metric, **kwds)
    return distances.sum(axis=1) / (distances.shape[0] - 1) 
Example 18
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def _intra_cluster_distances_block(X, labels, metric, n_jobs=1, **kwds):
    """Calculate the mean intra-cluster distance for sample i.

    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.

    labels : array, shape = [n_samples]
        label values for each sample

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    a : array [n_samples_a]
        Mean intra-cluster distance
    """
    intra_dist = np.zeros(labels.size, dtype=float)
    values = Parallel(n_jobs=n_jobs)(
            delayed(_intra_cluster_distances_block_)
                (X[np.where(labels == label)[0]], metric, **kwds)
                for label in np.unique(labels))
    for label, values_ in zip(np.unique(labels), values):
        intra_dist[np.where(labels == label)[0]] = values_
    return intra_dist 
Example 19
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def _nearest_cluster_distance_block_(subX_a, subX_b, metric, **kwds):
    dist = pairwise_distances(subX_a, subX_b, metric=metric, **kwds)
    dist_a = dist.mean(axis=1)
    dist_b = dist.mean(axis=0)
    return dist_a, dist_b 
Example 20
Project: geosketch   Author: brianhie   File: sketch.py    MIT License 5 votes vote down vote up
def gs_exact(X, N, k='auto', seed=None, replace=False,
             tol=1e-3, n_iter=300, verbose=1):
    ge_idx = gs(X, N, replace=replace)
    
    dist = pairwise_distances(X, n_jobs=-1)
    
    cost = dist.max()

    iter_i = 0
    
    while iter_i < n_iter:

        if verbose:
            log('iter_i = {}'.format(iter_i))

        labels = np.argmin(dist[ge_idx, :], axis=0)

        ge_idx_new = []
        for cluster in range(N):
            cluster_idx = np.nonzero(labels == cluster)[0]
            if len(cluster_idx) == 0:
                ge_idx_new.append(ge_idx[cluster])
                continue
            X_cluster = dist[cluster_idx, :]
            X_cluster = X_cluster[:, cluster_idx]
            within_idx = np.argmin(X_cluster.max(0))
            ge_idx_new.append(cluster_idx[within_idx])
        ge_idx = ge_idx_new

        cost, prev_cost = dist[ge_idx, :].min(0).max(), cost
        assert(cost <= prev_cost)

        if prev_cost - cost < tol:
            break

        iter_i += 1

    return ge_idx 
Example 21
Project: clr_prediction   Author: Kipok   File: clr_regressors.py    MIT License 5 votes vote down vote up
def predict(self, X):
    if self.weight_mode == 'size':
      probs = self.predict_proba
      return np.argmax(probs)
    dst = cdist(self.centers_, X)
    return np.argmin(dst, axis=0) 
Example 22
Project: clr_prediction   Author: Kipok   File: clr_regressors.py    MIT License 5 votes vote down vote up
def predict_proba(self, X):
    if self.weight_mode == 'size':
      return self.weights
    dst = cdist(self.centers_, X)
    return dst.T / np.sum(dst.T, axis=1, keepdims=True) 
Example 23
Project: DeepID2   Author: chenzeyuczy   File: feat_test.py    MIT License 5 votes vote down vote up
def getDist(feat1, feat2, metric):
	pair_num = len(feat1)
	import sklearn.metrics.pairwise as pw
	mt = pw.pairwise_distances(feat1, feat2, metric=metric)
	distance = np.empty((pair_num,))
	for i in xrange(pair_num):
		distance[i] = mt[i,i]
	return distance

# Extract feature via network. 
Example 24
Project: few   Author: lacava   File: evaluation.py    GNU General Public License v3.0 5 votes vote down vote up
def fisher(yhat,y,samples=False):
    """Fisher criterion"""
    classes = np.unique(y)
    mu = np.zeros(len(classes))
    v = np.zeros(len(classes))
    # pdb.set_trace()
    for c in classes.astype(int):
        mu[c] = np.mean(yhat[y==c])
        v[c] = np.var(yhat[y==c])

    if not samples:
        fisher = 0
        for c1,c2 in pairwise(classes.astype(int)):
            fisher += np.abs(mu[c1] - mu[c2])/np.sqrt(v[c1]+v[c2])
    else:
        # lexicase version
        fisher = np.zeros(len(yhat))
        # get closests classes to each class (min mu distance)
        mu_d = pairwise_distances(mu.reshape(-1,1))
        min_mu=np.zeros(len(classes),dtype=int)
        for i in np.arange(len(min_mu)):
            min_mu[i] = np.argsort(mu_d[i])[1]
        # for c1, pairwise(classes.astype(int)):
        #     min_mu[c1] = np.argmin()
        for i,l in enumerate(yhat.astype(int)):
            fisher[i] = np.abs(l - mu[min_mu[y[i]]])/np.sqrt(v[y[i]]+v[min_mu[y[i]]])

    # pdb.set_trace()
    return fisher 
Example 25
Project: 3d-vehicle-tracking   Author: ucbdrive   File: tracking_utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def compute_cos_dis(featA, featB):
    return np.exp(-skp.pairwise_distances(featA, featB)) 
Example 26
Project: nldrp   Author: etzinis   File: distance_matrix.py    MIT License 5 votes vote down vote up
def compute_distance_matrix(X,
                            norm='euclidean',
                            n_jobs = 1
                            ):
    """!
    \brief """
    return pairwise_distances(X, Y=X, metric=norm, n_jobs=n_jobs) 
Example 27
Project: lumberjack   Author: robertwatkins   File: apache_trainer.py    GNU General Public License v3.0 5 votes vote down vote up
def calculateDistanceMatrix(ngram_as_path):
    sample_n_gram_list_as_ids = n_gram_list_to_ids(ngram_as_path)
    return pairwise_distances(sample_n_gram_list_as_ids, metric=get_levenshtein_distance) 
Example 28
Project: BioCompoundML   Author: sandialabs   File: cluster.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def pairwise_distance_matrix(self, feature_matrix, dist_matrix_type):
        pair_distance_matrix = pairwise_distances(feature_matrix,
                                                  metric=dist_matrix_type)
        return pair_distance_matrix 
Example 29
Project: BioCompoundML   Author: sandialabs   File: distance.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def build_matrix(self, feature_dict):
        self.distance = pairwise_distances(X=feature_dict, metric=self.metric,
                                           n_jobs=self.n_jobs) 
Example 30
Project: didyprog   Author: arthurmensch   File: test_dtw.py    MIT License 5 votes vote down vote up
def make_data():
    rng = np.random.RandomState(0)
    m, n = 2, 2
    X = rng.randn(m, 3)
    Y = rng.randn(n, 3)
    return pairwise_distances(X, Y) / 10 
Example 31
Project: Deep-SVDD   Author: ErikKratzCth   File: kde.py    MIT License 5 votes vote down vote up
def train(self, bandwidth_GridSearchCV=True):

        if self.data._X_train.ndim > 2:
            X_train_shape = self.data._X_train.shape
            X_train = self.data._X_train.reshape(X_train_shape[0], -1)
        else:
            X_train = self.data._X_train

        print("Starting training...")
        self.start_clock()

        if bandwidth_GridSearchCV:
            # use grid search cross-validation to select bandwidth
            print("Using GridSearchCV for bandwidth selection...")

            # params = {'bandwidth': np.logspace(0.5, 5, num=10, base=2)}
            params = {'bandwidth': np.logspace(- 4.5, 5, num=20, base=2)}

            hyper_kde = GridSearchCV(KernelDensity(kernel=self.kernel), params, n_jobs=-1, cv=5, verbose=0)
            hyper_kde.fit(X_train)

            self.bandwidth = hyper_kde.best_estimator_.bandwidth
            self.kde = hyper_kde.best_estimator_
        else:
            # if exponential kernel, re-initialize kde with bandwidth minimizing
            # the numerical error
            if self.kernel == 'exponential':
                bandwidth = np.max(pairwise_distances(X_train)) ** 2
                self.kde = KernelDensity(kernel=self.kernel,
                                         bandwidth=bandwidth)

            self.kde.fit(X_train)

        self.stop_clock()
        self.train_time = self.clocked 
Example 32
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 33
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 34
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 35
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 36
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 37
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 38
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 39
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 40
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 41
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 42
Project: image-classifier   Author: gustavkkk   File: eval-all.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 43
Project: apachecn_ml   Author: ys1305   File: sklearn-RS-demo-cf-item-test.py    GNU General Public License v3.0 5 votes vote down vote up
def calc_similarity(self):
        # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
        self.train_mat = np.zeros((self.n_users, self.n_items))
        for line in self.train_data.itertuples():
            self.train_mat[int(line.user_id) - 1,
                           int(line.item_id) - 1] = float(line.rating)
        self.test_mat = np.zeros((self.n_users, self.n_items))
        for line in self.test_data.itertuples():
            # print "line", line.user_id-1, line.item_id-1, line.rating
            self.test_mat[int(line.user_id) - 1,
                          int(line.item_id) - 1] = float(line.rating)

        # 使用sklearn的pairwise_distances函数来计算余弦相似性。
        print("1:", np.shape(np.mat(self.train_mat).T))  # 行:电影,列:人
        # 电影-电影-距离(1682, 1682)
        self.item_mat_similarity = pairwise_distances(
            np.mat(self.train_mat).T, metric='cosine')
        print >> sys.stderr, 'item_mat_similarity=', np.shape(
            self.item_mat_similarity)

        print >> sys.stderr, '开始统计流行item的数量...'

        # 统计在所有的用户中,不同电影的总出现次数
        for i_index in range(self.n_items):
            if np.sum(self.train_mat[:, i_index]) != 0:
                self.item_popular[i_index] = np.sum(
                    self.train_mat[:, i_index] != 0)
                # print "pop=", i_index, self.item_popular[i_index]

        # save the total number of items
        self.item_count = len(self.item_popular)
        print >> sys.stderr, '总共流行item数量 = %d' % self.item_count

    # @profile 
Example 44
Project: apachecn_ml   Author: ys1305   File: RS-sklearn-rating.py    GNU General Public License v3.0 5 votes vote down vote up
def calc_similarity(n_users, n_items, train_data, test_data):
    # 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
    train_data_matrix = np.zeros((n_users, n_items))
    for line in train_data.itertuples():
        train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
    test_data_matrix = np.zeros((n_users, n_items))
    for line in test_data.itertuples():
        test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

    # 使用sklearn的pairwise_distances函数来计算余弦相似性。
    print("1:", np.shape(train_data_matrix))  # 行:人,列:电影
    print("2:", np.shape(train_data_matrix.T))  # 行:电影,列:人

    user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
    item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")

    print >> sys.stderr, '开始统计流行item的数量...'
    item_popular = {}
    # 统计在所有的用户中,不同电影的总出现次数
    for i_index in range(n_items):
        if np.sum(train_data_matrix[:, i_index]) != 0:
            item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
            # print "pop=", i_index, self.item_popular[i_index]

    # save the total number of items
    item_count = len(item_popular)
    print >> sys.stderr, '总共流行item数量 = %d' % item_count

    return train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular 
Example 45
Project: kd-switch   Author: alherit   File: SeqKNN.py    MIT License 5 votes vote down vote up
def predictKNNlambda(self,point,label,k):
        if k>0:
        
            dists = pairwise.pairwise_distances(X=[point],Y=self.trainingPoints)[0]
    
            #1 neighbor = taking the smallest distance = 0th element
            k-=1
            kthElem = np.partition(dists, kth=k )[k]
           
            cond = dists<=kthElem
            
            dist = np.histogram(np.array(self.trainingLabels)[cond],bins=self.alpha_label,range=[0,self.alpha_label])[0] 
            dist = dist/np.sum(dist,dtype=float)
            #print(dist)
            prob = dist[label]
            

            prob = lp.LogWeightProb(prob)


        else:
            prob = self.predictTheta0(label)
                
        ## lambda mix as in the paper: mix with uniform prob (could be theta0 as well)
        prob = self.lmbda*prob + (lp.LogWeightProb(1.)-self.lmbda)*lp.LogWeightProb(1./self.alpha_label)

            
        return prob 
Example 46
Project: linear_neuron   Author: uglyboxer   File: test_pairwise.py    MIT License 5 votes vote down vote up
def test_pairwise_parallel():
    wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1}
    metrics = [(pairwise_distances, 'euclidean', {}),
               (pairwise_distances, wminkowski, wminkowski_kwds),
               (pairwise_distances, 'wminkowski', wminkowski_kwds),
               (pairwise_kernels, 'polynomial', {'degree': 1}),
               (pairwise_kernels, callable_rbf_kernel, {'gamma': .1}),
               ]
    for func, metric, kwds in metrics:
        yield check_pairwise_parallel, func, metric, kwds 
Example 47
Project: linear_neuron   Author: uglyboxer   File: test_pairwise.py    MIT License 5 votes vote down vote up
def test_pairwise_callable_nonstrict_metric():
    # paired_distances should allow callable metric where metric(x, x) != 0
    # Knowing that the callable is a strict metric would allow the diagonal to
    # be left uncalculated and set to 0.
    assert_equal(pairwise_distances([[1]], metric=lambda x, y: 5)[0, 0], 5) 
Example 48
Project: linear_neuron   Author: uglyboxer   File: test_pairwise.py    MIT License 5 votes vote down vote up
def test_paired_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        S3 = func(csr_matrix(X), csr_matrix(Y))
        assert_array_almost_equal(S, S3)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y) 
Example 49
Project: linear_neuron   Author: uglyboxer   File: test_dbscan.py    MIT License 5 votes vote down vote up
def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters) 
Example 50
Project: Weiss   Author: WangWenjun559   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_parallel():
    wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1}
    metrics = [(pairwise_distances, 'euclidean', {}),
               (pairwise_distances, wminkowski, wminkowski_kwds),
               (pairwise_distances, 'wminkowski', wminkowski_kwds),
               (pairwise_kernels, 'polynomial', {'degree': 1}),
               (pairwise_kernels, callable_rbf_kernel, {'gamma': .1}),
               ]
    for func, metric, kwds in metrics:
        yield check_pairwise_parallel, func, metric, kwds 
Example 51
Project: Weiss   Author: WangWenjun559   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_callable_nonstrict_metric():
    # paired_distances should allow callable metric where metric(x, x) != 0
    # Knowing that the callable is a strict metric would allow the diagonal to
    # be left uncalculated and set to 0.
    assert_equal(pairwise_distances([[1]], metric=lambda x, y: 5)[0, 0], 5) 
Example 52
Project: Weiss   Author: WangWenjun559   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_paired_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        S3 = func(csr_matrix(X), csr_matrix(Y))
        assert_array_almost_equal(S, S3)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y) 
Example 53
Project: Weiss   Author: WangWenjun559   File: test_dbscan.py    Apache License 2.0 5 votes vote down vote up
def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters) 
Example 54
Project: Weiss   Author: WangWenjun559   File: test_approximate.py    Apache License 2.0 5 votes vote down vote up
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.") 
Example 55
Project: Weiss   Author: WangWenjun559   File: test_nearest_centroid.py    Apache License 2.0 5 votes vote down vote up
def test_precomputed():
    clf = NearestCentroid(metric="precomputed")
    clf.fit(X, y)
    S = pairwise_distances(T, clf.centroids_)
    assert_array_equal(clf.predict(S), true_result) 
Example 56
Project: Weiss   Author: WangWenjun559   File: test_neighbors.py    Apache License 2.0 5 votes vote down vote up
def test_non_euclidean_kneighbors():
    rng = np.random.RandomState(0)
    X = rng.rand(5, 5)

    # Find a reasonable radius.
    dist_array = pairwise_distances(X).flatten()
    np.sort(dist_array)
    radius = dist_array[15]

    # Test kneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.kneighbors_graph(
            X, 3, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
        assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())

    # Test radiusneighbors_graph
    for metric in ['manhattan', 'chebyshev']:
        nbrs_graph = neighbors.radius_neighbors_graph(
            X, radius, metric=metric).toarray()
        nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
        assert_array_equal(nbrs_graph,
                           nbrs1.radius_neighbors_graph(X).toarray())

    # Raise error when wrong parameters are supplied,
    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
                  metric='euclidean')
    X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
    X_nbrs.fit(X)
    assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
                  radius, metric='euclidean') 
Example 57
Project: D-GEX   Author: uci-cbcl   File: nodup_idx.py    GNU General Public License v2.0 5 votes vote down vote up
def main():
    data = np.load(BGEDV2_DATA)
    X = data.transpose()
    
    inlabel = open(LABEL)
    label = []
    for line in inlabel:
        label.append(int(line.strip('\n')))
    
    label = np.array(label)
    inlabel.close()
    
    idx_keep = []
    for k in range(0, K):
        print k
        sys.stdout.flush()
        idx_k = np.where(label == k)[0]
        X_k = X[idx_k, :]
        pd_k = pairwise_distances(X_k, metric='euclidean', n_jobs=10)
        idx_k_keep = keep(pd_k, idx_k)
        idx_keep.extend(idx_k_keep.tolist())
        
    idx_keep = np.sort(np.array(idx_keep)).astype('int')
    
    outfile = open('bgedv2_idx_nodup_K100_D1.0.txt', 'w')
    for idx in idx_keep:
        outfile.write(str(idx) + '\n')
    
    outfile.close() 
Example 58
Project: modAL   Author: modAL-python   File: batch.py    MIT License 5 votes vote down vote up
def select_cold_start_instance(X: modALinput,
                               metric: Union[str, Callable],
                               n_jobs: Union[int, None]) -> Tuple[int, modALinput]:
    """
    Define what to do if our batch-mode sampling doesn't have any labeled data -- a cold start.

    If our ranked batch sampling algorithm doesn't have any labeled data to determine similarity among the uncertainty
    set, this function finds the element with highest average similarity to cold-start the batch selection.

    TODO:
        - Figure out how to test this! E.g. how to create modAL model without training data.
        - Think of optimizing pairwise_distance call for large matrix.

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    Args:
        X: The set of unlabeled records.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.

    Returns:
        Index of the best cold-start instance from `X` chosen to be labelled; record of the best cold-start instance
        from `X` chosen to be labelled.
    """
    # Compute all pairwise distances in our unlabeled data and obtain the row-wise average for each of our records in X.
    n_jobs = n_jobs if n_jobs else 1
    average_distances = np.mean(pairwise_distances(X, metric=metric, n_jobs=n_jobs), axis=0)

    # Isolate and return our best instance for labeling as the record with the least average distance.
    best_coldstart_instance_index = np.argmin(average_distances)
    return best_coldstart_instance_index, X[best_coldstart_instance_index].reshape(1, -1) 
Example 59
Project: modAL   Author: modAL-python   File: batch.py    MIT License 5 votes vote down vote up
def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
                               X: Union[np.ndarray, sp.csr_matrix],
                               n_instances: int = 20,
                               metric: Union[str, Callable] = 'euclidean',
                               n_jobs: Optional[int] = None,
                               **uncertainty_measure_kwargs
                               ) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
    """
    Batch sampling query strategy. Selects the least sure instances for labelling.

    This strategy differs from :func:`~modAL.uncertainty.uncertainty_sampling` because, although it is supported,
    traditional active learning query strategies suffer from sub-optimal record selection when passing
    `n_instances` > 1. This sampling strategy extends the interactive uncertainty query sampling by allowing for
    batch-mode uncertainty query sampling. Furthermore, it also enforces a ranking -- that is, which records among the
    batch are most important for labeling?

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    Args:
        classifier: One of modAL's supported active learning models.
        X: Set of records to be considered for our active learning model.
        n_instances: Number of records to return for labeling from `X`.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`
        n_jobs: If not set, :func:`~sklearn.metrics.pairwise.pairwise_distances_argmin_min` is used for calculation of
            distances between samples. Otherwise it is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        **uncertainty_measure_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
    query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
                                 n_instances=n_instances, metric=metric, n_jobs=n_jobs)
    return query_indices, X[query_indices] 
Example 60
Project: DynWalks   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):
    ''' pairwise similarity; can be used as score function;
        vectorized computation 
    '''
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        result = euclidean_distances(mat)
        result = -result
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        result = manhattan_distances(mat)
        result = -result
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result 
Example 61
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_boolean_distance():
    # test that we convert to boolean arrays for boolean distances
    rng = np.random.RandomState(0)
    X = rng.randn(5, 4)
    Y = X.copy()
    Y[0, 0] = 1 - Y[0, 0]

    for metric in PAIRWISE_BOOLEAN_FUNCTIONS:
        for Z in [Y, None]:
            res = pairwise_distances(X, Z, metric=metric)
            res[np.isnan(res)] = 0
            assert_true(np.sum(res != 0) == 0) 
Example 62
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_precomputed():
    for func in [pairwise_distances, pairwise_kernels]:
        # Test correct shape
        assert_raises_regexp(ValueError, '.* shape .*',
                             func, np.zeros((5, 3)), metric='precomputed')
        # with two args
        assert_raises_regexp(ValueError, '.* shape .*',
                             func, np.zeros((5, 3)), np.zeros((4, 4)),
                             metric='precomputed')
        # even if shape[1] agrees (although thus second arg is spurious)
        assert_raises_regexp(ValueError, '.* shape .*',
                             func, np.zeros((5, 3)), np.zeros((4, 3)),
                             metric='precomputed')

        # Test not copied (if appropriate dtype)
        S = np.zeros((5, 5))
        S2 = func(S, metric="precomputed")
        assert_true(S is S2)
        # with two args
        S = np.zeros((5, 3))
        S2 = func(S, np.zeros((3, 3)), metric="precomputed")
        assert_true(S is S2)

        # Test always returns float dtype
        S = func(np.array([[1]], dtype='int'), metric='precomputed')
        assert_equal('f', S.dtype.kind)

        # Test converts list to array-like
        S = func([[1.]], metric='precomputed')
        assert_true(isinstance(S, np.ndarray)) 
Example 63
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_parallel():
    wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1}
    metrics = [(pairwise_distances, 'euclidean', {}),
               (pairwise_distances, wminkowski, wminkowski_kwds),
               (pairwise_distances, 'wminkowski', wminkowski_kwds),
               (pairwise_kernels, 'polynomial', {'degree': 1}),
               (pairwise_kernels, callable_rbf_kernel, {'gamma': .1}),
               ]
    for func, metric, kwds in metrics:
        yield check_pairwise_parallel, func, metric, kwds 
Example 64
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_pairwise_callable_nonstrict_metric():
    # paired_distances should allow callable metric where metric(x, x) != 0
    # Knowing that the callable is a strict metric would allow the diagonal to
    # be left uncalculated and set to 0.
    assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5) 
Example 65
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_paired_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in iteritems(PAIRED_DISTANCES):
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        S3 = func(csr_matrix(X), csr_matrix(Y))
        assert_array_almost_equal(S, S3)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Check the callable implementation
    S = paired_distances(X, Y, metric='manhattan')
    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
    assert_array_almost_equal(S, S2)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y) 
Example 66
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_dbscan.py    Apache License 2.0 5 votes vote down vote up
def test_dbscan_sparse_precomputed():
    D = pairwise_distances(X)
    nn = NearestNeighbors(radius=.9).fit(X)
    D_sparse = nn.radius_neighbors_graph(mode='distance')
    # Ensure it is sparse not merely on diagonals:
    assert D_sparse.nnz < D.shape[0] * (D.shape[0] - 1)
    core_sparse, labels_sparse = dbscan(D_sparse,
                                        eps=.8,
                                        min_samples=10,
                                        metric='precomputed')
    core_dense, labels_dense = dbscan(D, eps=.8, min_samples=10,
                                      metric='precomputed')
    assert_array_equal(core_dense, core_sparse)
    assert_array_equal(labels_dense, labels_sparse) 
Example 67
Project: scanorama   Author: brianhie   File: t_sne_approx.py    MIT License 4 votes vote down vote up
def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
    """Expresses to what extent the local structure is retained.

    The trustworthiness is within [0, 1]. It is defined as

    .. math::

        T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
            \sum_{j \in U^{(k)}_i} (r(i, j) - k)

    where :math:`r(i, j)` is the rank of the embedded datapoint j
    according to the pairwise distances between the embedded datapoints,
    :math:`U^{(k)}_i` is the set of points that are in the k nearest
    neighbors in the embedded space but not in the original space.

    * "Neighborhood Preservation in Nonlinear Projection Methods: An
      Experimental Study"
      J. Venna, S. Kaski
    * "Learning a Parametric Embedding by Preserving Local Structure"
      L.J.P. van der Maaten

    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        If the metric is 'precomputed' X must be a square distance
        matrix. Otherwise it contains a sample per row.

    X_embedded : array, shape (n_samples, n_components)
        Embedding of the training data in low-dimensional space.

    n_neighbors : int, optional (default: 5)
        Number of neighbors k that will be considered.

    precomputed : bool, optional (default: False)
        Set this flag if X is a precomputed square distance matrix.

    Returns
    -------
    trustworthiness : float
        Trustworthiness of the low-dimensional embedding.
    """
    if precomputed:
        dist_X = X
    else:
        dist_X = pairwise_distances(X, squared=True)
    dist_X_embedded = pairwise_distances(X_embedded, squared=True)
    ind_X = np.argsort(dist_X, axis=1)
    ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]

    n_samples = X.shape[0]
    t = 0.0
    ranks = np.zeros(n_neighbors)
    for i in range(n_samples):
        for j in range(n_neighbors):
            ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
        ranks -= n_neighbors
        t += np.sum(ranks[ranks > 0])
    t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
                          (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
    return t 
Example 68
Project: Deep-SAD-PyTorch   Author: lukasruff   File: kde.py    MIT License 4 votes vote down vote up
def train(self, dataset: BaseADDataset, device: str = 'cpu', n_jobs_dataloader: int = 0,
              bandwidth_GridSearchCV: bool = True):
        """Trains the Kernel Density Estimation model on the training data."""
        logger = logging.getLogger()

        # do not drop last batch for non-SGD optimization shallow_ssad
        train_loader = DataLoader(dataset=dataset.train_set, batch_size=128, shuffle=True,
                                  num_workers=n_jobs_dataloader, drop_last=False)

        # Get data from loader
        X = ()
        for data in train_loader:
            inputs, _, _, _ = data
            inputs = inputs.to(device)
            if self.hybrid:
                inputs = self.ae_net.encoder(inputs)  # in hybrid approach, take code representation of AE as features
            X_batch = inputs.view(inputs.size(0), -1)  # X_batch.shape = (batch_size, n_channels * height * width)
            X += (X_batch.cpu().data.numpy(),)
        X = np.concatenate(X)

        # Training
        logger.info('Starting training...')
        start_time = time.time()

        if bandwidth_GridSearchCV:
            # use grid search cross-validation to select bandwidth
            logger.info('Using GridSearchCV for bandwidth selection...')
            params = {'bandwidth': np.logspace(0.5, 5, num=10, base=2)}
            hyper_kde = GridSearchCV(KernelDensity(kernel=self.kernel), params, n_jobs=self.n_jobs, cv=5, verbose=0)
            hyper_kde.fit(X)
            self.bandwidth = hyper_kde.best_estimator_.bandwidth
            logger.info('Best bandwidth: {:.8f}'.format(self.bandwidth))
            self.model = hyper_kde.best_estimator_
        else:
            # if exponential kernel, re-initialize kde with bandwidth minimizing the numerical error
            if self.kernel == 'exponential':
                self.bandwidth = np.max(pairwise_distances(X)) ** 2
                self.model = KernelDensity(kernel=self.kernel, bandwidth=self.bandwidth)

            self.model.fit(X)

        train_time = time.time() - start_time
        self.results['train_time'] = train_time

        logger.info('Training Time: {:.3f}s'.format(self.results['train_time']))
        logger.info('Finished training.') 
Example 69
Project: jr-tools   Author: kingjr   File: artefact.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def find_reference(raw, n_cluster, pick_types=None, copy=True,
                   flat_threshold=1e-15, n_split=100, plot=True):
    """ Computes covariance on splits of the raw data, and apply KMeans
    clustering to find the number of disjoint references.
    n_cluster is found with PCA if float
    """
    import matplotlib.pyplot as plt
    from pyriemann.estimation import Covariances
    from sklearn.cluster import KMeans
    from sklearn.metrics.pairwise import pairwise_distances

    if copy:
        raw = raw.copy()
    # Remove flat lines
    flat = np.where(np.std(raw._data, axis=1) < flat_threshold)[0]
    for ch in flat:
        raw.info['bads'] += [raw.ch_names[ch]]

    # Pick data channels only
    if pick_types is None:
        pick_types = dict(seeg=True, exclude='bads')
    raw.pick_types(**pick_types)

    # Compute covariance on data splits
    n_time = len(raw.times)
    t_max = raw.times[n_time - n_time % n_split - 1]
    raw.crop(0, t_max, copy=False)  # ensure regularly sized splits
    X = np.array(np.array_split(raw._data, n_split, axis=1))
    covs = Covariances().fit_transform(X)

    # Compute cluster for each data split
    cluster = KMeans(n_cluster)
    all_kmeans = list()
    for cov in covs:
        dist = pairwise_distances(cov)
        all_kmeans.append(cluster.fit_predict(dist))

    # Combine clusters
    dist = pairwise_distances(np.array(all_kmeans).T)
    idx = cluster.fit_predict(dist)

    if plot:
        idx_ = np.argsort(idx)
        cov = np.median(covs, axis=0)
        plt.matshow(np.log10(cov)[idx_, :][:, idx_])

    clusters = [np.array(raw.ch_names)[idx == ii] for ii in np.unique(idx)]
    return clusters 
Example 70
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 4 votes vote down vote up
def silhouette_samples_slow(X, labels, metric='euclidean', **kwds):
    """Compute the Silhouette Coefficient for each sample.

    The Silhoeutte Coefficient is a measure of how well samples are clustered
    with samples that are similar to themselves. Clustering models with a high
    Silhouette Coefficient are said to be dense, where samples in the same
    cluster are similar to each other, and well separated, where samples in
    different clusters are not very similar to each other.

    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (a) and the mean nearest-cluster distance (b) for each sample.
    The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.

    This function returns the Silhoeutte Coefficient for each sample.

    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters.

    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.

    labels : array, shape = [n_samples]
             label values for each sample

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    silhouette : array, shape = [n_samples]
        Silhouette Coefficient for each samples.

    References
    ----------

    Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
        Interpretation and Validation of Cluster Analysis". Computational
        and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.

    http://en.wikipedia.org/wiki/Silhouette_(clustering)

    """
    metric = distance_metrics()[metric]
    n = labels.shape[0]
    A = np.array([_intra_cluster_distance_slow(X, labels, metric, i)
                  for i in range(n)])
    B = np.array([_nearest_cluster_distance_slow(X, labels, metric, i)
                  for i in range(n)])
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples) 
Example 71
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 4 votes vote down vote up
def silhouette_samples_block(X, labels, metric='euclidean', n_jobs=1, **kwds):
    """Compute the Silhouette Coefficient for each sample.

    The Silhoeutte Coefficient is a measure of how well samples are clustered
    with samples that are similar to themselves. Clustering models with a high
    Silhouette Coefficient are said to be dense, where samples in the same
    cluster are similar to each other, and well separated, where samples in
    different clusters are not very similar to each other.

    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (a) and the mean nearest-cluster distance (b) for each sample.
    The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.

    This function returns the Silhoeutte Coefficient for each sample.

    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters.

    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.

    labels : array, shape = [n_samples]
             label values for each sample

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    silhouette : array, shape = [n_samples]
        Silhouette Coefficient for each samples.

    References
    ----------

    Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
        Interpretation and Validation of Cluster Analysis". Computational
        and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.

    http://en.wikipedia.org/wiki/Silhouette_(clustering)

    """
    A = _intra_cluster_distances_block(X, labels, metric, n_jobs=n_jobs,
                                       **kwds)
    B = _nearest_cluster_distance_block(X, labels, metric, n_jobs=n_jobs,
                                        **kwds)
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples) 
Example 72
Project: political-affiliation-prediction   Author: kirel   File: newsreader.py    MIT License 4 votes vote down vote up
def pairwise_dists(data, nneighbors=10, folder='model', dist='l2'):
    '''

    Computes pairwise distances between bag-of-words vectors of articles

    INPUT
    folder      model folder
    nneighbors  number of closest neighbors to include in distance list

    '''
    stopwords = codecs.open("stopwords.txt", "r", encoding="utf-8", errors='ignore').readlines()[5:]
    stops = map(lambda x:x.lower().strip(),stopwords)

    # using now stopwords and filtering out digits
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)
    print 'Computing %s pairwise distances'%dist
    # KPCA transform bow vectors
    if dist is 'l2_kpca_zscore':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
        perc = 50.0
        width = percentile(K.flatten(),perc)
        Xc = zscore(KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X))
        K = pairwise_distances(Xc,metric='l2',n_jobs=1)
    elif dist is 'l2_kpca':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
        perc = 100./len(data)
        width = percentile(K.flatten(),perc)
        Xc = KernelPCA(n_components=50,kernel='rbf',gamma=width).fit_transform(X)
        K = pairwise_distances(Xc,metric='l2',n_jobs=1)
    elif dist is 'l2':
        K = pairwise_distances(X,metric='l2',n_jobs=1)
    elif dist is 'l1':
        K = pairwise_distances(X,metric='l1',n_jobs=1)

    # collect closest neighbors
    distances = []
    for urlidx in range(len(data)):
        idx =  (K[urlidx,:]).argsort()[1:nneighbors+1]
        for sidx in idx:
            distances.append([urlidx,sidx,(idx==sidx).nonzero()[0][0]])

    return distances 
Example 73
Project: political-affiliation-prediction   Author: kirel   File: newsreader.py    MIT License 4 votes vote down vote up
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
    '''

    Computes clustering of bag-of-words vectors of articles

    INPUT
    folder      model folder
    nclusters   number of clusters

    '''
    from sklearn.cluster import KMeans
    # filtering out some noise words
    stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])

    # vectorize non-stopwords 
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)

    # creating bow-index-to-word map
    idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))

    # using now stopwords and filtering out digits
    print 'Computing pairwise distances' 
    K = pairwise_distances(X,metric='l2',n_jobs=1)
    perc = 50.0
    width = percentile(K.flatten(),perc)

    # KPCA transform bow vectors
    Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
    
    if zscored:
        Xc = zscore(Xc)
    
    # compute clusters
    km = KMeans(n_clusters=nclusters).fit(Xc)
    Xc = km.predict(Xc)

    clusters = []
    for icluster in range(nclusters):
        nmembers = (Xc==icluster).sum()
        if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
            members = (Xc==icluster).nonzero()[0]
            topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
            topwords = ' '.join([idx2word[wi] for wi in topwordidx])
            meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
            meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
            # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
            clusters.append({
                'name':'Cluster-%d'%icluster,
                'description': topwords,
                'members': list(members),
                'meanL2Distances': meanDist
                })

    return clusters 
Example 74
Project: progressivis   Author: jdfekete   File: pairwise.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def run_step(self,run_number,step_size,howlong):
        dfslot = self.get_input_slot('table')
        df = dfslot.data()
        dfslot.update(run_number)
        if dfslot.updated.any() or dfslot.deleted.any():        
            dfslot.reset()
            logger.info('Reseting history because of changes in the input table')
            dfslot.update(run_number)
            #TODO: be smarter with changed values

        m = step_size
        
        indices = dfslot.created.next(m)
        m = indices_len(indices)

        i = None
        j = None
        Si = self._table['document']

        arrayslot = self.get_input_slot('array')
        if arrayslot is not None and arrayslot.data() is not None:
            array = arrayslot.data()
            logger.debug('Using array instead of DataFrame columns')
            if Si is not None:
                i = array[self._last_index]
            j = array[indices]
        if j is None:
            if self.columns is None:
                self.columns = df.columns.delete(np.where(df.columns==UPDATE_COLUMN))
            elif not isinstance(self.columns, pd.Index):
                self.columns = pd.Index(self.columns)
            rows = df[self.columns]
            if Si is not None:
                i = rows.loc[self._last_index]
                assert len(i)==len(self._last_index)
            j = rows.loc[fix_loc(indices)]
            assert len(j)==indices_len(indices)

        Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs)
        if Si is None:
            mat = self._buf.resize(Sj.shape[0])
            mat[:,:] = Sj
            self._last_index = dfslot.last_index[indices]
        else:
            Sij = pairwise_distances(i,j, metric=self._metric, n_jobs=self._n_jobs)
            n0 = i.shape[0]
            n1 = n0+j.shape[0]
            mat = self._buf.resize(n1)
            mat[0:n0,n0:n1] = Sij
            mat[n0:n1,0:n0] = Sij.T
            mat[n0:n1,n0:n1] = Sj
            self._last_index = self._last_index.append(df.index[indices])
            #truth = pairwise_distances(array[0:n1], metric=self._metric)
            #import pdb
            #pdb.set_trace()
            #assert np.allclose(mat,truth)
        return self._return_run_step(self.next_state(dfslot), steps_run=m) 
Example 75
Project: image-classifier   Author: gustavkkk   File: eval-all.py    MIT License 4 votes vote down vote up
def evaluate2(self,metric='cosine'):
	feature1=np.fromfile('./features/' + model_name +'-features.dat',dtype=np.float64)
	feature1=np.reshape(feature1,(class_size,feature_size))
	#np.savetxt('feature1.txt', feature1, delimiter=',')
	
	class_index = 0
	image_index = 0
	total_count = 0.0
	accept_sum = 0
	actual = []
	predict = []
	for filename in filenames:
	    #query-feature
	    X=self.read_imagelist(filelist_path + filename + extension)
	    test_num=np.shape(X)[0]
	    out = self.forward_all(data=X)
	    feature2=np.float64(out['deepid'])
	    feature2=np.reshape(feature2,(test_num,feature_size))
	    #np.savetxt('feature2.txt', feature2, delimiter=',')
	    #mt=pw.pairwise_distances(feature2, feature1, metric=metric)
	    mt=pw.cosine_similarity(feature2, feature1)
	    false=0
	    for i in range(test_num):
		actual.append(class_index)
		for j in range(class_size):
		   if np.max(mt[i]) == mt[i][j]:
			confusion_array[j] += 1	
			predict.append(j)
		image_index += 1

	    total_count += test_num
	    accept_sum += confusion_array[class_index]
	    class_index += 1
	
	print 'total:%d' % (round(total_count))
	print 'accept:%d' % (accept_sum)
	print 'reject:%d' % (round(total_count) - accept_sum)
	print 'accuray:%.4f' % (accept_sum / total_count)

	#conf_mat = confusion_matrix(actual,predict)
	#print(conf_mat)
	#actual = np.array(actual)
	#predict = np.array(predict)
	#y_actual = pd.Series(actual, name='Actual')
	#y_predict = pd.Series(predict, name='Predicted')
	#df_confusion = pd.crosstab(y_actual,y_predict, rownames=['Actual'], colnames=['Predicted'], margins=True)
	#print(df_confusion)
	#plot_confusion_matrix(df_confusion)
	return (accept_sum / total_count)
	
    #process a text file 
Example 76
Project: linear_neuron   Author: uglyboxer   File: test_pairwise.py    MIT License 4 votes vote down vote up
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) 
Example 77
Project: Weiss   Author: WangWenjun559   File: test_pairwise.py    Apache License 2.0 4 votes vote down vote up
def test_pairwise_distances_argmin_min():
    # Check pairwise minimum distances computation for any metric
    X = [[0], [1]]
    Y = [[-1], [2]]

    Xsp = dok_matrix(X)
    Ysp = csr_matrix(Y, dtype=np.float32)

    # euclidean metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean")
    D2 = pairwise_distances_argmin(X, Y, metric="euclidean")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # sparse matrix case
    Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
    assert_array_equal(Dsp, D)
    assert_array_equal(Esp, E)
    # We don't want np.matrix here
    assert_equal(type(Dsp), np.ndarray)
    assert_equal(type(Esp), np.ndarray)

    # Non-euclidean sklearn metric
    D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan")
    D2 = pairwise_distances_argmin(X, Y, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(D2, [0, 1])
    assert_array_almost_equal(E, [1., 1.])
    D, E = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
    D2 = pairwise_distances_argmin(Xsp, Ysp, metric="manhattan")
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (callable)
    D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski,
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Non-euclidean Scipy distance (string)
    D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski",
                                         metric_kwargs={"p": 2})
    assert_array_almost_equal(D, [0, 1])
    assert_array_almost_equal(E, [1., 1.])

    # Compare with naive implementation
    rng = np.random.RandomState(0)
    X = rng.randn(97, 149)
    Y = rng.randn(111, 149)

    dist = pairwise_distances(X, Y, metric="manhattan")
    dist_orig_ind = dist.argmin(axis=0)
    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]

    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
        X, Y, axis=0, metric="manhattan", batch_size=50)
    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) 
Example 78
Project: Weiss   Author: WangWenjun559   File: test_approximate.py    Apache License 2.0 4 votes vote down vote up
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points).fit(X)

    # define a query aligned with the first axis
    query = [1., 0.]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slighltly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2]) 
Example 79
Project: modAL   Author: modAL-python   File: batch.py    MIT License 4 votes vote down vote up
def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
                 unlabeled: modALinput,
                 uncertainty_scores: np.ndarray,
                 n_instances: int,
                 metric: Union[str, Callable],
                 n_jobs: Union[int, None]) -> np.ndarray:
    """
    Query our top :n_instances: to request for labeling.

    Refer to Cardoso et al.'s "Ranked batch-mode active learning":
        https://www.sciencedirect.com/science/article/pii/S0020025516313949

    Args:
        classifier: One of modAL's supported active learning models.
        unlabeled: Set of records to be considered for our active learning model.
        uncertainty_scores: Our classifier's predictions over the response variable.
        n_instances: Limit on the number of records to query from our unlabeled set.
        metric: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.
        n_jobs: This parameter is passed to :func:`~sklearn.metrics.pairwise.pairwise_distances`.

    Returns:
        The indices of the top n_instances ranked unlabelled samples.
    """
    # Make a local copy of our classifier's training data.
    # Define our record container and record the best cold start instance in the case of cold start.
    if classifier.X_training is None:
        best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
        instance_index_ranking = [best_coldstart_instance_index]
    elif classifier.X_training.shape[0] > 0:
        labeled = classifier.X_training[:]
        instance_index_ranking = []
    
    # The maximum number of records to sample.
    ceiling = np.minimum(unlabeled.shape[0], n_instances) - len(instance_index_ranking)

    # mask for unlabeled initialized as transparent
    mask = np.ones(unlabeled.shape[0], np.bool)

    for _ in range(ceiling):

        # Receive the instance and corresponding index from our unlabeled copy that scores highest.
        instance_index, instance, mask = select_instance(X_training=labeled, X_pool=unlabeled,
                                                         X_uncertainty=uncertainty_scores, mask=mask,
                                                         metric=metric, n_jobs=n_jobs)

        # Add our instance we've considered for labeling to our labeled set. Although we don't
        # know it's label, we want further iterations to consider the newly-added instance so
        # that we don't query the same instance redundantly.
        labeled = data_vstack((labeled, instance))

        # Finally, append our instance's index to the bottom of our ranking.
        instance_index_ranking.append(instance_index)

    # Return numpy array, not a list.
    return np.array(instance_index_ranking) 
Example 80
Project: keras_cbof   Author: passalis   File: cbof.py    MIT License 4 votes vote down vote up
def initialize_bof_layers(model, data, n_samples=100, n_feature_samples=5000, batch_size=32, k_means_max_iters=300,
                          k_means_n_init=4):
    """
    Initializes the BoF layers of a keras model

    :param model: the keras model
    :param data: data to be used for initializing the model
    :param n_samples: number of data samples used for the initializes
    :param n_feature_samples: number of feature vectors to be used for the clustering process
    :param batch_size:
    :param k_means_max_iters: the maximum number of iterations for the clustering algorithm (k-means)
    :param k_means_n_init: defines how many times to run the k-means algorithm
    :return:
    """

    for i in range(len(model.layers)):
        if isinstance(model.layers[i], BoF_Pooling):
            print("Found BoF layer (layer %d), initializing..." % i)
            cur_layer = model.layers[i]

            # Compile a function for getting the feature vectors
            get_features = K.function([model.input] + [K.learning_phase()], [model.layers[i - 1].output])

            features = []
            for j in range(int(n_samples / batch_size)):
                cur_feats = get_features([data[j * batch_size:(j + 1) * batch_size], 0])[0]
                features.append(cur_feats.reshape((-1, cur_feats.shape[3])))
            features = np.concatenate(features)
            np.random.shuffle(features)
            features = features[:n_feature_samples]

            # Cluster the features
            kmeans = KMeans(n_clusters=cur_layer.N_k, n_init=k_means_n_init, max_iter=k_means_max_iters)
            kmeans.fit(features)
            V = kmeans.cluster_centers_.T
            V = V.reshape((1, 1, V.shape[0], V.shape[1]))

            # Set the value for the codebook
            K.set_value(cur_layer.V, np.float32(V))

            # Get the mean distance for initializing the sigmas
            mean_dist = np.mean(pairwise_distances(features[:100]))

            # Set the value for sigmas
            sigmas = np.ones((1, 1, 1, cur_layer.N_k)) * (mean_dist ** 2)
            K.set_value(cur_layer.sigmas, np.float32(sigmas))