Python sklearn.neighbors() Examples

The following are 22 code examples of sklearn.neighbors(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: cub_utils.py    From omgh with MIT License 6 votes vote down vote up
def _pre_calculate(self, force=False):
        if self.final_storage.check_exists(self.final_storage.instance_path) and not force:
            self.NNS = self.final_storage.load_instance(self.final_storage.instance_path)
        else:
            self.ssfeature_loader.setup()
            self.Xtrain = self.ssfeature_loader.load_train()
            self.Xtest = self.ssfeature_loader.load_test()
            if self.normalize:
                self.Xtrain = utils.l2_feat_norm(self.Xtrain)
                self.Xtest = utils.l2_feat_norm(self.Xtest)

            self.nn_model = sklearn.neighbors.NearestNeighbors(n_neighbors=self.n_neighbors, algorithm='ball_tree', metric='minkowski', p=2)
            self.nn_model.fit(self.Xtrain)
            self.NNS = self.nn_model.kneighbors(self.Xtest, self.n_neighbors, return_distance=False)
            self.final_storage.save_instance(self.final_storage.instance_path, self.NNS)

        # this needs change for larges n_neighbors
        if self.n_neighbors == 1:
            self.NNS = self.NNS.T[0]
        else:
            pass 
Example #2
Source File: embeddings.py    From stanza-old with Apache License 2.0 6 votes vote down vote up
def k_nearest_approx(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of cosine similarity).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, cosine similarity) pairs, in descending order
        """
        if not hasattr(self, 'lshf'):
            self.lshf = self._init_lsh_forest()

        # TODO(kelvin): make this inner product score, to be consistent with k_nearest
        distances, neighbors = self.lshf.kneighbors([vec], n_neighbors=k, return_distance=True)
        scores = np.subtract(1, distances)
        nbr_score_pairs = self._word_to_score(np.squeeze(neighbors), np.squeeze(scores))

        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True) 
Example #3
Source File: models.py    From jh-kaggle-util with Apache License 2.0 5 votes vote down vote up
def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['rforest',RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=3,n_jobs=-1)],
      ['adaboost',AdaBoostClassifier(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None 
Example #4
Source File: baselines.py    From rmnist with MIT License 5 votes vote down vote up
def baselines(n):
    td, vd, ts = data_loader.load_data(n)
    classifiers = [
        sklearn.svm.SVC(C=1000),
        sklearn.svm.SVC(kernel="linear", C=0.1),
        sklearn.neighbors.KNeighborsClassifier(1),
        sklearn.tree.DecisionTreeClassifier(),
        sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1, hidden_layer_sizes=(500, 100))
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2)) 
Example #5
Source File: transfer.py    From rmnist with MIT License 5 votes vote down vote up
def transfer(n):
    td, vd, ts = data_loader.load_data(n, abstract=True, expanded=expanded)
    classifiers = [
        #sklearn.svm.SVC(),
        #sklearn.svm.SVC(kernel="linear", C=0.1),
        #sklearn.neighbors.KNeighborsClassifier(1),
        #sklearn.tree.DecisionTreeClassifier(),
        #sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1.0, hidden_layer_sizes=(300,), max_iter=500)
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2)) 
Example #6
Source File: embeddings.py    From stanza-old with Apache License 2.0 5 votes vote down vote up
def _init_lsh_forest(self):
        """Construct an LSH forest for nearest neighbor search."""
        import sklearn.neighbors
        lshf = sklearn.neighbors.LSHForest()
        lshf.fit(self.array)
        return lshf 
Example #7
Source File: embeddings.py    From stanza-old with Apache License 2.0 5 votes vote down vote up
def k_nearest(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of highest inner products).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
        """
        nbr_score_pairs = self.inner_products(vec)
        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k] 
Example #8
Source File: test_cli.py    From mlflow with Apache License 2.0 5 votes vote down vote up
def sk_model(iris_data):
    x, y = iris_data
    knn_model = sklearn.neighbors.KNeighborsClassifier()
    knn_model.fit(x, y)
    return knn_model 
Example #9
Source File: k_neighbors_classifier.py    From lale with Apache License 2.0 5 votes vote down vote up
def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = sklearn.neighbors.KNeighborsClassifier(**self._hyperparams) 
Example #10
Source File: utils.py    From NeuroKit with MIT License 5 votes vote down vote up
def _get_embedded(signal, delay=1, dimension=2, r="default", distance="chebyshev", approximate=True, fuzzy=False):
    """Examples
    ----------
    >>> import neurokit2 as nk
    >>>
    >>> signal = nk.signal_simulate(duration=2, frequency=5)
    >>> delay = nk.complexity_delay(signal)
    >>>
    >>> embbeded, count = _get_embedded(signal, delay, r=0.2 * np.std(signal, ddof=1), dimension=2,
    ...                                 distance='chebyshev', approximate=False)
    """
    # Sanity checks
    if distance not in sklearn.neighbors.KDTree.valid_metrics:
        raise ValueError(
            "NeuroKit error: _get_embedded(): The given metric (%s) is not valid."
            "The valid metric names are: %s" % (distance, sklearn.neighbors.KDTree.valid_metrics)
        )

    # Get embedded
    embedded = complexity_embedding(signal, delay=delay, dimension=dimension)
    if approximate is False:
        embedded = embedded[:-1]  # Removes the last line

    if fuzzy is False:
        # Get neighbors count
        count = _get_count(embedded, r=r, distance=distance)
    else:
        # FuzzyEn: Remove the local baselines of vectors
        embedded -= np.mean(embedded, axis=1, keepdims=True)
        count = _get_count_fuzzy(embedded, r=r, distance=distance, n=1)

    return embedded, count


# =============================================================================
# Get Count
# ============================================================================= 
Example #11
Source File: graph.py    From TextCategorization with MIT License 5 votes vote down vote up
def distance_lshforest(z, k=4, metric='cosine'):
    """Return an approximation of the k-nearest cosine distances."""
    assert metric is 'cosine'
    lshf = sklearn.neighbors.LSHForest()
    lshf.fit(z)
    dist, idx = lshf.kneighbors(z, n_neighbors=k + 1)
    assert dist.min() < 1e-10
    dist[dist < 0] = 0
    return dist, idx

# TODO: other ANNs s.a. NMSLIB, EFANNA, FLANN, Annoy, sklearn neighbors, PANN 
Example #12
Source File: field_based_ml_field_detection.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def init_classifier_impl(field_code: str, init_script: str):
    if init_script is not None:
        init_script = init_script.strip()

    if not init_script:
        from sklearn import tree as sklearn_tree
        return sklearn_tree.DecisionTreeClassifier()

    from sklearn import tree as sklearn_tree
    from sklearn import neural_network as sklearn_neural_network
    from sklearn import neighbors as sklearn_neighbors
    from sklearn import svm as sklearn_svm
    from sklearn import gaussian_process as sklearn_gaussian_process
    from sklearn.gaussian_process import kernels as sklearn_gaussian_process_kernels
    from sklearn import ensemble as sklearn_ensemble
    from sklearn import naive_bayes as sklearn_naive_bayes
    from sklearn import discriminant_analysis as sklearn_discriminant_analysis
    from sklearn import linear_model as sklearn_linear_model

    eval_locals = {
        'sklearn_linear_model': sklearn_linear_model,
        'sklearn_tree': sklearn_tree,
        'sklearn_neural_network': sklearn_neural_network,
        'sklearn_neighbors': sklearn_neighbors,
        'sklearn_svm': sklearn_svm,
        'sklearn_gaussian_process': sklearn_gaussian_process,
        'sklearn_gaussian_process_kernels': sklearn_gaussian_process_kernels,
        'sklearn_ensemble': sklearn_ensemble,
        'sklearn_naive_bayes': sklearn_naive_bayes,
        'sklearn_discriminant_analysis': sklearn_discriminant_analysis
    }
    return eval_script('classifier init script of field {0}'.format(field_code), init_script, eval_locals) 
Example #13
Source File: graph.py    From gconvRNN with MIT License 5 votes vote down vote up
def distance_lshforest(z, k=4, metric='cosine'):
    """Return an approximation of the k-nearest cosine distances."""
    assert metric is 'cosine'
    lshf = sklearn.neighbors.LSHForest()
    lshf.fit(z)
    dist, idx = lshf.kneighbors(z, n_neighbors=k+1)
    assert dist.min() < 1e-10
    dist[dist < 0] = 0
    return dist, idx

# TODO: other ANNs s.a. NMSLIB, EFANNA, FLANN, Annoy, sklearn neighbors, PANN 
Example #14
Source File: similarity_encoder.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
                          ngram_range=(3, 3), sparse=False, sample_weight=None,
                          random_state=None):
    """
    Computes prototypes based on:
      - dimensionality reduction (via hashing n-grams)
      - k-means clustering
      - nearest neighbor
    """
    vectorizer = HashingVectorizer(analyzer='char', norm=None,
                                   alternate_sign=False,
                                   ngram_range=ngram_range,
                                   n_features=hashing_dim)
    projected = vectorizer.transform(X)
    if not sparse:
        projected = projected.toarray()
    kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
    kmeans.fit(projected, sample_weight=sample_weight)
    centers = kmeans.cluster_centers_
    neighbors = NearestNeighbors()
    neighbors.fit(projected)
    indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
    if indexes_prototypes.shape[0] < n_prototypes:
        warnings.warn('Final number of unique prototypes is lower than ' +
                      'n_prototypes (expected)')
    return np.sort(X[indexes_prototypes]) 
Example #15
Source File: knn_matting.py    From knn-matting with MIT License 5 votes vote down vote up
def knn_matte(img, trimap, mylambda=100):
    [m, n, c] = img.shape
    img, trimap = img/255.0, trimap/255.0
    foreground = (trimap > 0.99).astype(int)
    background = (trimap < 0.01).astype(int)
    all_constraints = foreground + background

    print('Finding nearest neighbors')
    a, b = np.unravel_index(np.arange(m*n), (m, n))
    feature_vec = np.append(np.transpose(img.reshape(m*n,c)), [ a, b]/np.sqrt(m*m + n*n), axis=0).T
    nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=10, n_jobs=4).fit(feature_vec)
    knns = nbrs.kneighbors(feature_vec)[1]

    # Compute Sparse A
    print('Computing sparse A')
    row_inds = np.repeat(np.arange(m*n), 10)
    col_inds = knns.reshape(m*n*10)
    vals = 1 - np.linalg.norm(feature_vec[row_inds] - feature_vec[col_inds], axis=1)/(c+2)
    A = scipy.sparse.coo_matrix((vals, (row_inds, col_inds)),shape=(m*n, m*n))

    D_script = scipy.sparse.diags(np.ravel(A.sum(axis=1)))
    L = D_script-A
    D = scipy.sparse.diags(np.ravel(all_constraints[:,:, 0]))
    v = np.ravel(foreground[:,:,0])
    c = 2*mylambda*np.transpose(v)
    H = 2*(L + mylambda*D)

    print('Solving linear system for alpha')
    warnings.filterwarnings('error')
    alpha = []
    try:
        alpha = np.minimum(np.maximum(scipy.sparse.linalg.spsolve(H, c), 0), 1).reshape(m, n)
    except Warning:
        x = scipy.sparse.linalg.lsqr(H, c)
        alpha = np.minimum(np.maximum(x[0], 0), 1).reshape(m, n)
    return alpha 
Example #16
Source File: advanced_supvervised_model_trainer.py    From healthcareai-py with MIT License 5 votes vote down vote up
def knn(self,
            scoring_metric='roc_auc',
            hyperparameter_grid=None,
            randomized_search=True,
            number_iteration_samples=10):
        """
        A light wrapper for Sklearn's knn classifier that performs randomized 
        search over an overridable default
        hyperparameter grid.
        
        Args:
            scoring_metric (str): Any sklearn scoring metric appropriate for classification
            hyperparameter_grid (dict): hyperparameters by name
            randomized_search (bool): True for randomized search (default)

            number_iteration_samples (int): Number of models to train during the 
            randomized search for exploring the
            hyperparameter space. More may lead to a better model, but will take longer.

        Returns:
            TrainedSupervisedModel: 
        """
        self.validate_classification('KNN')
        if hyperparameter_grid is None:
            neighbors = list(range(5, 26))
            hyperparameter_grid = {'n_neighbors': neighbors, 'weights': ['uniform', 'distance']}
            number_iteration_samples = 10

            print('KNN Grid: {}'.format(hyperparameter_grid))
        algorithm = get_algorithm(KNeighborsClassifier,
                                  scoring_metric,
                                  hyperparameter_grid,
                                  randomized_search,
                                  number_iteration_samples=number_iteration_samples)

        trained_supervised_model = self._create_trained_supervised_model(algorithm)

        return trained_supervised_model 
Example #17
Source File: mutual_information.py    From NeuroKit with MIT License 5 votes vote down vote up
def _entropy(X, k=1):
    """Returns the entropy of X. From https://gist.github.com/GaelVaroquaux/ead9898bd3c973c40429.

    Parameters
    -----------
    X : array-like or shape (n_samples, n_features)
        The data the entropy of which is computed
    k : int (optional)
        number of nearest neighbors for density estimation

    Returns
    -------
    float
        entropy of X.

    Notes
    ---------
    - Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy of a random vector. Probl. Inf. Transm.
    23, 95-101.
    - Evans, D. 2008 A computationally efficient estimator for mutual information, Proc. R. Soc. A 464 (2093),
    1203-1215.
    - Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual information. Phys Rev E 69(6 Pt 2):066138.

    """

    # Distance to kth nearest neighbor
    r = _nearest_distances(X, k)  # squared distances
    n, d = X.shape
    volume_unit_ball = (np.pi ** (0.5 * d)) / scipy.special.gamma(0.5 * d + 1)

    # Perez-Cruz et al. (2008). Estimation of Information Theoretic Measures for
    # Continuous Random Variables, suggets returning:
    # return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)

    return (
        d * np.mean(np.log(r + np.finfo(X.dtype).eps))
        + np.log(volume_unit_ball)
        + scipy.special.psi(n)
        - scipy.special.psi(k)
    ) 
Example #18
Source File: mutual_information.py    From NeuroKit with MIT License 5 votes vote down vote up
def _nearest_distances(X, k=1):
    """From https://gist.github.com/GaelVaroquaux/ead9898bd3c973c40429
    X = array(N,M)
    N = number of points
    M = number of dimensions
    returns the distance to the kth nearest neighbor for every point in X
    """
    knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k + 1)
    knn.fit(X)
    d, _ = knn.kneighbors(X)  # the first nearest neighbor is itself
    return d[:, -1]  # returns the distance to the kth nearest neighbor 
Example #19
Source File: utils.py    From NeuroKit with MIT License 5 votes vote down vote up
def _get_count_fuzzy(embedded, r, distance="chebyshev", n=1):
    dist = sklearn.neighbors.DistanceMetric.get_metric(distance)
    dist = dist.pairwise(embedded)

    if n > 1:
        sim = np.exp(-(dist ** n) / r)
    else:
        sim = np.exp(-dist / r)
    # Return the count
    return np.sum(sim, axis=0)


# =============================================================================
# Get R
# ============================================================================= 
Example #20
Source File: utils.py    From NeuroKit with MIT License 5 votes vote down vote up
def _get_count(embedded, r, distance="chebyshev"):
    kdtree = sklearn.neighbors.KDTree(embedded, metric=distance)
    # Return the count
    return kdtree.query_radius(embedded, r, count_only=True).astype(np.float64) 
Example #21
Source File: runDBSCAN.py    From simsearch with MIT License 4 votes vote down vote up
def findEps(ssearch):
    """
    Find a good epsilon value to use.
    """
    ###########################################################################
    # Calculate nearest neighbors
    ###########################################################################
    
    # Create a nearest neighbors model--we need 2 nearest neighbors since the 
    # nearest neighbor to a point is going to be itself.
    nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)
    
    t0 = time.time()
    
    # Find nearest neighbors.
    distances, indices = nbrs_model.kneighbors(ssearch.index.index)
    
    elapsed = time.time() - t0
    
    print 'Took %.2f seconds' % elapsed
    
    distances = [d[1] for d in distances]
    indeces = [ind[1] for ind in indices]
    
    ###########################################################################
    # Histogram the nearest neighbor distances.
    ###########################################################################
    
    import matplotlib.pyplot as plt
    
    counts, bins, patches = plt.hist(distances, bins=16)
    plt.title("Nearest neighbor distances")
    plt.xlabel("Distance")
    plt.ylabel("Frequency")
    
    print '\n%d bins:' % len(counts)
    
    countAcc = 0
    num_points = len(ssearch.index.index)
    
    for i in range(0, len(counts)):
        countAcc += counts[i]
        
        # Calculate the percentage of values which fall below the upper limit 
        # of this bin.
        prcnt = float(countAcc) / float(num_points) * 100.0    
        
        print '  %.2f%% < %.2f' % (prcnt, bins[i + 1]) 
Example #22
Source File: runDBSCAN.py    From simsearch with MIT License 4 votes vote down vote up
def findMinPts(ssearch, eps):
    """
    Find a good value for MinPts.
    """
    
    ###########################################################################
    # Count neighbors within threshold
    ###########################################################################
    
    print 'Calculating pair-wise distances...'
    # Calculate pair-wise cosine distance for all documents.
    t0 = time.time()
    
    DD = sklearn.metrics.pairwise.cosine_distances(ssearch.index.index)
    
    elapsed = time.time() - t0
    
    print '    Took %.2f seconds' % elapsed
    
    print 'Counting number of neighbors...'
    
    t0 = time.time()
    
    # Create a list to hold the number of neighbors for each point.
    numNeighbors = [0]*len(DD)
    
    for i in range(0, len(DD)):
        dists = DD[i]
        
        count = 0
        for j in range(0, len(DD)):
            if (dists[j] < eps):
                count += 1
    
        numNeighbors[i] = count            
    
    elapsed = time.time() - t0
    
    print '    Took %.2f seconds' % elapsed
    
    ###############################################################################
    # Histogram the nearest neighbor distances.
    ###############################################################################
    
    import matplotlib.pyplot as plt
    
    counts, bins, patches = plt.hist(numNeighbors, bins=60)
    plt.title("Number of neighbors")
    plt.xlabel("Number of neighbors")
    plt.ylabel("Frequency")
    
    print '\n%d bins:' % (len(bins) - 1)
    binsStr = ''
    for b in bins:
        binsStr += '  %0.2f' % b
    
    print binsStr