Python sklearn.metrics.pairwise_distances() Examples

The following are 30 code examples of sklearn.metrics.pairwise_distances(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics , or try the search function .
Example #1
Source File: test_lof.py    From scikit-hubness with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_lof_precomputed(algorithm, random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    local_rng = np.random.RandomState(random_state)
    X = local_rng.random_sample((10, 4))
    Y = local_rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3,
                                         algorithm=algorithm,
                                         novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
                                         metric='precomputed', novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y) 
Example #2
Source File: rank_knowledge_for_mc_qa.py    From OpenBookQA with Apache License 2.0 6 votes vote down vote up
def combine_similarities(scores_per_feat, top=10, combine_feat_scores="mul"):
    """
    Get similarities based on multiple independent queries that are then combined using combine_feat_scores
    :param query_feats: Multiple vectorized text queries
    :param para_features: Multiple vectorized text paragraphs that will be scored against the queries
    :param top: Top N facts to keep
    :param combine_feat_scores: The way for combining the multiple scores
    :return: Ranked fact ids with scores List[tuple(id, weight)]
    """
    # scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats]  # this is distance - low is better!!!
    comb_func = comb_funcs[combine_feat_scores]

    smoothing_val = 0.000001
    max_val = pow((1 + smoothing_val), 2)
    dists = scores_per_feat[0] + smoothing_val
    if len(scores_per_feat) > 1:
        for i in range(1, len(scores_per_feat)):
            dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
    sorted_ix = np.argsort(dists).tolist()  # this is asc (lowers first) ,in case of ties, uses the earlier paragraph

    max_val = max(np.max(dists), 1)
    return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top] 
Example #3
Source File: test_nca.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_simple_example():
    """Test on a simple example.

    Puts four points in the input space where the opposite labels points are
    next to each other. After transform the samples from the same class
    should be next to each other.

    """
    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
    y = np.array([1, 0, 1, 0])
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                         random_state=42)
    nca.fit(X, y)
    X_t = nca.transform(X)
    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
                       np.array([2, 3, 0, 1])) 
Example #4
Source File: 2_kmeans-with-text-data_blank.py    From Coursera-UW-Machine-Learning-Clustering-Retrieval with MIT License 6 votes vote down vote up
def compute_heterogeneity(data, k, centroids, cluster_assignment):
    
    heterogeneity = 0.0
    for i in xrange(k):
        
        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment==i, :]
        
        if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)
        
    return heterogeneity


# Let's compute the cluster heterogeneity for the 2-cluster example we've been considering based on our current cluster assignments and centroids.

# In[26]: 
Example #5
Source File: k_means_clust.py    From Python with MIT License 6 votes vote down vote up
def compute_heterogeneity(data, k, centroids, cluster_assignment):

    heterogeneity = 0.0
    for i in range(k):

        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment == i, :]

        if member_data_points.shape[0] > 0:  # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(
                member_data_points, [centroids[i]], metric="euclidean"
            )
            squared_distances = distances ** 2
            heterogeneity += np.sum(squared_distances)

    return heterogeneity 
Example #6
Source File: rank_knowledge_for_mc_qa.py    From OpenBookQA with Apache License 2.0 6 votes vote down vote up
def get_similarities(query_feats, para_features, top=10, combine_feat_scores="mul"):
    """
    Get similarities based on multiple independent queries that are then combined using combine_feat_scores
    :param query_feats: Multiple vectorized text queries
    :param para_features: Multiple vectorized text paragraphs that will be scored against the queries
    :param top: Top N facts to keep
    :param combine_feat_scores: The way for combining the multiple scores
    :return: Ranked fact ids with scores List[tuple(id, weight)]
    """
    scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats]  # this is distance - low is better!!!
    comb_func = comb_funcs[combine_feat_scores]

    smoothing_val = 0.000001
    max_val = pow((1 + smoothing_val), 2)
    dists = scores_per_feat[0] + smoothing_val
    if len(scores_per_feat) > 1:
        for i in range(1, len(scores_per_feat)):
            dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
    sorted_ix = np.argsort(dists).tolist()  # this is asc (lowers first), in case of ties, uses the earlier paragraph

    return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top] 
Example #7
Source File: test_lof.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lof_precomputed(random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
                                         metric='precomputed', novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y) 
Example #8
Source File: 2_kmeans-with-text-data_blank.py    From Coursera-UW-Machine-Learning-Clustering-Retrieval with MIT License 6 votes vote down vote up
def assign_clusters(data, centroids):
    
    # Compute distances between each data point and the set of centroids:
    # Fill in the blank (RHS only)
    distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean')
        
    # Compute cluster assignments for each data point:
    # Fill in the blank (RHS only)
    cluster_assignment = np.argmin(distances_from_centroids, axis=1)
    
    return cluster_assignment


# **Checkpoint**. For the last time, let us check if Step 1 was implemented correctly. With rows 0, 2, 4, and 6 of `tf_idf` as an initial set of centroids, we assign cluster labels to rows 0, 10, 20, ..., and 90 of `tf_idf`. The resulting cluster labels should be `[0, 1, 1, 0, 0, 2, 0, 2, 2, 1]`.

# In[15]: 
Example #9
Source File: test_distances.py    From pynndescent with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def spatial_check(metric):
    dist_matrix = pairwise_distances(spatial_data, metric=metric)
    # scipy is bad sometimes
    if metric == "braycurtis":
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
    if metric in ("cosine", "correlation"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0
    dist_function = dist.named_distances[metric]
    test_matrix = np.array(
        [
            [
                dist_function(spatial_data[i], spatial_data[j])
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric {}".format(metric),
    ) 
Example #10
Source File: test_distances.py    From pynndescent with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def binary_check(metric):
    dist_matrix = pairwise_distances(binary_data, metric=metric)
    if metric in ("jaccard", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
    if metric in ("kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0
    dist_function = dist.named_distances[metric]
    test_matrix = np.array(
        [
            [
                dist_function(binary_data[i], binary_data[j])
                for j in range(binary_data.shape[0])
            ]
            for i in range(binary_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric {}".format(metric),
    ) 
Example #11
Source File: max_repr.py    From ViewAL with MIT License 6 votes vote down vote up
def _max_representative_samples(self, image_features, candidate_image_features, selection_count):
        all_distances = pairwise_distances(image_features, candidate_image_features, metric='euclidean')
        selected_sample_indices = []
        print('Finding max representative candidates..')
        minimum_distances = np.ones((len(image_features))) * float('inf')
        for _ in tqdm(range(selection_count)):
            current_best_score = float("-inf")
            current_best_idx = None
            current_minimum_distances = None
            for i in range(len(candidate_image_features)):
                if i not in selected_sample_indices:
                    selected_sample_indices.append(i)
                    tmp_distances = np.minimum(minimum_distances, all_distances[:, i])
                    tmp_score = np.sum(tmp_distances) * -1
                    if tmp_score > current_best_score:
                        current_best_score = tmp_score
                        current_minimum_distances = tmp_distances
                        current_best_idx = i
                    selected_sample_indices.pop()
            selected_sample_indices.append(current_best_idx)
            minimum_distances = current_minimum_distances
        return selected_sample_indices 
Example #12
Source File: test_distances.py    From pynndescent with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_seuclidean():
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v)
    test_matrix = np.array(
        [
            [
                dist.standardised_euclidean(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric seuclidean",
    ) 
Example #13
Source File: test_umap_metrics.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def sparse_spatial_check(metric, sparse_spatial_data):
    # Check that metric is supported for this test, otherwise, fail!
    assert (
        metric in spdist.sparse_named_distances
    ), f"{metric} not supported for sparse data"
    dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric)

    if metric in ("braycurtis", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0

    if metric in ("cosine", "correlation", "kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0

    run_test_sparse_metric(metric, sparse_spatial_data, dist_matrix) 
Example #14
Source File: class_separation.py    From SecuML with GNU General Public License v2.0 6 votes vote down vote up
def computer_perf(self, instances):
        X = instances.features.get_values()
        labels = instances.ground_truth.get_labels()
        # For unsupervised projection methods,
        # the performance is always computed with the labels
        # (not the families).
        if hasattr(self.projection.conf, 'multiclass'):
            if self.projection.conf.multiclass:
                labels = instances.ground_truth.get_families()
        unique_labels, label_inds = np.unique(labels, return_inverse=True)
        ratio = 0
        for li in range(len(unique_labels)):
            Xc = X[label_inds == li]
            Xnc = X[label_inds != li]
            ratio += pairwise_distances(Xc).mean() / \
                pairwise_distances(Xc, Xnc).mean()
        self.class_separation = ratio / len(unique_labels) 
Example #15
Source File: test_nca.py    From scikit-hubness with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_simple_example():
    """Test on a simple example.

    Puts four points in the input space where the opposite labels points are
    next to each other. After transform the samples from the same class
    should be next to each other.

    """
    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
    y = np.array([1, 0, 1, 0])
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                         random_state=42)
    nca.fit(X, y)
    X_t = nca.transform(X)
    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
                       np.array([2, 3, 0, 1])) 
Example #16
Source File: test_neighbors.py    From scikit-hubness with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_precomputed_cross_validation():
    # Ensure array is split correctly
    rng = np.random.RandomState(0)
    X = rng.rand(20, 2)
    D = pairwise_distances(X, metric='euclidean')
    y = rng.randint(3, size=20)
    for Est in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        metric_score = cross_val_score(Est(algorithm_params={'n_candidates': 5}), X, y)
        precomp_score = cross_val_score(Est(metric='precomputed',
                                            algorithm_params={'n_candidates': 5},
                                            ),
                                        D, y)
        assert_array_equal(metric_score, precomp_score) 
Example #17
Source File: pairwise.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def pairwise_distances(
    X: ArrayLike,
    Y: ArrayLike,
    metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean",
    n_jobs: Optional[int] = None,
    **kwargs: Any
):
    if isinstance(Y, da.Array):
        raise TypeError("`Y` must be a numpy array")
    chunks = (X.chunks[0], (len(Y),))
    return X.map_blocks(
        metrics.pairwise_distances,
        Y,
        dtype=float,
        chunks=chunks,
        metric=metric,
        **kwargs
    ) 
Example #18
Source File: stats.py    From nltools with MIT License 6 votes vote down vote up
def _compute_isc(data, metric='median'):
    ''' Helper function to compute intersubject correlation from observations by subjects array.
        
        Args:
            data: (pd.DataFrame, np.array) observations by subjects where isc is computed across subjects
            metric: (str) type of association metric ['spearman','pearson','kendall']
        
        Returns:
            isc: (float) intersubject correlation coefficient
            
    '''

    from nltools.data import Adjacency

    similarity = Adjacency(1 - pairwise_distances(data.T, metric='correlation'), matrix_type='similarity')
    if metric =='mean':
        isc = np.tanh(similarity.r_to_z().mean())
    elif metric =='median':
        isc = similarity.median()
    return isc 
Example #19
Source File: _utils.py    From hyppo with Apache License 2.0 6 votes vote down vote up
def gaussian(x, workers=None):
    """Default medial gaussian kernel similarity calculation"""
    l1 = pairwise_distances(X=x, metric="l1", n_jobs=workers)
    n = l1.shape[0]
    med = np.median(
        np.lib.stride_tricks.as_strided(
            l1, (n - 1, n + 1), (l1.itemsize * (n + 1), l1.itemsize)
        )[:, 1:]
    )
    # prevents division by zero when used on label vectors
    med = med if med else 1
    gamma = 1.0 / (2 * (med ** 2))
    return rbf_kernel(x, gamma=gamma)


# p-value computation 
Example #20
Source File: kcenter_greedy.py    From active-learning with Apache License 2.0 6 votes vote down vote up
def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
    """Update min distances given cluster centers.

    Args:
      cluster_centers: indices of cluster centers
      only_new: only calculate distance for newly selected points and update
        min_distances.
      rest_dist: whether to reset min_distances.
    """

    if reset_dist:
      self.min_distances = None
    if only_new:
      cluster_centers = [d for d in cluster_centers
                         if d not in self.already_selected]
    if cluster_centers:
      # Update min_distances for all examples given new cluster center.
      x = self.features[cluster_centers]
      dist = pairwise_distances(self.features, x, metric=self.metric)

      if self.min_distances is None:
        self.min_distances = np.min(dist, axis=1).reshape(-1,1)
      else:
        self.min_distances = np.minimum(self.min_distances, dist) 
Example #21
Source File: test_umap_metrics.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_mahalanobis(spatial_data):
    v = np.cov(np.transpose(spatial_data))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array(
        [
            [
                dist.mahalanobis(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric mahalanobis",
    ) 
Example #22
Source File: test_umap_metrics.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_weighted_minkowski(spatial_data):
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
    test_matrix = np.array(
        [
            [
                dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric weighted_minkowski",
    ) 
Example #23
Source File: test_umap_metrics.py    From umap with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def sparse_binary_check(metric, sparse_binary_data):
    # Check that metric is supported for this test, otherwise, fail!
    assert (
        metric in spdist.sparse_named_distances
    ), f"{metric} not supported for sparse data"
    dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric)
    if metric in ("jaccard", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0

    if metric in ("kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0

    run_test_sparse_metric(metric, sparse_binary_data, dist_matrix)


# --------------------
# Spatial Metric Tests
# -------------------- 
Example #24
Source File: test_distances.py    From pynndescent with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def test_mahalanobis():
    v = np.cov(np.transpose(spatial_data))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array(
        [
            [
                dist.mahalanobis(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric mahalanobis",
    ) 
Example #25
Source File: test_neighbors.py    From scikit-hubness with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pairwise_boolean_distance():
    # Non-regression test for #4523
    # 'brute': uses scipy.spatial.distance through pairwise_distances
    # 'ball_tree': uses sklearn.neighbors.dist_metrics
    rng = np.random.RandomState(0)
    X = rng.uniform(size=(6, 5))
    NN = neighbors.NearestNeighbors

    nn1 = NN(metric="jaccard", algorithm='brute').fit(X)
    nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X)
    assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0]) 
Example #26
Source File: test_lower_bounds.py    From pyts with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_lower_bounds_inequalities():
    """Test that the expected inequalities are verified."""
    # Toy dataset
    rng = np.random.RandomState(42)
    n_samples_train, n_samples_test, n_timestamps = 20, 30, 60
    window_size = 0.1
    X_train = rng.randn(n_samples_train, n_timestamps)
    X_test = rng.randn(n_samples_test, n_timestamps)

    # DTW
    X_dtw = pairwise_distances(X_test, X_train, dtw)
    region = sakoe_chiba_band(n_timestamps, window_size=window_size)
    X_dtw_window = pairwise_distances(X_test, X_train, _dtw_sakoechiba,
                                      window_size=window_size)

    # Lower bounds
    lb_yi = lower_bound_yi(X_train, X_test)
    lb_kim = lower_bound_kim(X_train, X_test)
    lb_keogh = lower_bound_keogh(X_train, X_test, region)
    lb_improved = lower_bound_improved(X_train, X_test, region)

    # Sanity check
    EPS = 1e-8
    np.testing.assert_array_less(lb_yi, X_dtw + EPS)
    np.testing.assert_array_less(lb_kim, X_dtw + EPS)
    np.testing.assert_array_less(lb_keogh, X_dtw_window + EPS)
    np.testing.assert_array_less(lb_improved, X_dtw_window + EPS)
    np.testing.assert_array_less(lb_keogh, lb_improved + EPS) 
Example #27
Source File: test_latentdistributiontest.py    From graspy with Apache License 2.0 5 votes vote down vote up
def test_callable_metric(self):
        def metric_func(X, Y=None, workers=None):
            return pairwise_distances(X, metric="euclidean") * 0.5

        ldt = LatentDistributionTest("dcorr", metric_func, n_bootstraps=10)
        ldt.fit(self.A1, self.A2) 
Example #28
Source File: preprocessed_corpus.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def prune(self, question: List[str], paragraphs: List[ExtractedParagraphWithAnswers]):
        tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
        text = []
        for para in paragraphs:
            text.append(" ".join(para.text))
        try:
            para_features = tfidf.fit_transform(text)
            q_features = tfidf.transform([" ".join(question)])
        except ValueError:
            return []

        dists = pairwise_distances(q_features, para_features, "cosine").ravel() # [N]
        sorted_ix = np.lexsort(([x.start for x in paragraphs], dists))  # in case of ties, use the earlier paragraph [N]

        selection = [i for i in sorted_ix[:self.n_to_select]]
        selected_paras = [paragraphs[i] for i in selection]
        ir_hit = 0. if all(len(x.answer_spans) == 0 for x in selected_paras) else 1.

        if self.is_training and not ir_hit:
            gold_indexes = [i for i, x in enumerate(paragraphs) if len(x.answer_spans) != 0]
            gold_index = random.choice(gold_indexes)
            selection[-1] = gold_index

        if self.sort_passage:
            selection = np.sort(selection)

        return [paragraphs[i] for i in selection], ir_hit 
Example #29
Source File: embedding.py    From word-embeddings-benchmarks with MIT License 5 votes vote down vote up
def nearest_neighbors(self, word, k=1, exclude=[], metric="cosine"):
        """
        Find nearest neighbor of given word

        Parameters
        ----------
          word: string or vector
            Query word or vector.

          k: int, default: 1
            Number of nearest neighbours to return.

          metric: string, default: 'cosine'
            Metric to use.

          exclude: list, default: []
            Words to omit in answer

        Returns
        -------
          n: list
            Nearest neighbors.
        """
        if isinstance(word, string_types):
            assert word in self, "Word not found in the vocabulary"
            v = self[word]
        else:
            v = word

        D = pairwise_distances(self.vectors, v.reshape(1, -1), metric=metric)

        if isinstance(word, string_types):
            D[self.vocabulary.word_id[word]] = D.max()

        for w in exclude:
            D[self.vocabulary.word_id[w]] = D.max()

        return [self.vocabulary.id_word[id] for id in D.argsort(axis=0).flatten()[0:k]] 
Example #30
Source File: omnibus.py    From mvlearn with Apache License 2.0 5 votes vote down vote up
def fit(self, Xs):
        """
        Fit the model with Xs and apply the embedding on Xs.
        The embeddings are saved as a class attribute.

        Parameters
        ==========
        Xs : list of array-likes or numpy.ndarray
             - Xs length: n_views
             - Xs[i] shape: (n_samples, n_features_i)
            The data to embed based on the prior fit function. Each
            X in Xs will receive its own embedding.
        """
        Xs = check_Xs(Xs)
        dissimilarities = []
        for X in Xs:
            if self.normalize is not None:
                X = normalize(X, norm=self.normalize)
            dissimilarity = pairwise_distances(X, metric=self.distance_metric)

            dissimilarities.append(dissimilarity)

        embedder = OmnibusEmbed(n_components=self.n_components,
                                algorithm=self.algorithm,
                                n_iter=self.n_iter)

        self.embeddings_ = embedder.fit_transform(dissimilarities)