python source code of object_ranking_data

import numpy as np
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.datasets import make_regression
from sklearn.datasets.samples_generator import make_blobs
from sklearn.gaussian_process.kernels import Matern
from sklearn.utils import check_random_state

from csrank.constants import OBJECT_RANKING
from csrank.numpy_util import scores_to_rankings
from ..synthetic_dataset_generator import SyntheticDatasetGenerator
from ..util import create_pairwise_prob_matrix
from ..util import quicksort

try:
    from pygmo import hypervolume
except ImportError:
    from csrank.util import MissingExtraError

    raise MissingExtraError("pygmo", "data")


class ObjectRankingDatasetGenerator(SyntheticDatasetGenerator):
    def __init__(self, dataset_type="medoid", **kwargs):
        super(ObjectRankingDatasetGenerator, self).__init__(
            learning_problem=OBJECT_RANKING, **kwargs
        )
        dataset_function_options = {
            "linear": self.make_linear_transitive,
            "medoid": self.make_intransitive_medoids,
            "gp_transitive": self.make_gp_transitive,
            "gp_non_transitive": self.make_gp_non_transitive,
            "hyper_volume": self.make_hv_dataset,
        }
        if dataset_type not in dataset_function_options:
            raise ValueError(
                f"dataset_type must be one of {set(dataset_function_options.keys())}"
            )
            dataset_type = "medoid"
        self.dataset_function = dataset_function_options[dataset_type]

    def make_linear_transitive(
        self,
        n_instances=1000,
        n_objects=5,
        noise=0.0,
        n_features=100,
        n_informative=10,
        seed=42,
        **kwd,
    ):
        random_state = check_random_state(seed=seed)
        X, y, coeff = make_regression(
            n_samples=n_instances * n_objects,
            n_features=n_features,
            n_informative=n_informative,
            coef=True,
            noise=noise,
            random_state=random_state,
        )
        X = X.reshape(n_instances, n_objects, n_features)
        y = y.reshape(n_instances, n_objects)
        Y = scores_to_rankings(y)
        return X, Y

    def make_gp_transitive(
        self,
        n_instances=1000,
        n_objects=5,
        noise=0.0,
        n_features=100,
        kernel_params=None,
        seed=42,
        **kwd,
    ):
        """Creates a nonlinear object ranking problem by sampling from a
        Gaussian process as the latent utility function.
        Note that this function needs to compute a kernel matrix of size
        (n_instances * n_objects) ** 2, which could allocate a large chunk of the
        memory."""
        random_state = check_random_state(seed=seed)

        if kernel_params is None:
            kernel_params = dict()
        n_total = n_instances * n_objects
        X = random_state.rand(n_total, n_features)
        L = np.linalg.cholesky(Matern(**kernel_params)(X))
        f = L.dot(random_state.randn(n_total)) + random_state.normal(
            scale=noise, size=n_total
        )
        X = X.reshape(n_instances, n_objects, n_features)
        f = f.reshape(n_instances, n_objects)
        Y = scores_to_rankings(f)

        return X, Y

    def make_gp_non_transitive(
        self,
        n_instances=1000,
        n_objects=5,
        n_features=100,
        center_box=(-10.0, 10.0),
        cluster_std=2.0,
        seed=42,
        **kwd,
    ):
        n_samples = n_instances * n_objects
        random_state = check_random_state(seed=seed)
        x, y = make_blobs(
            n_samples=n_samples,
            centers=n_objects,
            n_features=n_features,
            cluster_std=cluster_std,
            center_box=center_box,
            random_state=random_state,
            shuffle=True,
        )
        y = np.array([y])
        samples = np.append(x, y.T, axis=1)
        samples = samples[samples[:, n_features].argsort()]
        pairwise_prob = create_pairwise_prob_matrix(n_objects)
        X = []
        Y = []
        for inst in range(n_instances):
            feature = np.array(
                [samples[inst + i * n_instances, 0:-1] for i in range(n_objects)]
            )
            matrix = np.random.binomial(1, pairwise_prob)
            objects = list(np.arange(n_objects))
            ordering = np.array(quicksort(objects, matrix))
            ranking = np.argsort(ordering)
            X.append(feature)
            Y.append(ranking)
        X = np.array(X)
        Y = np.array(Y)
        return X, Y

    def make_intransitive_medoids(
        self, n_instances=100, n_objects=5, n_features=100, seed=42, **kwd
    ):
        random_state = check_random_state(seed=seed)
        X = random_state.uniform(size=(n_instances, n_objects, n_features))
        Y = np.empty((n_instances, n_objects))
        for i in range(n_instances):
            D = squareform(pdist(X[i], metric="euclidean"))
            sum_dist = D.mean(axis=0)
            medoid = np.argmin(sum_dist)
            ordering = np.argsort(D[medoid])
            ranking = np.argsort(ordering)
            Y[i] = ranking
        X = np.array(X)
        Y = np.array(Y)
        return X, Y

    def make_hv_dataset(
        self, n_instances=1000, n_objects=5, n_features=5, seed=42, **kwd
    ):
        random_state = check_random_state(seed=seed)
        X = random_state.randn(n_instances, n_objects, n_features)
        # Normalize to unit circle and fold to lower quadrant
        X = -np.abs(X / np.sqrt(np.power(X, 2).sum(axis=2))[..., None])
        Y = np.empty((n_instances, n_objects), dtype=int)
        reference = np.zeros(n_features)
        for i, x in enumerate(X):
            hv = hypervolume(x)
            cont = hv.contributions(reference)
            Y[i] = np.argsort(cont)[::-1].argsort()

        return X, Y

    def get_single_train_test_split(self):
        return super(ObjectRankingDatasetGenerator, self).get_single_train_test_split()

    def get_train_test_datasets(self, n_datasets=5):
        return super(ObjectRankingDatasetGenerator, self).get_train_test_datasets(
            n_datasets=n_datasets
        )

    def get_dataset_dictionaries(self, lengths=[5, 6]):
        return super(ObjectRankingDatasetGenerator, self).get_dataset_dictionaries(
            lengths=lengths
        )