Python sklearn.datasets.load_svmlight_file() Examples

The following are 30 code examples of sklearn.datasets.load_svmlight_file(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .
Example #1
Source File: data_io.py    From Kaggler with MIT License 6 votes vote down vote up
def load_data(path, dense=False):
    """Load data from a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        path (str): A path to the CSV, LibSVM or HDF5 format file.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]
    X, y = func(path)

    if dense and sparse.issparse(X):
        X = X.todense()

    return X, y 
Example #2
Source File: sklearn_test.py    From nni with MIT License 6 votes vote down vote up
def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train)) 
Example #3
Source File: datasets.py    From RFHO with MIT License 6 votes vote down vote up
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
    X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
    y = np.array([int(yy) for yy in y])
    if one_hot:
        y = to_one_hot_enc(y)
    res = [Dataset(data=X, target=y)]
    if partitions_proportions:
        res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return res


# noinspection PyPep8Naming 
Example #4
Source File: io.py    From dislib with Apache License 2.0 6 votes vote down vote up
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
    from tempfile import SpooledTemporaryFile
    from sklearn.datasets import load_svmlight_file

    # Creating a tmp file to use load_svmlight_file method should be more
    # efficient than parsing the lines manually
    tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
    tmp_file.writelines(lines)
    tmp_file.seek(0)

    x, y = load_svmlight_file(tmp_file, n_features)
    if not store_sparse:
        x = x.toarray()

    # tried also converting to csc/ndarray first for faster splitting but it's
    # not worth. Position 0 contains the X
    for i in range(ceil(n_features / col_size)):
        out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]

    # Position 1 contains the y block
    out_blocks[1][0] = y.reshape(-1, 1) 
Example #5
Source File: test_array.py    From dislib with Apache License 2.0 6 votes vote down vote up
def test_load_svmlight_file(self):
        """ Tests loading a LibSVM file  """
        file_ = "tests/files/libsvm/1"

        x_np, y_np = load_svmlight_file(file_, n_features=780)

        # Load SVM and store in sparse
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=True)

        self.assertTrue(_equal_arrays(x.collect(), x_np))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

        # Load SVM and store in dense
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=False)

        self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
        self.assertTrue(_equal_arrays(y.collect(), y_np)) 
Example #6
Source File: test_svmlight_format.py    From rankeval with Mozilla Public License 2.0 6 votes vote down vote up
def test_dump_qid(self):
        tmpfile = "/tmp/tmp_dump.txt"
        try:
            # loads from file
            Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)

            # loads them as CSR MATRIX with scikit-learn
            X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
            assert_array_equal(q, q2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
Example #7
Source File: test_svmlight_format.py    From rankeval with Mozilla Public License 2.0 6 votes vote down vote up
def test_dump(self):
        tmpfile = "tmp_dump.txt"
        try:
            # loads from file
            Xs, y = load_svmlight_file(datafile)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, zero_based=False)

            # loads them as CSR MATRIX
            X2, y2 = sk_load_svmlight_file(tmpfile)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
Example #8
Source File: data_reader.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def read_year_prediction_data(fileName):
    feature_dim = 90
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
Example #9
Source File: python2libsvm.py    From AiLearning with GNU General Public License v3.0 5 votes vote down vote up
def get_data(file_input, separator='\t'):
    if 'libsvm' not in file_input:
        file_input = other2libsvm(file_input, separator)
    data = datasets.load_svmlight_file(file_input)
    return data[0], data[1] 
Example #10
Source File: pg2.py    From aca with MIT License 5 votes vote down vote up
def homepage_xgb_model(model_path, training_set='True'):
    training_set = './data/%s_features.svm.txt'%(training_set)
    model = xgb.XGBClassifier( learning_rate =0.1,
         n_estimators=200,
         max_depth=5,
         min_child_weight=1,
         gamma= 0.3,
         subsample= 0.7,
         colsample_bytree=0.7,
         objective= 'binary:logistic',
         scale_pos_weight=1)
    X, y = load_svmlight_file(training_set)
    model.fit(X,y)
    pickle.dump(model, open(model_path, 'wb'))
    return model 
Example #11
Source File: datasets.py    From interpret-community with MIT License 5 votes vote down vote up
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = 'datasets.12.18.2019'
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile
        zipfilename = outdirname + '.zip'
        urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)
        with zipfile.ZipFile(zipfilename, 'r') as unzip:
            unzip.extractall('.')
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == '.npz':
        # sparse format file
        from scipy.sparse import load_npz
        return load_npz(filepath)
    elif extension == '.svmlight':
        from sklearn import datasets
        return datasets.load_svmlight_file(filepath)
    elif extension == '.json':
        import json
        with open(filepath, encoding='utf-8') as f:
            dataset = json.load(f)
        return dataset
    elif extension == '.csv':
        import pandas as pd
        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception('Unrecognized file extension: ' + extension) 
Example #12
Source File: transitionparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
            )

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True,
            )

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name) 
Example #13
Source File: io.py    From transfer with MIT License 5 votes vote down vote up
def __init__(self, file_path, domain=0):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # Y is synthetic label, not used
        X = X.todense().astype(np.float32)
        self.X = torch.from_numpy(X)
        self.Y = torch.LongTensor([domain] * self.X.shape[0]) 
Example #14
Source File: io.py    From transfer with MIT License 5 votes vote down vote up
def __init__(self, file_path):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # X is a sparse matrix
        # L = [X[i].nonzero()[0].shape[0] for i in range(X.shape[0])]
        X = X.todense().astype(np.float32)
        Y = np.array((Y + 1) / 2, dtype=int)
        self.X = torch.from_numpy(X)
        self.Y = torch.from_numpy(Y) 
Example #15
Source File: dataset_loading.py    From allRank with Apache License 2.0 5 votes vote down vote up
def from_svm_file(cls, svm_file_path, transform=None):
        """
        Instantiate a LibSVMDataset from a LibSVM file path.
        :param svm_file_path: LibSVM file path
        :param transform: a callable defining an optional transformation called on the dataset
        :return: LibSVMDataset instantiated from a given file and with an optional transformation defined
        """
        x, y, query_ids = load_svmlight_file(svm_file_path, query_id=True)
        logger.info("loaded dataset from {} and got x shape {}, y shape {} and query_ids shape {}".format(
            svm_file_path, x.shape, y.shape, query_ids.shape))
        return cls(x, y, query_ids, transform) 
Example #16
Source File: benchmark_test.py    From nni with MIT License 5 votes vote down vote up
def run_test(self, pipeline, name, path):
        print("download " + name)
        update_name = self.download(name, path)
        X, y = load_svmlight_file(update_name)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
        
        pipeline.fit(X_train, y_train)
        print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test)) 
Example #17
Source File: classifier.py    From Semantic-Texual-Similarity-Toolkits with MIT License 5 votes vote down vote up
def load_file(self, file_path):
        data = load_svmlight_file(file_path)
        return data[0], data[1] 
Example #18
Source File: datasets.py    From interpret-text with MIT License 5 votes vote down vote up
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = "datasets.1.17.2019"
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile

        zipfilename = outdirname + ".zip"
        urlretrieve(
            "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
            zipfilename,
        )
        with zipfile.ZipFile(zipfilename, "r") as unzip:
            unzip.extractall(".")
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == ".npz":
        # sparse format file
        import scipy.sparse as sparse

        return sparse.load_npz(filepath)
    elif extension == ".svmlight":
        from sklearn import datasets

        return datasets.load_svmlight_file(filepath)
    elif extension == ".json":
        import json

        with open(filepath, encoding="utf-8") as f:
            dataset = json.load(f)
        return dataset
    elif extension == ".csv":
        import pandas as pd

        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception("Unrecognized file extension: " + extension) 
Example #19
Source File: transitionparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name) 
Example #20
Source File: fetch_url_dataset.py    From tick with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_url_dataset_day(cache_path, days):
    """Loads url dataset from a tar file

    Parameters
    ----------
    cache_path : `str`
        Path to the tar file

    days : `list` or `range`
        Days to be loaded

    Returns
    -------
    X : `np.ndarray`
        A sparse matrix containing the features

    y : `np.ndarray`
        An array containing the labels
    """
    tar_file = tarfile.open(cache_path, "r:gz")

    X, y = None, None

    for day in days:
        data_filename = 'url_svmlight/Day{}.svm'.format(day)
        with tar_file.extractfile(data_filename) as data_file:
            X_day, y_day = load_svmlight_file(data_file,
                                              n_features=_N_FEATURES)

        if X is None:
            X, y = X_day, y_day
        else:
            X = scipy.sparse.vstack((X, X_day))
            y = np.hstack((y, y_day))

    return X, y 
Example #21
Source File: download_helper.py    From tick with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_dataset(dataset_path, data_home=None, n_features=None):
    """Load dataset from given path
    Parameters
    ----------
    dataset_path : `str`
        Dataset relative path
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.
    n_features : `int`, optional, default=None
        The number of features to use. If None, it will be inferred. This
        argument is useful to load several files that are subsets of a
        bigger sliced dataset: each subset might not have examples of
        every feature, hence the inferred shape might vary from one
        slice to another.
    Returns
    -------
    output : `np.ndarray` or `dict` or `tuple`
        Dataset. Its format will depend on queried dataset.
    """
    data_home = get_data_home(data_home)
    cache_path = os.path.join(data_home, dataset_path)

    if cache_path.endswith(".npz"):
        dataset = np.load(cache_path, allow_pickle=True)
        # If we have only one numpy array we return it directly otherwise
        # we return the row dictionary
        if len(dataset) == 1:
            key_0 = list(dataset.keys())[0]
            dataset = dataset[key_0]
        else:
            dataset = dataset.items()
    else:
        dataset = load_svmlight_file(cache_path, n_features=n_features)

    return dataset 
Example #22
Source File: data_reader.py    From training_results_v0.6 with Apache License 2.0 5 votes vote down vote up
def get_year_prediction_data(dirname=None):
    feature_dim = 90
    if dirname is None:
        dirname = os.path.join(os.path.dirname(__file__), 'data')
    filename = 'YearPredictionMSD'
    download_filename = os.path.join(dirname, "%s.bz2" % filename)
    extracted_filename = os.path.join(dirname, filename)
    if not os.path.isfile(download_filename):
        print("Downloading data...")
        mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
    if not os.path.isfile(extracted_filename):
        print("Extracting data...")
        with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
            shutil.copyfileobj(fr,fw)
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
Example #23
Source File: dataset.py    From libact with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def import_libsvm_sparse(filename):
    """Imports dataset file in libsvm sparse format"""
    from sklearn.datasets import load_svmlight_file
    X, y = load_svmlight_file(filename)
    return Dataset(X.toarray(), y) 
Example #24
Source File: run_sklearn.py    From recipy with Apache License 2.0 5 votes vote down vote up
def load_svmlight_file(self):
        """
        Use sklearn.datasets.load_svmlight_file to load data.svmlight.
        """
        file_name = os.path.join(self.data_dir, "data.svmlight")
        datasets.load_svmlight_file(file_name) 
Example #25
Source File: data_utils.py    From pyxclib with MIT License 5 votes vote down vote up
def read_data(filename, header=True, dtype='float32', zero_based=True):
    """Read data in sparse format

    Arguments
    ---------
    filename: str
        output file name
    header: bool, default=True
        If header is present or not
    dtype: str, default='float32'
        data type of values
    zero_based: boolean, default=True
        zwero based indices?

    Returns
    --------
    features: csr_matrix
        features matrix
    labels: csr_matix
        labels matrix
    num_samples: int
        #instances
    num_feat: int
        #features
    num_labels: int
        #labels
    """
    with open(filename, 'rb') as f:
        _l_shape = None
        if header:
            line = f.readline().decode('utf-8').rstrip("\n")
            line = line.split(" ")
            num_samples, num_feat, num_labels = int(
                line[0]), int(line[1]), int(line[2])
            _l_shape = (num_samples, num_labels)
        else:
            num_samples, num_feat, num_labels = None, None, None
        features, labels = load_svmlight_file(f, multilabel=True)
        labels = ll_to_sparse(
            labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
    return features, labels, num_samples, num_feat, num_labels 
Example #26
Source File: test_multilabel_realdata.py    From libact with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def setUp(self):
        dataset_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'datasets/yeast_train.svm')
        X, y = load_svmlight_file(dataset_filepath, multilabel=True)
        self.X = X.todense().tolist()
        self.y = MultiLabelBinarizer().fit_transform(y).tolist()
        self.quota = 10 
Example #27
Source File: io.py    From dislib with Apache License 2.0 4 votes vote down vote up
def load_svmlight_file(path, block_size, n_features, store_sparse):
    """ Loads a SVMLight file into a distributed array.

    Parameters
    ----------
    path : string
        File path.
    block_size : tuple (int, int)
        Size of the blocks for the output ds-array.
    n_features : int
        Number of features.
    store_sparse : boolean
        Whether to use scipy.sparse data structures to store data. If False,
        numpy.array is used instead.

    Returns
    -------
    x, y : (ds-array, ds-array)
        A distributed representation (ds-array) of the X and y.
    """
    n, m = block_size
    lines = []
    x_blocks, y_blocks = [], []

    n_rows = 0
    with open(path, "r") as f:
        for line in f:
            n_rows += 1
            lines.append(line.encode())

            if len(lines) == n:
                # line 0 -> X, line 1 -> y
                out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
                out_blocks.append([object()])
                # out_blocks.append([])
                _read_svmlight(lines, out_blocks, col_size=m,
                               n_features=n_features,
                               store_sparse=store_sparse)
                # we append only the list forming the row (out_blocks depth=2)
                x_blocks.append(out_blocks[0])
                y_blocks.append(out_blocks[1])
                lines = []

    if lines:
        out_blocks = Array._get_out_blocks((1, ceil(n_features / m)))
        out_blocks.append([object()])
        _read_svmlight(lines, out_blocks, col_size=m,
                       n_features=n_features, store_sparse=store_sparse)
        # we append only the list forming the row (out_blocks depth=2)
        x_blocks.append(out_blocks[0])
        y_blocks.append(out_blocks[1])

    x = Array(x_blocks, top_left_shape=block_size, reg_shape=block_size,
              shape=(n_rows, n_features), sparse=store_sparse)

    # y has only a single line but it's treated as a 'column'
    y = Array(y_blocks, top_left_shape=(n, 1), reg_shape=(n, 1),
              shape=(n_rows, 1), sparse=False)

    return x, y 
Example #28
Source File: bidnn.py    From BiDNN with GNU Affero General Public License v3.0 4 votes vote down vote up
def load_dataset(self, X=None):
        if self.conf.verbosity > 1:
            print "Loading dataset..."
        if X is None:
            self.X_train, self.tl = load_svmlight_file(self.conf.fname_in, dtype=np.float32, multilabel=False)
            # we're saving tl (target labels) just in case they exist and the user needs them - since
            # this is unsupervised learning, we completely ignore the labels and don't expect them to exist
        else:
            self.X_train = X
        
        self.X_train = self.X_train.todense()

        if (self.conf.mod1size + self.conf.mod2size) != self.X_train.shape[1]:
            raise ValueError("Provided dimensionality of 1st modality ("+str(self.conf.mod1size)+") and 2nd modality ("+str(self.conf.mod2size)+") " \
                             "does not sum to the dimensionality provided in the input file ("+str(self.X_train.shape[1])+")")

        # indices of missing modalities (stored for later)
        self.idxMissingFirst = []
        self.idxMissingSecond = []
        
        # generate training data for modality translation
        self.X_first = [] 
        self.X_second = []
        
        bothMissing = both = 0
        if self.conf.ignore_zeroes:
            # zeroes are not treated as missing modalities
            # I have no idea why this might be useful, but ok :D
            # since idxMissing* are left empty, this is the only
            # place where we should take care of this
            for i in range(self.X_train.shape[0]):
                both += 1
                self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))
        else:
            # zero vectors are treated as missing modalities (default)
            for i in range(self.X_train.shape[0]):
                if not np.any(self.X_train[i, :self.conf.mod1size]): # first missing
                    if np.any(self.X_train[i, self.conf.mod1size:]): # second not missing
                        # second ok, need to reconstruct first
                        self.idxMissingFirst.append(i)
                    else:
                        bothMissing +=  1 # missing both
                else: # first ok
                    if not np.any(self.X_train[i, self.conf.mod1size:]): # second missing
                        self.idxMissingSecond.append(i)
                    else: #both ok -> use them to train translator
                        both += 1
                        self.X_first.append(np.ravel(self.X_train[i, :self.conf.mod1size]))
                        self.X_second.append(np.ravel(self.X_train[i, self.conf.mod1size:]))
            
        if self.conf.verbosity > 1:
            print "Both modalities present:",both, "\nMissing 1st:", len(self.idxMissingFirst), "\nMissing 2nd:",len(self.idxMissingSecond)
            print "Missing both modalities:", bothMissing, "\n"

        self.X_first = np.array(self.X_first)
        self.X_second = np.array(self.X_second) 
Example #29
Source File: libsvm.py    From celer with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_X_y(dataset, compressed_path, multilabel, replace=False):
    """Load a LIBSVM dataset as sparse X and observation y/Y.
    If X and y already exists as npz and npy, they are not redownloaded unless
    replace=True."""

    ext = '.npz' if multilabel else '.npy'
    y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext))
    X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset])
    if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path):
        tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset])

        decompressor = BZ2Decompressor()
        print("Decompressing...")
        with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g:
            for data in iter(lambda: g.read(100 * 1024), b''):
                f.write(decompressor.decompress(data))

        n_features_total = N_FEATURES[dataset]
        print("Loading svmlight file...")
        with open(tmp_path, 'rb') as f:
            X, y = load_svmlight_file(
                f, n_features_total, multilabel=multilabel)

        os.remove(tmp_path)
        X = sparse.csc_matrix(X)
        X.sort_indices()
        sparse.save_npz(X_path, X)

        if multilabel:
            indices = np.array([lab for labels in y for lab in labels])
            indptr = np.cumsum([0] + [len(labels) for labels in y])
            data = np.ones_like(indices)
            Y = sparse.csr_matrix((data, indices, indptr))
            sparse.save_npz(y_path, Y)
            return X, Y

        else:
            np.save(y_path, y)

    else:
        X = sparse.load_npz(X_path)
        y = np.load(y_path)

    return X, y 
Example #30
Source File: multiclass_soft_confidence_weighted_1_diag.py    From python-online-machine-learning-library with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main():
    """
    Example of how to use
    """
    # data load
    #fname = "/home/kzk/datasets/uci_csv/iris.csv"
    fname = "/home/kzk/datasets/uci_csv/glass.csv"
    #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
    #fname = "/home/kzk/datasets/uci_csv/car.csv"
    #fname = "/home/kzk/datasets/uci_csv/credit.csv"
    #fname = "/home/kzk/datasets/uci_csv/usps.csv"
    #fname = "/home/kzk/datasets/uci_csv/liver.csv"
    #fname = "/home/kzk/datasets/uci_csv/haberman.csv"
    #fname = "/home/kzk/datasets/uci_csv/pima.csv"
    #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
    #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
    #fname = "/home/kzk/datasets/uci_csv/isolet.csv"
    #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
    #fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
    #fname = "/home/kzk/datasets/uci_csv/yeast.csv"
    fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
    print "dataset is", fname
    
    #data = np.loadtxt(fname, delimiter=" ")
    #X = data[:, 1:]
    #y = data[:, 0]

    (X, y) = load_svmlight_file(fname)
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)
    #X = X.toarray()
    
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)

    # learn
    model = MSCWIDiag(C=1, eta=0.9, epochs=1)
    model.learn(X, y)

    # predict
    st = time.time()
    for i in xrange(0, n_samples):
        if i % 1000 == 0:
            print "#samples = %d" % i
            pass
        sample = X[i, :]
        y_pred[i] = model.predict(sample)
    et = time.time()
    print "prediction time: %f[s]" % (et - st)
    print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
    
    # show result
    cm = confusion_matrix(y, y_pred)
    #print cm
    print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))