Python sklearn.datasets.load_svmlight_file() Examples

The following are 30 code examples for showing how to use sklearn.datasets.load_svmlight_file(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.datasets , or try the search function .

Example 1
def read_year_prediction_data(fileName):
    feature_dim = 90
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(fileName, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
Example 2
Project: Kaggler   Author: jeongyoonlee   File: data_io.py    License: MIT License 6 votes vote down vote up
def load_data(path, dense=False):
    """Load data from a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        path (str): A path to the CSV, LibSVM or HDF5 format file.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]
    X, y = func(path)

    if dense and sparse.issparse(X):
        X = X.todense()

    return X, y 
Example 3
Project: rankeval   Author: hpclab   File: test_svmlight_format.py    License: Mozilla Public License 2.0 6 votes vote down vote up
def test_dump(self):
        tmpfile = "tmp_dump.txt"
        try:
            # loads from file
            Xs, y = load_svmlight_file(datafile)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, zero_based=False)

            # loads them as CSR MATRIX
            X2, y2 = sk_load_svmlight_file(tmpfile)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
Example 4
Project: rankeval   Author: hpclab   File: test_svmlight_format.py    License: Mozilla Public License 2.0 6 votes vote down vote up
def test_dump_qid(self):
        tmpfile = "/tmp/tmp_dump.txt"
        try:
            # loads from file
            Xs, y, q = load_svmlight_file(qid_datafile, query_id=True)

            # dumps to file
            dump_svmlight_file(Xs, y, tmpfile, query_id=list(q), zero_based=False)

            # loads them as CSR MATRIX with scikit-learn
            X2, y2, q2 = sk_load_svmlight_file(tmpfile, query_id=True)

            X3 = np.ndarray(shape=X2.shape, dtype=X2.dtype)
            X2.toarray(out=X3)

            # check assertions
            assert_array_almost_equal(Xs, X3)
            assert_array_almost_equal(y, y2)
            assert_array_equal(q, q2)
        finally:
            if os.path.exists(tmpfile):
                os.remove(tmpfile) 
Example 5
Project: nni   Author: microsoft   File: sklearn_test.py    License: MIT License 6 votes vote down vote up
def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train)) 
Example 6
Project: RFHO   Author: lucfra   File: datasets.py    License: MIT License 6 votes vote down vote up
def load_realsim(folder=REALSIM, one_hot=True, partitions_proportions=None, shuffle=False, as_tensor=True):
    X, y = sk_dt.load_svmlight_file(folder + "/real-sim")
    y = np.array([int(yy) for yy in y])
    if one_hot:
        y = to_one_hot_enc(y)
    res = [Dataset(data=X, target=y)]
    if partitions_proportions:
        res = redivide_data(res, shuffle=shuffle, partition_proportions=partitions_proportions)
        res = Datasets.from_list(res)

    if as_tensor: [dat.convert_to_tensor() for dat in res]

    return res


# noinspection PyPep8Naming 
Example 7
Project: dislib   Author: bsc-wdc   File: io.py    License: Apache License 2.0 6 votes vote down vote up
def _read_svmlight(lines, out_blocks, col_size, n_features, store_sparse):
    from tempfile import SpooledTemporaryFile
    from sklearn.datasets import load_svmlight_file

    # Creating a tmp file to use load_svmlight_file method should be more
    # efficient than parsing the lines manually
    tmp_file = SpooledTemporaryFile(mode="wb+", max_size=2e8)
    tmp_file.writelines(lines)
    tmp_file.seek(0)

    x, y = load_svmlight_file(tmp_file, n_features)
    if not store_sparse:
        x = x.toarray()

    # tried also converting to csc/ndarray first for faster splitting but it's
    # not worth. Position 0 contains the X
    for i in range(ceil(n_features / col_size)):
        out_blocks[0][i] = x[:, i * col_size:(i + 1) * col_size]

    # Position 1 contains the y block
    out_blocks[1][0] = y.reshape(-1, 1) 
Example 8
Project: dislib   Author: bsc-wdc   File: test_array.py    License: Apache License 2.0 6 votes vote down vote up
def test_load_svmlight_file(self):
        """ Tests loading a LibSVM file  """
        file_ = "tests/files/libsvm/1"

        x_np, y_np = load_svmlight_file(file_, n_features=780)

        # Load SVM and store in sparse
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=True)

        self.assertTrue(_equal_arrays(x.collect(), x_np))
        self.assertTrue(_equal_arrays(y.collect(), y_np))

        # Load SVM and store in dense
        x, y = ds.load_svmlight_file(file_, (25, 100), n_features=780,
                                     store_sparse=False)

        self.assertTrue(_equal_arrays(x.collect(), x_np.toarray()))
        self.assertTrue(_equal_arrays(y.collect(), y_np)) 
Example 9
Project: interpret-text   Author: interpretml   File: datasets.py    License: MIT License 5 votes vote down vote up
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = "datasets.1.17.2019"
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile

        zipfilename = outdirname + ".zip"
        urlretrieve(
            "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
            zipfilename,
        )
        with zipfile.ZipFile(zipfilename, "r") as unzip:
            unzip.extractall(".")
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == ".npz":
        # sparse format file
        import scipy.sparse as sparse

        return sparse.load_npz(filepath)
    elif extension == ".svmlight":
        from sklearn import datasets

        return datasets.load_svmlight_file(filepath)
    elif extension == ".json":
        import json

        with open(filepath, encoding="utf-8") as f:
            dataset = json.load(f)
        return dataset
    elif extension == ".csv":
        import pandas as pd

        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception("Unrecognized file extension: " + extension) 
Example 10
Project: razzy-spinner   Author: rafasashi   File: transitionparser.py    License: GNU General Public License v3.0 5 votes vote down vote up
def train(self, depgraphs, modelfile):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train',
                dir=tempfile.gettempdir(),
                delete=False)

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=True,
                probability=True)

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name) 
Example 11
Project: libact   Author: ntucllab   File: dataset.py    License: BSD 2-Clause "Simplified" License 5 votes vote down vote up
def import_libsvm_sparse(filename):
    """Imports dataset file in libsvm sparse format"""
    from sklearn.datasets import load_svmlight_file
    X, y = load_svmlight_file(filename)
    return Dataset(X.toarray(), y) 
Example 12
Project: libact   Author: ntucllab   File: test_multilabel_realdata.py    License: BSD 2-Clause "Simplified" License 5 votes vote down vote up
def setUp(self):
        dataset_filepath = os.path.join(
            os.path.dirname(os.path.realpath(__file__)),
            'datasets/yeast_train.svm')
        X, y = load_svmlight_file(dataset_filepath, multilabel=True)
        self.X = X.todense().tolist()
        self.y = MultiLabelBinarizer().fit_transform(y).tolist()
        self.quota = 10 
Example 13
Project: pyxclib   Author: kunaldahiya   File: data_utils.py    License: MIT License 5 votes vote down vote up
def read_data(filename, header=True, dtype='float32', zero_based=True):
    """Read data in sparse format

    Arguments
    ---------
    filename: str
        output file name
    header: bool, default=True
        If header is present or not
    dtype: str, default='float32'
        data type of values
    zero_based: boolean, default=True
        zwero based indices?

    Returns
    --------
    features: csr_matrix
        features matrix
    labels: csr_matix
        labels matrix
    num_samples: int
        #instances
    num_feat: int
        #features
    num_labels: int
        #labels
    """
    with open(filename, 'rb') as f:
        _l_shape = None
        if header:
            line = f.readline().decode('utf-8').rstrip("\n")
            line = line.split(" ")
            num_samples, num_feat, num_labels = int(
                line[0]), int(line[1]), int(line[2])
            _l_shape = (num_samples, num_labels)
        else:
            num_samples, num_feat, num_labels = None, None, None
        features, labels = load_svmlight_file(f, multilabel=True)
        labels = ll_to_sparse(
            labels, dtype=dtype, zero_based=zero_based, shape=_l_shape)
    return features, labels, num_samples, num_feat, num_labels 
Example 14
Project: recipy   Author: recipy   File: run_sklearn.py    License: Apache License 2.0 5 votes vote down vote up
def load_svmlight_file(self):
        """
        Use sklearn.datasets.load_svmlight_file to load data.svmlight.
        """
        file_name = os.path.join(self.data_dir, "data.svmlight")
        datasets.load_svmlight_file(file_name) 
Example 15
Project: training_results_v0.6   Author: mlperf   File: data_reader.py    License: Apache License 2.0 5 votes vote down vote up
def get_year_prediction_data(dirname=None):
    feature_dim = 90
    if dirname is None:
        dirname = os.path.join(os.path.dirname(__file__), 'data')
    filename = 'YearPredictionMSD'
    download_filename = os.path.join(dirname, "%s.bz2" % filename)
    extracted_filename = os.path.join(dirname, filename)
    if not os.path.isfile(download_filename):
        print("Downloading data...")
        mx.test_utils.download('https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/%s.bz2' % filename, dirname=dirname)
    if not os.path.isfile(extracted_filename):
        print("Extracting data...")
        with bz2.BZ2File(download_filename) as fr, open(extracted_filename,"wb") as fw:
            shutil.copyfileobj(fr,fw)
    print("Reading data from disk...")
    train_features, train_labels = load_svmlight_file(extracted_filename, n_features=feature_dim, dtype=np.float32)
    train_features = train_features.todense()

    # normalize the data: subtract means and divide by standard deviations
    label_mean = train_labels.mean()
    label_std = np.sqrt(np.square(train_labels - label_mean).mean())
    feature_means = train_features.mean(axis=0)
    feature_stds = np.sqrt(np.square(train_features - feature_means).mean(axis=0))

    train_features = (train_features - feature_means) / feature_stds
    train_labels = (train_labels - label_mean) / label_std

    return feature_dim, train_features, train_labels 
Example 16
Project: tick   Author: X-DataInitiative   File: download_helper.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_dataset(dataset_path, data_home=None, n_features=None):
    """Load dataset from given path
    Parameters
    ----------
    dataset_path : `str`
        Dataset relative path
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.
    n_features : `int`, optional, default=None
        The number of features to use. If None, it will be inferred. This
        argument is useful to load several files that are subsets of a
        bigger sliced dataset: each subset might not have examples of
        every feature, hence the inferred shape might vary from one
        slice to another.
    Returns
    -------
    output : `np.ndarray` or `dict` or `tuple`
        Dataset. Its format will depend on queried dataset.
    """
    data_home = get_data_home(data_home)
    cache_path = os.path.join(data_home, dataset_path)

    if cache_path.endswith(".npz"):
        dataset = np.load(cache_path, allow_pickle=True)
        # If we have only one numpy array we return it directly otherwise
        # we return the row dictionary
        if len(dataset) == 1:
            key_0 = list(dataset.keys())[0]
            dataset = dataset[key_0]
        else:
            dataset = dataset.items()
    else:
        dataset = load_svmlight_file(cache_path, n_features=n_features)

    return dataset 
Example 17
Project: tick   Author: X-DataInitiative   File: fetch_url_dataset.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_url_dataset_day(cache_path, days):
    """Loads url dataset from a tar file

    Parameters
    ----------
    cache_path : `str`
        Path to the tar file

    days : `list` or `range`
        Days to be loaded

    Returns
    -------
    X : `np.ndarray`
        A sparse matrix containing the features

    y : `np.ndarray`
        An array containing the labels
    """
    tar_file = tarfile.open(cache_path, "r:gz")

    X, y = None, None

    for day in days:
        data_filename = 'url_svmlight/Day{}.svm'.format(day)
        with tar_file.extractfile(data_filename) as data_file:
            X_day, y_day = load_svmlight_file(data_file,
                                              n_features=_N_FEATURES)

        if X is None:
            X, y = X_day, y_day
        else:
            X = scipy.sparse.vstack((X, X_day))
            y = np.hstack((y, y_day))

    return X, y 
Example 18
Project: Semantic-Texual-Similarity-Toolkits   Author: rgtjf   File: classifier.py    License: MIT License 5 votes vote down vote up
def load_file(self, file_path):
        data = load_svmlight_file(file_path)
        return data[0], data[1] 
Example 19
Project: nni   Author: microsoft   File: benchmark_test.py    License: MIT License 5 votes vote down vote up
def run_test(self, pipeline, name, path):
        print("download " + name)
        update_name = self.download(name, path)
        X, y = load_svmlight_file(update_name)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
        
        pipeline.fit(X_train, y_train)
        print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test)) 
Example 20
Project: transfer   Author: jiangfeng1124   File: io.py    License: MIT License 5 votes vote down vote up
def __init__(self, file_path):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # X is a sparse matrix
        # L = [X[i].nonzero()[0].shape[0] for i in range(X.shape[0])]
        X = X.todense().astype(np.float32)
        Y = np.array((Y + 1) / 2, dtype=int)
        self.X = torch.from_numpy(X)
        self.Y = torch.from_numpy(Y) 
Example 21
Project: transfer   Author: jiangfeng1124   File: io.py    License: MIT License 5 votes vote down vote up
def __init__(self, file_path, domain=0):
        self.file_path = file_path
        X, Y = load_svmlight_file(file_path) # Y is synthetic label, not used
        X = X.todense().astype(np.float32)
        self.X = torch.from_numpy(X)
        self.Y = torch.LongTensor([domain] * self.X.shape[0]) 
Example 22
Project: AiLearning   Author: apachecn   File: python2libsvm.py    License: GNU General Public License v3.0 5 votes vote down vote up
def get_data(file_input, separator='\t'):
    if 'libsvm' not in file_input:
        file_input = other2libsvm(file_input, separator)
    data = datasets.load_svmlight_file(file_input)
    return data[0], data[1] 
Example 23
Project: aca   Author: geekinglcq   File: pg2.py    License: MIT License 5 votes vote down vote up
def homepage_xgb_model(model_path, training_set='True'):
    training_set = './data/%s_features.svm.txt'%(training_set)
    model = xgb.XGBClassifier( learning_rate =0.1,
         n_estimators=200,
         max_depth=5,
         min_child_weight=1,
         gamma= 0.3,
         subsample= 0.7,
         colsample_bytree=0.7,
         objective= 'binary:logistic',
         scale_pos_weight=1)
    X, y = load_svmlight_file(training_set)
    model.fit(X,y)
    pickle.dump(model, open(model_path, 'wb'))
    return model 
Example 24
Project: interpret-community   Author: interpretml   File: datasets.py    License: MIT License 5 votes vote down vote up
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = 'datasets.12.18.2019'
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile
        zipfilename = outdirname + '.zip'
        urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)
        with zipfile.ZipFile(zipfilename, 'r') as unzip:
            unzip.extractall('.')
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == '.npz':
        # sparse format file
        from scipy.sparse import load_npz
        return load_npz(filepath)
    elif extension == '.svmlight':
        from sklearn import datasets
        return datasets.load_svmlight_file(filepath)
    elif extension == '.json':
        import json
        with open(filepath, encoding='utf-8') as f:
            dataset = json.load(f)
        return dataset
    elif extension == '.csv':
        import pandas as pd
        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception('Unrecognized file extension: ' + extension) 
Example 25
def train(self, depgraphs, modelfile, verbose=True):
        """
        :param depgraphs : list of DependencyGraph as the training data
        :type depgraphs : DependencyGraph
        :param modelfile : file name to save the trained model
        :type modelfile : str
        """

        try:
            input_file = tempfile.NamedTemporaryFile(
                prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False
            )

            if self._algorithm == self.ARC_STANDARD:
                self._create_training_examples_arc_std(depgraphs, input_file)
            else:
                self._create_training_examples_arc_eager(depgraphs, input_file)

            input_file.close()
            # Using the temporary file to train the libsvm classifier
            x_train, y_train = load_svmlight_file(input_file.name)
            # The parameter is set according to the paper:
            # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre
            # Todo : because of probability = True => very slow due to
            # cross-validation. Need to improve the speed here
            model = svm.SVC(
                kernel='poly',
                degree=2,
                coef0=0,
                gamma=0.2,
                C=0.5,
                verbose=verbose,
                probability=True,
            )

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, 'wb'))
        finally:
            remove(input_file.name) 
Example 26
Project: allRank   Author: allegro   File: dataset_loading.py    License: Apache License 2.0 5 votes vote down vote up
def from_svm_file(cls, svm_file_path, transform=None):
        """
        Instantiate a LibSVMDataset from a LibSVM file path.
        :param svm_file_path: LibSVM file path
        :param transform: a callable defining an optional transformation called on the dataset
        :return: LibSVMDataset instantiated from a given file and with an optional transformation defined
        """
        x, y, query_ids = load_svmlight_file(svm_file_path, query_id=True)
        logger.info("loaded dataset from {} and got x shape {}, y shape {} and query_ids shape {}".format(
            svm_file_path, x.shape, y.shape, query_ids.shape))
        return cls(x, y, query_ids, transform) 
Example 27
Project: celer   Author: mathurinm   File: libsvm.py    License: BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_X_y(dataset, compressed_path, multilabel, replace=False):
    """Load a LIBSVM dataset as sparse X and observation y/Y.
    If X and y already exists as npz and npy, they are not redownloaded unless
    replace=True."""

    ext = '.npz' if multilabel else '.npy'
    y_path = pjoin(CELER_PATH, "%s_target%s" % (NAMES[dataset], ext))
    X_path = pjoin(CELER_PATH, "%s_data.npz" % NAMES[dataset])
    if replace or not os.path.isfile(y_path) or not os.path.isfile(X_path):
        tmp_path = pjoin(CELER_PATH, "%s" % NAMES[dataset])

        decompressor = BZ2Decompressor()
        print("Decompressing...")
        with open(tmp_path, "wb") as f, open(compressed_path, "rb") as g:
            for data in iter(lambda: g.read(100 * 1024), b''):
                f.write(decompressor.decompress(data))

        n_features_total = N_FEATURES[dataset]
        print("Loading svmlight file...")
        with open(tmp_path, 'rb') as f:
            X, y = load_svmlight_file(
                f, n_features_total, multilabel=multilabel)

        os.remove(tmp_path)
        X = sparse.csc_matrix(X)
        X.sort_indices()
        sparse.save_npz(X_path, X)

        if multilabel:
            indices = np.array([lab for labels in y for lab in labels])
            indptr = np.cumsum([0] + [len(labels) for labels in y])
            data = np.ones_like(indices)
            Y = sparse.csr_matrix((data, indices, indptr))
            sparse.save_npz(y_path, Y)
            return X, Y

        else:
            np.save(y_path, y)

    else:
        X = sparse.load_npz(X_path)
        y = np.load(y_path)

    return X, y 
Example 28
def main():
    """
    Example of how to use
    """
    # data load
    #fname = "/home/kzk/datasets/uci_csv/iris.csv"
    fname = "/home/kzk/datasets/uci_csv/glass.csv"
    #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
    #fname = "/home/kzk/datasets/uci_csv/car.csv"
    #fname = "/home/kzk/datasets/uci_csv/credit.csv"
    #fname = "/home/kzk/datasets/uci_csv/usps.csv"
    #fname = "/home/kzk/datasets/uci_csv/liver.csv"
    #fname = "/home/kzk/datasets/uci_csv/haberman.csv"
    #fname = "/home/kzk/datasets/uci_csv/pima.csv"
    #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
    #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
    #fname = "/home/kzk/datasets/uci_csv/isolet.csv"
    #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
    #fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
    #fname = "/home/kzk/datasets/uci_csv/yeast.csv"
    fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
    print "dataset is", fname
    
    #data = np.loadtxt(fname, delimiter=" ")
    #X = data[:, 1:]
    #y = data[:, 0]

    (X, y) = load_svmlight_file(fname)
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)
    #X = X.toarray()
    
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)

    # learn
    model = MSCWIIDiag(C=1, eta=0.9, epochs=1)
    model.learn(X, y)

    # predict
    st = time.time()
    for i in xrange(0, n_samples):
        if i % 1000 == 0:
            print "#samples = %d" % i
            pass
        sample = X[i, :]
        y_pred[i] = model.predict(sample)
    et = time.time()
    print "prediction time: %f[s]" % (et - st)
    print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
    
    # show result
    cm = confusion_matrix(y, y_pred)
    #print cm
    print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm)) 
Example 29
def main():
    """
    Example of how to use
    """
    # data load
    #fname = "/home/kzk/datasets/uci_csv/iris.csv"
    fname = "/home/kzk/datasets/uci_csv/glass.csv"
    #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
    #fname = "/home/kzk/datasets/uci_csv/car.csv"
    #fname = "/home/kzk/datasets/uci_csv/credit.csv"
    #fname = "/home/kzk/datasets/uci_csv/usps.csv"
    #fname = "/home/kzk/datasets/uci_csv/liver.csv"
    #fname = "/home/kzk/datasets/uci_csv/haberman.csv"
    #fname = "/home/kzk/datasets/uci_csv/pima.csv"
    #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
    #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
    #fname = "/home/kzk/datasets/uci_csv/isolet.csv"
    #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
    #fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
    #fname = "/home/kzk/datasets/uci_csv/yeast.csv"
    #fname = "/home/kzk/datasets/news20/news20.dat"
    fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
    print "dataset is", fname
    
    #data = np.loadtxt(fname, delimiter=" ")
    #X = data[:, 1:]
    #y = data[:, 0]

    (X, y) = load_svmlight_file(fname)
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)
    #X = X.toarray()
    
    # learn
    model = MCWVarDiag(eta=0.9, epochs=1)
    model.learn(X, y)

    # predict
    st = time.time()
    for i in xrange(0, n_samples):
        if i % 1000 == 0:
            print "#samples = %d" % i
            pass
        sample = X[i, :]
        y_pred[i] = model.predict(sample)
    et = time.time()
    print "prediction time: %f[s]" % (et - st)
    print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
    
    # show result
    cm = confusion_matrix(y, y_pred)
    #print cm
    print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm)) 
Example 30
def main():
    """
    Example of how to use
    """
    # data load
    #fname = "/home/kzk/datasets/uci_csv/iris.csv"
    fname = "/home/kzk/datasets/uci_csv/glass.csv"
    #fname = "/home/kzk/datasets/uci_csv/breast_cancer.csv"
    #fname = "/home/kzk/datasets/uci_csv/car.csv"
    #fname = "/home/kzk/datasets/uci_csv/credit.csv"
    #fname = "/home/kzk/datasets/uci_csv/usps.csv"
    #fname = "/home/kzk/datasets/uci_csv/liver.csv"
    #fname = "/home/kzk/datasets/uci_csv/haberman.csv"
    #fname = "/home/kzk/datasets/uci_csv/pima.csv"
    #fname = "/home/kzk/datasets/uci_csv/parkinsons.csv"
    #fname = "/home/kzk/datasets/uci_csv/ionosphere.csv"
    #fname = "/home/kzk/datasets/uci_csv/isolet.csv"
    #fname = "/home/kzk/datasets/uci_csv/magicGamaTelescope.csv"
    #fname = "/home/kzk/datasets/uci_csv/mammographic.csv"
    #fname = "/home/kzk/datasets/uci_csv/yeast.csv"
    fname = "/home/k_yoshiyama/datasets/news20/news20.dat"
    print "dataset is", fname
    
    #data = np.loadtxt(fname, delimiter=" ")
    #X = data[:, 1:]
    #y = data[:, 0]

    (X, y) = load_svmlight_file(fname)
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)
    #X = X.toarray()
    
    n_samples = X.shape[0]
    y_pred = np.ndarray(n_samples)

    # learn
    model = MSCWIDiag(C=1, eta=0.9, epochs=1)
    model.learn(X, y)

    # predict
    st = time.time()
    for i in xrange(0, n_samples):
        if i % 1000 == 0:
            print "#samples = %d" % i
            pass
        sample = X[i, :]
        y_pred[i] = model.predict(sample)
    et = time.time()
    print "prediction time: %f[s]" % (et - st)
    print "prediction time/sample: %f[s]" % ((et - st) / n_samples)
    
    # show result
    cm = confusion_matrix(y, y_pred)
    #print cm
    print "accurary: %d [%%]" % (np.sum(cm.diagonal()) * 100.0 / np.sum(cm))