python source code of data

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from io import open
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
from scipy import sparse

import csv
import datetime
import heapq
import json
import os
import pickle
import time
import logging

import h5py
import numpy as np


logger = logging.getLogger('Kaggler')


def is_number(s):
    """Check if a string is a number or not."""

    try:
        float(s)
        return True
    except ValueError:
        return False


def save_data(X, y, path):
    """Save data as a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        X (numpy or scipy sparse matrix): Data matrix
        y (numpy array): Target vector. If None, all zero vector will be saved.
        path (str): Path to the CSV, LibSVM or HDF5 file to save data.
    """
    catalog = {'.csv': save_csv, '.sps': save_libsvm, '.h5': save_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]

    if y is None:
        y = np.zeros((X.shape[0], ))

    func(X, y, path)


def save_csv(X, y, path):
    """Save data as a CSV file.

    Args:
        X (numpy or scipy sparse matrix): Data matrix
        y (numpy array): Target vector.
        path (str): Path to the CSV file to save data.
    """

    if sparse.issparse(X):
        X = X.todense()

    np.savetxt(path, np.hstack((y.reshape((-1, 1)), X)), delimiter=',')


def save_libsvm(X, y, path):
    """Save data as a LibSVM file.

    Args:
        X (numpy or scipy sparse matrix): Data matrix
        y (numpy array): Target vector.
        path (str): Path to the CSV file to save data.
    """

    dump_svmlight_file(X, y, path, zero_based=False)


def save_hdf5(X, y, path):
    """Save data as a HDF5 file.

    Args:
        X (numpy or scipy sparse matrix): Data matrix
        y (numpy array): Target vector.
        path (str): Path to the HDF5 file to save data.
    """

    with h5py.File(path, 'w') as f:
        is_sparse = 1 if sparse.issparse(X) else 0
        f['issparse'] = is_sparse
        f['target'] = y

        if is_sparse:
            if not sparse.isspmatrix_csr(X):
                X = X.tocsr()

            f['shape'] = np.array(X.shape)
            f['data'] = X.data
            f['indices'] = X.indices
            f['indptr'] = X.indptr
        else:
            f['data'] = X


def load_data(path, dense=False):
    """Load data from a CSV, LibSVM or HDF5 file based on the file extension.

    Args:
        path (str): A path to the CSV, LibSVM or HDF5 format file.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    catalog = {'.csv': load_csv, '.sps': load_svmlight_file, '.h5': load_hdf5}

    ext = os.path.splitext(path)[1]
    func = catalog[ext]
    X, y = func(path)

    if dense and sparse.issparse(X):
        X = X.todense()

    return X, y


def load_csv(path):
    """Load data from a CSV file.

    Args:
        path (str): A path to the CSV format file containing data.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    with open(path) as f:
        line = f.readline().strip()

    X = np.loadtxt(path, delimiter=',',
                   skiprows=0 if is_number(line.split(',')[0]) else 1)

    y = np.array(X[:, 0]).flatten()
    X = X[:, 1:]

    return X, y


def load_hdf5(path):
    """Load data from a HDF5 file.

    Args:
        path (str): A path to the HDF5 format file containing data.
        dense (boolean): An optional variable indicating if the return matrix
                         should be dense.  By default, it is false.

    Returns:
        Data matrix X and target vector y
    """

    with h5py.File(path, 'r') as f:
        is_sparse = f['issparse'][...]
        if is_sparse:
            shape = tuple(f['shape'][...])
            data = f['data'][...]
            indices = f['indices'][...]
            indptr = f['indptr'][...]
            X = sparse.csr_matrix((data, indices, indptr), shape=shape)
        else:
            X = f['data'][...]

        y = f['target'][...]

    return X, y


def read_sps(path):
    """Read a LibSVM file line-by-line.

    Args:
        path (str): A path to the LibSVM file to read.

    Yields:
        data (list) and target (int).
    """

    for line in open(path):
        # parse x
        xs = line.rstrip().split(' ')

        yield xs[1:], int(xs[0])


def shuf_file(f, shuf_win):
    heap = []
    for line in f:
        key = hash(line)
        if len(heap) < shuf_win:
            heapq.heappush(heap, (key, line))
        else:
            _, out = heapq.heappushpop(heap, (key, line))
            yield out

    while len(heap) > 0:
        _, out = heapq.heappop(heap)
        yield out


class PathJoiner:
    """Load directory names from SETTINGS.json.

    Originally written by Baris Umog (https://www.kaggle.com/barisumog).

    Usage:
        # In SETTINGS.json, "data": "/path/to/data/".
        # To load "/path/to/data/targets.array" file to y:
        PATH = PathJoiner()
        y = load(PATH.data('targets.array'))
    """

    def __init__(self, filename='SETTINGS.json'):
        with open(filename) as file:
            self.subdirs = json.load(file)

    def __getattr__(self, attr):
        subdir = self.subdirs[attr]
        return lambda *dirs: os.path.join(subdir, *dirs)


def stream_lines(filename, encoding='utf-8', ignore_errors=False):
    errors = 'ignore' if ignore_errors else 'strict'
    with open(filename, encoding=encoding, errors=errors) as file:
        for line in file:
            yield line


def stream_csv(filename, encoding='utf-8', ignore_errors=False):
    stream = stream_lines(filename, encoding, ignore_errors)
    return csv.reader(stream)


def limit_stream(stream, count=1, skip=0):
    for i in range(skip):
        next(stream)
    for i in range(count):
        yield next(stream)


def save_obj(filename, obj):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info('saved : {}\t{}'.format(filename, type(obj)))


def load_obj(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
    logger.info('loaded : {}\t{}'.format(filename, type(obj)))
    return obj


def save_array(filename, X):
    with h5py.File(filename, 'w') as file:
        file['data'] = X
    logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape))


def load_array(filename):
    with h5py.File(filename, 'r') as file:
        X = file['data'][...]
    logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
    return X


def save_sparse(filename, X):
    with h5py.File(filename, 'w') as file:
        file['shape'] = np.array(X.shape)
        file['data'] = X.data
        file['indices'] = X.indices
        file['indptr'] = X.indptr
    logger.info('saved : {}\t{}\t{}'.format(filename, X.dtype, X.shape))


def load_sparse(filename):
    with h5py.File(filename, 'r') as file:
        shape = tuple(file['shape'][...])
        data = file['data'][...]
        indices = file['indices'][...]
        indptr = file['indptr'][...]
    X = sparse.csr_matrix((data, indices, indptr), shape=shape)
    logger.info('loaded : {}\t{}\t{}'.format(filename, X.dtype, X.shape))
    return X


def save(filename, X):
    catalog = {'obj': save_obj, 'array': save_array, 'sparse': save_sparse}
    extension = filename.split('.')[-1]
    func = catalog[extension]
    func(filename, X)


def load(filename):
    catalog = {'obj': load_obj, 'array': load_array, 'sparse': load_sparse}
    extension = filename.split('.')[-1]
    func = catalog[extension]
    X = func(filename)
    return X


class Clock(object):

    def __init__(self):
        self.start = time.time()
        self.last = self.start
        self.now = self.start
        self.report()

    def check(self):
        self.now = time.time()
        self.report()
        self.last = self.now

    def report(self):
        txt = '\n[CLOCK]  [ {} ]    '
        txt += 'since start: [ {} ]    since last: [ {} ]\n'
        current = time.asctime().split()[3]
        since_start = datetime.timedelta(seconds=round(self.now - self.start))
        since_last = datetime.timedelta(seconds=round(self.now - self.last))
        logger.info(txt.format(current, since_start, since_last))


def beep(n=1):
    for _ in range(n):
        os.system('beep')


def print_shape_type(*objs):
    for obj in objs:
        try:
            logger.info(obj.shape, obj.dtype, type(obj))
        except AttributeError:
            logger.error(obj.shape, type(obj))