python source code of data

from __future__ import absolute_import, division, print_function
import sys
import os
curr_path = os.path.abspath(os.path.dirname(__file__))
sys.path = [os.path.dirname(os.path.dirname(curr_path)), curr_path] + sys.path
curr_path = None
try:
    import cPickle as pickle
except:
    import pickle
import logging
import csv
import h5py
import numpy as np
import pandas as pd
import re
import auto_deepnet.utils.exceptions as exceptions

logger = logging.getLogger("auto_deepnet")
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)


'''
function: save_pickle_data
inputs:
    - file_path: string pathname to save data to
    - data_frame: pandas data_frame to save to disk in any picklable format
    - pandas_format (optional): whether to save as a pandas dataframe or as a numpy array
    - append (optional): whether to append data to preexisting data. Requires data to be in the same format
    - mode (optional): The mode to open file as
description:
    helper function to save any data to disk via pickling
'''
def save_pickle_data(file_path, data_frame, **kwargs):
    logger.info("Opening pickle file {} to write data...".format(file_path))
    pandas_format = kwargs.get('pandas_format', True)
    append = kwargs.get('append', False)
    mode = kwargs.get('mode', 'wb')
    if append and os.path.isfile(file_path):
        logger.info("Opening file to append data...")
        try:
            data_frame = pd.concat((load_pickle_data(file_path), data_frame))
        except Exception as e:
            logger.exception("Error appending data from {}: {}".format(file_path), e)
    try:
        if 'pandas_format' not in kwargs or pandas_format:
            data_frame.to_pickle(file_path)
        else:
            with open(file_path, mode) as f:
                pickle.dump(data_frame.values, f)
    except Exception as e:
        logger.exception("Failed with Error {0}".format(e))
        raise exceptions.FileSaveError
    logger.info("Successfully saved pickle data")


'''
function: load_pickle_data
inputs:
    - file_path: string pathname to load data from
    - mode: the mode to open file as
    helper function to load any pickled data from disk
'''
def load_pickle_data(file_path, **kwargs):
    mode = kwargs.get('mode', 'rb')
    logger.info("Opening pickle file {} to read...".format(file_path))
    try:
        with open(file_path, mode) as f:
            data = pickle.load(f)
    except Exception as e:
        logger.exception("Failed with Error {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully read pickle data")
    return data


'''
function: save_hdf5_data
inputs:
    - file_path: string pathname to save data to
    - data_frame: the pandas dataframe to save to disk
    - key (optional): The name to call the dataset
    - pandas_format (optional): whether to save as a pandas structure or default hdf5
    - mode (optional): The mode to open file as
    - format (optional): whether to save as a table or fixed dataset
    - append (optional): Whether data should be appended or replaced
'''
def save_hdf5_data(file_path, data_frame, **kwargs):
    pandas_format = kwargs.get('pandas_format', True)
    key = kwargs.get('key', 'data')
    mode = kwargs.get('mode', 'a')
    format = kwargs.get('format', 'table')
    append = kwargs.get('append', False)
    logger.info("Opening HDF5 file {} to write data...".format(file_path))
    try:
        if pandas_format:
            with pd.HDFStore(file_path, mode=mode) as f:
                if key in f and not append:
                    f.remove(key)
                f.put(key=key, value=data_frame, format=format, append=append)
        else:
            if key == None:
                logger.error("Need a key when saving as default HDF5 format")
                raise exceptions.FileSaveError
            with h5py.File(file_path, mode) as f:
                if key in f:
                    if append:
                        data_frame = pd.concat((pd.DataFrame(f[key]), data_frame))
                    del f[key]
                f.create_dataset(key, data=data_frame.values)
    except Exception as e:
        logger.exception("Failed with Error {0}".format(e))
        raise exceptions.FileSaveError
    logger.info("Successfully saved hdf5 data")


'''
function: load_hdf5_file
inputs:
    - file_path: string pathname to load data from
    - key (optional): name of the dataset
    - pandas_format (optional): whether the file was saved in pandas format
    - mode (optional): The mode to open the file as
description:
    helper function to load an hdf5 file from disk
'''
def load_hdf5_data(file_path, **kwargs):
    key = kwargs.get('key', None)
    pandas_format = kwargs.get('pandas_format', True)
    mode = kwargs.get('mode', 'r')
    logger.info("Opening HDF5 file {} to read...".format(file_path))
    try:
        if pandas_format:
            data = pd.read_hdf(file_path, key=key, mode=mode)
        else:
            with h5py.File(file_path, mode) as f:
                data = f[key][()]
    except KeyError as e:
        logger.exception("Dataset {} does not exist".format(dataset))
        raise exceptions.FileLoadError("Dataset does not exist")
    except Exception as e:
        logger.exception("Problem loading dataset: {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully loaded HDF5 data")
    return data


'''
function: save_csv_data
inputs:
    - file_path: string pathname to load data from
    - data_frame: pandas data to save to csv
    - append: whether to append to preexisting data
    - mode (optional): The mode to open the file as
other inputs:
    - any inputs to pd.DataFrame.to_csv() (optional)
'''
def save_csv_data(file_path, data_frame, **kwargs):
    logger.info("Opening CSV file {} to write data".format(file_path))
    if 'index' not in kwargs:
        kwargs['index'] = False
    if 'mode' not in kwargs:
        kwargs['mode'] = 'w'
    append = kwargs.pop('append', False)
    kwargs.pop('pandas_format', None)
    kwargs.pop('format', None)
    try:
        if append:
            data_frame.to_csv(file_path, index=False, mode='a', header=False)
        else:
            data_frame.to_csv(file_path, **kwargs)
    except Exception as e:
        logger.exception("Problem saving dataset: {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully saved CSV data")


'''
function: load_csv_data
inputs:
    - file_path: string pathname to load data from
other inputs:
    - any inputs used by pd.read_csv() (optional)
'''
def load_csv_data(file_path, **kwargs):
    kwargs.pop('pandas_format', None)
    kwargs.pop('mode', None)
    logger.info("Opening CSV file {} to read...".format(file_path))
    try:
        data = pd.read_csv(file_path, **kwargs)
    except Exception as e:
        logger.exception("Problem reading CSV: {0}".format(e))
        raise exceptions.FileSaveError
    logger.info("Successfully loaded CSV data")
    return data


'''
function: save_data
inputs:
    - file_path: string pathname to save data to
    - data_frame: data to save to disk
    - save_format (optional): format to save to disk
    - overwrite (optional): whether to overwrite preexisting data
    - mode (optional): mode to open file in
    - key: The name to save the data as (required if hdf5 format, deprecated otherwise)
    - pandas_format (optional): whether to save as a pandas dataframe or as a numpy array
    - append (optional): whether to append data
additional inputs:
    - Any inputs that can be used by other saver functions
'''
def save_data(file_path, data_frame, save_format='hdf5', overwrite=False, mode='a', **kwargs):
    if 'key' not in kwargs and save_format == 'hdf5':
        logger.warning("No key specified, defaulting to 'data'")
        kwargs['key'] = 'data'
    if save_format != 'csv':
        if 'pandas_format' not in kwargs:
            kwargs['pandas_format'] = True
        if 'format' not in kwargs:
            kwargs['format'] = 'table'
    if 'append' not in kwargs:
        kwargs['append'] = False
    if 'index' not in kwargs:
        kwargs['index'] = False
    logger.info("Attempting to save data to {}...".format(file_path))
    try:
        dir_name, file_name = os.path.split(file_path)
    except Exception as e:
        logger.exception("Error with file path {}: {}".format(file_path, e))
        raise exceptions.FileSaveError("Invalid file path")
    if len(dir_name) > 0 and not os.path.isdir(dir_name):
        logger.info("Directory {} does not exist. Creating...".format(dir_name))
        os.makedirs(dir_name)
    if os.path.isfile(file_path):
        if not overwrite:
            logger.error("File {} already exists.".format(file_path))
            raise exceptions.FileSaveError
        if (mode == 'w' or save_format == 'pickle'):
            logger.warning("File {} will be overwritten".format(file_path))
            os.remove(file_path)
    if (mode == 'a' and save_format == 'pickle'):
        logger.warning("Can't use mode='a' for writing to pickle files. using mode='wb' instead...")
        mode = 'wb'
    
    saver = {
        'hdf5': save_hdf5_data,
        'csv': save_csv_data,
        'pickle': save_pickle_data
    }
    try:
        saver.get(save_format, save_hdf5_data)(file_path, data_frame, mode=mode, **kwargs)
    except Exception as e:
        logger.exception("Error saving file {}".format(file_path))
        raise exceptions.FileSaveError


'''
function: load_data
inputs:
    - file_path: string pathname to load data from
    - load_format: format to load data as
additional inputs:
    - any inputs used by other loader functions
'''
def load_data(file_path, load_format='hdf5', **kwargs):
    if 'key' not in kwargs and load_format == 'hdf5':
        kwargs['key'] = None
    if load_format != 'csv' and 'pandas_format' not in kwargs:
        kwargs['pandas_format'] = True
    if 'mode' not in kwargs:
        if load_format == 'pickle':
            kwargs['mode'] = 'rb'
        elif load_format == 'hdf5':
            kwargs['mode'] = 'r'
    logger.info("Attempting to load data from {}...".format(file_path))
    if not os.path.isfile(file_path):
        logger.error("File {} does not exist".format(file_path))
    loader = {
        'hdf5': load_hdf5_data,
        'csv': load_csv_data,
        'pickle': load_pickle_data
    }
    try:
        return loader.get(load_format, load_hdf5_data)(file_path, **kwargs)
    except Exception as e:
        logger.exception("Error loading file {}".format(file_path))
        raise exceptions.FileLoadError