python source code of data

#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import collections
import csv
import json
import logging
import os.path
import pickle
import random
import re
import functools

import h5py
import numpy as np
import pandas as pd
from pandas.errors import ParserError
from sklearn.model_selection import KFold

from ludwig.constants import SPLIT

logger = logging.getLogger(__name__)


def get_abs_path(data_csv_path, file_path):
    if data_csv_path is not None:
        return os.path.join(data_csv_path, file_path)
    else:
        return file_path


def load_csv(data_fp):
    data = []
    with open(data_fp, 'rb') as f:
        data = list(csv.reader(f))
    return data


def read_csv(data_fp, header=0, nrows=None, skiprows=None):
    """
    Helper method to read a csv file. Wraps around pd.read_csv to handle some
    exceptions. Can extend to cover cases as necessary
    :param data_fp: path to the csv file
    :param header: header argument for pandas to read the csv
    :param nrows: number of rows to read from the csv, None means all
    :param skiprows: number of rows to skip from the csv, None means no skips
    :return: Pandas dataframe with the data
    """

    separator = ','
    with open(data_fp, 'r', encoding="utf8") as csvfile:
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024 * 100),
                                          delimiters=[',', '\t', '|'])
            separator = dialect.delimiter
        except csv.Error:
            # Could not conclude the delimiter, defaulting to comma
            pass

    try:
        df = pd.read_csv(data_fp, sep=separator, header=header,
                         nrows=nrows, skiprows=skiprows)
    except ParserError:
        logger.warning('Failed to parse the CSV with pandas default way,'
                       ' trying \\ as escape character.')
        df = pd.read_csv(data_fp, sep=separator, header=header, escapechar='\\',
                         nrows=nrows, skiprows=skiprows)

    return df


def save_csv(data_fp, data):
    with open(data_fp, 'w', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        for row in data:
            if not isinstance(row, collections.Iterable) or isinstance(row,
                                                                       str):
                row = [row]
            writer.writerow(row)


def csv_contains_column(data_fp, column_name):
    return column_name in read_csv(data_fp, nrows=0)  # only loads header


def load_json(data_fp):
    data = []
    with open(data_fp, 'r') as input_file:
        data = json.load(input_file)
    return data


def save_json(data_fp, data, sort_keys=True, indent=4):
    with open(data_fp, 'w') as output_file:
        json.dump(data, output_file, cls=NumpyEncoder, sort_keys=sort_keys,
                  indent=indent)


# to be tested
# also, when loading an hdf5 file
# most of the times you don't want
# to put everything in memory
# like this function does
# it's jsut for convenience for relatively small datasets
def load_hdf5(data_fp):
    data = {}
    with h5py.File(data_fp, 'r') as h5_file:
        for key in h5_file.keys():
            data[key] = h5_file[key][()]
    return data


# def save_hdf5(data_fp: str, data: Dict[str, object]):
def save_hdf5(data_fp, data, metadata=None):
    if metadata is None:
        metadata = {}
    mode = 'w'
    if os.path.isfile(data_fp):
        mode = 'r+'
    with h5py.File(data_fp, mode) as h5_file:
        for key, value in data.items():
            dataset = h5_file.create_dataset(key, data=value)
            if key in metadata:
                if 'in_memory' in metadata[key]['preprocessing']:
                    if metadata[key]['preprocessing']['in_memory']:
                        dataset.attrs['in_memory'] = True
                    else:
                        dataset.attrs['in_memory'] = False


def load_object(object_fp):
    with open(object_fp, 'rb') as f:
        return pickle.load(f)


def save_object(object_fp, obj):
    with open(object_fp, 'wb') as f:
        pickle.dump(obj, f)


def load_array(data_fp, dtype=float):
    list_num = []
    with open(data_fp, 'r') as input_file:
        for x in input_file:
            list_num.append(dtype(x.strip()))
    return np.array(list_num)


def load_matrix(data_fp, dtype=float):
    list_num = []
    with open(data_fp, 'r') as input_file:
        for row in input_file:
            list_num.append([dtype(elem) for elem in row.strip().split()])
    return np.squeeze(np.array(list_num))


def save_array(data_fp, array):
    with open(data_fp, 'w') as output_file:
        for x in np.nditer(array):
            output_file.write(str(x) + '\n')


def load_pretrained_embeddings(embeddings_path, vocab):
    embeddings = load_glove(embeddings_path)

    # find out the size of the embeddings
    embeddings_size = len(next(iter(embeddings.values())))

    # calculate an average embedding, to use for initializing missing words
    avg_embedding = np.zeros(embeddings_size)
    count = 0
    for word in vocab:
        if word in embeddings:
            avg_embedding += embeddings[word]
            count += 1
    if count > 0:
        avg_embedding /= count

    # create the embedding matrix
    embeddings_vectors = []
    for word in vocab:
        if word in embeddings:
            embeddings_vectors.append(embeddings[word])
        else:
            embeddings_vectors.append(
                avg_embedding + np.random.uniform(-0.01, 0.01, embeddings_size))
    embeddings_matrix = np.stack(embeddings_vectors)

    # let's help the garbage collector free some memory
    embeddings = None

    return embeddings_matrix


@functools.lru_cache(1)
def load_glove(file_path):
    logger.info('  Loading Glove format file {}'.format(file_path))
    embeddings = {}
    embedding_size = 0

    # collect embeddings size assuming the first line is correct
    with open(file_path, 'r', encoding='utf-8') as f:
        found_line = False
        while not found_line:
            line = f.readline()
            if line:
                embedding_size = len(line.split()) - 1
                found_line = True

    # collect embeddings
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f):
            if line:
                try:
                    split = line.split()
                    if len(split) != embedding_size + 1:
                        raise ValueError
                    word = split[0]
                    embedding = np.array(
                        [float(val) for val in split[-embedding_size:]]
                    )
                    embeddings[word] = embedding
                except ValueError:
                    logger.warning(
                        'Line {} in the GloVe file {} is malformed, '
                        'skipping it'.format(
                            line_number, file_path
                        )
                    )
    logger.info('  {0} embeddings loaded'.format(len(embeddings)))
    return embeddings


def split_data(split, data):
    # type: (float, list) -> (list, list)
    split_length = int(round(split * len(data)))
    random.shuffle(data)
    return data[:split_length], data[split_length:]


def shuffle_unison_inplace(list_of_lists, random_state=None):
    if list_of_lists:
        assert all(len(l) == len(list_of_lists[0]) for l in list_of_lists)
        if random_state is not None:
            p = random_state.permutation(len(list_of_lists[0]))
        else:
            p = np.random.permutation(len(list_of_lists[0]))
        return [l[p] for l in list_of_lists]
    return None


def shuffle_dict_unison_inplace(np_dict, random_state=None):
    keys = list(np_dict.keys())
    list_of_lists = list(np_dict.values())

    # shuffle up the list of lists according to previous fct
    shuffled_list = shuffle_unison_inplace(list_of_lists, random_state)

    recon = {}
    for ii in range(len(keys)):
        dkey = keys[ii]
        recon[dkey] = shuffled_list[ii]

    # we've shuffled the dictionary in place!
    return recon


def shuffle_inplace(np_dict):
    if len(np_dict) == 0:
        return
    size = np_dict[next(iter(np_dict))].shape[0]
    for k in np_dict:
        if np_dict[k].shape[0] != size:
            raise ValueError(
                'Invalid: dictionary contains variable length arrays')

    p = np.random.permutation(size)

    for k in np_dict:
        np_dict[k] = np_dict[k][p]


def split_dataset_tvt(dataset, split):
    if SPLIT in dataset:
        del dataset[SPLIT]
    training_set = split_dataset(dataset, split, value_to_split=0)
    validation_set = split_dataset(dataset, split, value_to_split=1)
    test_set = split_dataset(dataset, split, value_to_split=2)
    return training_set, test_set, validation_set


def split_dataset(dataset, split, value_to_split=0):
    splitted_dataset = {}
    for key in dataset:
        splitted_dataset[key] = dataset[key][split == value_to_split]
        if len(splitted_dataset[key]) == 0:
            return None
    return splitted_dataset


def collapse_rare_labels(labels, labels_limit):
    if labels_limit > 0:
        labels[labels >= labels_limit] = labels_limit
    return labels


def class_counts(dataset, labels_field):
    return np.bincount(dataset[labels_field].flatten()).tolist()


def text_feature_data_field(text_feature):
    return text_feature['name'] + '_' + text_feature['level']


def load_from_file(file_name, field=None, dtype=int, ground_truth_split=2):
    """Load experiment data from supported file formats.

    Experiment data can be test/train statistics, model predictions,
    probability, ground truth,  ground truth metadata.
    :param file_name: Path to file to be loaded
    :param field: Target Prediction field.
    :param dtype:
    :param ground_truth_split: Ground truth split filter where 0 is train 1 is
    validation and 2 is test split. By default test split is used when loading
    ground truth from hdf5.
    :return: Experiment data as array
    """
    if file_name.endswith('.hdf5') and field is not None:
        hdf5_data = h5py.File(file_name, 'r')
        split = hdf5_data[SPLIT][()]
        column = hdf5_data[field][()]
        hdf5_data.close()
        array = column[split == ground_truth_split]  # ground truth
    elif file_name.endswith('.npy'):
        array = np.load(file_name)
    elif file_name.endswith('.csv'):
        array = read_csv(file_name, header=None).values
    else:
        array = load_matrix(file_name, dtype)
    return array


def replace_file_extension(file_path, extension):
    """
    Return a file path for a file with same name but different format.
    a.csv, json -> a.json
    a.csv, hdf5 -> a.hdf5
    :param file_path: original file path
    :param extension: file extension
    :return: file path with same name but different format
    """
    if file_path is None:
        return None
    if '.' in extension:
        # Handle the case if the user calls with '.hdf5' instead of 'hdf5'
        extension = extension.replace('.', '').strip()

    return os.path.splitext(file_path)[0] + '.' + extension


def file_exists_with_diff_extension(file_path, extension):
    return file_path is None or \
           os.path.isfile(replace_file_extension(file_path, extension))


def add_sequence_feature_column(df, col_name, seq_length):
    """
    Adds a new column to the dataframe computed from an existing column.
    Values in the new column are space-delimited strings composed of preceding
    values of the same column up to seq_length.
    For example values of the i-th row of the new column will be a
    space-delimited string of df[col_name][i-seq_length].
     :param df: input dataframe
    :param col_name: column name containing sequential data
    :param seq_length: length of an array of preceeding column values to use
    """
    if col_name not in df.columns.values:
        logger.error('{} column does not exist'.format(col_name))
        return

    new_col_name = col_name + '_feature'
    if new_col_name in df.columns.values:
        logger.warning(
            '{} column already exists, values will be overridden'.format(
                new_col_name
            )
        )

    new_data = [None] * seq_length
    old_data = np.array(df[col_name])

    for i in range(seq_length, len(df)):
        new_data.append(' '.join(
            str(j) for j in old_data[i - seq_length: i]
        ))

    df[new_col_name] = new_data
    df[new_col_name] = df[new_col_name].fillna(method='backfill')


def override_in_memory_flag(input_features, override_value):
    num_overrides = 0
    for feature in input_features:
        if 'preprocessing' in feature:
            if 'in_memory' in feature['preprocessing']:
                feature['preprocessing']['in_memory'] = override_value
                num_overrides += 1
    return num_overrides


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        elif isinstance(obj, tuple):
            return list(obj)
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return json.JSONEncoder.default(self, obj)


def generate_kfold_splits(data_df, num_folds, random_state):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
    fold_num = 0
    for train_indices, test_indices in kf.split(data_df):
        fold_num += 1
        yield train_indices, test_indices, fold_num


def get_path_size(
        start_path,
        regex_accept=None,
        regex_reject=None
):
    total_size = 0
    pattern_accept = re.compile(regex_accept) if regex_accept else None
    pattern_reject = re.compile(regex_reject) if regex_reject else None

    for dirpath, dirnames, filenames in os.walk(start_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if not os.path.islink(filepath):
                accepted = True
                if pattern_accept:
                    accepted = accepted and pattern_accept.match(filename)
                if pattern_reject:
                    accepted = accepted and not pattern_reject.match(filename)
                if accepted:
                    total_size += os.path.getsize(filepath)

    return total_size