# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the 'license' file accompanying this file. This file is # distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. import cgi import csv import logging import os import mlio from mlio.integ.arrow import as_arrow_file from mlio.integ.numpy import as_numpy from mlio.integ.scipy import to_coo_matrix import numpy as np import pandas as pd import pyarrow.parquet as pq from sagemaker_containers import _content_types from scipy.sparse import vstack as scipy_vstack import xgboost as xgb from sagemaker_algorithm_toolkit import exceptions as exc from sagemaker_xgboost_container.constants import xgb_content_types BATCH_SIZE = 4000 CSV = 'csv' LIBSVM = 'libsvm' PARQUET = 'parquet' RECORDIO_PROTOBUF = 'recordio-protobuf' VALID_CONTENT_TYPES = [CSV, LIBSVM, PARQUET, RECORDIO_PROTOBUF, _content_types.CSV, xgb_content_types.LIBSVM, xgb_content_types.X_LIBSVM, xgb_content_types.X_PARQUET, xgb_content_types.X_RECORDIO_PROTOBUF] VALID_PIPED_CONTENT_TYPES = [CSV, PARQUET, RECORDIO_PROTOBUF, _content_types.CSV, xgb_content_types.X_PARQUET, xgb_content_types.X_RECORDIO_PROTOBUF] INVALID_CONTENT_TYPE_ERROR = "{invalid_content_type} is not an accepted ContentType: " + \ ", ".join(['%s' % c for c in VALID_CONTENT_TYPES]) + "." INVALID_CONTENT_FORMAT_ERROR = "First line '{line_snippet}...' of file '{file_name}' is not " \ "'{content_type}' format. Please ensure the file is in '{content_type}' format." def _get_invalid_content_type_error_msg(invalid_content_type): return INVALID_CONTENT_TYPE_ERROR.format(invalid_content_type=invalid_content_type) def _get_invalid_libsvm_error_msg(line_snippet, file_name): return INVALID_CONTENT_FORMAT_ERROR.format(line_snippet=line_snippet, file_name=file_name, content_type='LIBSVM') def _get_invalid_csv_error_msg(line_snippet, file_name): return INVALID_CONTENT_FORMAT_ERROR.format(line_snippet=line_snippet, file_name=file_name, content_type='CSV') def get_content_type(content_type_cfg_val): """Get content type from data config. Assumes that training and validation data have the same content type. ['libsvm', 'text/libsvm ;charset=utf8', 'text/x-libsvm'] will return 'libsvm' ['csv', 'text/csv', 'text/csv; label_size=1'] will return 'csv' :param content_type_cfg_val :return: Parsed content type """ if content_type_cfg_val is None: return LIBSVM else: # cgi.parse_header extracts all arguments after ';' as key-value pairs # e.g. cgi.parse_header('text/csv;label_size=1;charset=utf8') returns # the tuple ('text/csv', {'label_size': '1', 'charset': 'utf8'}) content_type, params = cgi.parse_header(content_type_cfg_val.lower()) if content_type in [CSV, _content_types.CSV]: # CSV content type allows a label_size parameter # that should be 1 for XGBoost if (params and 'label_size' in params and params['label_size'] != '1'): msg = "{} is not an accepted csv ContentType. "\ "Optional parameter label_size must be equal to 1".format(content_type_cfg_val) raise exc.UserError(msg) return CSV elif content_type in [LIBSVM, xgb_content_types.LIBSVM, xgb_content_types.X_LIBSVM]: return LIBSVM elif content_type in [PARQUET, xgb_content_types.X_PARQUET]: return PARQUET elif content_type in [RECORDIO_PROTOBUF, xgb_content_types.X_RECORDIO_PROTOBUF]: return RECORDIO_PROTOBUF else: raise exc.UserError(_get_invalid_content_type_error_msg(content_type_cfg_val)) def _is_data_file(file_path, file_name): """Return true if file name is a valid data file name. A file is valid if: * File name does not start with '.' or '_'. * File is not a XGBoost cache file. :param file_path: :param file_name: :return: bool """ if not os.path.isfile(os.path.join(file_path, file_name)): return False if file_name.startswith('.') or file_name.startswith('_'): return False # avoid XGB cache file if '.cache' in file_name: if 'dtrain' in file_name or 'dval' in file_name: return False return True def _get_csv_delimiter(sample_csv_line): try: delimiter = csv.Sniffer().sniff(sample_csv_line).delimiter logging.info("Determined delimiter of CSV input is \'{}\'".format(delimiter)) except Exception as e: raise exc.UserError("Could not determine delimiter on line {}:\n{}".format(sample_csv_line[:50], e)) return delimiter def _get_num_valid_libsvm_features(libsvm_line): """Get number of valid LIBSVM features. XGBoost expects the following LIBSVM format: <label>(:<instance weight>) <index>:<value> <index>:<value> <index>:<value> ... :param libsvm_line: :return: -1 if the line is not a valid LIBSVM line; otherwise, return number of correctly formatted features """ split_line = libsvm_line.split(' ') num_sparse_features = 0 if not _is_valid_libsvm_label(split_line[0]): logging.error("{} does not follow LIBSVM label format <label>(:<weight>).".format(split_line[0])) return -1 if len(split_line) > 1: for idx in range(1, len(split_line)): if ':' not in split_line[idx]: return -1 else: libsvm_feature_contents = split_line[1].split(':') if len(libsvm_feature_contents) != 2: return -1 else: num_sparse_features += 1 return num_sparse_features else: return 0 def _is_valid_libsvm_label(libsvm_label): """Check if LIBSVM label is formatted like so: <label> if just label <label>:<instance_weight> if label and instance weight both exist :param libsvm_label: """ split_label = libsvm_label.split(':') if len(split_label) <= 2: for label_part in split_label: try: float(label_part) except ValueError: return False else: return False return True def _validate_csv_format(file_path): """Validate that data file is CSV format. Check that delimiter can be inferred. Note: This only validates the first line in the file. This is not a comprehensive file check, as XGBoost will have its own data validation. :param file_path """ with open(file_path, 'r', errors='ignore') as read_file: line_to_validate = read_file.readline() _get_csv_delimiter(line_to_validate) def _validate_libsvm_format(file_path): """Validate that data file is LIBSVM format. XGBoost expects the following LIBSVM format: <label>(:<instance weight>) <index>:<value> <index>:<value> <index>:<value> ... Note: This only validates the first line that has a feature. This is not a comprehensive file check, as XGBoost will have its own data validation. :param file_path """ with open(file_path, 'r', errors='ignore') as read_file: for line_to_validate in read_file: num_sparse_libsvm_features = _get_num_valid_libsvm_features(line_to_validate) if num_sparse_libsvm_features > 1: # Return after first valid LIBSVM line with features return elif num_sparse_libsvm_features < 0: raise exc.UserError(_get_invalid_libsvm_error_msg( line_snippet=line_to_validate[:50], file_name=file_path.split('/')[-1])) logging.warning("File {} is not an invalid LIBSVM file but has no features. Accepting simple validation.".format( file_path.split('/')[-1])) def validate_data_file_path(data_path, content_type): """Validate data in data_path are formatted correctly based on content_type. Note: This is not a comprehensive validation. XGBoost has its own content validation. :param data_path: :param content_type: """ parsed_content_type = get_content_type(content_type) if not os.path.exists(data_path): raise exc.UserError("{} is not a valid path!".format(data_path)) else: if os.path.isfile(data_path): data_files = [data_path] else: dir_path = None for root, dirs, files in os.walk(data_path): if dirs == []: dir_path = root break data_files = [ os.path.join(dir_path, file_name) for file_name in os.listdir(dir_path) if _is_data_file( dir_path, file_name)] if parsed_content_type.lower() == CSV: for data_file_path in data_files: _validate_csv_format(data_file_path) elif parsed_content_type.lower() == LIBSVM: for data_file_path in data_files: _validate_libsvm_format(data_file_path) elif parsed_content_type.lower() == PARQUET or parsed_content_type.lower() == RECORDIO_PROTOBUF: # No op return def _get_csv_dmatrix_file_mode(files_path, csv_weights): """Get Data Matrix from CSV data in file mode. Infer the delimiter of data from first line of first data file. :param files_path: File path where CSV formatted training data resides, either directory or file :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :return: xgb.DMatrix """ csv_file = files_path if os.path.isfile(files_path) else [ f for f in os.listdir(files_path) if os.path.isfile(os.path.join(files_path, f))][0] with open(os.path.join(files_path, csv_file)) as read_file: sample_csv_line = read_file.readline() delimiter = _get_csv_delimiter(sample_csv_line) try: if csv_weights == 1: dmatrix = xgb.DMatrix( '{}?format=csv&label_column=0&delimiter={}&weight_column=1'.format(files_path, delimiter)) else: dmatrix = xgb.DMatrix('{}?format=csv&label_column=0&delimiter={}'.format(files_path, delimiter)) except Exception as e: raise exc.UserError("Failed to load csv data with exception:\n{}".format(e)) return dmatrix def _get_csv_dmatrix_pipe_mode(pipe_path, csv_weights): """Get Data Matrix from CSV data in pipe mode. :param pipe_path: SageMaker pipe path where CSV formatted training data is piped :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :return: xgb.DMatrix or None """ try: dataset = [mlio.SageMakerPipe(pipe_path)] reader = mlio.CsvReader(dataset=dataset, batch_size=BATCH_SIZE, header_row_index=None) # Check if data is present in reader if reader.peek_example() is not None: examples = [] for example in reader: # Write each feature (column) of example into a single numpy array tmp = [as_numpy(feature).squeeze() for feature in example] tmp = np.array(tmp) if len(tmp.shape) > 1: # Columns are written as rows, needs to be transposed tmp = tmp.T else: # If tmp is a 1-D array, it needs to be reshaped as a matrix tmp = np.reshape(tmp, (1, tmp.shape[0])) examples.append(tmp) data = np.vstack(examples) del examples if csv_weights == 1: dmatrix = xgb.DMatrix(data[:, 2:], label=data[:, 0], weight=data[:, 1]) else: dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError("Failed to load csv data with exception:\n{}".format(e)) def get_csv_dmatrix(path, csv_weights, is_pipe=False): """Get Data Matrix from CSV data. :param path: Path where CSV formatted training data resides, either directory, file, or SageMaker pipe :param csv_weights: 1 if instance weights are in second column of CSV data; else 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ if is_pipe: return _get_csv_dmatrix_pipe_mode(path, csv_weights) else: return _get_csv_dmatrix_file_mode(path, csv_weights) def get_libsvm_dmatrix(files_path, is_pipe=False): """Get DMatrix from libsvm file path. Pipe mode not currently supported for libsvm. :param files_path: File path where LIBSVM formatted training data resides, either directory or file :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix """ if is_pipe: raise exc.UserError("Pipe mode not supported for LibSVM.") try: dmatrix = xgb.DMatrix(files_path) except Exception as e: raise exc.UserError("Failed to load libsvm data with exception:\n{}".format(e)) return dmatrix def _get_parquet_dmatrix_file_mode(files_path): """Get Data Matrix from parquet data in file mode. :param files_path: File path where parquet formatted training data resides, either directory or file :return: xgb.DMatrix """ try: table = pq.read_table(files_path) data = table.to_pandas() del table if type(data) is pd.DataFrame: # pyarrow.Table.to_pandas may produce NumPy array or pandas DataFrame data = data.to_numpy() dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) del data return dmatrix except Exception as e: raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e)) def _get_parquet_dmatrix_pipe_mode(pipe_path): """Get Data Matrix from parquet data in pipe mode. :param pipe_path: SageMaker pipe path where parquet formatted training data is piped :return: xgb.DMatrix or None """ try: f = mlio.SageMakerPipe(pipe_path) examples = [] with f.open_read() as strm: reader = mlio.ParquetRecordReader(strm) for record in reader: table = pq.read_table(as_arrow_file(record)) array = table.to_pandas() if type(array) is pd.DataFrame: array = array.to_numpy() examples.append(array) if examples: data = np.vstack(examples) del examples dmatrix = xgb.DMatrix(data[:, 1:], label=data[:, 0]) return dmatrix else: return None except Exception as e: raise exc.UserError("Failed to load parquet data with exception:\n{}".format(e)) def get_parquet_dmatrix(path, is_pipe=False): """Get Data Matrix from parquet data. :param path: Path where parquet formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ if is_pipe: return _get_parquet_dmatrix_pipe_mode(path) else: return _get_parquet_dmatrix_file_mode(path) def get_recordio_protobuf_dmatrix(path, is_pipe=False): """Get Data Matrix from recordio-protobuf data. :param path: Path where recordio-protobuf formatted training data resides, either directory, file, or SageMaker pipe :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ try: if is_pipe: dataset = [mlio.SageMakerPipe(path)] reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=BATCH_SIZE) else: dataset = mlio.list_files(path) reader = mlio.RecordIOProtobufReader(dataset=dataset, batch_size=BATCH_SIZE) if reader.peek_example() is not None: # recordio-protobuf tensor may be dense (use numpy) or sparse (use scipy) if type(reader.peek_example()['values']) is mlio.core.DenseTensor: to_matrix = as_numpy vstack = np.vstack else: to_matrix = to_coo_matrix vstack = scipy_vstack all_features = [] all_labels = [] for example in reader: features = to_matrix(example['values']) all_features.append(features) labels = as_numpy(example['label_values']) all_labels.append(labels) all_features = vstack(all_features) all_labels = np.concatenate(all_labels, axis=None) dmatrix = xgb.DMatrix(all_features, label=all_labels) return dmatrix else: return None except Exception as e: raise exc.UserError("Failed to load recordio-protobuf data with exception:\n{}".format(e)) def get_dmatrix(data_path, content_type, csv_weights=0, is_pipe=False): """Create Data Matrix from CSV or LIBSVM file. Assumes that sanity validation for content type has been done. :param data_path: Either directory or file :param content_type: :param csv_weights: Only used if file_type is 'csv'. 1 if the instance weights are in the second column of csv file; otherwise, 0 :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: xgb.DMatrix or None """ if not (os.path.exists(data_path) or (is_pipe and os.path.exists(data_path + '_0'))): return None else: if os.path.isfile(data_path) or is_pipe: files_path = data_path elif not is_pipe: for root, dirs, files in os.walk(data_path): if dirs == []: files_path = root break if content_type.lower() == CSV: dmatrix = get_csv_dmatrix(files_path, csv_weights, is_pipe) elif content_type.lower() == LIBSVM: dmatrix = get_libsvm_dmatrix(files_path, is_pipe) elif content_type.lower() == PARQUET: dmatrix = get_parquet_dmatrix(files_path, is_pipe) elif content_type.lower() == RECORDIO_PROTOBUF: dmatrix = get_recordio_protobuf_dmatrix(files_path, is_pipe) if dmatrix and dmatrix.get_label().size == 0: raise exc.UserError( "Got input data without labels. Please check the input data set. " "If training job is running on multiple instances, please switch " "to using single instance if number of records in the data set " "is less than number of workers (16 * number of instance) in the cluster.") return dmatrix def get_size(data_path, is_pipe=False): """Return size of data files at dir_path. :param data_path: Either directory or file :param is_pipe: Boolean to indicate if data is being read in pipe mode :return: Size of data or 1 if sagemaker pipe found """ if is_pipe and os.path.exists(data_path + '_0'): logging.info('Pipe path {} found.'.format(data_path)) return 1 if not os.path.exists(data_path): logging.info('Path {} does not exist!'.format(data_path)) return 0 else: total_size = 0 if os.path.isfile(data_path): return os.path.getsize(data_path) else: for root, dirs, files in os.walk(data_path): for current_file in files: if current_file.startswith('.'): raise exc.UserError("Hidden file found in the data path! Remove that before training.") file_path = os.path.join(root, current_file) total_size += os.path.getsize(file_path) return total_size