"""Copyright 2018 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from __future__ import division from __future__ import print_function import os import tensorflow as tf import argparse import numpy as np from sklearn.model_selection import train_test_split import sklearn.datasets from sklearn.preprocessing import OrdinalEncoder # Needed in order to speed up the activation generator, so we don't have to infer types from sklearn # when loading concept files. This is also because the sklearn dataset does not contain the # typing information when loaded. kFloatIndices = [24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40] kIntIndices = [ 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 31, 32 ] kBytesIndices = [1, 2, 3] def make_keras_model(categorical_map): """Simple feedforward model. Uses embedding representation for categorical variables. """ n_features = 41 n_labels = 23 # Make all placeholders inputs = [] models = [] input_layer = tf.keras.layers.Input(shape=(n_features,), name='input') inputs.append(input_layer) # Splits the input vector into multiple variables deconcat = tf.split(input_layer, [1 for i in range(n_features)], 1) for index in range(n_features): # For categorical variables, we create embedding layers if index in categorical_map.keys(): vocab_size = categorical_map[index] inpt = deconcat[index] inputs.append(inpt) embed = tf.keras.layers.Embedding(vocab_size, 200, \ trainable=True, embeddings_initializer=tf.initializers.random_normal())(inpt) embed_reshaped = tf.keras.layers.Reshape(target_shape=(200,))(embed) models.append(embed_reshaped) else: # Else, create a simple input for numerical features inpt = deconcat[index] inputs.append(inpt) models.append(inpt) # Concatenate them into a single vector merge_models = tf.keras.layers.concatenate(models) # Plug them into the DNN net = tf.keras.layers.Dense(1000)(merge_models) net = tf.keras.layers.BatchNormalization()(net) net = tf.keras.layers.Dense(256)(net) net = tf.keras.layers.BatchNormalization()(net) pred = tf.keras.layers.Dense(n_labels, activation='sigmoid')(net) model_full = tf.keras.models.Model(inputs=input_layer, \ outputs=pred) model_full.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \ metrics=['accuracy'], optimizer='adam') return model_full def encode_variables(data): """ Encodes variables using simple ordinal encoding.""" data2 = np.copy(data) encoder = OrdinalEncoder() categorical_indices = kBytesIndices data2[:, categorical_indices] = encoder.fit_transform(data2[:, categorical_indices]) return data2 def encoding_function(x, label_path): """ Matches a label's name with the index from the labels.txt file""" # Open label file and get indices with tf.io.gfile.GFile(label_path, 'r') as f: labels = [l.strip().split(' ')[0] for l in f.readlines()] return labels.index(x.decode('utf-8')) def encode_labels(labels, label_path): """Encoding labels according to order in the labels file, so it is compliant with ModelWrapper""" labels2 = np.copy(labels) encoded = [encoding_function(x, label_path) for x in labels2] return encoded def create_categorical_map(data): categorical_indices = kBytesIndices # Takes form {index : number of unique values} categorical_map = {} for index in categorical_indices: categorical_map[index] = np.unique(data[:, [index]]).shape[0] return categorical_map def prepare_dataset(labels_path): """Fetches data from sklearn. Encodes categorical variables with Ordinal Encoding """ data, labels = sklearn.datasets.fetch_kddcup99(return_X_y=True) encoded_data = encode_variables(data) encoded_labels = encode_labels(labels, labels_path) return encoded_data, encoded_labels def train_and_save_model(model_path, labels_path): """ Trains simple feedforward model for the KDD99 dataset""" # Prepare dataset and split it data, labels = prepare_dataset(labels_path) train_data, test_data, train_labels, test_labels = train_test_split( data, labels, test_size=0.2) # Create categorical map for the embedding layer categorical_map = create_categorical_map(data) model = make_keras_model(categorical_map) print(model.summary()) model.fit( train_data, train_labels, validation_data=(test_data, test_labels), epochs=4, batch_size=64) model.save(model_path) # Test on a small subset of predictions predictions = model.predict(test_data[:10]) print(predictions) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create a UCI income model.') parser.add_argument( '--path_to_save_model', type=str, help='Name for the directory where we will save the trained model') parser.add_argument( '--model_name', type=str, help='Name for the model. Example: model.h5') parser.add_argument( '--labels_path', type=str, help='path containing labels for the model.') args = parser.parse_args() model_path = os.path.join(args.path_to_save_model, args.model_name) print('#################INPUTS#################') print('Saving model at: ' + model_path) print('Using labels from: ' + args.labels_path) print('########################################') train_and_save_model(model_path, args.labels_path)