python source code of train

'''
Copyright (c) 2018 Hai Pham, Rutgers University
http://www.cs.rutgers.edu/~hxp1/

This code is free to use for academic/research purpose.

'''

import numpy as np
import cntk as C
import sys
import argparse
from LayerUtils import conv_bn_lrelu, bi_recurrence, flatten
from SysUtils import get_current_time_string, is_Win32, make_dir, ArgParser

C.cntk_py.set_fixed_random_seed(1)

F_DIM = 128
T_DIM = 32

input_dim_model = (1, F_DIM, T_DIM)
input_dim = F_DIM*T_DIM
label_dim = 46

#--------------------------------------
# the audio CNN encoder subnetwork
#--------------------------------------
def audio_encoder(input):
    #---------------------------------------------
    # F-convolution
    #---------------------------------------------
    # 1x128x32
    h = conv_bn_lrelu(input, filter_shape=(5,1), num_filters=32, strides=(2,1), name="conv1")
    # 32x64x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=64, strides=(2,1), name="conv2")
    # 64x32x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=128, strides=(2,1), name="conv3")
    # 128x16x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=256, strides=(2,1), name="conv4")
    # 256x8x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=512, strides=(2,1), name="conv5")
    # 512x4x32
    #---------------------------------------------
    # T-convolution
    #---------------------------------------------
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=512, strides=(1,2), name="t_conv1")
    # 512 x 4 x 16
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=512, strides=(1,2), name="t_conv2")
    # 512 x 4 x 8
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=512, strides=(1,2), name="t_conv3")
    # 512 x 4 x 4
    return h

def audio_encoder_2(input):
    #---------------------------------------------
    # F-convolution
    #---------------------------------------------
    # 1x128x32
    h = conv_bn_lrelu(input, filter_shape=(5,1), num_filters=64, strides=(2,1), name="conv1")
    # 64x64x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=128, strides=(2,1), name="conv2")
    # 128x32x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=256, strides=(2,1), name="conv3")
    # 256x16x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=512, strides=(2,1), name="conv4")
    # 512x8x32
    h = conv_bn_lrelu(h, filter_shape=(3,1), num_filters=1024, strides=(2,1), name="conv5")
    # 1024x4x32
    #---------------------------------------------
    # T-convolution
    #---------------------------------------------
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=1024, strides=(1,2), name="t_conv1")
    # 1024 x 4 x 16
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=1024, strides=(1,2), name="t_conv2")
    # 1024 x 4 x 8
    h = conv_bn_lrelu(h, filter_shape=(1,3), num_filters=1024, strides=(1,2), name="t_conv3")
    # 1024 x 4 x 4
    return h

def audio_encoder_3(input, model_file, cloning=False):
    # Load and freeze pre-trained encoder
    last_layer_name = "t_conv3"
    model = C.load_model(model_file)
    input_node = model.find_by_name("input")
    last_conv = model.find_by_name(last_layer_name)
    if not last_conv:
        raise ValueError("the layer does not exist")
    h = C.combine([last_conv.owner]).clone(C.CloneMethod.clone if cloning else C.CloneMethod.freeze, {input_node: input})
    return h

def create_model(input, net_type="gru", encoder_type=1, model_file=None, e3cloning=False):
    if encoder_type == 1:
        h = audio_encoder(input)
        if net_type.lower() is not "cnn":
            h = flatten(h)
    elif encoder_type == 2:
        h = audio_encoder_2(input)
        # pooling
        h = C.layers.GlobalAveragePooling(name="avgpool")(h)
        h = C.squeeze(h)
    elif encoder_type == 3:
        h = audio_encoder_3(input, model_file, e3cloning)
        if net_type.lower() is not "cnn":
            h = flatten(h)
    else:
        raise ValueError("encoder type {:d} not supported".format(encoder_type))

    if net_type.lower() == "cnn":
        h = C.layers.Dense(1024, init=C.he_normal(), activation=C.tanh)(h)
    elif net_type.lower() == "gru":
        h = C.layers.Recurrence(step_function=C.layers.GRU(256), go_backwards=False, name="rnn")(h)
    elif net_type.lower() == "lstm":
        h = C.layers.Recurrence(step_function=C.layers.LSTM(256), go_backwards=False, name="rnn")(h)
    elif net_type.lower() == "bigru":
        # bi-directional GRU
        h = bi_recurrence(h, C.layers.GRU(128), C.layers.GRU(128), name="bigru")
    elif net_type.lower() == "bilstm":
        # bi-directional LSTM
        h = bi_recurrence(h, C.layers.LSTM(128), C.layers.LSTM(128), name="bilstm")
    h = C.layers.Dropout(0.2)(h)
    # output
    y = C.layers.Dense(label_dim, activation=C.sigmoid, init=C.he_normal(), name="output")(h)
    return y

#--------------------------------------
# loss functions
#--------------------------------------
def l2_loss(output, target):
    return C.reduce_mean(C.square(output - target))

def std_normalized_l2_loss(output, target):
    std_inv = np.array([6.6864805402, 5.2904440280, 3.7165409939, 4.1421640454, 8.1537399389, 7.0312877415, 2.6712380967,
                        2.6372177876, 8.4253649884, 6.7482162880, 9.0849960354, 10.2624412692, 3.1325531319, 3.1091179819,
                        2.7337937590, 2.7336441031, 4.3542467871, 5.4896293687, 6.2003761588, 3.1290341469, 5.7677042738,
                        11.5460919611, 9.9926451700, 5.4259818848, 20.5060642486, 4.7692101480, 3.1681517575, 3.8582905289,
                        3.4222250436, 4.6828286809, 3.0070785113, 2.8936539301, 4.0649030157, 25.3068458731, 6.0030623160,
                        3.1151977458, 7.7773542649, 6.2057372469, 9.9494258692, 4.6865422850, 5.3300697628, 2.7722027974,
                        4.0658663003, 18.1101618617, 3.5390113731, 2.7794520068], dtype=np.float32)
    weights = C.constant(value=std_inv) #.reshape((1, label_dim)))
    dif = output - target
    ret = C.reduce_mean(C.square(C.element_times(dif, weights)))
    return ret

def l1_reg_loss(output):
    # don't need C.abs(output), because output is already non-negative
    # use abs() if your desired output could be negative
    return C.reduce_mean(output)


#----------------------------------------
# create computational graph and learner
#----------------------------------------
def build_graph(config):
    assert(config['type'] in ["cnn", "lstm", "gru", "bilstm", "bigru"])
    if config["type"] == "cnn":
        # static model
        features = C.input_variable(input_dim_model, name="input")
        labels = C.input_variable(label_dim, name="label")
    else:
        # recurrent model
        features = C.sequence.input_variable(input_dim_model, name="input")
        labels = C.sequence.input_variable(label_dim, name="label")
    netoutput = create_model(features, config["type"], config["encoder"], config["pretrained_model"], config["e3_clone"])

    if config["l2_loss_type"] == 1:
        print("Use standard l2 loss")
        ce = l2_loss(netoutput, labels)
    elif config["l2_loss_type"] == 2:
        print("Use variance normalized l2 loss")
        ce = std_normalized_l2_loss(netoutput, labels)
    else:
        raise ValueError("Unsupported loss type")

    # enforce sparsity output
    if config["l1_reg"] > sys.float_info.epsilon:
        ce = ce + config["l1_reg"] * l1_reg_loss(netoutput)
    
    # performance metrics
    pe = C.squared_error(netoutput, labels)

    if config["constlr"]:
        lr_schedule = config["lr"]
    else:
        if config["lr_list"] is not None:
            print("use learning rate schedule from file")
            lr_schedule = config["lr_list"]
        else:
            if config["type"] != "cnn": # default learning rate for recurrent model
                lr_schedule = [0.005] + [0.0025]*2 + [0.001]*4 + [0.0005]*8 + [0.00025]*16 + [0.0001]*1000 + [0.00005]*1000 + [0.000025]
            elif config["lr_schedule"] == 1: # learning rate for CNN
                lr_schedule = [0.005] + [0.0025]*2 + [0.00125]*3 + [0.0005]*4 + [0.00025]*5 + [0.0001]
            elif config["lr_schedule"] == 2:
                lr_schedule = [0.005] + [0.0025]*2 + [0.00125]*3 + [0.0005]*4 + [0.00025]*5 + [0.0001]*100 + [0.00005]*50 + [0.000025]*50 + [0.00001]
            else:
                raise ValueError("unknown learning rate")
    learning_rate = C.learning_parameter_schedule_per_sample(lr_schedule, epoch_size=config["epoch_size"])
    momentum_schedule = C.momentum_schedule(0.9, minibatch_size=300)
    
    learner = C.adam(netoutput.parameters, lr=learning_rate, momentum=momentum_schedule,
                        l2_regularization_weight=0.0001,
                        gradient_clipping_threshold_per_sample=3.0, gradient_clipping_with_truncation=True)
    trainer = C.Trainer(netoutput, (ce, pe), [learner])

    return features, labels, netoutput, trainer


#-----------------------------------
# training procedure
#-----------------------------------

# create reader
def create_reader(path, is_training=True):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
        features = C.io.StreamDef(field='features', shape=input_dim, is_sparse=False),
        labels = C.io.StreamDef(field='labels', shape=label_dim, is_sparse=False)
    )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)


def train(config):
    features, labels, netoutput, trainer = build_graph(config)

    C.logging.log_number_of_parameters(netoutput) ; print()

    #training config
    epoch_size = config["epoch_size"]
    
    progress_printer = C.logging.ProgressPrinter(freq=200, tag='Training') # more detailed logging

    minibatch_size = config["minibatch_size"]
    max_epochs = config["num_epochs"]
    model_file = config["modelfile"]
    log_file = config["logfile"]

    reader = create_reader(config["datafile"])
    input_map = {features: reader.streams.features, labels: reader.streams.labels}
    
    t = 0
    for epoch in range(max_epochs):         # loop over epochs
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:               # loop over minibatches on the epoch
            data = reader.next_minibatch(min(minibatch_size, epoch_end-t), input_map=input_map) # fetch minibatch
            trainer.train_minibatch(data)                                   # update model with it
            t += trainer.previous_minibatch_sample_count                    # count samples processed so far
            progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
        loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
        with open(log_file, 'a') as csvfile:
            csvfile.write("Epoch:  "+ str(epoch) + " Loss: " + str(loss) + " Metric: " + str(metric) + "\n")
    # save
    netoutput.save(model_file)
    return features, labels, netoutput, trainer


def evaluate(datafile, features, labels, trainer, log_file):
    progress_printer = C.logging.ProgressPrinter(tag="Evaluation", num_epochs=0)
    minibatch_size = 200

    reader = create_reader(datafile, is_training=False)
    input_map = {features: reader.streams.features, labels: reader.streams.labels}

    while True:
        data = reader.next_minibatch(minibatch_size, input_map=input_map)
        if not data:                                 # until we hit the end
            break
        metric = trainer.test_minibatch(data)
        progress_printer.update(0, data[labels].num_samples, metric) # log progress
    loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
    with open(log_file, 'a') as csvfile:
        csvfile.write("\n\n --- Test error: " + str(metric) + "\n")


def process_args():
    class ThisArgParser(ArgParser):
        def __init__(self):
            super(ThisArgParser, self).__init__()
            self.config["minibatch_size"] = 300
            self.config["lr_schedule"] = 1
            self.config["type"] = "gru"
            self.config["encoder"] = 1
            self.config["pretrained_model"] = None
            self.config["e3_clone"] = False

        def prepare(self):
            super(ThisArgParser, self).prepare()
            self.parser.add_argument("--gru", action="store_true")
            self.parser.add_argument("--lstm", action="store_true")
            self.parser.add_argument("--cnn", action="store_true")
            self.parser.add_argument("--bigru", action="store_true")
            self.parser.add_argument("--bilstm", action="store_true")
            self.parser.add_argument("--l2type", type=int, default=2)
            self.parser.add_argument("--l1reg", type=float, default=0.1)
            self.parser.add_argument("--lrschd", type=int, default=self.config["lr_schedule"])
            self.parser.add_argument("--encoder", type=int, default=self.config["encoder"])
            self.parser.add_argument("--pretrained_model", type=str)
            self.parser.add_argument("--e3clone", action="store_true")
            

        def parse(self):
            super(ThisArgParser, self).parse()
            self.config["lr_schedule"] = self.args.lrschd
            self.config["l2_loss_type"] = self.args.l2type
            self.config["l1_reg"] = self.args.l1reg
            self.config["lr_schedule"] = self.args.lrschd
            self.config["encoder"] = self.args.encoder
            if self.args.pretrained_model:
                self.config["pretrained_model"] = self.args.pretrained_model
    
            if self.args.cnn:
                self.config["type"] = "cnn"
            if self.args.lstm:
                self.config["type"] = "lstm"
            if self.args.gru:
                self.config["type"] = "gru"
            if self.args.bigru:
                self.config["type"] = "bigru"
            if self.args.bilstm:
                self.config["type"] = "bilstm"

            if self.args.e3clone:
                self.config["e3_clone"] = True


    parser = ThisArgParser()
    parser.prepare()
    parser.parse()
    config = parser.config
    return config


def main():
    config = process_args()
    print("training type: {:s}".format(config["type"]))
    print("max epoch: {:d}".format(config["num_epochs"]))

    current_time = get_current_time_string()

    if is_Win32():
        data_dir = "H:/speech_data"
    else:
        data_dir = "/home/hxp1/speech_data"

    # set proper paths
    if config["type"] == "cnn":
        train_file = data_dir + "/audio_exp_train_noseq.ctf"
        test_file = data_dir + "/audio_exp_test_noseq.ctf"
    else:
        train_file = data_dir + "/audio_exp_train.ctf"
        test_file = data_dir + "/audio_exp_test.ctf"

    model_dir = data_dir + "/model_audio2exp_" + current_time
    make_dir(model_dir)

    model_filename = model_dir + "/model_audio2exp_" + current_time
    model_file = model_filename + ".dnn"
    log_file = model_dir + "/training_log.txt"
    
    if config["encoder"] == 3:
        if not config["pretrained_model"]:
            config["pretrained_model"] = data_dir + "/model_audio2exp_2018-07-19-07-16.dnn"
    else:
        config["pretrained_model"] = None

    config["datafile"] = train_file
    config["modelfile"] = model_file
    config["logfile"] = log_file

    features, labels, netoutput, trainer = train(config)
    print ("Training done!")
    # test
    evaluate(test_file, features, labels, trainer, log_file)
    print ("Testing done!")

# script calling
if __name__=='__main__':
    main()