python source code of exp_epic

# !/usr/bin/env python
# -*- coding: UTF-8 -*-

########################################################################
# GNU General Public License v3.0
# GNU GPLv3
# Copyright (c) 2019, Noureldien Hussein
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
########################################################################

"""
This experiment is for EPIC-Kitchens dataset.
"""

import sys
import os
import random
import time
import datetime
import threading
import numpy as np

import tensorflow as tf
import keras.backend as K
from keras.layers import Input, BatchNormalization
from keras.layers import Dense, LeakyReLU, Dropout, Activation, Conv3D
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras.utils import multi_gpu_utils

from nets.keras_layers import ReshapeLayer, TransposeLayer, DepthwiseDilatedConv1DLayer, DepthwiseConv1DLayer, MaxLayer, MeanLayer, NetVLAD
from nets.keras_layers import DepthwiseDenseLayer, ConvOverSpaceLayer

from nets.i3d_keras_epic_kitchens import Inception_Inflated3d_Backbone
from datasets import ds_epic_kitchens
from nets import videograph, timeception
from core import utils, keras_utils, metrics, image_utils
from core.utils import Path as Pth

# region Train

def train_model_on_pickled_features():
    """
    Train model.
    """

    annotation_type = 'noun'
    annot_path = Pth('EPIC-Kitchens/annotation/annot_video_level_many_shots.pkl')
    (y_tr, y_te), n_classes = __load_annotation(annot_path, annotation_type)

    model_type = 'i3d_rgb'
    feature_type = 'mixed_5c'
    n_nodes = 128
    n_timesteps = 64
    n_frames_per_segment = 8
    n_frames_per_video = n_timesteps * n_frames_per_segment
    batch_size_tr = 20
    batch_size_te = 30
    n_epochs = 500
    epoch_offset = 0
    model_name = 'classifier_%s' % (utils.timestamp())
    model_root_path = Pth('EPIC-Kitchens/models')

    features_path = Pth('EPIC-Kitchens/features/features_i3d_mixed_5c_%d_frames.h5', (n_frames_per_video,))
    nodes_path = Pth('EPIC-Kitchens/features_centroids/features_random_%d.pkl', (n_nodes,))
    n_channels, side_dim = utils.get_model_feat_maps_info(model_type, feature_type)
    input_shape = (None, n_timesteps, side_dim, side_dim, n_channels)
    nodes = utils.pkl_load(nodes_path)

    print ('--- start time')
    print (datetime.datetime.now())

    # building the model
    print('... building model %s' % (model_name))
    t1 = time.time()
    model = __load_model_videograph(nodes, n_classes, input_shape)
    t2 = time.time()
    duration = t2 - t1
    print (model.summary(line_length=130, positions=None, print_fn=None))
    print ('... model built, duration (sec): %d' % (duration))

    # load data
    print ('... loading data: %s' % (features_path))
    t1 = time.time()
    # features are extracting using datasets.Epic_Kitchens.i3d_keras_epic_kitchens()
    # we use out-of-box i3d (pre-trained on kinetics, NOT fine-tuned on epic-kitchens) with last conv feature 7*7*1024 'mixed_5c'
    # to get a better performance, you need to write code to randomly sample new frames and extract their features every new epoch
    # please use this function to random sampling, instead of uniform sampling: Epic_Kitchens.__random_sample_frames_per_video_for_i3d()
    # then extract their features, as done in: Epic_Kitchens._901_extract_features_i3d()
    # then train on the extracted features. Please do so in every epoch. It's computationally heavy, but you cannot avoid random sampling to get better results.
    # Even better results if you replace I3D with a 2D/3D CNN that's previously fine-tuned on Epic-Kitchens
    (x_tr, x_te) = utils.h5_load_multi(features_path, ['x_tr', 'x_te'])
    t2 = time.time()

    duration = t2 - t1
    print ('... data loaded: %d' % (duration))

    n_tr = len(x_tr)
    n_te = len(x_te)
    n_batch_tr = utils.calc_num_batches(n_tr, batch_size_tr)
    n_batch_te = utils.calc_num_batches(n_te, batch_size_te)
    print ('... [tr]: n, n_batch, batch_size: %d, %d, %d' % (n_tr, n_batch_tr, batch_size_tr))
    print ('... [te]: n, n_batch, batch_size: %d, %d, %d' % (n_te, n_batch_te, batch_size_te))
    print (x_tr.shape)
    print (x_te.shape)
    print (y_tr.shape)
    print (y_te.shape)

    save_callback = keras_utils.ModelSaveCallback(model, model_name, epoch_offset, model_root_path)
    score_callback = keras_utils.MapScoreCallback(model, None, None, x_te, y_te, batch_size_te, n_classes)
    model_callbacks = [save_callback, score_callback]
    model.fit(x_tr, y_tr, epochs=n_epochs, batch_size=batch_size_tr, validation_split=0.0, validation_data=(x_te, y_te), shuffle=True, callbacks=model_callbacks, verbose=2)

    print ('--- finish time')
    print (datetime.datetime.now())

def train_model_on_video_frames():
    """
    When training model on images, the model won't fit in gpu.
    If trained on several gpus, the batch size will get so small that BatchNorm is not applicable anymore.
    The solution is to use first 3 gpus to extract features from the backbone model (i.e. bottom part, for example: I3D or ResNet),
    and to use the 4th gpu to train our model (i.e. top part) on these features.
    """

    # this is to allow for small cpu utilization by numpy
    # has to be set before importing numpy
    # os.environ["MKL_NUM_THREADS"] = "1"
    # os.environ["NUMEXPR_NUM_THREADS"] = "1"
    # os.environ["OMP_NUM_THREADS"] = "1"

    # if training from scratch
    resume_epoch_num = 0
    is_resume_training = False
    resume_timestamp = ''

    # get the model part to run
    timestamp = utils.timestamp() if not is_resume_training else resume_timestamp
    starting_epoch_num = 0 if not is_resume_training else resume_epoch_num
    n_epochs = 500

    # for i3d-keras
    n_centroids = 128
    n_frames_bottom = 512
    n_frames_top = 64
    n_instances = 3
    model_bottom = __start_train_model_on_video_frames_backbone_i3d_keras
    model_top = __start_train_model_on_video_frames_videograph

    # also, create the files where the training state will be stored
    global TRAIN_STATE
    TRAIN_STATE = TrainingState()

    # bottom part, instance 1
    args_bottom_1 = (n_epochs, starting_epoch_num, n_frames_bottom, n_instances, 1)
    thread_bottom_1 = threading.Thread(target=model_bottom, args=args_bottom_1)

    # bottom part, instance 2
    args_bottom_2 = (n_epochs, starting_epoch_num, n_frames_bottom, n_instances, 2)
    thread_bottom_2 = threading.Thread(target=model_bottom, args=args_bottom_2)

    # bottom part, instance 3
    args_bottom_3 = (n_epochs, starting_epoch_num, n_frames_bottom, n_instances, 3)
    thread_bottom_3 = threading.Thread(target=model_bottom, args=args_bottom_3)

    # top part
    args_top = (n_epochs, n_frames_top, n_centroids, timestamp, is_resume_training, starting_epoch_num)
    thread_top = threading.Thread(target=model_top, args=args_top)

    thread_top.start()
    thread_bottom_1.start()
    thread_bottom_2.start()
    thread_bottom_3.start()

    thread_top.join()
    thread_bottom_1.join()
    thread_bottom_2.join()
    thread_bottom_3.join()

def __start_train_model_on_video_frames_videograph(n_epochs, n_timesteps, n_centroids, timestamp, is_resume_training, start_epoch_num):
    # configure the gpu to be used by keras
    gpu_core_id = 3
    device_id = '/gpu:%d' % gpu_core_id

    # with graph.as_default():
    # with session.as_default():

    graph = tf.Graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    sess = tf.Session(config=config, graph=graph)
    K.set_session(sess)
    with sess:
        with tf.device(device_id):
            __train_model_on_video_frames_videograph(n_epochs, n_timesteps, n_centroids, timestamp, is_resume_training, start_epoch_num)

def __start_train_model_on_video_frames_backbone_i3d_keras(n_epochs, starting_epoch_num, n_frames_per_video, n_instances, instance_num):
    # configure the gpu to be used by keras
    gpu_core_id = instance_num - 1
    device_id = '/gpu:%d' % gpu_core_id

    assert instance_num in [1, 2, 3], 'Sorry, wrong instance number: %d' % (instance_num)

    graph = tf.Graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    sess = tf.Session(config=config, graph=graph)
    K.set_session(sess)
    with sess:
        with tf.device(device_id):
            __train_model_on_video_frames_backbone_i3d_keras(n_epochs, starting_epoch_num, n_frames_per_video, n_instances, instance_num)

def __train_model_on_video_frames_videograph(n_epochs, n_timesteps, n_centroids, timestamp, is_resume_training, start_epoch_num):
    """
    Train model of 3rd gpu, train it on features extracted on first 2 gpus.
    """

    global TRAIN_STATE
    assert (start_epoch_num > 1 and is_resume_training) or (start_epoch_num == 0 and not is_resume_training), 'sorry, either provide resume_epoch_num or set the model as not resuming with resume_epoch_num = 0'

    n_frames_per_segment = 8
    n_frames_per_video = n_frames_per_segment * n_timesteps

    # locations
    model_name = 'classifier_from_video_frames_%s' % (timestamp)
    resume_model_json_path = Pth('EPIC-Kitchens/models/%s/%03d.json', (model_name, start_epoch_num))
    resume_model_weights_path = Pth('EPIC-Kitchens/models/%s/%03d.pkl', (model_name, start_epoch_num))

    frames_root_path = Pth('EPIC-Kitchens/frames_rgb_resized/train')
    features_te_path = Pth('EPIC-Kitchens/features/features_i3d_mixed_5c_%d_frames_te.h5', (n_frames_per_video,))
    centroids_path = Pth('EPIC-Kitchens/features_centroid/features_random_%d_centroids.pkl', (n_centroids,))
    centroids_path = Pth('EPIC-Kitchens/features_centroid/features_sobol_%d_centroids.pkl', (n_centroids,))
    video_names_splits_path = Pth('EPIC-Kitchens/annotation/video_names_splits.pkl')
    frame_relative_pathes_dict_path = Pth('EPIC-Kitchens/annotation/frame_relative_pathes_dict_tr.pkl')
    annot_path = Pth('EPIC-Kitchens/annotation/annot_video_level_many_shots.pkl')

    is_save_centroids = False
    is_save_model = True
    verbose = False

    n_gpus = 1
    n_classes = ds_epic_kitchens.N_NOUNS_MANY_SHOT

    batch_size_tr = 20
    batch_size_te = 40
    n_threads_te = 16

    n_feat_maps = 1024
    featmap_side_dim = 7
    input_shape = (None, n_timesteps, featmap_side_dim, featmap_side_dim, n_feat_maps)

    # load centroids
    centroids = utils.pkl_load(centroids_path)

    print ('--- start time')
    print (datetime.datetime.now())

    # building the model
    print('... building model %s' % (model_name))
    t1 = time.time()

    # load new or previous model
    if is_resume_training:
        custom_objects = {'DepthwiseDilatedConv1DLayer': DepthwiseDilatedConv1DLayer,
                          'DepthwiseConv1DLayer': DepthwiseConv1DLayer,
                          'DepthwiseDenseLayer': DepthwiseDenseLayer,
                          'ConvOverSpaceLayer': ConvOverSpaceLayer,
                          'TransposeLayer': TransposeLayer,
                          'ReshapeLayer': ReshapeLayer,
                          'MeanLayer': MeanLayer,
                          'MaxLayer': MaxLayer}
        model = keras_utils.load_model(resume_model_json_path, resume_model_weights_path, custom_objects=custom_objects, is_compile=False)
        model, _ = __compile_model_for_finetuning(model, n_gpus)
    else:
        model, _ = __load_model_action_vlad(n_classes, input_shape, n_gpus=n_gpus, is_load_weights=False, weight_path='')
        model, _ = __load_model_videograph(centroids, n_classes, input_shape)
        # model, _ = __load_model_timeception(n_classes, input_shape, n_gpus=n_gpus, is_load_weights=False, weight_path='')
        # model, _ = __load_model_mlp_classifier_transformer_centroids_with_graph_embedding(centroids, n_classes, input_shape, n_gpus=n_gpus, is_load_weights=False, weight_path='')

    # dry run to get the model loaded in gpu
    dummy_feature = np.zeros(tuple([batch_size_tr] + list(input_shape[1:])), dtype=np.float32)
    model.predict(dummy_feature)

    t2 = time.time()
    duration = t2 - t1
    print (model.summary(line_length=120, positions=None, print_fn=None))
    print ('... model built, duration (sec): %d' % (duration))

    # load data
    print ('... loading data')
    t1 = time.time()

    (y_tr, _, _, y_te, _, _) = utils.pkl_load(annot_path)
    (video_names_tr, video_names_te) = utils.pkl_load(video_names_splits_path)
    frame_relative_pathes_dict = utils.pkl_load(frame_relative_pathes_dict_path)
    x_te = utils.h5_load(features_te_path)
    print ('... centroids: %s' % (centroids_path))

    n_tr = len(video_names_tr)
    n_te = len(video_names_te)

    # set list of video names and ground truth
    TRAIN_STATE.video_names_tr = video_names_tr
    TRAIN_STATE.class_nums_tr = y_tr

    # sample new frames
    sampled_video_frames_dict = ds_epic_kitchens.__random_sample_frames_per_video_for_i3d(TRAIN_STATE.video_names_tr, frames_root_path, frame_relative_pathes_dict, n_frames_per_segment, n_frames_per_video)
    TRAIN_STATE.video_frames_dict_tr = sampled_video_frames_dict

    del video_names_tr
    del video_names_te
    del y_tr

    n_batch_tr = keras_utils.calc_num_batches(n_tr, batch_size_tr)
    n_batch_te = keras_utils.calc_num_batches(n_te, batch_size_te)
    t2 = time.time()
    duration = t2 - t1
    print ('... data loaded: %d' % duration)
    print ('... [tr]: n, n_batch, batch_size, n_gpus: %d, %d, %d, %d' % (n_tr, n_batch_tr, batch_size_tr, n_gpus))
    print ('... [te]: n, n_batch, batch_size, n_gpus: %d, %d, %d, %d' % (n_te, n_batch_te, batch_size_te, n_gpus))

    # make model top ready
    TRAIN_STATE.model_top_ready = True
    sys.stdout.write('\n')
    for idx_epoch in range(start_epoch_num, n_epochs):

        epoch_num = idx_epoch + 1
        # wait until bottom parts start
        while TRAIN_STATE.model_bottom_1_epoch_start < epoch_num or TRAIN_STATE.model_bottom_2_epoch_start < epoch_num or TRAIN_STATE.model_bottom_3_epoch_start < epoch_num:
            threading._sleep(2.0)
            if verbose:
                print ('... top part is waiting for bottom part to start extracting features for epoch %d' % (epoch_num))

        # epoch started, update counter
        TRAIN_STATE.model_top_epoch_start = epoch_num

        # video names are obtained from the state at the beginning of each epoch
        video_names_tr = TRAIN_STATE.video_names_tr
        y_tr = TRAIN_STATE.class_nums_tr

        loss_tr = 0.0
        loss_tr_b = 0.0
        tt1 = time.time()
        waiting_duration_total = 0

        # loop and train
        for idx_batch_tr in range(n_batch_tr):

            batch_num_tr = idx_batch_tr + 1

            start_idx_batch = idx_batch_tr * batch_size_tr
            stop_idx_batch = (idx_batch_tr + 1) * batch_size_tr
            video_names_tr_batch = video_names_tr[start_idx_batch:stop_idx_batch]
            y_tr_b = y_tr[start_idx_batch:stop_idx_batch]
            is_missing_features = True

            # wait until the festures are loaded
            t1 = time.time()
            while is_missing_features:
                is_missing_features = False
                for _v_name in video_names_tr_batch:
                    if _v_name not in TRAIN_STATE.feats_dict_tr_1 and _v_name not in TRAIN_STATE.feats_dict_tr_2 and _v_name not in TRAIN_STATE.feats_dict_tr_3:
                        is_missing_features = True
                        break
                if is_missing_features:
                    threading._sleep(1.0)
                    if verbose:
                        print ('... model top is waiting for missing videos: %s' % _v_name)
            t2 = time.time()

            x_tr_b = __get_features_from_dictionaries(video_names_tr_batch)
            x_tr_b = np.array(x_tr_b)

            loss_batch_tr = model.train_on_batch(x_tr_b, y_tr_b)

            # after training, remove feats from dictionary (# delete feature and remove key)
            for _v_name in video_names_tr_batch:
                if _v_name in TRAIN_STATE.feats_dict_tr_1:
                    del TRAIN_STATE.feats_dict_tr_1[_v_name]
                    TRAIN_STATE.feats_dict_tr_1.pop(_v_name, None)
                elif _v_name in TRAIN_STATE.feats_dict_tr_2:
                    del TRAIN_STATE.feats_dict_tr_2[_v_name]
                    TRAIN_STATE.feats_dict_tr_2.pop(_v_name, None)
                elif _v_name in TRAIN_STATE.feats_dict_tr_3:
                    del TRAIN_STATE.feats_dict_tr_3[_v_name]
                    TRAIN_STATE.feats_dict_tr_3.pop(_v_name, None)

            loss_tr += loss_batch_tr
            loss_tr_b = loss_tr / float(batch_num_tr)
            tt2 = time.time()
            duration = tt2 - tt1
            waiting_duration = t2 - t1
            waiting_duration_total += waiting_duration
            msg = '%04ds - epoch: %02d/%02d, batch [tr]: %02d/%02d, loss: %0.2f, waited: %.01f  ' % (duration, epoch_num, n_epochs, batch_num_tr, n_batch_tr, loss_tr_b, waiting_duration)
            if verbose:
                print(msg)
            else:
                sys.stdout.write('\r%s' % (msg))

        # test
        y_pred_te = model.predict(x_te, batch_size_te, verbose=0)
        map_te_avg = 100 * metrics.mean_avg_precision_sklearn(y_te, y_pred_te)
        loss_tr /= float(n_batch_tr)

        tt2 = time.time()
        duration = tt2 - tt1
        timestamp_now = utils.timestamp()
        msg = '%04ds - epoch: %02d/%02d, loss [tr]: %0.2f, map [te]: %0.2f%%, waited: %d, finished: %s   \n' % (duration, epoch_num, n_epochs, loss_tr, map_te_avg, waiting_duration_total, timestamp_now)
        if verbose:
            print(msg)
        else:
            sys.stdout.write('\r%s' % (msg))

        # after we're done with training and testing, shuffle the list of training videos, and set in the TRAINING_STATE, also sample new frames
        video_names_tr, y_tr = __shuffle_training_data(TRAIN_STATE.video_names_tr, TRAIN_STATE.class_nums_tr)
        TRAIN_STATE.video_names_tr = video_names_tr
        TRAIN_STATE.class_nums_tr = y_tr
        del video_names_tr, y_tr

        # also, sample new frames
        sampled_video_frames_dict = ds_epic_kitchens.__random_sample_frames_per_video_for_i3d(TRAIN_STATE.video_names_tr, frames_root_path, frame_relative_pathes_dict, n_frames_per_segment, n_frames_per_video)
        TRAIN_STATE.video_frames_dict_tr = sampled_video_frames_dict

        # update counter so the bottom part starts extracting features for the next epoch
        TRAIN_STATE.model_top_epoch_end = epoch_num

        # save the model and nodes, if required
        if is_save_model:
            __save_model(model, model_name, epoch_num)

        if is_save_centroids:
            __save_centroids(model, model_name, epoch_num)

    print ('--- finish time')
    print (datetime.datetime.now())

def __train_model_on_video_frames_backbone_i3d_keras(n_epochs, starting_epoch_num, n_frames_per_video, n_instances, instance_num):
    """
    Extract features from i3d-model to be used by our model.
    """

    verbose = False
    global TRAIN_STATE  # type: TrainingState
    assert instance_num in [1, 2, 3], 'Sorry, wrong instance number: %d' % (instance_num)
    assert n_instances == 3, 'Sorry, wrong number of instances %d' % (n_instances)

    n_threads = 16
    n_frames_per_segment = 8
    max_preloaded_feats = 40
    n_frames_in = n_frames_per_video
    n_frames_out = int(n_frames_in / float(n_frames_per_segment))
    assert n_frames_per_segment * n_frames_out == n_frames_in

    # load the model
    model = Inception_Inflated3d_Backbone()

    # reader for getting video frames
    video_reader = image_utils.AsyncImageReaderEpicKitchensForI3dKerasModel(n_threads=n_threads)

    # wait until model top is ready
    while not TRAIN_STATE.model_top_ready:
        threading._sleep(5.0)
        if verbose:
            print ('... bottom part (%d) is waiting for top part to get ready' % (instance_num))

    # extract features for n epoch
    for idx_epoch in range(starting_epoch_num, n_epochs):

        epoch_num = idx_epoch + 1

        video_frames_dict = TRAIN_STATE.video_frames_dict_tr
        video_names = TRAIN_STATE.video_names_tr
        n_videos = len(video_names)

        # only first instance can modify train_state and get videos from pickle
        if instance_num == 1:
            # model started, update count
            TRAIN_STATE.model_bottom_1_epoch_start = epoch_num
        elif instance_num == 2:
            # model started, update count
            TRAIN_STATE.model_bottom_2_epoch_start = epoch_num
        elif instance_num == 3:
            # model started, update count
            TRAIN_STATE.model_bottom_3_epoch_start = epoch_num
        else:
            raise Exception('Sorry, unknown instance number: %d' % (instance_num))

        if verbose:
            print ('epoch %d by instance %s' % (epoch_num, instance_num))

        # aync reader, and get load images for the first video, we will read the first group of videos
        current_video_name = video_names[instance_num - 1]
        current_video_frames = video_frames_dict[current_video_name]

        # just for clarification, can be reshaped from (256,) into (T, N) = (32, 8)
        # where T is the number of segments in one video, and N is the number of frames in one segment
        # video_group_frames = np.reshape(video_group_frames, tuple([n_frames_out, n_segment_length] + list(video_group_frames.shape[1:])))
        video_reader.load_imgs_in_batch(current_video_frames)

        # extract features only for training videos
        t1 = time.time()

        if verbose:
            print('... extracting features tr')
            print('... start time: %s' % utils.timestamp())

        # loop on list of videos
        for idx_video in range(n_videos):

            if instance_num == 1:
                # wait looping if there are so many features in the dictionary
                while len(TRAIN_STATE.feats_dict_tr_1) > max_preloaded_feats:
                    threading._sleep(1.0)
                    if verbose:
                        print ('... bottom part (%d) is waiting for features in the dictionary to get consumed by top part' % (instance_num))

            elif instance_num == 2:
                # wait looping if there are so many features in the dictionary
                while len(TRAIN_STATE.feats_dict_tr_2) > max_preloaded_feats:
                    threading._sleep(1.0)
                    if verbose:
                        print ('... bottom part (%d) is waiting for features in the dictionary to get consumed by top part' % (instance_num))

            elif instance_num == 3:
                # wait looping if there are so many features in the dictionary
                while len(TRAIN_STATE.feats_dict_tr_3) > max_preloaded_feats:
                    threading._sleep(1.0)
                    if verbose:
                        print ('... bottom part (%d) is waiting for features in the dictionary to get consumed by top part' % (instance_num))

            # loop on groups according to instances
            if instance_num == 1 and idx_video % n_instances != 0:
                continue

            if instance_num == 2 and idx_video % n_instances != 1:
                continue

            if instance_num == 3 and idx_video % n_instances != 2:
                continue

            tg_1 = time.time()
            video_name = video_names[idx_video]
            video_num = idx_video + 1

            # wait until the image_batch is loaded
            t1 = time.time()
            while video_reader.is_busy():
                threading._sleep(0.1)
            t2 = time.time()
            duration_waited = t2 - t1
            if verbose:
                print('\n... ... model bottom (%d), video %d/%d, waited: %d, name: %s' % (instance_num, video_num, n_videos, duration_waited, video_name))

            # get the frames
            frames = video_reader.get_images()  # (G*T*N, 224, 224, 3)

            # pre-load for the next video group, notice that we take into account the number of instances
            if idx_video + n_instances < n_videos:
                next_video_num = video_num + n_instances
                next_video_name = video_names[idx_video + n_instances]
                next_video_frames = video_frames_dict[next_video_name]
                video_reader.load_imgs_in_batch(next_video_frames)
                if verbose:
                    print('\n... ... model bottom (%d), next video %d/%d, name: %s' % (instance_num, next_video_num, n_videos, next_video_name))

            if video_name in TRAIN_STATE.feats_dict_tr_1 or video_name in TRAIN_STATE.feats_dict_tr_2 or video_name in TRAIN_STATE.feats_dict_tr_3:
                raise ('... ... this should not be happening, but features for video %s already exist in the dictionary' % (video_name))

            if len(frames) != n_frames_per_video:
                raise ('... ... wrong n frames for video: %s' % (video_name))

            # reshape to make one dimension carries the frames / segment, while the other dimesion represents the batch size
            frames = np.reshape(frames, [n_frames_out, n_frames_per_segment, 224, 224, 3])  # (T, 8, 224, 224, 3)

            # get features
            features = model.predict(frames)  # (T, 1, 7, 7, 1024)

            # remove temporal axis, as it is one
            features = np.squeeze(features, axis=1)  # (T, 7, 7, 1024)

            # add feature to the dictionary
            if instance_num == 1:
                TRAIN_STATE.feats_dict_tr_1[video_name] = features
            elif instance_num == 2:
                TRAIN_STATE.feats_dict_tr_2[video_name] = features
            elif instance_num == 3:
                TRAIN_STATE.feats_dict_tr_3[video_name] = features

            tg_2 = time.time()
            if verbose:
                print ('took', tg_2 - tg_1)

        t2 = time.time()
        if verbose:
            print('... finish extracting features in %d seconds' % (t2 - t1))

        # after finishing epoch, update counters
        if instance_num == 1:
            TRAIN_STATE.model_bottom_1_epoch_end = epoch_num
        if instance_num == 2:
            TRAIN_STATE.model_bottom_2_epoch_end = epoch_num
        if instance_num == 3:
            TRAIN_STATE.model_bottom_3_epoch_end = epoch_num

        # wait untill the other part finishes
        if instance_num == 1:
            while TRAIN_STATE.model_bottom_1_epoch_end > TRAIN_STATE.model_bottom_2_epoch_end or TRAIN_STATE.model_bottom_1_epoch_end > TRAIN_STATE.model_bottom_3_epoch_end:
                threading._sleep(1.0)
                if verbose:
                    print ('... bottom part (1) is waiting for bottom part (2,3) to finish extracting features on epoch %d' % (epoch_num))
        if instance_num == 2:
            while TRAIN_STATE.model_bottom_2_epoch_end > TRAIN_STATE.model_bottom_1_epoch_end or TRAIN_STATE.model_bottom_2_epoch_end > TRAIN_STATE.model_bottom_3_epoch_end:
                threading._sleep(1.0)
                if verbose:
                    print ('... bottom part (2) is waiting for bottom part (1,3) to finish extracting features on epoch %d' % (epoch_num))
        if instance_num == 3:
            while TRAIN_STATE.model_bottom_3_epoch_end > TRAIN_STATE.model_bottom_1_epoch_end or TRAIN_STATE.model_bottom_3_epoch_end > TRAIN_STATE.model_bottom_2_epoch_end:
                threading._sleep(1.0)
                if verbose:
                    print ('... bottom part (3) is waiting for bottom part (1,2) to finish extracting features on epoch %d' % (epoch_num))

        # if top part is not finished yet, then wait
        while TRAIN_STATE.model_top_epoch_end < TRAIN_STATE.model_bottom_1_epoch_end or TRAIN_STATE.model_top_epoch_end < TRAIN_STATE.model_bottom_2_epoch_end or TRAIN_STATE.model_top_epoch_end < TRAIN_STATE.model_bottom_3_epoch_end:
            threading._sleep(2.0)
            if verbose:
                print ('... bottom part (%d) is waiting for top part to finish training on epoch: %d' % (instance_num, TRAIN_STATE.model_top_epoch_end + 1))

    print('... finish extracting features for all epochs, goodbye!')
    print('... end time: %s' % utils.timestamp())

# endregion

# region Train Helpers

def __get_features_from_dictionaries(video_names_tr_batch):
    global TRAIN_STATE

    features = []
    for v_name in video_names_tr_batch:
        if v_name in TRAIN_STATE.feats_dict_tr_1:
            features.append(TRAIN_STATE.feats_dict_tr_1[v_name])
        elif v_name in TRAIN_STATE.feats_dict_tr_2:
            features.append(TRAIN_STATE.feats_dict_tr_2[v_name])
        elif v_name in TRAIN_STATE.feats_dict_tr_3:
            features.append(TRAIN_STATE.feats_dict_tr_3[v_name])
        else:
            raise Exception('This should not be happening, but a feature is asked for and it does not exist in the dictionaries: %s' % (v_name))

    return features

def __shuffle_training_data(video_names_tr, class_nums_tr):
    n_v = len(video_names_tr)
    idx = np.arange(n_v)
    random.shuffle(idx)
    video_names_tr = video_names_tr[idx]
    class_nums_tr = class_nums_tr[idx]

    return video_names_tr, class_nums_tr

def __save_model_backbone(root_model, model_name, epoch_num):
    model_root_path = Pth('EPIC-Kitchens/models_backbone/%s', (model_name,))
    if not os.path.exists(model_root_path):
        os.mkdir(model_root_path)

    model_path = '%s/%03d.model' % (model_root_path, epoch_num)
    model_json_path = '%s/%03d.json' % (model_root_path, epoch_num)
    model_weight_path = '%s/%03d.pkl' % (model_root_path, epoch_num)

    # for very long model, this does not work
    # self.root_model.save(model_path)
    # only save model definition and weights
    keras_utils.save_model(root_model, model_json_path, model_weight_path)

def __save_model(root_model, model_name, epoch_num):
    model_root_path = Pth('EPIC-Kitchens/models/%s' % (model_name,))
    if not os.path.exists(model_root_path):
        os.mkdir(model_root_path)

    model_path = '%s/%03d.model' % (model_root_path, epoch_num)
    model_json_path = '%s/%03d.json' % (model_root_path, epoch_num)
    model_weight_path = '%s/%03d.pkl' % (model_root_path, epoch_num)

    # for very long model, this does not work
    # self.root_model.save(model_path)
    # only save model definition and weights
    keras_utils.save_model(root_model, model_json_path, model_weight_path)

def __save_centroids(root_model, model_name, epoch_num):
    centroids_root_path = Pth('EPIC-Kitchens/node_features/%s', (model_name,))
    centroids_path = '%s/%03d.pkl' % (centroids_root_path, epoch_num)

    if not os.path.exists(centroids_root_path):
        os.mkdir(centroids_root_path)

    session = K.get_session()
    t_centroids = root_model.get_layer('node_embedding').output  # (1, 20, 1024)
    centroids_embedding = t_centroids.eval(session=session)  # (1, 20, 1024)
    centroids_embedding = np.squeeze(centroids_embedding, axis=0)

    utils.pkl_dump(centroids_embedding, centroids_path)

def __compile_model_for_finetuning(model, n_gpus):
    # optimizer and loss
    loss = keras_utils.LOSSES[3]
    optimizer = Adam(lr=0.01, epsilon=1e-8)
    optimizer = Adam(lr=0.001, epsilon=1e-4)
    optimizer = SGD(lr=0.1, momentum=0.9, decay=0.0000001)
    optimizer = SGD(lr=0.02, momentum=0.8)

    if n_gpus == 1:
        model.compile(loss=loss, optimizer=optimizer)
        parallel_model = model
    else:
        parallel_model = multi_gpu_utils.multi_gpu_model(model, n_gpus)
        parallel_model.compile(loss=loss, optimizer=optimizer)

    return model, parallel_model

# endregion

# region Models

def __load_model_timeception(n_classes, input_shape, n_gpus, is_load_weights, weight_path):
    """
    Model
    """
    # optimizer and loss
    loss = keras_utils.LOSSES[3]
    output_activation = keras_utils.ACTIVATIONS[2]
    optimizer = SGD(lr=0.01)
    optimizer = Adam(lr=0.01, epsilon=1e-8)
    optimizer = Adam(lr=0.01, epsilon=1e-4)

    n_tc_layer = 3
    expansion_factor = 5.0 / 4.0
    _, n_timesteps, side_dim, _, n_channels_in = input_shape
    n_groups = int(n_channels_in / 128.0)
    print ('... n_groups, expansion factor: %d, %.02f' % (n_groups, expansion_factor))

    input_shape = (input_shape[1:])
    t_input = Input(shape=input_shape)  # (None, 20, 7, 7, 1024)
    tensor = t_input

    # timeception layers
    tensor = timeception.timeception_temporal_convolutions(tensor, n_tc_layer, n_groups, expansion_factor, is_dilated=True)

    # spatio-temporal pooling
    tensor = MaxLayer(axis=(1, 2, 3))(tensor)

    # dense layers
    tensor = Dropout(0.5)(tensor)
    tensor = Dense(512)(tensor)
    tensor = BatchNormalization()(tensor)
    tensor = LeakyReLU(alpha=0.2)(tensor)
    tensor = Dropout(0.25)(tensor)
    tensor = Dense(n_classes)(tensor)
    t_output = Activation(output_activation)(tensor)
    model = Model(input=t_input, output=t_output)

    if is_load_weights:
        model.load_weights(weight_path)

    if n_gpus == 1:
        model.compile(loss=loss, optimizer=optimizer)
        parallel_model = model
    else:
        parallel_model = multi_gpu_utils.multi_gpu_model(model, n_gpus)
        parallel_model.compile(loss=loss, optimizer=optimizer)

    return model, parallel_model

def __load_model_action_vlad(n_classes, input_shape, n_gpus, is_load_weights, weight_path):
    """
    Model
    """

    # optimizer and loss
    loss = keras_utils.LOSSES[3]
    output_activation = keras_utils.ACTIVATIONS[2]
    optimizer = SGD(lr=0.01)
    optimizer = Adam(lr=0.01, epsilon=1e-8)
    optimizer = Adam(lr=0.01, epsilon=1e-4)

    expansion_factor = 5.0 / 4.0
    _, n_timesteps, side_dim, _, n_channels_in = input_shape

    input_shape = (input_shape[1:])
    t_input = Input(shape=input_shape)  # (None, 7, 7, 1024)
    tensor = t_input

    # spatial convolution
    n_channels_out = 512
    tensor = Conv3D(n_channels_out, kernel_size=(1, 1, 1), padding='same')(tensor)
    tensor = BatchNormalization()(tensor)
    tensor = Activation('relu')(tensor)
    n_channels_in = n_channels_out

    # reshape for vlad
    tensor = ReshapeLayer((n_channels_in,))(tensor)

    # vlad layer
    max_samples = n_timesteps * side_dim * side_dim
    tensor = NetVLAD(n_channels_in, max_samples, 32)(tensor)

    # dense layers
    tensor = Dropout(0.5)(tensor)
    tensor = Dense(256)(tensor)
    tensor = BatchNormalization()(tensor)
    tensor = LeakyReLU(alpha=0.2)(tensor)
    tensor = Dropout(0.25)(tensor)
    tensor = Dense(n_classes)(tensor)

    t_output = Activation(output_activation)(tensor)
    model = Model(input=t_input, output=t_output)

    if is_load_weights:
        model.load_weights(weight_path)

    if n_gpus == 1:
        model.compile(loss=loss, optimizer=optimizer)
        parallel_model = model
    else:
        parallel_model = multi_gpu_utils.multi_gpu_model(model, n_gpus)
        parallel_model.compile(loss=loss, optimizer=optimizer)

    return model, parallel_model

def __load_model_videograph(nodes, n_classes, input_shape_x):
    """
    Model
    """

    # optimizer and loss
    loss = keras_utils.LOSSES[3]
    output_activation = keras_utils.ACTIVATIONS[2]
    optimizer = SGD(lr=0.01)
    optimizer = Adam(lr=0.01, epsilon=1e-8)
    optimizer = Adam(lr=0.01, epsilon=1e-4)

    # per-layer kernel size and max pooling for nodes and timesteps
    n_graph_layers = 2

    # time kernel
    t_kernel_size = 7
    t_max_size = 3

    # node kernel
    n_kernel_size = 7
    n_max_size = 3
    n_avg_size = 4

    # space kernel
    s_kernel_size = 2
    s_kernel_size = 1

    n_nodes, _ = nodes.shape

    _, n_timesteps, side_dim, side_dim, n_channels_in = input_shape_x
    t_input_x = Input(shape=(n_timesteps, side_dim, side_dim, n_channels_in), name='input_x')  # (None, 64, 1024)
    t_input_n = Input(tensor=tf.constant(nodes, dtype=tf.float32), name='input_n')  # (1, 100, 1024)
    tensor = t_input_x

    # spatial convolution
    tensor = Conv3D(n_channels_in, (1, s_kernel_size, s_kernel_size), padding='VALID', name='conv_s')(tensor)
    tensor = BatchNormalization()(tensor)
    tensor = LeakyReLU(alpha=0.2)(tensor)

    # pool over space
    tensor = MaxLayer(axis=(2, 3), is_keep_dim=True, name='global_pool_s')(tensor)  # (None, 64, 7, 7, 1024)

    # node attention
    tensor = videograph.node_attention(tensor, t_input_n, n_channels_in, activation_type='relu')  # (N, 100, 64, 7, 7, 1024)

    # graph embedding
    tensor = videograph.graph_embedding(tensor, n_graph_layers, n_avg_size, n_kernel_size, t_kernel_size, n_max_size, t_max_size)  # (N, 100, 64, 7, 7, 1024)

    # node pooling
    tensor = MeanLayer(axis=(1,), name='global_pool_n')(tensor)

    # temporal pooling
    tensor = MaxLayer(axis=(1, 2, 3), name='global_pool_t')(tensor)

    # mlp for classification
    tensor = Dropout(0.25)(tensor)
    tensor = Dense(512)(tensor)
    tensor = BatchNormalization()(tensor)
    tensor = LeakyReLU(alpha=0.2)(tensor)
    tensor = Dropout(0.25)(tensor)
    tensor = Dense(n_classes)(tensor)
    t_output = Activation(output_activation)(tensor)

    model = Model(input=[t_input_x, t_input_n], output=t_output)
    model.compile(loss=loss, optimizer=optimizer)
    return model

# endregion

# region Functions

def __load_annotation(annotation_path, annotation_type):
    annotation_types = ['noun', 'verb', 'noun_verb', 'action']
    assert annotation_type in annotation_types

    (y_noun_tr, y_verb_tr, y_actn_tr, y_noun_te, y_verb_te, y_actn_te) = utils.pkl_load(annotation_path)
    if annotation_type == 'noun':
        n_classes = ds_epic_kitchens.N_NOUNS_MANY_SHOT
        (y_tr, y_te) = (y_noun_tr, y_noun_te)
    elif annotation_type == 'verb':
        n_classes = ds_epic_kitchens.N_VERBS_MANY_SHOT
        (y_tr, y_te) = (y_verb_tr, y_verb_te)
    elif annotation_type == 'noun_verb':
        n_classes = ds_epic_kitchens.N_NOUNS_MANY_SHOT + ds_epic_kitchens.N_VERBS_MANY_SHOT
        (y_tr, y_te) = (np.hstack((y_noun_tr, y_verb_tr)), np.hstack((y_noun_te, y_verb_te)))
    elif annotation_type == 'action':
        n_classes = ds_epic_kitchens.N_ACTNS_MANY_SHOT
        (y_tr, y_te) = (y_actn_tr, y_actn_te)
    else:
        raise Exception('Sorry, unknown annotation type: %s' % (annotation_type))

    return (y_tr, y_te), n_classes

# endregion

# region Classes

class TrainingState(object):
    """
    An instance of this object is serialized and saved in the features path. This object is read by both top and bottom layer, as a way for communication.
    """

    def __init__(self):
        self.model_top_epoch_start = 0
        self.model_top_epoch_end = 0
        self.model_bottom_1_epoch_start = 0
        self.model_bottom_2_epoch_start = 0
        self.model_bottom_3_epoch_start = 0
        self.model_bottom_1_epoch_end = 0
        self.model_bottom_2_epoch_end = 0
        self.model_bottom_3_epoch_end = 0
        self.epoch_num_model_top = 0
        self.model_top_ready = False
        self.video_names_tr = None
        self.class_nums_tr = None
        self.video_frames_dict_tr = {}
        self.feats_dict_tr_1 = {}
        self.feats_dict_tr_2 = {}
        self.feats_dict_tr_3 = {}

# endregion