python source code of model

#
# Copyright 2018-2019 IBM Corp. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from maxfw.model import MAXModelWrapper
from config import DEFAULT_MODEL_PATH, DEFAULT_MODEL_DIR, BATCH_SIZE, CROP_SIZE, CHANNELS, NUM_FRAMES_PER_CLIP

import codecs
import os
import random
from six.moves import xrange  # pylint: disable=redefined-builtin
from ffmpy import FFmpeg

import tensorflow as tf
import PIL.Image as Image
import numpy as np
import cv2
import logging


logger = logging.getLogger()


# This method is adapted from https://github.com/hx173149/C3D-tensorflow/blob/master/input_data.py
def get_frames_data(filename, num_frames_per_clip, seed=84):
    '''Given a directory containing extracted frames, return a video clip of
    (num_frames_per_clip) consecutive frames as a list of np arrays'''
    ret_arr = []
    s_index = 0
    for parent, dirnames, filenames in os.walk(filename):
        if len(filenames) < num_frames_per_clip:
            return [], s_index
        filenames = sorted(filenames)
        # fixed random seed for each call
        random.seed(seed)
        s_index = random.randint(0, len(filenames) - num_frames_per_clip)  # nosec - randint isn't being used for crypto
        logger.debug('Total frames: {}; start index: {}, end index: {}'.format(len(filenames), s_index,
                                                                               s_index + num_frames_per_clip))

        for i in range(s_index, s_index + num_frames_per_clip):
            image_name = str(filename) + '/' + str(filenames[i])
            img = Image.open(image_name)
            img_data = np.array(img)
            ret_arr.append(img_data)
    return ret_arr, s_index


def convert_video_to_frames(filename):
    '''Splits the video up into 5 frames per second and saves images in directory
    :param filename: Name of the video to be processed
    :return: directory created
    '''
    video_dir = os.path.splitext(filename)[0]
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
        ff = FFmpeg(inputs={filename: None},
                    outputs={video_dir + '/%05d.jpg': '-vf fps=5 -nostats -loglevel 0'})
        ff.run()
    return video_dir


def process_frames(dirname, means, batch_size=BATCH_SIZE, num_frames_per_clip=NUM_FRAMES_PER_CLIP, crop_size=CROP_SIZE):
    tmp_data, _ = get_frames_data(dirname, num_frames_per_clip)
    img_datas = []
    data = []

    if len(tmp_data) != 0:
        for j in xrange(len(tmp_data)):
            img = Image.fromarray(tmp_data[j].astype(np.uint8))
            if img.width > img.height:
                scale = float(crop_size) / float(img.height)
                img = np.array(cv2.resize(np.array(img), (int(img.width * scale + 1), crop_size))).astype(np.float32)
            else:
                scale = float(crop_size) / float(img.width)
                img = np.array(cv2.resize(np.array(img), (crop_size, int(img.height * scale + 1)))).astype(np.float32)
            crop_x = int((img.shape[0] - crop_size) / 2)
            crop_y = int((img.shape[1] - crop_size) / 2)
            img = img[crop_x:crop_x + crop_size, crop_y:crop_y + crop_size, :] - means[j]
            img_datas.append(img)
        data.append(img_datas)

    # pad (duplicate) data/label if less than batch_size
    valid_len = len(data)
    pad_len = batch_size - valid_len
    if pad_len:
        for i in range(pad_len):
            data.append(img_datas)

    np_arr_data = np.array(data).astype(np.float32)
    return np_arr_data


class ModelWrapper(MAXModelWrapper):

    MODEL_META_DATA = {
        'id': 'max-sports-video-classifier',
        'name': 'MAX Sports Video Classifier',
        'description': 'C3D TensorFlow video classification model trained on the Sports1m dataset',
        'type': 'Video Classification',
        'source': 'https://developer.ibm.com/exchanges/models/all/max-sports-video-classifier/',
        'license': 'MIT'
    }

    def __init__(self, path=DEFAULT_MODEL_PATH, model_dir=DEFAULT_MODEL_DIR):
        logger.info('Loading model from: {}...'.format(path))
        sess = tf.Session(graph=tf.Graph())
        # load the graph
        saved_model_path = '{}/{}'.format(path, model_dir)
        model_graph_def = tf.saved_model.load(sess, [tf.saved_model.tag_constants.SERVING], saved_model_path)
        sig_def = model_graph_def.signature_def[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]

        input_name = sig_def.inputs['inputs'].name
        output_name = sig_def.outputs['scores'].name

        # Load labels from file
        label_file = codecs.open('./{}/labels.txt'.format(path), "r", encoding="utf-8")
        labels = [label.strip('\n') for label in label_file.readlines()]

        self.labels = labels

        # set up instance variables and required inputs for inference
        self.sess = sess
        self.model_graph_def = model_graph_def
        self.output_tensor = sess.graph.get_tensor_by_name(output_name)
        self.input_name = input_name
        self.output_name = output_name

        self.means = np.load('./{}/crop_mean.npy'.format(path)).reshape(
            [NUM_FRAMES_PER_CLIP, CROP_SIZE, CROP_SIZE, CHANNELS])
        logger.info('Loaded model')

    def _pre_process(self, video_path):
        dir_name = convert_video_to_frames(video_path)
        frames = process_frames(dir_name, self.means)
        return frames

    def _predict(self, frames):

        predict_score = self.sess.run(self.output_tensor, feed_dict={self.input_name: frames})

        # take mean of scores per frame for the average predicted video score
        predict_score = predict_score.mean(axis=0)
        # Take top 3 labels
        predict_idx = np.argsort(predict_score)[::-1][:3]
        predictions = []
        for label in predict_idx:
            predictions.append((label, self.labels[label], predict_score[label]))
        return predictions

    def _post_process(self, predictions):
        return predictions