python source code of tracker

import numpy as np
import cv2
import scipy
import time


from scipy import signal
# from numpy.fft import fftshift

from .config import config
from .features import FHogFeature, TableFeature, mround, ResNet50Feature, VGG16Feature
from .fourier_tools import cfft2, interpolate_dft, shift_sample, full_fourier_coeff,\
        cubic_spline_fourier, compact_fourier_coeff, ifft2, fft2, sample_fs
from .optimize_score import optimize_score
from .sample_space_model import GMM
from .train import train_joint, train_filter
from .scale_filter import ScaleFilter

if config.use_gpu:
    import cupy as cp


class ECOTracker:
    def __init__(self, is_color):
        self._is_color = is_color
        self._frame_num = 0
        self._frames_since_last_train = 0
        if config.use_gpu:
            cp.cuda.Device(config.gpu_id).use()

    def _cosine_window(self, size):
        """
            get the cosine window
        """
        cos_window = np.hanning(int(size[0]+2))[:, np.newaxis].dot(np.hanning(int(size[1]+2))[np.newaxis, :])
        cos_window = cos_window[1:-1, 1:-1][:, :, np.newaxis, np.newaxis].astype(np.float32)
        if config.use_gpu:
            cos_window = cp.asarray(cos_window)
        return cos_window

    def _get_interp_fourier(self, sz):
        """
            compute the fourier series of the interpolation function.
        """
        f1 = np.arange(-(sz[0]-1) / 2, (sz[0]-1)/2+1, dtype=np.float32)[:, np.newaxis] / sz[0]
        interp1_fs = np.real(cubic_spline_fourier(f1, config.interp_bicubic_a) / sz[0])
        f2 = np.arange(-(sz[1]-1) / 2, (sz[1]-1)/2+1, dtype=np.float32)[np.newaxis, :] / sz[1]
        interp2_fs = np.real(cubic_spline_fourier(f2, config.interp_bicubic_a) / sz[1])
        if config.interp_centering:
            f1 = np.arange(-(sz[0]-1) / 2, (sz[0]-1)/2+1, dtype=np.float32)[:, np.newaxis]
            interp1_fs = interp1_fs * np.exp(-1j*np.pi / sz[0] * f1)
            f2 = np.arange(-(sz[1]-1) / 2, (sz[1]-1)/2+1, dtype=np.float32)[np.newaxis, :]
            interp2_fs = interp2_fs * np.exp(-1j*np.pi / sz[1] * f2)

        if config.interp_windowing:
            win1 = np.hanning(sz[0]+2)[:, np.newaxis]
            win2 = np.hanning(sz[1]+2)[np.newaxis, :]
            interp1_fs = interp1_fs * win1[1:-1]
            interp2_fs = interp2_fs * win2[1:-1]
        if not config.use_gpu:
            return (interp1_fs[:, :, np.newaxis, np.newaxis],
                    interp2_fs[:, :, np.newaxis, np.newaxis])
        else:
            return (cp.asarray(interp1_fs[:, :, np.newaxis, np.newaxis]),
                    cp.asarray(interp2_fs[:, :, np.newaxis, np.newaxis]))

    def _get_reg_filter(self, sz, target_sz, reg_window_edge):
        """
            compute the spatial regularization function and drive the
            corresponding filter operation used for optimization
        """
        if config.use_reg_window:
            # normalization factor
            reg_scale = 0.5 * target_sz

            # construct grid
            wrg = np.arange(-(sz[0]-1)/2, (sz[1]-1)/2+1, dtype=np.float32)
            wcg = np.arange(-(sz[0]-1)/2, (sz[1]-1)/2+1, dtype=np.float32)
            wrs, wcs = np.meshgrid(wrg, wcg)

            # construct the regularization window
            reg_window = (reg_window_edge - config.reg_window_min) * (np.abs(wrs/reg_scale[0])**config.reg_window_power + \
                            np.abs(wcs/reg_scale[1])**config.reg_window_power) + config.reg_window_min

            # compute the DFT and enforce sparsity
            reg_window_dft = fft2(reg_window) / np.prod(sz)
            reg_window_dft[np.abs(reg_window_dft) < config.reg_sparsity_threshold* np.max(np.abs(reg_window_dft.flatten()))] = 0

            # do the inverse transform, correct window minimum
            reg_window_sparse = np.real(ifft2(reg_window_dft))
            reg_window_dft[0, 0] = reg_window_dft[0, 0] - np.prod(sz) * np.min(reg_window_sparse.flatten()) + config.reg_window_min
            reg_window_dft = np.fft.fftshift(reg_window_dft).astype(np.complex64)

            # find the regularization filter by removing the zeros
            row_idx = np.logical_not(np.all(reg_window_dft==0, axis=1))
            col_idx = np.logical_not(np.all(reg_window_dft==0, axis=0))
            mask = np.outer(row_idx, col_idx)
            reg_filter = np.real(reg_window_dft[mask]).reshape(np.sum(row_idx), -1)
        else:
            # else use a scaled identity matrix
            reg_filter = config.reg_window_min
        if not config.use_gpu:
            return reg_filter.T
        else:
            return cp.asarray(reg_filter.T)

    def _init_proj_matrix(self, init_sample, compressed_dim, proj_method):
        """
            init the projection matrix
        """
        if config.use_gpu:
            xp = cp.get_array_module(init_sample[0])
        else:
            xp = np
        x = [xp.reshape(x, (-1, x.shape[2])) for x in init_sample]
        x = [z - z.mean(0) for z in x]
        proj_matrix_ = []
        if config.proj_init_method == 'pca':
            for x_, compressed_dim_  in zip(x, compressed_dim):
                proj_matrix, _, _ = xp.linalg.svd(x_.T.dot(x_))
                proj_matrix = proj_matrix[:, :compressed_dim_]
                proj_matrix_.append(proj_matrix)
        elif config.proj_init_method == 'rand_uni':
            for x_, compressed_dim_ in zip(x, compressed_dim):
                proj_matrix = xp.random.uniform(size=(x_.shape[1], compressed_dim_))
                proj_matrix /= xp.sqrt(xp.sum(proj_matrix**2, axis=0, keepdims=True))
                proj_matrix_.append(proj_matrix)
        return proj_matrix_

    def _proj_sample(self, x, P):
        if config.use_gpu:
            xp = cp.get_array_module(x[0])
        else:
            xp = np
        return [xp.matmul(P_.T, x_) for x_, P_ in zip(x, P)]

    def init(self, frame, bbox, total_frame=np.inf):
        """
            frame -- image
            bbox -- need xmin, ymin, width, height
        """
        self._pos = np.array([bbox[1]+(bbox[3]-1)/2., bbox[0]+(bbox[2]-1)/2.], dtype=np.float32)
        self._target_sz = np.array([bbox[3], bbox[2]])
        self._num_samples = min(config.num_samples, total_frame)
        xp = cp if config.use_gpu else np

        # calculate search area and initial scale factor
        search_area = np.prod(self._target_sz * config.search_area_scale)
        if search_area > config.max_image_sample_size:
            self._current_scale_factor = np.sqrt(search_area / config.max_image_sample_size)
        elif search_area < config.min_image_sample_size:
            self._current_scale_factor = np.sqrt(search_area / config.min_image_sample_size)
        else:
            self._current_scale_factor = 1.

        # target size at the initial scale
        self._base_target_sz = self._target_sz / self._current_scale_factor

        # target size, taking padding into account
        if config.search_area_shape == 'proportional':
            self._img_sample_sz = np.floor(self._base_target_sz * config.search_area_scale)
        elif config.search_area_shape == 'square':
            self._img_sample_sz = np.ones((2), dtype=np.float32) * np.sqrt(np.prod(self._base_target_sz * config.search_area_scale))
        else:
            raise("unimplemented")

        features = [feature for feature in config.features
                if ("use_for_color" in feature and feature["use_for_color"] == self._is_color) or
                    "use_for_color" not in feature]

        self._features = []
        cnn_feature_idx = -1
        for idx, feature in enumerate(features):
            if feature['fname'] == 'cn' or feature['fname'] == 'ic':
                self._features.append(TableFeature(**feature))
            elif feature['fname'] == 'fhog':
                self._features.append(FHogFeature(**feature))
            elif feature['fname'].startswith('cnn'):
                cnn_feature_idx = idx
                netname = feature['fname'].split('-')[1]
                if netname == 'resnet50':
                    self._features.append(ResNet50Feature(**feature))
                elif netname == 'vgg16':
                    self._features.append(VGG16Feature(**feature))
            else:
                raise("unimplemented features")
        self._features = sorted(self._features, key=lambda x:x.min_cell_size)

        # calculate image sample size
        if cnn_feature_idx >= 0:
            self._img_sample_sz = self._features[cnn_feature_idx].init_size(self._img_sample_sz)
        else:
            cell_size = [x.min_cell_size for x in self._features]
            self._img_sample_sz = self._features[0].init_size(self._img_sample_sz, cell_size)

        for idx, feature in enumerate(self._features):
            if idx != cnn_feature_idx:
                feature.init_size(self._img_sample_sz)

        if config.use_projection_matrix:
            sample_dim = [ x for feature in self._features for x in feature._compressed_dim ]
        else:
            sample_dim = [ x for feature in self._features for x in feature.num_dim ]

        feature_dim = [ x for feature in self._features for x in feature.num_dim ]

        feature_sz = np.array([x for feature in self._features for x in feature.data_sz ], dtype=np.int32)

        # number of fourier coefficients to save for each filter layer, this will be an odd number
        filter_sz = feature_sz + (feature_sz + 1) % 2

        # the size of the label function DFT. equal to the maximum filter size
        self._k1 = np.argmax(filter_sz, axis=0)[0]
        self._output_sz = filter_sz[self._k1]

        self._num_feature_blocks = len(feature_dim)

        # get the remaining block indices
        self._block_inds = list(range(self._num_feature_blocks))
        self._block_inds.remove(self._k1)

        # how much each feature block has to be padded to the obtain output_sz
        self._pad_sz = [((self._output_sz - filter_sz_) / 2).astype(np.int32) for filter_sz_ in filter_sz]

        # compute the fourier series indices and their transposes
        self._ky = [np.arange(-np.ceil(sz[0]-1)/2, np.floor((sz[0]-1)/2)+1, dtype=np.float32)
                        for sz in filter_sz]
        self._kx = [np.arange(-np.ceil(sz[1]-1)/2, 1, dtype=np.float32)
                        for sz in filter_sz]

        # construct the gaussian label function using poisson formula
        sig_y = np.sqrt(np.prod(np.floor(self._base_target_sz))) * config.output_sigma_factor * (self._output_sz / self._img_sample_sz)
        yf_y = [np.sqrt(2 * np.pi) * sig_y[0] / self._output_sz[0] * np.exp(-2 * (np.pi * sig_y[0] * ky_ / self._output_sz[0])**2)
                    for ky_ in self._ky]
        yf_x = [np.sqrt(2 * np.pi) * sig_y[1] / self._output_sz[1] * np.exp(-2 * (np.pi * sig_y[1] * kx_ / self._output_sz[1])**2)
                    for kx_ in self._kx]
        self._yf = [yf_y_.reshape(-1, 1) * yf_x_ for yf_y_, yf_x_ in zip(yf_y, yf_x)]
        if config.use_gpu:
            self._yf = [cp.asarray(yf) for yf in self._yf]
            self._ky = [cp.asarray(ky) for ky in self._ky]
            self._kx = [cp.asarray(kx) for kx in self._kx]

        # construct cosine window
        self._cos_window = [self._cosine_window(feature_sz_) for feature_sz_ in feature_sz]

        # compute fourier series of interpolation function
        self._interp1_fs = []
        self._interp2_fs = []
        for sz in filter_sz:
            interp1_fs, interp2_fs = self._get_interp_fourier(sz)
            self._interp1_fs.append(interp1_fs)
            self._interp2_fs.append(interp2_fs)

        # get the reg_window_edge parameter
        reg_window_edge = []
        for feature in self._features:
            if hasattr(feature, 'reg_window_edge'):
                reg_window_edge.append(feature.reg_window_edge)
            else:
                reg_window_edge += [config.reg_window_edge for _ in range(len(feature.num_dim))]

        # construct spatial regularization filter
        self._reg_filter = [self._get_reg_filter(self._img_sample_sz, self._base_target_sz, reg_window_edge_)
                                for reg_window_edge_ in reg_window_edge]

        # compute the energy of the filter (used for preconditioner)
        if not config.use_gpu:
            self._reg_energy = [np.real(np.vdot(reg_filter.flatten(), reg_filter.flatten()))
                            for reg_filter in self._reg_filter]
        else:
            self._reg_energy = [cp.real(cp.vdot(reg_filter.flatten(), reg_filter.flatten()))
                            for reg_filter in self._reg_filter]

        if config.use_scale_filter:
            self._scale_filter = ScaleFilter(self._target_sz)
            self._num_scales = self._scale_filter.num_scales
            self._scale_step = self._scale_filter.scale_step
            self._scale_factor = self._scale_filter.scale_factors
        else:
            # use the translation filter to estimate the scale
            self._num_scales = config.number_of_scales
            self._scale_step = config.scale_step
            scale_exp = np.arange(-np.floor((self._num_scales-1)/2), np.ceil((self._num_scales-1)/2)+1)
            self._scale_factor = self._scale_step**scale_exp

        if self._num_scales > 0:
            # force reasonable scale changes
            self._min_scale_factor = self._scale_step ** np.ceil(np.log(np.max(5 / self._img_sample_sz)) / np.log(self._scale_step))
            self._max_scale_factor = self._scale_step ** np.floor(np.log(np.min(frame.shape[:2] / self._base_target_sz)) / np.log(self._scale_step))

        # set conjugate gradient options
        init_CG_opts = {'CG_use_FR': True,
                        'tol': 1e-6,
                        'CG_standard_alpha': True
                       }
        self._CG_opts = {'CG_use_FR': config.CG_use_FR,
                         'tol': 1e-6,
                         'CG_standard_alpha': config.CG_standard_alpha
                        }
        if config.CG_forgetting_rate == np.inf or config.learning_rate >= 1:
            self._CG_opts['init_forget_factor'] = 0.
        else:
            self._CG_opts['init_forget_factor'] = (1 - config.learning_rate) ** config.CG_forgetting_rate

        # init ana allocate
        self._gmm = GMM(self._num_samples)
        self._samplesf = [[]] * self._num_feature_blocks

        for i in range(self._num_feature_blocks):
            if not config.use_gpu:
                self._samplesf[i] = np.zeros((int(filter_sz[i, 0]), int((filter_sz[i, 1]+1)/2),
                    sample_dim[i], config.num_samples), dtype=np.complex64)
            else:
                self._samplesf[i] = cp.zeros((int(filter_sz[i, 0]), int((filter_sz[i, 1]+1)/2),
                    sample_dim[i], config.num_samples), dtype=cp.complex64)

        # allocate
        self._num_training_samples = 0

        # extract sample and init projection matrix
        sample_pos = mround(self._pos)
        sample_scale = self._current_scale_factor
        xl = [x for feature in self._features
                for x in feature.get_features(frame, sample_pos, self._img_sample_sz, self._current_scale_factor) ]  # get features

        if config.use_gpu:
            xl = [cp.asarray(x) for x in xl]

        xlw = [x * y for x, y in zip(xl, self._cos_window)]                                                          # do windowing
        xlf = [cfft2(x) for x in xlw]                                                                                # fourier series
        xlf = interpolate_dft(xlf, self._interp1_fs, self._interp2_fs)                                               # interpolate features,
        xlf = compact_fourier_coeff(xlf)                                                                             # new sample to be added
        shift_sample_ = 2 * np.pi * (self._pos - sample_pos) / (sample_scale * self._img_sample_sz)
        xlf = shift_sample(xlf, shift_sample_, self._kx, self._ky)
        self._proj_matrix = self._init_proj_matrix(xl, sample_dim, config.proj_init_method)
        xlf_proj = self._proj_sample(xlf, self._proj_matrix)
        merged_sample, new_sample, merged_sample_id, new_sample_id = \
            self._gmm.update_sample_space_model(self._samplesf, xlf_proj, self._num_training_samples)
        self._num_training_samples += 1

        if config.update_projection_matrix:
            for i in range(self._num_feature_blocks):
                self._samplesf[i][:, :, :, new_sample_id:new_sample_id+1] = new_sample[i]

        # train_tracker
        self._sample_energy = [xp.real(x * xp.conj(x)) for x in xlf_proj]

        # init conjugate gradient param
        self._CG_state = None
        if config.update_projection_matrix:
            init_CG_opts['maxit'] = np.ceil(config.init_CG_iter / config.init_GN_iter)
            self._hf = [[[]] * self._num_feature_blocks for _ in range(2)]
            feature_dim_sum = float(np.sum(feature_dim))
            proj_energy = [2 * xp.sum(xp.abs(yf_.flatten())**2) / feature_dim_sum * xp.ones_like(P)
                    for P, yf_ in zip(self._proj_matrix, self._yf)]
        else:
            self._CG_opts['maxit'] = config.init_CG_iter
            self._hf = [[[]] * self._num_feature_blocks]

        # init the filter with zeros
        for i in range(self._num_feature_blocks):
            self._hf[0][i] = xp.zeros((int(filter_sz[i, 0]), int((filter_sz[i, 1]+1)/2),
                int(sample_dim[i]), 1), dtype=xp.complex64)

        if config.update_projection_matrix:
            # init Gauss-Newton optimization of the filter and projection matrix
            self._hf, self._proj_matrix = train_joint(
                                                  self._hf,
                                                  self._proj_matrix,
                                                  xlf,
                                                  self._yf,
                                                  self._reg_filter,
                                                  self._sample_energy,
                                                  self._reg_energy,
                                                  proj_energy,
                                                  init_CG_opts)
            # re-project and insert training sample
            xlf_proj = self._proj_sample(xlf, self._proj_matrix)
            # self._sample_energy = [np.real(x * np.conj(x)) for x in xlf_proj]
            for i in range(self._num_feature_blocks):
                self._samplesf[i][:, :, :, 0:1] = xlf_proj[i]

            # udpate the gram matrix since the sample has changed
            if config.distance_matrix_update_type == 'exact':
                # find the norm of the reprojected sample
                new_train_sample_norm = 0.
                for i in range(self._num_feature_blocks):
                    new_train_sample_norm += 2 * xp.real(xp.vdot(xlf_proj[i].flatten(), xlf_proj[i].flatten()))
                self._gmm._gram_matrix[0, 0] = new_train_sample_norm
        self._hf_full = full_fourier_coeff(self._hf)

        if config.use_scale_filter and self._num_scales > 0:
            self._scale_filter.update(frame, self._pos, self._base_target_sz, self._current_scale_factor)
        self._frame_num += 1

    def update(self, frame, train=True, vis=False):
        # target localization step
        xp = cp if config.use_gpu else np
        pos = self._pos
        old_pos = np.zeros((2))
        for _ in range(config.refinement_iterations):
            # if np.any(old_pos != pos):
            if not np.allclose(old_pos, pos):
                old_pos = pos.copy()
                # extract fatures at multiple resolutions
                sample_pos = mround(pos)
                sample_scale = self._current_scale_factor * self._scale_factor
                xt = [x for feature in self._features
                        for x in feature.get_features(frame, sample_pos, self._img_sample_sz, sample_scale) ]  # get features
                if config.use_gpu:
                    xt = [cp.asarray(x) for x in xt]
                xt_proj = self._proj_sample(xt, self._proj_matrix)                                             # project sample
                xt_proj = [feat_map_ * cos_window_
                        for feat_map_, cos_window_ in zip(xt_proj, self._cos_window)]                          # do windowing
                xtf_proj = [cfft2(x) for x in xt_proj]                                                         # compute the fourier series
                xtf_proj = interpolate_dft(xtf_proj, self._interp1_fs, self._interp2_fs)                       # interpolate features to continuous domain

                # compute convolution for each feature block in the fourier domain, then sum over blocks
                scores_fs_feat = [[]] * self._num_feature_blocks
                scores_fs_feat[self._k1] = xp.sum(self._hf_full[self._k1] * xtf_proj[self._k1], 2)
                scores_fs = scores_fs_feat[self._k1]

                # scores_fs_sum shape: height x width x num_scale
                for i in self._block_inds:
                    scores_fs_feat[i] = xp.sum(self._hf_full[i] * xtf_proj[i], 2)
                    scores_fs[self._pad_sz[i][0]:self._output_sz[0]-self._pad_sz[i][0],
                              self._pad_sz[i][1]:self._output_sz[0]-self._pad_sz[i][1]] += scores_fs_feat[i]

                # optimize the continuous score function with newton's method.
                trans_row, trans_col, scale_idx = optimize_score(scores_fs, config.newton_iterations)

                # show score
                if vis:
                    if config.use_gpu:
                       xp = cp
                    self.score = xp.fft.fftshift(sample_fs(scores_fs[:,:,scale_idx],
                            tuple((10*self._output_sz).astype(np.uint32))))
                    if config.use_gpu:
                       self.score = cp.asnumpy(self.score)
                    self.crop_size = self._img_sample_sz * self._current_scale_factor

                # compute the translation vector in pixel-coordinates and round to the cloest integer pixel
                translation_vec = np.array([trans_row, trans_col]) * (self._img_sample_sz / self._output_sz) * \
                                    self._current_scale_factor * self._scale_factor[scale_idx]
                scale_change_factor = self._scale_factor[scale_idx]

                # udpate position
                pos = sample_pos + translation_vec

                if config.clamp_position:
                    pos = np.maximum(np.array(0, 0), np.minimum(np.array(frame.shape[:2]), pos))

                # do scale tracking with scale filter
                if self._num_scales > 0 and config.use_scale_filter:
                    scale_change_factor = self._scale_filter.track(frame, pos, self._base_target_sz,
                           self._current_scale_factor)

                # udpate the scale
                self._current_scale_factor *= scale_change_factor

                # adjust to make sure we are not to large or to small
                if self._current_scale_factor < self._min_scale_factor:
                    self._current_scale_factor = self._min_scale_factor
                elif self._current_scale_factor > self._max_scale_factor:
                    self._current_scale_factor = self._max_scale_factor

        # model udpate step
        if config.learning_rate > 0:
            # use the sample that was used for detection
            sample_scale = sample_scale[scale_idx]
            xlf_proj = [xf[:, :(xf.shape[1]+1)//2, :, scale_idx:scale_idx+1] for xf in xtf_proj]

            # shift the sample so that the target is centered
            shift_sample_ = 2 * np.pi * (pos - sample_pos) / (sample_scale * self._img_sample_sz)
            xlf_proj = shift_sample(xlf_proj, shift_sample_, self._kx, self._ky)

        # update the samplesf to include the new sample. The distance matrix, kernel matrix and prior weight are also updated
        merged_sample, new_sample, merged_sample_id, new_sample_id = \
                self._gmm.update_sample_space_model(self._samplesf, xlf_proj, self._num_training_samples)

        if self._num_training_samples < self._num_samples:
            self._num_training_samples += 1

        if config.learning_rate > 0:
            for i in range(self._num_feature_blocks):
                if merged_sample_id >= 0:
                    self._samplesf[i][:, :, :, merged_sample_id:merged_sample_id+1] = merged_sample[i]
                if new_sample_id >= 0:
                    self._samplesf[i][:, :, :, new_sample_id:new_sample_id+1] = new_sample[i]

        # training filter
        if self._frame_num < config.skip_after_frame or \
                self._frames_since_last_train >= config.train_gap:
            # print("Train filter: ", self._frame_num)
            new_sample_energy = [xp.real(xlf * xp.conj(xlf)) for xlf in xlf_proj]
            self._CG_opts['maxit'] = config.CG_iter
            self._sample_energy = [(1 - config.learning_rate)*se + config.learning_rate*nse
                                for se, nse in zip(self._sample_energy, new_sample_energy)]

            # do conjugate gradient optimization of the filter
            self._hf, self._CG_state = train_filter(
                                                 self._hf,
                                                 self._samplesf,
                                                 self._yf,
                                                 self._reg_filter,
                                                 self._gmm.prior_weights,
                                                 self._sample_energy,
                                                 self._reg_energy,
                                                 self._CG_opts,
                                                 self._CG_state)
            # reconstruct the ful fourier series
            self._hf_full = full_fourier_coeff(self._hf)
            self._frames_since_last_train = 0
        else:
            self._frames_since_last_train += 1
        if config.use_scale_filter:
            self._scale_filter.update(frame, pos, self._base_target_sz, self._current_scale_factor)

        # udpate the target size
        self._target_sz = self._base_target_sz * self._current_scale_factor

        # save position and calculate fps
        bbox = (pos[1] - self._target_sz[1]/2, # xmin
                pos[0] - self._target_sz[0]/2, # ymin
                pos[1] + self._target_sz[1]/2, # xmax
                pos[0] + self._target_sz[0]/2) # ymax
        self._pos = pos
        self._frame_num += 1
        return bbox