python source code of k210

# coding=utf-8
'''
 * Copyright 2019 Sipeed Inc.
 * Copyright 2018 Canaan Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

import math
import numpy as np
from tensorlayer import logging

from .k210_constants import *
from dissect import cstruct 

import sys
sys.path.append("..")
from ..edge_constants import *
from ..edge_layer import *

default_conv_arg = None
default_act_arg = None
default_bn_arg = {
    'load_para': 0,
    'bwsx_base_addr': 0
}
default_pool_arg = {
    'pool_type': 0,  # bypass
}

###############################################################################
def signed_to_hex(value, width):
    return hex(int(round((1 << width) + value)) % (1 << width))


def min_max_to_scale_bias(minv, maxv):
    scale = (maxv - minv) / 255
    bias = minv
    return scale, bias

#def signed_to_hex(value, width):
#    if isinstance(value, np.ndarray):
#        value = value.tolist()
#    return hex(int(round((1 << width) + value)) % (1 << width))

def pow_next_log_of_2_no_round(value, bound_shift, shift_max_shift=4):
    mul, shift = np.frexp(np.abs(value))    # value = mul(0~1)*(1 << shift)
    ret = bound_shift - 1 - shift           # shift to full bound_shift
    mul = np.sign(value) * mul * np.power(2, bound_shift - 1)   # scale mul
    # value = mul>>ret
    return ret, mul

def pow_next_log_of_2(value, bound_shift, shift_max_shift=4):
    ret = 0     #limit shr < shift_max_shift
    shift_max = 1 << shift_max_shift
    while value >= -(1 << (bound_shift - 2)) and value < (1 << (bound_shift - 2)) \
            and value != 0 and ret < (shift_max - 1):
        value = value * 2   
        ret = ret + 1
    return ret, value
    
def pow_next_log_of_2_with_maxshift(value, bound_shift, shift_max):
    ret = 0     #limit shr < shift_max_shift
    while value >= -(1 << (bound_shift - 2)) and value < (1 << (bound_shift - 2)) \
            and value != 0 and ret < (shift_max - 1):
        value = value * 2   
        ret = ret + 1
    return ret, value
    
def align_4(oft):
    oft_align = ((((oft-1)>>2)+1)<<2)
    pad_len = oft_align - oft
    return [oft_align, pad_len]

def align_8(oft):
    oft_align = ((((oft-1)>>3)+1)<<3)
    pad_len = oft_align - oft
    return [oft_align, pad_len]

def align_128(oft):
    oft_align = ((((oft-1)>>7)+1)<<7)
    pad_len = oft_align - oft
    return [oft_align, pad_len]
###############################################################################
def gen_layer_struct(elayer, idx):
    reserved = 0
    set_to_zero = 0
    img_ram_size = 2 * 1024 * 1024

    # we do following ops in order, every ops give scale to next step
    # conv ops scale weight and x, so give swsx to next step
    conv_arg = elayer.conv and elayer.conv.to_kmodel_param() or default_conv_arg
    # bn ops scale b, so give swsxsb to next step
    bn_arg = elayer.bn and elayer.bn.to_kmodel_param(conv_arg['swsx'], conv_arg['scale_w_fix'], ) or default_bn_arg
    act_arg = elayer.act and elayer.act.to_kmodel_param(bn_arg['post_scale']) or default_act_arg
    pool_arg = elayer.pool and elayer.pool.to_kmodel_param() or default_pool_arg
    io_arg = elayer.to_kmodel_io_param()

    mino, maxo = elayer.act.min_y, elayer.act.max_y
    output_scale, output_bias = min_max_to_scale_bias(mino, maxo)

    img_input_size = int(math.ceil(io_arg['i_ch_num'] / conv_arg['coef_group']) * 64 * conv_arg['channel_switch_addr'])
    img_output_size = int(math.ceil(io_arg['o_ch_num'] / io_arg['wb_group']) * 64 * io_arg['wb_channel_switch_addr'])

    assert (img_input_size + img_output_size <= img_ram_size)
    
    logging.debug("----KPU Register Config Info----")
    interrupt_enabe = {
        'int_en': set_to_zero,
        'ram_flag': reserved,
        'full_add': set_to_zero,
        'depth_wise_layer': conv_arg['depth_wise_layer']
    }
    logging.debug("       {}".format(interrupt_enabe))
    image_addr = {
        'image_src_addr': hex(int((0 if not idx & 1 else (img_ram_size - img_input_size)) / 64)),
        'image_dst_addr': hex(int((0 if idx & 1 else (img_ram_size - img_output_size)) / 64))
    }
    logging.debug("       {}".format(image_addr))
    image_channel_num = {
        'i_ch_num': hex(io_arg['i_ch_num'] - 1),
        'o_ch_num': hex(io_arg['o_ch_num'] - 1),
        'o_ch_num_coef': hex(conv_arg['o_ch_num_coef'] - 1),
    }
    logging.debug("       {}".format(image_channel_num))
    image_size = {
        'i_row_wid': hex(conv_arg['i_row_wid'] - 1),
        'i_col_high': hex(conv_arg['i_col_high'] - 1),
        'o_row_wid': hex(io_arg['o_row_wid'] - 1),
        'o_col_high': hex(io_arg['o_col_high'] - 1),
    }
    logging.debug("       {}".format(image_size))
    kernel_pool_type_cfg = {
        'kernel_type': conv_arg['kernel_type'],
        'pad_type': conv_arg['pad_type'],
        'pool_type': pool_arg['pool_type'],
        'first_stride': conv_arg['first_stride'],
        'bypass_conv': 0 if elayer.conv else 1,
        'load_para': bn_arg['load_para'],
        'dma_burst_size': io_arg['dma_burst_size'],
        'pad_value': signed_to_hex(conv_arg['pad_value'], 8),
        'bwsx_base_addr': bn_arg['bwsx_base_addr'],
    }
    
    kernel_pool_type_cfg_print = {
        'kernel_type': conv_arg['kernel_type'],
        'pad_type': conv_arg['pad_type'],
        'pool_type': pool_arg['pool_type'],
        'first_stride': conv_arg['first_stride'],
        'bypass_conv': 0 if elayer.conv else 1,
        'load_para': bn_arg['load_para'],
        'dma_burst_size': io_arg['dma_burst_size'],
        'pad_value': signed_to_hex(conv_arg['pad_value'], 8),
        'bwsx_base_addr': 'too many content',
    }
    logging.debug("       {}".format(kernel_pool_type_cfg_print))
    kernel_load_cfg = {
        'load_coor': conv_arg['load_coor'],
        'load_time': conv_arg['load_time'] - 1,
        'para_size': conv_arg['para_size'],
        'para_start_addr': conv_arg['para_start_addr'],
    }
    
    kernel_load_cfg_print = {
        'load_coor': conv_arg['load_coor'],
        'load_time': conv_arg['load_time'] - 1,
        'para_size': conv_arg['para_size'],
        'para_start_addr': 'too many content',
    }
    logging.debug("       {}".format(kernel_load_cfg_print))
    kernel_offset = {
        'coef_column_offset': set_to_zero,
        'coef_row_offset': set_to_zero,
    }
    logging.debug("       {}".format(kernel_offset))
    kernel_calc_type_cfg = {
        'channel_switch_addr': hex(conv_arg['channel_switch_addr']),
        'row_switch_addr': hex(conv_arg['row_switch_addr']),
        'coef_size': reserved,
        'coef_group': conv_arg['coef_group'],
        'load_act': 1 if elayer.act else 0,
        'active_addr': act_arg['active_addr']
    }
    
    kernel_calc_type_cfg_print = {
        'channel_switch_addr': hex(conv_arg['channel_switch_addr']),
        'row_switch_addr': hex(conv_arg['row_switch_addr']),
        'coef_size': reserved,
        'coef_group': conv_arg['coef_group'],
        'load_act': 1 if elayer.act else 0,
        'active_addr': 'too many content'
    }
    logging.debug("       {}".format(kernel_calc_type_cfg_print))
    write_back_cfg = {
        'wb_channel_switch_addr': hex(io_arg['wb_channel_switch_addr']),
        'wb_row_switch_addr': hex(io_arg['wb_row_switch_addr']),
        'wb_group': io_arg['wb_group']
    }
    logging.debug("       {}".format(write_back_cfg))
    conv_value = {
        'shr_w': conv_arg['shr_w'],
        'shr_x': conv_arg['shr_x'],
        'arg_w': signed_to_hex(conv_arg['arg_w'], 24),
        'arg_x': signed_to_hex(conv_arg['arg_x'], 24),
    }
    logging.debug("       {}".format(conv_value))
    conv_value2 = {
        'arg_add': int(round(conv_arg['arg_add'])),
    }
    logging.debug("       {}".format(conv_value2))
    dma_parameter = {
        'send_data_out': io_arg['send_data_out'],
        'channel_byte_num': io_arg['channel_byte_num'] - 1,
        'dma_total_byte': io_arg['dma_total_byte'] - 1,
    }
    logging.debug("       {}".format(dma_parameter))

    return {
        'interrupt_enabe': interrupt_enabe,
        'image_addr': image_addr,
        'image_channel_num': image_channel_num,
        'image_size': image_size,
        'kernel_pool_type_cfg': kernel_pool_type_cfg,
        'kernel_load_cfg': kernel_load_cfg,
        'kernel_offset': kernel_offset,
        'kernel_calc_type_cfg': kernel_calc_type_cfg,
        'write_back_cfg': write_back_cfg,
        'conv_value': conv_value,
        'conv_value2': conv_value2,
        'dma_parameter': dma_parameter
    }, (output_scale, output_bias)

def gen_layer_code(elayer_struct, layer_cfg):
    layer_cfg.reg_arg = bytearray(len(elayer_struct[0].items()) * 8)
    for reg_name, data in elayer_struct[0].items():
        value = 0
        for filed_name, filed_value in data.items():
            #logging.info("  %s: %s"%(filed_name, filed_value))
            if isinstance(filed_value, int):
                pass
            elif isinstance(filed_value, str):
                filed_value = int(filed_value, 16) if '0x' in filed_value else int(filed_value)
            else:
                filed_value = 0
            value |= (filed_value << kpu_layer_config_field_offset[reg_name][filed_name])
        value&=0xffffffffffffffff
        layer_cfg.reg_arg[kpu_layer_config_reg_offset[reg_name]*8:kpu_layer_config_reg_offset[reg_name]*8+8] = value.to_bytes(8, 'little')


def gen_bn_code(elayer_struct, layer_cfg):
    bn_list = elayer_struct[0]['kernel_pool_type_cfg']['bwsx_base_addr']
    layer_cfg.bn_len = len(bn_list) * 8
    layer_cfg.bn_arg = bytearray(layer_cfg.bn_len)
    i = 0
    for bn in bn_list:
            layer_cfg.bn_arg[i:i+8] = (int(bn['norm_mul'], 16) + (int(bn['norm_add'], 16) << 24) + (int(bn['norm_shift']) << 56)).to_bytes(8, 'little')
            i += 8

def gen_act_code(elayer_struct, layer_cfg):
    act_list = elayer_struct[0]['kernel_calc_type_cfg']['active_addr']
    for item in act_list:
        layer_cfg.act_arg += (int(item['dxs']) + (int(item['dy']) << 8) + (int(signed_to_hex(item['x'], 36), 16) << 24)).to_bytes(8, 'little')
    bias_list = [int(item['y']) for item in act_list]
    value1, value2 = 0, 0
    for index in range(8):
        value1 += (bias_list[index] << (8 * index))
        value2 += (bias_list[index + 8] << (8 * index))
    layer_cfg.act_arg += value1.to_bytes(8, 'little')
    layer_cfg.act_arg += value2.to_bytes(8, 'little')

def gen_weights_code(elayer_struct, layer_cfg, eight_bit_mode):
    weights = elayer_struct[0]['kernel_load_cfg']['para_start_addr']
    if eight_bit_mode:
        layer_cfg.weights_len = len(weights)
        layer_cfg.weights_arg = bytearray(layer_cfg.weights_len)
        i = 0
        for item in weights:
            layer_cfg.weights_arg[i:i+1] = int(signed_to_hex(item, 8), 16).to_bytes(1, 'little')
            i += 1
    else:
        layer_cfg.weights_len = len(weights) * 2
        layer_cfg.weights_arg = bytearray(layer_cfg.weights_len)
        i = 0
        for item in weights:
            layer_cfg.weights_arg[i:i+2] = int(signed_to_hex(item, 16), 16).to_bytes(2, 'little')
            i += 2


class layer_config_struct():
    def __init__(self):
        self.reg_addr_offset = 0
        self.reg_arg = b''
        self.act_addr_offset = 0
        self.act_arg = b''
        self.bn_addr_offset = 0
        self.bn_len = 0
        self.bn_arg = b''
        self.weights_addr_offset = 0
        self.weights_len = 0
        self.weights_arg = b''    
    
    
###############################################################################
class K210Conv:
    def __init__(self, weights, depth_wise_layer, eight_bit_mode, xy_shape, xw_minmax, quant_func):
        self.weights = weights
        self.weights_shape = self.weights.shape
        self.input_shape, self.output_shape = xy_shape
        xmin, xmax, wmin, wmax = xw_minmax
        self.stride = 1
        self.depth_wise_layer = depth_wise_layer
        self.eight_bit_mode = eight_bit_mode
        self.quant_func = quant_func

        self.wmax = wmax
        self.wmin = wmin
        self.x_range = xmax - xmin
        self.x_bias = xmin
        if self.x_range == 0:
            self.x_range = 0.00001

        if len(wmin.shape) == 0:
            self.is_chwise = False
            self.w_range = wmax - wmin
            self.w_bias = wmin
            if self.w_range == 0:
                self.w_range = 0.00001
            self.w_range_all = wmax-wmin
            self.w_bias_all = wmin
        else:
            self.is_chwise = True
            self.w_range_all = max(wmax)-min(wmin)
            self.w_bias_all = min(wmin)
            self.w_range = wmax - wmin
            self.w_bias = wmin
            if self.w_range_all == 0:
                self.w_range_all = 0.00001
            for _range in self.w_range:
                if _range == 0:
                    _range = 0.00001

        if self.input_shape[1] < 4:
            tensor_height = self.input_shape[1]
            logging.info('[error] feature map required height>4 which this layer height is {}' \
                  .format(tensor_height))
            self.input_shape = list(self.input_shape)
            self.output_shape = list(self.output_shape)
            old_input_wh = self.input_shape[1:3]
            old_output_wh = self.output_shape[1:3]
            self.input_shape[1:3] = [4, 4]
            self.output_shape[1:3] = [4, 4]
            notice = 'this layer heigh-width MUST padding from {}x{}=>{}x{} to 4x4=>4x4 in CPU before continue.' \
                .format(*old_input_wh, *old_output_wh)
            logging.info('[notice] ' + ('=' * 71))
            logging.info('[notice] ' + notice)
            logging.info('[notice] ' + ('=' * 71))
            raise ValueError('conv height must > 4' )

    @staticmethod
    def q(value, scale, bias):
        return (value - bias) / scale

    def para_mult_loads(self, weights_shape, output_shape, kernel_size):
        weight_buffer_size = 2 * 9 * 4096
        weights_ich = int(weights_shape[2])
        weights_och = int(weights_shape[3])
        weight_data_size = 1 if self.eight_bit_mode else 2

        if self.depth_wise_layer:
            o_ch_weights_size = int(weights_shape[0]) * int(weights_shape[1]) * weight_data_size
        else:
            o_ch_weights_size = int(weights_shape[0]) * int(weights_shape[1]) * int(weights_shape[2]) * weight_data_size

        if int(weights_shape[0]) == 1:
            o_ch_weights_size_pad = math.ceil(o_ch_weights_size / 8) * 9
        else:
            o_ch_weights_size_pad = o_ch_weights_size
            assert (int(weights_shape[0]) == 3)

        if kernel_size == 3:
            load_time = math.ceil(weights_och / math.floor(4096 * 2 / weight_data_size / weights_ich))
        elif kernel_size == 1:
            load_time = math.ceil(weights_och / math.floor(4096 * 8 * 2 / weight_data_size / weights_ich))
        else:
            load_time = None
            assert (None)

        o_ch_num = int(output_shape[3])
        o_ch_num_coef = math.floor(weight_buffer_size / o_ch_weights_size_pad)

        if self.eight_bit_mode:
            half_weight_buffer_size = weight_buffer_size / 2
            while True:
                last_ch_idx = (o_ch_num - 1) % o_ch_num_coef
                last_addr_end = (last_ch_idx + 1) * o_ch_weights_size_pad
                if last_addr_end < half_weight_buffer_size:
                    break

                o_ch_num_coef = o_ch_num_coef - 1
                load_time = math.ceil(o_ch_num / o_ch_num_coef)
                if o_ch_num_coef <= 0:
                    assert ('cannot fix last_addr_end to first half part')

        assert (load_time <= 64)

        o_ch_num_coef = min(o_ch_num_coef, o_ch_num)
        para_size = o_ch_num_coef * o_ch_weights_size
        return load_time, para_size, o_ch_num_coef

    def to_kmodel_param(self):
        input_shape = self.input_shape
        output_shape = self.output_shape
        weights_shape = self.weights_shape
        weights = self.weights.transpose([3, 2, 0, 1])
        stride = self.stride

        weight_data_size = 1 if self.eight_bit_mode else 2
        kernel_size = int(weights_shape[0])

        # img i
        i_row_wid = int(input_shape[2])
        i_col_high = int(input_shape[1])
        coef_group = 1 if i_row_wid > 32 else (2 if i_row_wid > 16 else 4)
        row_switch_addr = math.ceil(i_row_wid / 64)
        channel_switch_addr = i_col_high * row_switch_addr
        # conv
        depth_wise_layer = 1 if self.depth_wise_layer else 0
        kernel_type = {1: 0, 3: 1}[kernel_size]
        pad_type = 0
        load_coor = 1

        first_stride = 0 if stride == 1 else 1
        assert (256 >= (i_col_high if first_stride == 0 else i_col_high / 2))

        load_time, para_size, o_ch_num_coef = self.para_mult_loads(weights_shape, output_shape, kernel_size)

        x_qmax = 255
        w_qmax = (1 << (8 * weight_data_size)) - 1
        
        # scale channel weight to full range first
        if self.is_chwise:
            logging.debug("---- channel wise scale ----")
            wmax_all = max(self.wmax)
            wmin_all = min(self.wmin)
            scale_w  = np.zeros(weights.shape[0])
            for i in range(weights.shape[0]):
                s1 = wmax_all / self.wmax[i]
                s2 = wmin_all / self.wmin[i]
                s = max(s1, s2) if (s1 < 0 or s2 < 0) else min(s1, s2);
                if s <= 0:
                    raise ValueError("channel wise scale error!")
                weights[i] *= s
                scale_w[i] = s
                #logging.debug("ch %d: max: %.3f,%.3f; min: %.3f,%.3f; s1=%.3f,s2=%.3f; s=%.3f"%(i,wmax_all,self.wmax[i], wmin_all,self.wmin[i],s1,s2,s))
            # TODO: use quant_func
            wmin_all = weights.min()
            wmax_all = weights.max()
            self.w_range_all = wmax_all - wmin_all
            self.w_bias_all = wmin_all
            scale_w_fix = 1/scale_w
        else:
            scale_w_fix = 1
            
        
        bias_x, scale_x = self.x_bias, self.x_range / x_qmax
        bias_w_all, scale_w_all = self.w_bias_all, self.w_range_all / w_qmax

        bx_div_sx = bias_x / scale_x     
        bw_div_sw_all = bias_w_all / scale_w_all

        shr_x, arg_x = pow_next_log_of_2(bw_div_sw_all, 24) #bw_div_sw = arg_x >> shr_x
        shr_w, arg_w = pow_next_log_of_2(bx_div_sx, 24) #bx_div_sx = arg_w >> shr_w
        arg_add = kernel_size * kernel_size * bw_div_sw_all * bx_div_sx
        pad_value = 0 -bx_div_sx
        swsx = scale_w_all * scale_x

        logging.debug("---- Doing conv quant  x = xq*scale_x + bias_x ----")
        logging.debug("quant X: bias %f, range %f ---> bias %f, scale %f"% \
            (self.x_bias, self.x_range, bias_x, scale_x))
        logging.debug("quant W: bias %f, range %f ---> bias %f, scale %f"% \
            (self.w_bias_all, self.w_range_all, bias_w_all, scale_w_all))

        weight_q = ((weights - bias_w_all) / scale_w_all)
        para_start_addr = [int(round(item)) for item in np.reshape(weight_q, (np.product(weight_q.shape),))]
        #print(para_start_addr)

        return {
            'swsx': swsx,
            'coef_group': coef_group,
            'channel_switch_addr': channel_switch_addr,
            'depth_wise_layer': depth_wise_layer,
            'o_ch_num_coef': o_ch_num_coef,
            'i_row_wid': i_row_wid,
            'i_col_high': i_col_high,
            'kernel_type': kernel_type,
            'pad_type': pad_type,
            'first_stride': first_stride,
            'pad_value': pad_value,
            'load_coor': load_coor,
            'load_time': load_time,
            'para_size': para_size,
            'para_start_addr': para_start_addr,
            'row_switch_addr': row_switch_addr,
            'shr_w': shr_w,
            'shr_x': shr_x,
            'arg_w': arg_w,
            'arg_x': arg_x,
            'arg_add': arg_add, 
            'scale_w_fix': scale_w_fix
        }


class K210BN:
    def __init__(self, mean, var, gamma, beta, epsilon, eight_bit_mode):
        self.mean = mean
        self.var = var
        self.gamma = gamma
        self.beta = beta
        self.epsilon = epsilon
        self.eight_bit_mode = eight_bit_mode

    @staticmethod
    def get_bn(scale, shift, bias):
        norm_shift, norm_mul = shift, scale
        return {
            'norm_mul': signed_to_hex(norm_mul, 24),
            'norm_add': signed_to_hex(bias, 32),
            'norm_shift': norm_shift
        }

    def to_kmodel_param(self, swsx=1, scale_w_fix=1):
        rsqrt_var = 1.0 / np.sqrt(self.var + self.epsilon)
        scale = self.gamma * rsqrt_var  #8.775
        bias = self.beta - self.gamma * self.mean * rsqrt_var
        
        # todo: rewrite this, make max_abs mul is +-(1<<N)
        # now we need convert bias from float to 36bit int
        bmax = max(abs(np.min(scale)), abs(np.max(scale)))
        brange = bmax
        sb = brange / 255
        if np.min(scale) == np.max(scale):
            sb = 1
        swsxsb = swsx * sb
        load_para = 1
        
        act_shift = 10
        post_scale = np.power(2, act_shift)
        if type(scale_w_fix) == int: #not channel wise
            swsxsb = swsxsb*scale_w_fix*post_scale
            out_shift, out_mul = pow_next_log_of_2_with_maxshift(swsxsb, 22, 15+1) # out_mul>>out_shift
            bn_shift = np.ones(len(bias)) *(out_shift)
            scale = (scale / sb * out_mul).round().astype('int32')
            bias = (bias * post_scale ).round().astype('int32')
        else:   #channel wise
            swsxsb = swsxsb * scale_w_fix * post_scale
            #logging.debug("{}, {}".format(swsxsb, scale_w_fix))
            bn_shift = np.ones(len(bias))
            for i in range(len(bias)):
                out_shift, out_mul = pow_next_log_of_2_with_maxshift(swsxsb[i], 22, 15+1)
                bn_shift[i] = out_shift
                scale[i] = (scale[i] / sb * out_mul).round().astype('int32')
                bias[i] = (bias[i] * post_scale ).round().astype('int32')
                #logging.debug("ch %d: swsxsb=%f, shift=%d, mul=%f, bn_shift=%d,scale(17bit)=0x%x, bias=0x%x"%(i,swsxsb[i],out_shift, out_mul, bn_shift[i] , int(scale[i]), int(bias[i])))

        bwsx_base_addr = [
            self.get_bn(s, shift, b)
            for s,shift,b in zip(scale, bn_shift, bias)
        ]

        return locals()


class K210Act:
    def __init__(self, min_y, max_y, ty, eight_bit_mode):
        if isinstance(ty, list) or isinstance(ty, tuple):
            self.ty = ty[0]
            self.leaky_mul = ty[1]
        else:
            self.ty = ty
        self.eight_bit_mode = eight_bit_mode
        self.min_y = min_y
        self.max_y = max_y

    @staticmethod
    def leaky_relu(x, v_mul):
        return x if x >= 0 else x * v_mul

    @staticmethod
    def leaky_relu_inverse(y, v_mul):
        return y if y >= 0 else y / v_mul

    @staticmethod
    def relu_inverse(y):
        return y

    @staticmethod
    def relu6_inverse(y):
        return y

    @staticmethod
    def leaky_table(min_y, max_y, v_mul):
        range_y = max_y - min_y
        y_table = [min_y + i * range_y / 15 for i in range(15)]
        y_table.append(max_y)
        if 0 not in y_table:
            y_table.append(0)
        y_table = sorted(y_table)
        x_table = [K210Act.leaky_relu_inverse(it, v_mul) for it in y_table]
        dydx = [(y_table[i + 1] - y_table[i]) / (x_table[i + 1] - x_table[i]) for i in range(len(y_table) - 1)]
        return zip(x_table, y_table, dydx)

    @staticmethod
    def relu_table(min_y, max_y):
        range_y = max_y - min_y
        y_table = [min_y + i * range_y / 15 for i in range(15)]
        y_table.append(max_y)
        if 0 not in y_table:
            y_table.append(0)
        y_table = sorted(y_table)
        x_table = [K210Act.relu_inverse(it) for it in y_table]
        dydx = [(y_table[i + 1] - y_table[i]) / (x_table[i + 1] - x_table[i]) for i in range(len(y_table) - 1)]
        return zip(x_table, y_table, dydx)

    @staticmethod
    def relu6_table(min_y, max_y):
        range_y = max_y - min_y
        y_table = [min_y + i * range_y / 15 for i in range(15)]
        y_table.append(max_y)
        if 0 not in y_table:
            y_table.append(0)
        y_table = sorted(y_table)
        x_table = [K210Act.relu6_inverse(it) for it in y_table]
        dydx = [(y_table[i + 1] - y_table[i]) / (x_table[i + 1] - x_table[i]) for i in range(len(y_table) - 1)]
        return zip(x_table, y_table, dydx)

    @staticmethod
    def linear_table(min_y, max_y):
        range_y = max_y - min_y
        y_table = [min_y + i * range_y / 15 for i in range(15)]
        if 0 not in y_table:
            y_table.append(0)
        y_table.append(max_y)
        y_table = sorted(y_table)
        return zip(y_table, y_table, [1] * (len(y_table) - 1))

    @staticmethod
    def find_shift(dydx):
        ret_shift = 0
        while abs(dydx) < (1 << 14) and dydx > 0:
            dydx = dydx * 2
            ret_shift = ret_shift + 1
        return ret_shift, dydx

    @staticmethod
    def table_to_act(act_table, min_y, max_y, eight_bit_mode, post_scale):
        def act_table_aux(x, y, dydx):
            y_scale = (max_y - min_y) / 255
            y_bias = min_y
            x_fix = x * post_scale                  # x scale 
            y_fix = (y - y_bias) / y_scale          # y scale to 0~255
            dydx_fix = dydx / y_scale / post_scale  # slope, y/

            yf_q = round(y_fix)                     # y bias, 0~255
            yf_err = y_fix - yf_q                   # 
            xfy = x_fix - yf_err / dydx_fix         # fix y to x 
            return xfy, yf_q, dydx_fix              # xstart, y bias, y_mul>>shift

        act_table = [(0x800000000, 0, 0)] + [act_table_aux(x, y, dydx) for x, y, dydx in act_table]
        
        #logging.info(act_table)
        #logging.info("miny=%f,maxy=%f"%(min_y,max_y))
        def ret_aux(x, y, dydx):
            dxss, dys = K210Act.find_shift(dydx)
            assert (dys >= 0)
            return {'x': int(round(x)), 'y': int(round(y)), 'dxs': dxss, 'dy': int(round(dys))}

        return [ret_aux(x, y, dydx) for x, y, dydx in act_table]

    def to_kmodel_param(self, post_scale):
#tl act dict
# _act_dict = {
    # "relu": tf.nn.relu,
    # "relu6": tf.nn.relu6,
    # "leaky_relu": tf.nn.leaky_relu,
    # "lrelu": tf.nn.leaky_relu,
    # "softplus": tf.nn.softplus,
    # "tanh": tf.nn.tanh,
    # "sigmoid": tf.nn.sigmoid,
# }
        act_tab = None
        if self.ty == 'leaky_relu' or self.ty == 'lrelu' :
            act_tab = list(K210Act.leaky_table(self.min_y, self.max_y, self.leaky_mul))
        elif self.ty == 'relu':
            act_tab = list(K210Act.relu_table(self.min_y, self.max_y))
        elif self.ty == 'relu6':
            act_tab = list(K210Act.relu6_table(self.min_y, self.max_y))
        elif self.ty == 'linear':
            act_tab = list(K210Act.linear_table(self.min_y, self.max_y))
        else:
            assert ValueError(self.ty, ' active is not supported.')

        active_tab = K210Act.table_to_act(list(act_tab), self.min_y, self.max_y, self.eight_bit_mode, post_scale)

        return {'active_addr': active_tab[:16]}


class K210Pool:
    def __init__(self, pool_type, size, stride):
        self.size = size
        self.stride = stride
        self.pool_type = pool_type

    def to_kmodel_param(self):
        if self.pool_type == 'MaxPool':
            return {'pool_type': {
                (2, 2): 1,
                (4, 4): 3,
                (2, 1): 9
            }[(self.size, self.stride)]}
        elif self.pool_type == 'AvgPool':
            return {'pool_type': {
                (2, 2): 2,
                (4, 4): 4,
                (2, 1): 8
            }[(self.size, self.stride)]}
        elif self.pool_type == 'leftPool':
            return {'pool_type': {
                (2, 2): 5,
                (4, 4): 7,
            }[(self.size, self.stride)]}
        elif self.pool_type == 'rightPool':
            return {'pool_type': 6}
        else:
            return None


class K210_Conv_Layer:
    def __init__(self, iwo_minmax, ico_shapes, conv_weights_isdw, bn_mean_var_gamma_beta_epsilon, act_type,
                 pool_type_size_stride, conv_idx, output_en, quant_func, eight_bit_mode=False):
        logging.info("### init K210_Conv_Layer")
        input_min, input_max, weights_min, weights_max, output_min, output_max = iwo_minmax
        input_shape, conv_shape, output_shape = ico_shapes
        conv_weights, conv_isdw = conv_weights_isdw
        self.conv_idx = conv_idx
        # KPU consist of conv, pool, bn, act
        # Conv
        self.type = EL_K210_CONV
        self.typename = "EL_K210_CONV"
        self.conv = K210Conv(
            conv_weights,
            conv_isdw,
            eight_bit_mode, [input_shape, conv_shape],
            [input_min, input_max, weights_min, weights_max],
            quant_func
        )

        bn_mean, bn_var, bn_gamma, bn_beta, bn_epsilon = bn_mean_var_gamma_beta_epsilon
        self.bn = K210BN(
            bn_mean,
            bn_var,
            bn_gamma,
            bn_beta,
            bn_epsilon,
            eight_bit_mode,
        )

        self.act = K210Act(output_min, output_max, act_type, eight_bit_mode=eight_bit_mode)

        if pool_type_size_stride is not None:
            pool_type, pool_size, pool_stride = pool_type_size_stride
            if pool_size == 2 and conv_shape[3] % 2 != 0:
                raise ValueError(
                    "this layer unsupport padding mode SAME of pooling"
                )

            #if conv_isdw and pool_size != 1:
            #    raise ValueError(
            #        'this layer not supported DepthwiseConv2d followed by pooling witch pool_size is not 1.'
            #    )

            self.pool = K210Pool(pool_type, pool_size, pool_stride)
        else:
            self.pool = None
        
        self.output_en = output_en
        if output_en:
            self.memsize = output_shape[1]*output_shape[2]*output_shape[3]
            self.outsize = self.memsize
        else:
            self.memsize = 0    #no need normal mem
            self.outsize = self.memsize

    @staticmethod
    def batch(iter, n=1):
        l = len(iter)
        for ndx in range(0, l, n):
            yield iter[ndx:min(ndx + n, l)]

    def to_kmodel_io_param(self):
        output_shape = self.conv.output_shape

        weights_shape = self.conv.weights_shape
        input_shape = self.conv.input_shape
        i_row_wid = int(input_shape[1])
        img_data_size = 1

        coef_group = 1 if i_row_wid > 32 else (2 if i_row_wid > 16 else 4)

        # io
        i_ch_num = int(weights_shape[2])
        o_ch_num = int(output_shape[3])
        # img o
        o_row_wid = int(output_shape[2])
        o_col_high = int(output_shape[1])
        wb_group = 1 if o_row_wid > 32 else (2 if o_row_wid > 16 else 4)
        wb_row_switch_addr = math.ceil(o_row_wid / 64)
        wb_channel_switch_addr = o_col_high * wb_row_switch_addr
        channel_byte_num = o_row_wid * o_col_high

        int_en = 1 if self.output_en else 0
        image_src_addr = None
        image_dst_addr = None
        dma_total_byte = o_row_wid * o_col_high * o_ch_num
        dma_burst_size = 0xf
        send_data_out = 1 if self.output_en else 0
        return locals()
        
    def to_kmodel(self, arg_oft, eight_bit_mode, buf_map):
        struct = gen_layer_struct(self, self.conv_idx)
        output_scale, output_bias = struct[1]
        cparser = cstruct.cstruct()
        cparser.load(kmodel_def)
        layer_header = cparser.kpu_model_layer_header_t()
        layer_config = layer_config_struct()    #bin
        # gen some bins
        gen_layer_code(struct, layer_config)    #kpu reg info
        gen_act_code(struct, layer_config)      #act table
        gen_bn_code(struct, layer_config)       #bn table
        gen_weights_code(struct, layer_config, eight_bit_mode)
                                                #conv weights
        layer_arg = cparser.kpu_model_conv_layer_argument_t()
        layer_arg_size = len(layer_arg.dumps())
        act_len = len(layer_config.act_arg)     
        #fill layer arg param
        layer = layer_arg
        layer.flags = 1 if self.output_en else 0
        buf_map, _, layer.main_mem_out_address = cal_in_out_addr(buf_map, self.outsize)
        [layer.layer_offset, pad_layer] = align_8(arg_oft + layer_arg_size)
        [layer.weights_offset, pad_weights] = align_128(layer.layer_offset + 12*8)   
        [layer.bn_offset, pad_bn] = align_128(layer.weights_offset + layer_config.weights_len)  
        [layer.act_offset, pad_act] = align_128(layer.bn_offset + layer_config.bn_len)
       
        layer_bin = layer.dumps() + \
            (b'\0'*pad_layer) + layer_config.reg_arg + \
            (b'\0'*pad_weights) + layer_config.weights_arg + \
            (b'\0'*pad_bn) + layer_config.bn_arg + \
            (b'\0'*pad_act) + layer_config.act_arg
        layer_header.type = EL_K210_CONV
        layer_header.body_size = len(layer_bin)
        
        logging.info("###K210 Conv Layer @0x%x"%arg_oft)
        if layer.flags :
            logging.info("output en, main_mem_out_address = 0x%x"%(layer.main_mem_out_address))
        logging.info("layer_offset=0x%x, weights_offset=0x%x, bn_offset=0x%x, act_offset=0x%x"%(layer.layer_offset, layer.weights_offset, layer.bn_offset, layer.act_offset))
        
        return layer_header, layer_bin, buf_map, (output_scale, output_bias)
        

class K210_Upload_Layer:
    def __init__(self, network, idx):
        self.type       = EL_K210_UPLOAD
        self.typename   = "EL_K210_UPLOAD"
        layer           = network.all_layers[idx-1]
        shape           = layer._nodes[0].out_tensors[0].shape
        self.width      = shape[1]
        self.height     = shape[2]
        self.channel    = shape[3]
    
    
################################################################################
# in kpu.c, it is not add zeros, just put data to kpu ram in right oft
class AddPadding_Layer:
    def __init__(self, network, idx, tl_type_list, meta_info):
        logging.info("### init AddPadding_Layer")
        self.type       = EL_K210_ADD_PADDING
        self.typename   = "EL_K210_ADD_PADDING"
        layer           = network.all_layers[idx]
        shape           = layer._nodes[0].in_tensors[0].shape
        if len(shape)   != 4:
            raise ValueError('K210 only support 4-d input tensor!')
        self.channels   = shape[3]
        self.memsize    = shape[1]*shape[2]*shape[3]
        self.outsize    = 0     #put to kpu
        logging.debug("AddPadding_Layer, channel=%d"%self.channels)
    def to_kmodel(self, arg_oft, eight_bit_mode, buf_map):
        cparser = cstruct.cstruct()
        cparser.load(kmodel_def)
        layer_header = cparser.kpu_model_layer_header_t()
        layer_body = cparser.kpu_model_add_padding_layer_argument_t()    
        # fill layer body
        layer_body.flags                = 0
        _, layer_body.main_mem_in_address, _ = \
            cal_in_out_addr(buf_map, self.outsize)
        layer_body.kpu_mem_out_address  = 0
        
        buf_map['pingpong']             = 0
        buf_map['last_addr']            = 0
        
        layer_body.channels             = self.channels
        # fill header
        layer_header.type               = EL_K210_ADD_PADDING
        layer_header.body_size          = len(layer_body)
        # header, bin, memsize, (s,b)
        return layer_header, layer_body.dumps(), buf_map, (0, 0)
        
class RemovePadding_Layer:
    def __init__(self, network, idx, tl_type_list, meta_info, output_shape):
        logging.info("### init RemovePadding_Layer")
        self.type       = EL_K210_REMOVE_PADDING
        self.typename   = "EL_K210_REMOVE_PADDING"
        layer           = network.all_layers[idx]
        shape           = layer._nodes[0].out_tensors[0].shape
        if len(shape)   != 4:
            raise ValueError('K210 only support 4-d input tensor!')
        self.channels   = shape[3]   
        self.memsize    = output_shape[1]*output_shape[2]*output_shape[3]
        self.outsize    = output_shape[1]*output_shape[2]*output_shape[3]
        logging.debug("RemovePadding_Layer, channel=%d"%self.channels)
    def to_kmodel(self, arg_oft, eight_bit_mode, buf_map):
        cparser = cstruct.cstruct()
        cparser.load(kmodel_def)
        layer_header = cparser.kpu_model_layer_header_t()
        layer_body = cparser.kpu_model_remove_padding_layer_argument_t()    
        # fill layer body
        layer_body.flags                = 0
        buf_map, layer_body.main_mem_in_address, layer_body.main_mem_out_address = \
            cal_in_out_addr(buf_map, self.outsize)
        layer_body.channels             = self.channels
        # fill header
        layer_header.type               = EL_K210_REMOVE_PADDING
        layer_header.body_size          = len(layer_body)
        # header, bin, memsize, (s,b)
        return layer_header, layer_body.dumps(), buf_map, (0, 0)
        
################################################################################   
#NCHW       NHWC
#KL_K210_CONV                         :   [['Conv2d'],
#                                        ['Conv2d', 'BatchNorm'],
#                                        ['DepthwiseConv2d'],
#                                        ['DepthwiseConv2d', 'BatchNorm']],
def gen_k210_conv_layer(network, idx, tl_type_list, meta_info):  
    def min_max_to_scale_bias(minv, maxv):
        scale = (maxv - minv) / 255
        bias = minv
        return scale, bias  
    layer_list = []
    layers = network.all_layers  
    
    logging.debug("gen k210 conv layer from tl_type_list: {}".format(tl_type_list))
    
    #check if the padding is right
    if (tl_type_list[0] == 'ZeroPad2d'):    
        zeropad_layer = layers[idx]
        if zeropad_layer.layer_args['padding'] != ((1, 1), (1, 1)):
            raise ValueError('K210 assume use ((1, 1), (1, 1)) zero padding!' )
        #idx += 1                        # skip the padding layer
        conv_layer = layers[idx+1]
        padding = conv_layer.layer_args['padding'] 
        strides = conv_layer.layer_args['strides'] 
        if padding != 'VALID':
            raise ValueError("K210 assume conv layer after zeropad use padding = 'VALID'" )
        if strides != (2, 2):
            raise ValueError("K210 assume conv layer after zeropad use strides = (2,2)" )
        
        if len(tl_type_list)>2 :
            bn_layer = layers[idx+2]
        else:
            bn_layer = None
            
        conv_isdw = (tl_type_list[1] == 'DepthwiseConv2d')  
    else:
        conv_layer = layers[idx]
        padding = conv_layer.layer_args['padding'] 
        strides = conv_layer.layer_args['strides'] 
        if strides != (1, 1):
            raise ValueError("K210 assume conv layer which stride > (1,1) use ZeroPad2d ahead " )
        
        if len(tl_type_list)>1 :
            bn_layer = layers[idx+1]
        else:
            bn_layer = None
            
        conv_isdw = (tl_type_list[0] == 'DepthwiseConv2d')  

    # valid parm check
    if conv_layer.layer_args['dilation_rate'] != (1,1):
        raise ValueError('only support (1,1) dilation_rate!')
    if conv_layer.layer_args['data_format'] != 'channels_last':
        raise ValueError('only support channels_last data_format!')  
    if  conv_layer.layer_args['filter_size'] != (1,1) and \
        conv_layer.layer_args['filter_size'] != (3,3) :
        raise ValueError('only support 1x1 or 3x3 filter_size!')   
    if conv_layer.layer_args['layer_type'] != 'normal':
        raise ValueError('only support normal layer_type!')  
        
    # Conv2d or DepthwiseConv2d Layer
    input_shape = conv_layer._nodes[0].in_tensors[0].shape
    conved_shape = conv_layer._nodes[0].out_tensors[0].shape
    output_shape = conved_shape
    input_shape = input_shape.as_list()
    conved_shape = conved_shape.as_list()
    output_shape = output_shape.as_list()

    if (tl_type_list[0] == 'ZeroPad2d'):    #strip 
        input_shape[1] -= 2
        input_shape[2] -= 2
    
    if len(input_shape) != 4:
        raise ValueError('K210 only support 4-d input tensor!')
    if input_shape[3] > 1024:
        raise ValueError('K210 only support max 1024 channel feature!')
    
    small_conv_flag = 0
    if input_shape[2] < 4 or input_shape[1] < 4:
        logging.info("too small conv, padding it first!")
        small_conv_flag = 1
        addpadding_layer = AddPadding_Layer(network, idx, tl_type_list, meta_info)
        if input_shape[1] < 4:
            input_shape[1] = 4
            conved_shape[1] = 4
            output_shape[1] = 4
        if input_shape[2] < 4:
            input_shape[2] = 4
            conved_shape[2] = 4
            output_shape[2] = 4
            
    conv_weights = conv_layer.all_weights[0].numpy()
    if hasattr(conv_layer, 'b'):
        conv_bias = conv_layer.b.numpy()
    else:
        conv_bias = 0
    #weights_min, weights_max, _ = meta_info['quant_func'](network, conv_layer, meta_info['dataset'], is_weights=True, is_chwise=False)
    weights_min, weights_max, _ = meta_info['quant_func'](network, conv_layer, meta_info['dataset'], is_weights=True, is_chwise=(bn_layer != None) and (not conv_isdw))
    
    
    # Pool in Conv Layer
    stride = conv_layer.layer_args['strides'] 
    if stride[0] != stride[1]:
        raise ValueError('only support square stride !')
    pool_size = stride[0]   #square size
    pool_stride = stride[0] #stride step
    if stride != (1,1):
        pool_type = 'leftPool'
        pool_type_size_stride = [pool_type, pool_size, pool_stride]
    else:
        pool_type_size_stride = None

    if pool_size>1 and input_shape[1] % pool_size != 0:
        if conv_layer.layer_args['padding'] == 'SAME':
            raise ValueError("at {} unsupport padding mode SAME of pooling with size > 1".format(conv_layer.layer_args['name']))

    # BN Layer
    if bn_layer != None:
        bn_mean_var_gamma_beta_epsilon = [
            bn_layer.moving_mean.numpy().flatten()-(conv_bias),
            bn_layer.moving_var.numpy().flatten(),
            bn_layer.gamma.numpy().flatten(),
            bn_layer.beta.numpy().flatten(),
            bn_layer.epsilon,
        ]
    else:
        bn_mean_var_gamma_beta_epsilon = [
            np.zeros([conved_shape[3]]), np.ones([conved_shape[3]]), np.ones([conved_shape[3]]), np.zeros([conved_shape[3]]), np.zeros([conved_shape[3]])
        ]
        

        
    # Act Layer
    if (bn_layer != None):
        if (bn_layer.layer_args['act'] != None):
            act_min_y, act_max_y, _ = meta_info['quant_func'](network, bn_layer, meta_info['dataset'])
            act_type = bn_layer.layer_args['act']
        else :
            act_min_y, act_max_y, _ = meta_info['quant_func'](network, bn_layer, meta_info['dataset'])
            act_type = 'linear'
    else:   #no act, use linear to bypass
        act_min_y, act_max_y, _ = meta_info['quant_func'](network, conv_layer, meta_info['dataset'])
        act_type = 'linear'
    eight_bit_mode = (meta_info['quant_bit'] == 8)
    
    # is it need output to normal memory?
    output_en = False
    # if next layer fork to another branch, we should output it to ram
    if idx+len(tl_type_list) >= len(layers):
        output_en = True
    else: 
        next_layer = layers[idx+len(tl_type_list)]
        next_layer_config = next_layer.config
        next_shape = next_layer._nodes[0].in_tensors[0].shape
        this_layer_node = layers[idx+len(tl_type_list)-1].config['args']['name'] +'_node_0'
        if (next_layer_config['prev_layer'][0] != this_layer_node):    #fork branch
            output_en = True
        # if next layer is non conv layer, output it to ram
        elif (next_layer_config['class'] != 'Conv2d') and (next_layer_config['class'] != 'DepthwiseConv2d'):
            output_en = True
        # if next layer is conv layer, but use cpu calculate
        elif next_shape[1] < 4 or next_shape[2] < 4:
            output_en = True
    
    if small_conv_flag == 1:    #TODO: need download to cpu ram first
        logging.info("samll conv, need padding, conv_idx reset to 0")
        meta_info['conv_idx'] = 0
    else:   #k210 
        if meta_info['is_inai'] == False:  #not in ai ram, we need upload it first
            logging.info("need upload, conv_idx reset to 0")
            meta_info['conv_idx'] = 0
    kl_args = {
        'iwo_minmax': [meta_info['last_min'], meta_info['last_max'], weights_min, weights_max, act_min_y, act_max_y],
        'ico_shapes': [input_shape, conved_shape, output_shape],
        'conv_weights_isdw':[conv_weights, conv_isdw],
        'bn_mean_var_gamma_beta_epsilon': bn_mean_var_gamma_beta_epsilon,
        'act_type': act_type,
        'pool_type_size_stride':pool_type_size_stride,
        'conv_idx':meta_info['conv_idx'],
        'output_en':output_en,
        'quant_func' : meta_info['quant_func'],
        'eight_bit_mode': eight_bit_mode,
    }
    # fix some critical condition
    # kl_args_fixed = k210_layer_post_fix(kl_args)
    
    # logging.info layer info
    output_min = act_min_y
    output_max = act_max_y
    layer_shape_trans = [
        int(input_shape[1]), int(input_shape[2]), int(input_shape[3]),
        int(output_shape[1]), int(output_shape[2]), int(output_shape[3])
    ]
    if bn_layer != None:
        output_name = bn_layer.layer_args['name']
    else:
        output_name = conv_layer.layer_args['name']
    logging.info("in min:%f, max:%f;  out min %f, max: %f"%(meta_info['last_min'], meta_info['last_max'], output_min, output_max))
    input_scale, input_bias = min_max_to_scale_bias(meta_info['last_min'], meta_info['last_max'])
    output_scale, output_bias = min_max_to_scale_bias(output_min, output_max)

    logging.info("**********gen_conv_layer")
    logging.info('     shape(HWC): {}x{}x{} ==> {}x{}x{}'.format(*layer_shape_trans))
    logging.info('     scale,bias: ({},{}) ==> ({},{})'.format(input_scale, input_bias, output_scale, output_bias))
    
    #convert to k210 layer
    kconv_layer = K210_Conv_Layer(**kl_args)

    if small_conv_flag == 1:    #TODO: need download to cpu ram first
        removepadding_layer = RemovePadding_Layer(network, idx, tl_type_list, meta_info, output_shape)
        layer_list.append(addpadding_layer)
        layer_list.append(kconv_layer)
        layer_list.append(removepadding_layer)
        meta_info['conv_idx']   = 0
        meta_info['is_inai']    = False
        meta_info['last_min']   = act_min_y
        meta_info['last_max']   = act_max_y
    else:   #k210 
        if meta_info['is_inai'] == False:  #not in ai ram, we need upload it first
            upload_layer = Upload_Layer(network, idx-1)
            layer_list.append(upload_layer)
            layer_list.append(kconv_layer)
            meta_info['conv_idx']   = 0
            meta_info['is_inai']    = True
            meta_info['last_min']   = act_min_y
            meta_info['last_max']   = act_max_y
        else:   #in ai ram already
            layer_list.append(kconv_layer)
            meta_info['conv_idx']   += 1
            meta_info['is_inai']    = True
            meta_info['last_min']   = act_min_y
            meta_info['last_max']   = act_max_y
            
    return layer_list, meta_info


def k210_layer_post_fix(el_list):
    def expand_wh(shape_):
        shape_1 = shape_[1] * 2
        shape_2 = shape_[2] * 2
        return [shape_[0], shape_1, shape_2, shape_[3]]
    def fix_dw_with_strde2(el_list):
        lack_of_left_pooling = False
        last_is_conv = True
        for index in range(len(el_list)):
            el = el_list[index]
            logging.debug("Layer %d:"%index)
            if el.type == EL_K210_CONV :
                conv_part = el.conv
                pool_part = el.pool
                input_shape = conv_part.input_shape
                output_shape = conv_part.output_shape
                conv_shape = output_shape
                conv_weights = conv_part.weights
                conv_isdw = conv_part.depth_wise_layer
                if pool_part is not None:
                    pool_type_size_stride = [pool_part.pool_type, pool_part.size, pool_part.stride]
                else:
                    pool_type_size_stride = None
                conv_kernel_size = int(conv_weights.shape[0])
                conv_stride = int((int(input_shape[2]) + 1) / int(conv_shape[2]))

                logging.debug("conv_stride=%d, conv_isdw=%d, conv_kernel_size=%d, pool==None:%d"%(conv_stride, conv_isdw, conv_kernel_size, (pool_type_size_stride is None)))

                if lack_of_left_pooling:
                    if last_is_conv == False:
                        raise ValueError('run fix_dw_with_strde2 failed, last not conv layer')
                    if not conv_isdw and conv_kernel_size == 1 and pool_type_size_stride is None:
                        # fix in current layer
                        input_shape = expand_wh(input_shape)
                        conv_shape = expand_wh(conv_shape)
                        lack_of_left_pooling = False
                        el.pool = K210Pool('leftPool', 2, 2)
                        #el.conv.output_shape = conv_shape
                        el.conv.input_shape = input_shape
                        logging.debug("Fixed: in normal 1x1 conv")
                    else:
                        if not (conv_kernel_size == 1 and pool_type_size_stride is None):
                            raise ValueError(
                                'run fix_dw_with_strde2 failed. ' +
                                'can not delay left_pooling over current layer, ' +
                                'current layer conv_kernel_size:{}, pool_type_size_stride:{}' \
                                .format(conv_kernel_size, pool_type_size_stride)
                            )

                        # delay fix in after layers
                        input_shape = expand_wh(input_shape)
                        conv_shape = expand_wh(conv_shape)
                        output_shape = expand_wh(output_shape)
                        el.conv.input_shape = input_shape
                        #el.conv.output_shape = conv_shape
                        logging.debug("Fixed: in next dw 1x1 conv")

                if conv_stride == 2:
                    if not conv_isdw:
                        logging.debug("we have done before")
                    else:
                        # dw layer needs to fix it later, it is chip bug/feature
                        lack_of_left_pooling = True
                        el.pool = None
                        conv_shape = expand_wh(conv_shape)
                        output_shape = expand_wh(output_shape)
                        el.conv.output_shape = conv_shape
                        logging.debug("dw conv stride fix in later layer")
                elif conv_stride != 1:
                    raise ValueError('unsupported stride!')
                else :
                    logging.debug("stride == 1, nothing to do")
            else:
                last_is_conv = False
                logging.debug("Not Conv Layer")

        if lack_of_left_pooling:
            raise ValueError('run fix_dw_with_strde2 failed. no more layers for fix.')
        return 
    
    logging.debug("----Start fix stride----")
    fix_dw_with_strde2(el_list)
    logging.debug("----End fix stride----")
    return